diff options
1128 files changed, 15211 insertions, 12167 deletions
@@ -25,6 +25,8 @@ Aleksey Gorelov <aleksey_gorelov@phoenix.com> Alexander Lobakin <alobakin@pm.me> <alobakin@dlink.ru> Alexander Lobakin <alobakin@pm.me> <alobakin@marvell.com> Alexander Lobakin <alobakin@pm.me> <bloodyreaper@yandex.ru> +Alexander Mikhalitsyn <alexander@mihalicyn.com> <alexander.mikhalitsyn@virtuozzo.com> +Alexander Mikhalitsyn <alexander@mihalicyn.com> <aleksandr.mikhalitsyn@canonical.com> Alexandre Belloni <alexandre.belloni@bootlin.com> <alexandre.belloni@free-electrons.com> Alexei Starovoitov <ast@kernel.org> <alexei.starovoitov@gmail.com> Alexei Starovoitov <ast@kernel.org> <ast@fb.com> @@ -130,6 +132,7 @@ Domen Puncer <domen@coderock.org> Douglas Gilbert <dougg@torque.net> Ed L. Cashin <ecashin@coraid.com> Erik Kaneda <erik.kaneda@intel.com> <erik.schmauss@intel.com> +Eugen Hristev <eugen.hristev@collabora.com> <eugen.hristev@microchip.com> Evgeniy Polyakov <johnpol@2ka.mipt.ru> Ezequiel Garcia <ezequiel@vanguardiasur.com.ar> <ezequiel@collabora.com> Felipe W Damasio <felipewd@terra.com.br> @@ -214,6 +217,7 @@ Jisheng Zhang <jszhang@kernel.org> <jszhang@marvell.com> Jisheng Zhang <jszhang@kernel.org> <Jisheng.Zhang@synaptics.com> Johan Hovold <johan@kernel.org> <jhovold@gmail.com> Johan Hovold <johan@kernel.org> <johan@hovoldconsulting.com> +John Crispin <john@phrozen.org> <blogic@openwrt.org> John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> John Stultz <johnstul@us.ibm.com> Jordan Crouse <jordan@cosmicpenguin.net> <jcrouse@codeaurora.org> @@ -1173,6 +1173,10 @@ D: Future Domain TMC-16x0 SCSI driver (author) D: APM driver (early port) D: DRM drivers (author of several) +N: Veaceslav Falico +E: vfalico@gmail.com +D: Co-maintainer and co-author of the network bonding driver. + N: János Farkas E: chexum@shadow.banki.hu D: romfs, various (mostly networking) fixes @@ -4179,6 +4183,10 @@ S: B-1206 Jingmao Guojigongyu S: 16 Baliqiao Nanjie, Beijing 101100 S: People's Repulic of China +N: Vlad Yasevich +E: vyasevich@gmail.com +D: SCTP protocol maintainer. + N: Aviad Yehezkel E: aviadye@nvidia.com D: Kernel TLS implementation and offload support. diff --git a/Documentation/ABI/testing/sysfs-fs-erofs b/Documentation/ABI/testing/sysfs-fs-erofs index bb4681a01811..284224d1b56f 100644 --- a/Documentation/ABI/testing/sysfs-fs-erofs +++ b/Documentation/ABI/testing/sysfs-fs-erofs @@ -4,7 +4,8 @@ Contact: "Huang Jianan" <huangjianan@oppo.com> Description: Shows all enabled kernel features. Supported features: zero_padding, compr_cfgs, big_pcluster, chunked_file, - device_table, compr_head2, sb_chksum. + device_table, compr_head2, sb_chksum, ztailpacking, + dedupe, fragments. What: /sys/fs/erofs/<disk>/sync_decompress Date: November 2021 diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index c8ae7c897f14..74cec76be9f2 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1245,13 +1245,17 @@ PAGE_SIZE multiple when read back. This is a simple interface to trigger memory reclaim in the target cgroup. - This file accepts a string which contains the number of bytes to - reclaim. + This file accepts a single key, the number of bytes to reclaim. + No nested keys are currently supported. Example:: echo "1G" > memory.reclaim + The interface can be later extended with nested keys to + configure the reclaim behavior. For example, specify the + type of memory to reclaim from (anon, file, ..). + Please note that the kernel can over or under reclaim from the target cgroup. If less bytes are reclaimed than the specified amount, -EAGAIN is returned. @@ -1263,13 +1267,6 @@ PAGE_SIZE multiple when read back. This means that the networking layer will not adapt based on reclaim induced by memory.reclaim. - This file also allows the user to specify the nodes to reclaim from, - via the 'nodes=' key, for example:: - - echo "1G nodes=0,1" > memory.reclaim - - The above instructs the kernel to reclaim memory from nodes 0,1. - memory.peak A read-only single value file which exists on non-root cgroups. diff --git a/Documentation/admin-guide/hw-vuln/cross-thread-rsb.rst b/Documentation/admin-guide/hw-vuln/cross-thread-rsb.rst new file mode 100644 index 000000000000..875616d675fe --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/cross-thread-rsb.rst @@ -0,0 +1,91 @@ + +.. SPDX-License-Identifier: GPL-2.0 + +Cross-Thread Return Address Predictions +======================================= + +Certain AMD and Hygon processors are subject to a cross-thread return address +predictions vulnerability. When running in SMT mode and one sibling thread +transitions out of C0 state, the other sibling thread could use return target +predictions from the sibling thread that transitioned out of C0. + +The Spectre v2 mitigations protect the Linux kernel, as it fills the return +address prediction entries with safe targets when context switching to the idle +thread. However, KVM does allow a VMM to prevent exiting guest mode when +transitioning out of C0. This could result in a guest-controlled return target +being consumed by the sibling thread. + +Affected processors +------------------- + +The following CPUs are vulnerable: + + - AMD Family 17h processors + - Hygon Family 18h processors + +Related CVEs +------------ + +The following CVE entry is related to this issue: + + ============== ======================================= + CVE-2022-27672 Cross-Thread Return Address Predictions + ============== ======================================= + +Problem +------- + +Affected SMT-capable processors support 1T and 2T modes of execution when SMT +is enabled. In 2T mode, both threads in a core are executing code. For the +processor core to enter 1T mode, it is required that one of the threads +requests to transition out of the C0 state. This can be communicated with the +HLT instruction or with an MWAIT instruction that requests non-C0. +When the thread re-enters the C0 state, the processor transitions back +to 2T mode, assuming the other thread is also still in C0 state. + +In affected processors, the return address predictor (RAP) is partitioned +depending on the SMT mode. For instance, in 2T mode each thread uses a private +16-entry RAP, but in 1T mode, the active thread uses a 32-entry RAP. Upon +transition between 1T/2T mode, the RAP contents are not modified but the RAP +pointers (which control the next return target to use for predictions) may +change. This behavior may result in return targets from one SMT thread being +used by RET predictions in the sibling thread following a 1T/2T switch. In +particular, a RET instruction executed immediately after a transition to 1T may +use a return target from the thread that just became idle. In theory, this +could lead to information disclosure if the return targets used do not come +from trustworthy code. + +Attack scenarios +---------------- + +An attack can be mounted on affected processors by performing a series of CALL +instructions with targeted return locations and then transitioning out of C0 +state. + +Mitigation mechanism +-------------------- + +Before entering idle state, the kernel context switches to the idle thread. The +context switch fills the RAP entries (referred to as the RSB in Linux) with safe +targets by performing a sequence of CALL instructions. + +Prevent a guest VM from directly putting the processor into an idle state by +intercepting HLT and MWAIT instructions. + +Both mitigations are required to fully address this issue. + +Mitigation control on the kernel command line +--------------------------------------------- + +Use existing Spectre v2 mitigations that will fill the RSB on context switch. + +Mitigation control for KVM - module parameter +--------------------------------------------- + +By default, the KVM hypervisor mitigates this issue by intercepting guest +attempts to transition out of C0. A VMM can use the KVM_CAP_X86_DISABLE_EXITS +capability to override those interceptions, but since this is not common, the +mitigation that covers this path is not enabled by default. + +The mitigation for the KVM_CAP_X86_DISABLE_EXITS capability can be turned on +using the boolean module parameter mitigate_smt_rsb, e.g. ``kvm.mitigate_smt_rsb=1``. diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 4df436e7c417..e0614760a99e 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -18,3 +18,4 @@ are configurable at compile, boot or run time. core-scheduling.rst l1d_flush.rst processor_mmio_stale_data.rst + cross-thread-rsb.rst diff --git a/Documentation/devicetree/bindings/.gitignore b/Documentation/devicetree/bindings/.gitignore index a77719968a7e..51ddb26d93f0 100644 --- a/Documentation/devicetree/bindings/.gitignore +++ b/Documentation/devicetree/bindings/.gitignore @@ -2,3 +2,8 @@ *.example.dts /processed-schema*.yaml /processed-schema*.json + +# +# We don't want to ignore the following even if they are dot-files +# +!.yamllint diff --git a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml index 9f7d3e11aacb..8449e14af9f3 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml @@ -108,7 +108,7 @@ properties: msi-controller: description: - Only present if the Message Based Interrupt functionnality is + Only present if the Message Based Interrupt functionality is being exposed by the HW, and the mbi-ranges property present. mbi-ranges: diff --git a/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml b/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml index 0a7aa29563c1..21c8ea08ff0a 100644 --- a/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml +++ b/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml @@ -40,6 +40,8 @@ properties: description: Indicates that the setting of RTC time is allowed by the host CPU. + wakeup-source: true + required: - compatible - reg diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst index ef183387da20..eccd327e6df5 100644 --- a/Documentation/filesystems/fscrypt.rst +++ b/Documentation/filesystems/fscrypt.rst @@ -1277,8 +1277,8 @@ the file contents themselves, as described below: For the read path (->read_folio()) of regular files, filesystems can read the ciphertext into the page cache and decrypt it in-place. The -page lock must be held until decryption has finished, to prevent the -page from becoming visible to userspace prematurely. +folio lock must be held until decryption has finished, to prevent the +folio from becoming visible to userspace prematurely. For the write path (->writepage()) of regular files, filesystems cannot encrypt data in-place in the page cache, since the cached diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index cb8e7573882a..ede672dedf11 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -118,10 +118,11 @@ as follows: - ``hash_algorithm`` must be the identifier for the hash algorithm to use for the Merkle tree, such as FS_VERITY_HASH_ALG_SHA256. See ``include/uapi/linux/fsverity.h`` for the list of possible values. -- ``block_size`` must be the Merkle tree block size. Currently, this - must be equal to the system page size, which is usually 4096 bytes. - Other sizes may be supported in the future. This value is not - necessarily the same as the filesystem block size. +- ``block_size`` is the Merkle tree block size, in bytes. In Linux + v6.3 and later, this can be any power of 2 between (inclusively) + 1024 and the minimum of the system page size and the filesystem + block size. In earlier versions, the page size was the only allowed + value. - ``salt_size`` is the size of the salt in bytes, or 0 if no salt is provided. The salt is a value that is prepended to every hashed block; it can be used to personalize the hashing for a particular @@ -161,6 +162,7 @@ FS_IOC_ENABLE_VERITY can fail with the following errors: - ``EBUSY``: this ioctl is already running on the file - ``EEXIST``: the file already has verity enabled - ``EFAULT``: the caller provided inaccessible memory +- ``EFBIG``: the file is too large to enable verity on - ``EINTR``: the operation was interrupted by a fatal signal - ``EINVAL``: unsupported version, hash algorithm, or block size; or reserved bits are set; or the file descriptor refers to neither a @@ -495,9 +497,11 @@ To create verity files on an ext4 filesystem, the filesystem must have been formatted with ``-O verity`` or had ``tune2fs -O verity`` run on it. "verity" is an RO_COMPAT filesystem feature, so once set, old kernels will only be able to mount the filesystem readonly, and old -versions of e2fsck will be unable to check the filesystem. Moreover, -currently ext4 only supports mounting a filesystem with the "verity" -feature when its block size is equal to PAGE_SIZE (often 4096 bytes). +versions of e2fsck will be unable to check the filesystem. + +Originally, an ext4 filesystem with the "verity" feature could only be +mounted when its block size was equal to the system page size +(typically 4096 bytes). In Linux v6.3, this limitation was removed. ext4 sets the EXT4_VERITY_FL on-disk inode flag on verity files. It can only be set by `FS_IOC_ENABLE_VERITY`_, and it cannot be cleared. @@ -518,9 +522,7 @@ support paging multi-gigabyte xattrs into memory, and to support encrypting xattrs. Note that the verity metadata *must* be encrypted when the file is, since it contains hashes of the plaintext data. -Currently, ext4 verity only supports the case where the Merkle tree -block size, filesystem block size, and page size are all the same. It -also only supports extent-based files. +ext4 only allows verity on extent-based files. f2fs ---- @@ -538,11 +540,10 @@ Like ext4, f2fs stores the verity metadata (Merkle tree and fsverity_descriptor) past the end of the file, starting at the first 64K boundary beyond i_size. See explanation for ext4 above. Moreover, f2fs supports at most 4096 bytes of xattr entries per inode -which wouldn't be enough for even a single Merkle tree block. +which usually wouldn't be enough for even a single Merkle tree block. -Currently, f2fs verity only supports a Merkle tree block size of 4096. -Also, f2fs doesn't support enabling verity on files that currently -have atomic or volatile writes pending. +f2fs doesn't support enabling verity on files that currently have +atomic or volatile writes pending. btrfs ----- @@ -567,51 +568,48 @@ Pagecache ~~~~~~~~~ For filesystems using Linux's pagecache, the ``->read_folio()`` and -``->readahead()`` methods must be modified to verify pages before they -are marked Uptodate. Merely hooking ``->read_iter()`` would be +``->readahead()`` methods must be modified to verify folios before +they are marked Uptodate. Merely hooking ``->read_iter()`` would be insufficient, since ``->read_iter()`` is not used for memory maps. -Therefore, fs/verity/ provides a function fsverity_verify_page() which -verifies a page that has been read into the pagecache of a verity -inode, but is still locked and not Uptodate, so it's not yet readable -by userspace. As needed to do the verification, -fsverity_verify_page() will call back into the filesystem to read -Merkle tree pages via fsverity_operations::read_merkle_tree_page(). +Therefore, fs/verity/ provides the function fsverity_verify_blocks() +which verifies data that has been read into the pagecache of a verity +inode. The containing folio must still be locked and not Uptodate, so +it's not yet readable by userspace. As needed to do the verification, +fsverity_verify_blocks() will call back into the filesystem to read +hash blocks via fsverity_operations::read_merkle_tree_page(). -fsverity_verify_page() returns false if verification failed; in this -case, the filesystem must not set the page Uptodate. Following this, +fsverity_verify_blocks() returns false if verification failed; in this +case, the filesystem must not set the folio Uptodate. Following this, as per the usual Linux pagecache behavior, attempts by userspace to -read() from the part of the file containing the page will fail with -EIO, and accesses to the page within a memory map will raise SIGBUS. - -fsverity_verify_page() currently only supports the case where the -Merkle tree block size is equal to PAGE_SIZE (often 4096 bytes). +read() from the part of the file containing the folio will fail with +EIO, and accesses to the folio within a memory map will raise SIGBUS. -In principle, fsverity_verify_page() verifies the entire path in the -Merkle tree from the data page to the root hash. However, for -efficiency the filesystem may cache the hash pages. Therefore, -fsverity_verify_page() only ascends the tree reading hash pages until -an already-verified hash page is seen, as indicated by the PageChecked -bit being set. It then verifies the path to that page. +In principle, verifying a data block requires verifying the entire +path in the Merkle tree from the data block to the root hash. +However, for efficiency the filesystem may cache the hash blocks. +Therefore, fsverity_verify_blocks() only ascends the tree reading hash +blocks until an already-verified hash block is seen. It then verifies +the path to that block. This optimization, which is also used by dm-verity, results in excellent sequential read performance. This is because usually (e.g. -127 in 128 times for 4K blocks and SHA-256) the hash page from the +127 in 128 times for 4K blocks and SHA-256) the hash block from the bottom level of the tree will already be cached and checked from -reading a previous data page. However, random reads perform worse. +reading a previous data block. However, random reads perform worse. Block device based filesystems ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Block device based filesystems (e.g. ext4 and f2fs) in Linux also use the pagecache, so the above subsection applies too. However, they -also usually read many pages from a file at once, grouped into a +also usually read many data blocks from a file at once, grouped into a structure called a "bio". To make it easier for these types of filesystems to support fs-verity, fs/verity/ also provides a function -fsverity_verify_bio() which verifies all pages in a bio. +fsverity_verify_bio() which verifies all data blocks in a bio. ext4 and f2fs also support encryption. If a verity file is also -encrypted, the pages must be decrypted before being verified. To +encrypted, the data must be decrypted before being verified. To support this, these filesystems allocate a "post-read context" for each bio and store it in ``->bi_private``:: @@ -626,14 +624,14 @@ each bio and store it in ``->bi_private``:: verity, or both is enabled. After the bio completes, for each needed postprocessing step the filesystem enqueues the bio_post_read_ctx on a workqueue, and then the workqueue work does the decryption or -verification. Finally, pages where no decryption or verity error -occurred are marked Uptodate, and the pages are unlocked. +verification. Finally, folios where no decryption or verity error +occurred are marked Uptodate, and the folios are unlocked. On many filesystems, files can contain holes. Normally, -``->readahead()`` simply zeroes holes and sets the corresponding pages -Uptodate; no bios are issued. To prevent this case from bypassing -fs-verity, these filesystems use fsverity_verify_page() to verify hole -pages. +``->readahead()`` simply zeroes hole blocks and considers the +corresponding data to be up-to-date; no bios are issued. To prevent +this case from bypassing fs-verity, filesystems use +fsverity_verify_blocks() to verify hole blocks. Filesystems also disable direct I/O on verity files, since otherwise direct I/O would bypass fs-verity. @@ -644,7 +642,7 @@ Userspace utility This document focuses on the kernel, but a userspace utility for fs-verity can be found at: - https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/fsverity-utils.git + https://git.kernel.org/pub/scm/fs/fsverity/fsverity-utils.git See the README.md file in the fsverity-utils source tree for details, including examples of setting up fs-verity protected files. @@ -793,9 +791,9 @@ weren't already directly answered in other parts of this document. :A: There are many reasons why this is not possible or would be very difficult, including the following: - - To prevent bypassing verification, pages must not be marked + - To prevent bypassing verification, folios must not be marked Uptodate until they've been verified. Currently, each - filesystem is responsible for marking pages Uptodate via + filesystem is responsible for marking folios Uptodate via ``->readahead()``. Therefore, currently it's not possible for the VFS to do the verification on its own. Changing this would require significant changes to the VFS and all filesystems. diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 36fa2a83d714..7de7a7272a5e 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -56,35 +56,35 @@ inode_operations prototypes:: - int (*create) (struct inode *,struct dentry *,umode_t, bool); + int (*create) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t, bool); struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); - int (*symlink) (struct inode *,struct dentry *,const char *); - int (*mkdir) (struct inode *,struct dentry *,umode_t); + int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,const char *); + int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); - int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); - int (*rename) (struct inode *, struct dentry *, + int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t,dev_t); + int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*readlink) (struct dentry *, char __user *,int); const char *(*get_link) (struct dentry *, struct inode *, struct delayed_call *); void (*truncate) (struct inode *); - int (*permission) (struct inode *, int, unsigned int); + int (*permission) (struct mnt_idmap *, struct inode *, int, unsigned int); struct posix_acl * (*get_inode_acl)(struct inode *, int, bool); - int (*setattr) (struct dentry *, struct iattr *); - int (*getattr) (const struct path *, struct kstat *, u32, unsigned int); + int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *); + int (*getattr) (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); void (*update_time)(struct inode *, struct timespec *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); - int (*tmpfile) (struct user_namespace *, struct inode *, + int (*tmpfile) (struct mnt_idmap *, struct inode *, struct file *, umode_t); - int (*fileattr_set)(struct user_namespace *mnt_userns, + int (*fileattr_set)(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); - struct posix_acl * (*get_acl)(struct user_namespace *, struct dentry *, int); + struct posix_acl * (*get_acl)(struct mnt_idmap *, struct dentry *, int); locking rules: all may block @@ -135,7 +135,7 @@ prototypes:: struct inode *inode, const char *name, void *buffer, size_t size); int (*set)(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags); diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 2c15e7053113..c53f30251a66 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -421,31 +421,31 @@ As of kernel 2.6.22, the following members are defined: .. code-block:: c struct inode_operations { - int (*create) (struct user_namespace *, struct inode *,struct dentry *, umode_t, bool); + int (*create) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t, bool); struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); - int (*symlink) (struct user_namespace *, struct inode *,struct dentry *,const char *); - int (*mkdir) (struct user_namespace *, struct inode *,struct dentry *,umode_t); + int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,const char *); + int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); - int (*mknod) (struct user_namespace *, struct inode *,struct dentry *,umode_t,dev_t); - int (*rename) (struct user_namespace *, struct inode *, struct dentry *, + int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t,dev_t); + int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*readlink) (struct dentry *, char __user *,int); const char *(*get_link) (struct dentry *, struct inode *, struct delayed_call *); - int (*permission) (struct user_namespace *, struct inode *, int); + int (*permission) (struct mnt_idmap *, struct inode *, int); struct posix_acl * (*get_inode_acl)(struct inode *, int, bool); - int (*setattr) (struct user_namespace *, struct dentry *, struct iattr *); - int (*getattr) (struct user_namespace *, const struct path *, struct kstat *, u32, unsigned int); + int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *); + int (*getattr) (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); ssize_t (*listxattr) (struct dentry *, char *, size_t); void (*update_time)(struct inode *, struct timespec *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); - int (*tmpfile) (struct user_namespace *, struct inode *, struct file *, umode_t); - struct posix_acl * (*get_acl)(struct user_namespace *, struct dentry *, int); - int (*set_acl)(struct user_namespace *, struct dentry *, struct posix_acl *, int); - int (*fileattr_set)(struct user_namespace *mnt_userns, + int (*tmpfile) (struct mnt_idmap *, struct inode *, struct file *, umode_t); + struct posix_acl * (*get_acl)(struct mnt_idmap *, struct dentry *, int); + int (*set_acl)(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); + int (*fileattr_set)(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); }; diff --git a/Documentation/networking/device_drivers/ethernet/intel/ice.rst b/Documentation/networking/device_drivers/ethernet/intel/ice.rst index dc2e60ced927..b481b81f3be5 100644 --- a/Documentation/networking/device_drivers/ethernet/intel/ice.rst +++ b/Documentation/networking/device_drivers/ethernet/intel/ice.rst @@ -819,7 +819,7 @@ NAPI ---- This driver supports NAPI (Rx polling mode). For more information on NAPI, see -https://www.linuxfoundation.org/collaborate/workgroups/networking/napi +https://wiki.linuxfoundation.org/networking/napi MACVLAN diff --git a/Documentation/networking/device_drivers/ethernet/wangxun/txgbe.rst b/Documentation/networking/device_drivers/ethernet/wangxun/txgbe.rst index eaa87dbe8848..d052ef40fe36 100644 --- a/Documentation/networking/device_drivers/ethernet/wangxun/txgbe.rst +++ b/Documentation/networking/device_drivers/ethernet/wangxun/txgbe.rst @@ -16,5 +16,5 @@ Contents Support ======= -If you got any problem, contact Wangxun support team via support@trustnetic.com +If you got any problem, contact Wangxun support team via nic-support@net-swift.com and Cc: netdev. diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9807b05a1b57..0a67cb738013 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8070,9 +8070,13 @@ considering the state as complete. VMM needs to ensure that the dirty state is final and avoid missing dirty pages from another ioctl ordered after the bitmap collection. -NOTE: One example of using the backup bitmap is saving arm64 vgic/its -tables through KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} command on -KVM device "kvm-arm-vgic-its" when dirty ring is enabled. +NOTE: Multiple examples of using the backup bitmap: (1) save vgic/its +tables through command KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} on +KVM device "kvm-arm-vgic-its". (2) restore vgic/its tables through +command KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_RESTORE_TABLES} on KVM device +"kvm-arm-vgic-its". VGICv3 LPI pending status is restored. (3) save +vgic3 pending table through KVM_DEV_ARM_VGIC_{GRP_CTRL, SAVE_PENDING_TABLES} +command on KVM device "kvm-arm-vgic-v3". 8.30 KVM_CAP_XEN_HVM -------------------- diff --git a/MAINTAINERS b/MAINTAINERS index 8a5c25c20d00..82938ca70466 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1097,7 +1097,6 @@ S: Maintained F: drivers/dma/ptdma/ AMD SEATTLE DEVICE TREE SUPPORT -M: Brijesh Singh <brijeshkumar.singh@amd.com> M: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> M: Tom Lendacky <thomas.lendacky@amd.com> S: Supported @@ -2212,6 +2211,9 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git X: drivers/media/i2c/ +F: arch/arm64/boot/dts/freescale/ +X: arch/arm64/boot/dts/freescale/fsl-* +X: arch/arm64/boot/dts/freescale/qoriq-* N: imx N: mxs @@ -2450,11 +2452,14 @@ F: drivers/rtc/rtc-mt7622.c ARM/Mediatek SoC support M: Matthias Brugger <matthias.bgg@gmail.com> +R: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com> +L: linux-kernel@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-mediatek@lists.infradead.org (moderated for non-subscribers) S: Maintained W: https://mtk.wiki.kernel.org/ -C: irc://chat.freenode.net/linux-mediatek +C: irc://irc.libera.chat/linux-mediatek +F: arch/arm/boot/dts/mt2* F: arch/arm/boot/dts/mt6* F: arch/arm/boot/dts/mt7* F: arch/arm/boot/dts/mt8* @@ -2462,7 +2467,7 @@ F: arch/arm/mach-mediatek/ F: arch/arm64/boot/dts/mediatek/ F: drivers/soc/mediatek/ N: mtk -N: mt[678] +N: mt[2678] K: mediatek ARM/Mediatek USB3 PHY DRIVER @@ -3766,7 +3771,6 @@ F: net/bluetooth/ BONDING DRIVER M: Jay Vosburgh <j.vosburgh@gmail.com> -M: Veaceslav Falico <vfalico@gmail.com> M: Andy Gospodarek <andy@greyhouse.net> L: netdev@vger.kernel.org S: Supported @@ -7743,6 +7747,7 @@ R: Jeffle Xu <jefflexu@linux.alibaba.com> L: linux-erofs@lists.ozlabs.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git +F: Documentation/ABI/testing/sysfs-fs-erofs F: Documentation/filesystems/erofs.rst F: fs/erofs/ F: include/trace/events/erofs.h @@ -8198,7 +8203,7 @@ F: drivers/fpga/microchip-spi.c FPU EMULATOR M: Bill Metzenthen <billm@melbpc.org.au> S: Maintained -W: http://floatingpoint.sourceforge.net/emulator/index.html +W: https://floatingpoint.billm.au/ F: arch/x86/math-emu/ FRAMEBUFFER CORE @@ -9992,7 +9997,7 @@ S: Maintained T: git://git.kernel.org/pub/scm/linux/kernel/git/vfs/idmapping.git F: Documentation/filesystems/idmappings.rst F: tools/testing/selftests/mount_setattr/ -F: include/linux/mnt_idmapping.h +F: include/linux/mnt_idmapping.* IDT VersaClock 5 CLOCK DRIVER M: Luca Ceresoli <luca@lucaceresoli.net> @@ -14604,7 +14609,6 @@ F: tools/testing/selftests/net/ipsec.c NETWORKING [IPv4/IPv6] M: "David S. Miller" <davem@davemloft.net> -M: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> M: David Ahern <dsahern@kernel.org> L: netdev@vger.kernel.org S: Maintained @@ -15661,7 +15665,7 @@ OPENRISC ARCHITECTURE M: Jonas Bonn <jonas@southpole.se> M: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi> M: Stafford Horne <shorne@gmail.com> -L: openrisc@lists.librecores.org +L: linux-openrisc@vger.kernel.org S: Maintained W: http://openrisc.io T: git https://github.com/openrisc/linux.git @@ -16117,7 +16121,7 @@ F: drivers/pci/controller/pci-v3-semi.c PCI ENDPOINT SUBSYSTEM M: Lorenzo Pieralisi <lpieralisi@kernel.org> -R: Krzysztof WilczyÅ„ski <kw@linux.com> +M: Krzysztof WilczyÅ„ski <kw@linux.com> R: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> R: Kishon Vijay Abraham I <kishon@kernel.org> L: linux-pci@vger.kernel.org @@ -16125,7 +16129,7 @@ S: Supported Q: https://patchwork.kernel.org/project/linux-pci/list/ B: https://bugzilla.kernel.org C: irc://irc.oftc.net/linux-pci -T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git F: Documentation/PCI/endpoint/* F: Documentation/misc-devices/pci-endpoint-test.rst F: drivers/misc/pci_endpoint_test.c @@ -16160,7 +16164,7 @@ S: Supported Q: https://patchwork.kernel.org/project/linux-pci/list/ B: https://bugzilla.kernel.org C: irc://irc.oftc.net/linux-pci -T: git git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git F: Documentation/driver-api/pci/p2pdma.rst F: drivers/pci/p2pdma.c F: include/linux/pci-p2pdma.h @@ -16182,14 +16186,14 @@ F: drivers/pci/controller/pci-xgene-msi.c PCI NATIVE HOST BRIDGE AND ENDPOINT DRIVERS M: Lorenzo Pieralisi <lpieralisi@kernel.org> +M: Krzysztof WilczyÅ„ski <kw@linux.com> R: Rob Herring <robh@kernel.org> -R: Krzysztof WilczyÅ„ski <kw@linux.com> L: linux-pci@vger.kernel.org S: Supported Q: https://patchwork.kernel.org/project/linux-pci/list/ B: https://bugzilla.kernel.org C: irc://irc.oftc.net/linux-pci -T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git F: Documentation/devicetree/bindings/pci/ F: drivers/pci/controller/ F: drivers/pci/pci-bridge-emul.c @@ -16202,7 +16206,7 @@ S: Supported Q: https://patchwork.kernel.org/project/linux-pci/list/ B: https://bugzilla.kernel.org C: irc://irc.oftc.net/linux-pci -T: git git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git F: Documentation/PCI/ F: Documentation/devicetree/bindings/pci/ F: arch/x86/kernel/early-quirks.c @@ -18231,6 +18235,7 @@ L: rust-for-linux@vger.kernel.org S: Supported W: https://github.com/Rust-for-Linux/linux B: https://github.com/Rust-for-Linux/linux/issues +C: zulip://rust-for-linux.zulipchat.com T: git https://github.com/Rust-for-Linux/linux.git rust-next F: Documentation/rust/ F: rust/ @@ -18687,9 +18692,9 @@ F: drivers/target/ F: include/target/ SCTP PROTOCOL -M: Vlad Yasevich <vyasevich@gmail.com> M: Neil Horman <nhorman@tuxdriver.com> M: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> +M: Xin Long <lucien.xin@gmail.com> L: linux-sctp@vger.kernel.org S: Maintained W: http://lksctp.sourceforge.net @@ -20087,6 +20092,7 @@ F: drivers/watchdog/sunplus_wdt.c SUPERH M: Yoshinori Sato <ysato@users.sourceforge.jp> M: Rich Felker <dalias@libc.org> +M: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> L: linux-sh@vger.kernel.org S: Maintained Q: http://patchwork.kernel.org/project/linux-sh/list/ @@ -20319,8 +20325,7 @@ S: Maintained F: drivers/platform/x86/system76_acpi.c SYSV FILESYSTEM -M: Christoph Hellwig <hch@infradead.org> -S: Maintained +S: Orphan F: Documentation/filesystems/sysv-fs.rst F: fs/sysv/ F: include/linux/sysv_fs.h @@ -21727,6 +21732,7 @@ F: include/uapi/linux/uvcvideo.h USB WEBCAM GADGET M: Laurent Pinchart <laurent.pinchart@ideasonboard.com> +M: Daniel Scally <dan.scally@ideasonboard.com> L: linux-usb@vger.kernel.org S: Maintained F: drivers/usb/gadget/function/*uvc* @@ -21814,11 +21820,9 @@ W: http://en.wikipedia.org/wiki/Util-linux T: git git://git.kernel.org/pub/scm/utils/util-linux/util-linux.git UUID HELPERS -M: Christoph Hellwig <hch@lst.de> R: Andy Shevchenko <andriy.shevchenko@linux.intel.com> L: linux-kernel@vger.kernel.org S: Maintained -T: git git://git.infradead.org/users/hch/uuid.git F: include/linux/uuid.h F: include/uapi/linux/uuid.h F: lib/test_uuid.c @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 2 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* @@ -1602,7 +1602,7 @@ endif # CONFIG_MODULES CLEAN_FILES += include/ksym vmlinux.symvers modules-only.symvers \ modules.builtin modules.builtin.modinfo modules.nsdeps \ compile_commands.json .thinlto-cache rust/test rust/doc \ - .vmlinux.objs .vmlinux.export.c + rust-project.json .vmlinux.objs .vmlinux.export.c # Directories & files removed with 'make mrproper' MRPROPER_FILES += include/config include/generated \ diff --git a/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts b/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts index d1971ddf06a5..7f755e5a4624 100644 --- a/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts +++ b/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts @@ -751,7 +751,7 @@ }; pca9849@75 { - compatible = "nxp,pca849"; + compatible = "nxp,pca9849"; reg = <0x75>; #address-cells = <1>; #size-cells = <0>; diff --git a/arch/arm/boot/dts/imx7d-smegw01.dts b/arch/arm/boot/dts/imx7d-smegw01.dts index 546268b8d0b1..c0f00f5db11e 100644 --- a/arch/arm/boot/dts/imx7d-smegw01.dts +++ b/arch/arm/boot/dts/imx7d-smegw01.dts @@ -198,6 +198,7 @@ &usbotg2 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_usbotg2>; + over-current-active-low; dr_mode = "host"; status = "okay"; }; @@ -374,7 +375,7 @@ pinctrl_usbotg2: usbotg2grp { fsl,pins = < - MX7D_PAD_UART3_RTS_B__USB_OTG2_OC 0x04 + MX7D_PAD_UART3_RTS_B__USB_OTG2_OC 0x5c >; }; diff --git a/arch/arm/boot/dts/nuvoton-wpcm450.dtsi b/arch/arm/boot/dts/nuvoton-wpcm450.dtsi index b637241316bb..fd671c7a1e5d 100644 --- a/arch/arm/boot/dts/nuvoton-wpcm450.dtsi +++ b/arch/arm/boot/dts/nuvoton-wpcm450.dtsi @@ -480,6 +480,7 @@ reg = <0xc8000000 0x1000>, <0xc0000000 0x4000000>; reg-names = "control", "memory"; clocks = <&clk 0>; + nuvoton,shm = <&shm>; status = "disabled"; }; diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi index 487b0e03d4b4..2ca76b69add7 100644 --- a/arch/arm/boot/dts/rk3288.dtsi +++ b/arch/arm/boot/dts/rk3288.dtsi @@ -1181,6 +1181,7 @@ clock-names = "dp", "pclk"; phys = <&edp_phy>; phy-names = "dp"; + power-domains = <&power RK3288_PD_VIO>; resets = <&cru SRST_EDP>; reset-names = "dp"; rockchip,grf = <&grf>; diff --git a/arch/arm/boot/dts/stihxxx-b2120.dtsi b/arch/arm/boot/dts/stihxxx-b2120.dtsi index 920a0bad7494..8d9a2dfa76f1 100644 --- a/arch/arm/boot/dts/stihxxx-b2120.dtsi +++ b/arch/arm/boot/dts/stihxxx-b2120.dtsi @@ -178,7 +178,7 @@ tsin-num = <0>; serial-not-parallel; i2c-bus = <&ssc2>; - reset-gpios = <&pio15 4 GPIO_ACTIVE_HIGH>; + reset-gpios = <&pio15 4 GPIO_ACTIVE_LOW>; dvb-card = <STV0367_TDA18212_NIMA_1>; }; }; diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index 68112c172025..006163195d67 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -73,6 +73,7 @@ #include <linux/syscalls.h> #include <linux/errno.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/cred.h> #include <linux/fcntl.h> #include <linux/eventpoll.h> diff --git a/arch/arm64/boot/dts/amlogic/meson-axg.dtsi b/arch/arm64/boot/dts/amlogic/meson-axg.dtsi index 1648e67afbb6..417523dc4cc0 100644 --- a/arch/arm64/boot/dts/amlogic/meson-axg.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-axg.dtsi @@ -1886,7 +1886,7 @@ sd_emmc_b: sd@5000 { compatible = "amlogic,meson-axg-mmc"; reg = <0x0 0x5000 0x0 0x800>; - interrupts = <GIC_SPI 217 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 217 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; clocks = <&clkc CLKID_SD_EMMC_B>, <&clkc CLKID_SD_EMMC_B_CLK0>, @@ -1898,7 +1898,7 @@ sd_emmc_c: mmc@7000 { compatible = "amlogic,meson-axg-mmc"; reg = <0x0 0x7000 0x0 0x800>; - interrupts = <GIC_SPI 218 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 218 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; clocks = <&clkc CLKID_SD_EMMC_C>, <&clkc CLKID_SD_EMMC_C_CLK0>, diff --git a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi index 9dbd50820b1c..7f55d97f6c28 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi @@ -2324,7 +2324,7 @@ sd_emmc_a: sd@ffe03000 { compatible = "amlogic,meson-axg-mmc"; reg = <0x0 0xffe03000 0x0 0x800>; - interrupts = <GIC_SPI 189 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 189 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; clocks = <&clkc CLKID_SD_EMMC_A>, <&clkc CLKID_SD_EMMC_A_CLK0>, @@ -2336,7 +2336,7 @@ sd_emmc_b: sd@ffe05000 { compatible = "amlogic,meson-axg-mmc"; reg = <0x0 0xffe05000 0x0 0x800>; - interrupts = <GIC_SPI 190 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 190 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; clocks = <&clkc CLKID_SD_EMMC_B>, <&clkc CLKID_SD_EMMC_B_CLK0>, @@ -2348,7 +2348,7 @@ sd_emmc_c: mmc@ffe07000 { compatible = "amlogic,meson-axg-mmc"; reg = <0x0 0xffe07000 0x0 0x800>; - interrupts = <GIC_SPI 191 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 191 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; clocks = <&clkc CLKID_SD_EMMC_C>, <&clkc CLKID_SD_EMMC_C_CLK0>, diff --git a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi index e3c12e0be99d..5eed15035b67 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi @@ -603,21 +603,21 @@ sd_emmc_a: mmc@70000 { compatible = "amlogic,meson-gx-mmc", "amlogic,meson-gxbb-mmc"; reg = <0x0 0x70000 0x0 0x800>; - interrupts = <GIC_SPI 216 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 216 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; }; sd_emmc_b: mmc@72000 { compatible = "amlogic,meson-gx-mmc", "amlogic,meson-gxbb-mmc"; reg = <0x0 0x72000 0x0 0x800>; - interrupts = <GIC_SPI 217 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 217 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; }; sd_emmc_c: mmc@74000 { compatible = "amlogic,meson-gx-mmc", "amlogic,meson-gxbb-mmc"; reg = <0x0 0x74000 0x0 0x800>; - interrupts = <GIC_SPI 218 IRQ_TYPE_EDGE_RISING>; + interrupts = <GIC_SPI 218 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; }; }; diff --git a/arch/arm64/boot/dts/freescale/imx8dxl.dtsi b/arch/arm64/boot/dts/freescale/imx8dxl.dtsi index 0c64b9194621..214f21bd0cb4 100644 --- a/arch/arm64/boot/dts/freescale/imx8dxl.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8dxl.dtsi @@ -164,7 +164,7 @@ sc_pwrkey: keys { compatible = "fsl,imx8qxp-sc-key", "fsl,imx-sc-key"; - linux,keycode = <KEY_POWER>; + linux,keycodes = <KEY_POWER>; wakeup-source; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-data-modul-edm-sbc.dts b/arch/arm64/boot/dts/freescale/imx8mm-data-modul-edm-sbc.dts index 752f409a30b1..9889319d4f04 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-data-modul-edm-sbc.dts +++ b/arch/arm64/boot/dts/freescale/imx8mm-data-modul-edm-sbc.dts @@ -88,6 +88,7 @@ pinctrl-names = "default"; pinctrl-0 = <&pinctrl_watchdog_gpio>; compatible = "linux,wdt-gpio"; + always-running; gpios = <&gpio1 8 GPIO_ACTIVE_HIGH>; hw_algo = "level"; /* Reset triggers in 2..3 seconds */ diff --git a/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h b/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h index 83c8f715cd90..b1f11098d248 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h +++ b/arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h @@ -602,7 +602,7 @@ #define MX8MM_IOMUXC_UART1_RXD_GPIO5_IO22 0x234 0x49C 0x000 0x5 0x0 #define MX8MM_IOMUXC_UART1_RXD_TPSMP_HDATA24 0x234 0x49C 0x000 0x7 0x0 #define MX8MM_IOMUXC_UART1_TXD_UART1_DCE_TX 0x238 0x4A0 0x000 0x0 0x0 -#define MX8MM_IOMUXC_UART1_TXD_UART1_DTE_RX 0x238 0x4A0 0x4F4 0x0 0x0 +#define MX8MM_IOMUXC_UART1_TXD_UART1_DTE_RX 0x238 0x4A0 0x4F4 0x0 0x1 #define MX8MM_IOMUXC_UART1_TXD_ECSPI3_MOSI 0x238 0x4A0 0x000 0x1 0x0 #define MX8MM_IOMUXC_UART1_TXD_GPIO5_IO23 0x238 0x4A0 0x000 0x5 0x0 #define MX8MM_IOMUXC_UART1_TXD_TPSMP_HDATA25 0x238 0x4A0 0x000 0x7 0x0 diff --git a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw72xx-0x-rs232-rts.dtso b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw72xx-0x-rs232-rts.dtso index 3ea73a6886ff..f6ad1a4b8b66 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw72xx-0x-rs232-rts.dtso +++ b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw72xx-0x-rs232-rts.dtso @@ -33,7 +33,6 @@ pinctrl-0 = <&pinctrl_uart2>; rts-gpios = <&gpio5 29 GPIO_ACTIVE_LOW>; cts-gpios = <&gpio5 28 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx-0x-rs232-rts.dtso b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx-0x-rs232-rts.dtso index 2fa635e1c1a8..1f8ea20dfafc 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx-0x-rs232-rts.dtso +++ b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx-0x-rs232-rts.dtso @@ -33,7 +33,6 @@ pinctrl-0 = <&pinctrl_uart2>; rts-gpios = <&gpio5 29 GPIO_ACTIVE_LOW>; cts-gpios = <&gpio5 28 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx.dtsi index 244ef8d6cc68..7761d5671cb1 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw73xx.dtsi @@ -222,7 +222,6 @@ pinctrl-0 = <&pinctrl_uart3>, <&pinctrl_bten>; cts-gpios = <&gpio5 8 GPIO_ACTIVE_LOW>; rts-gpios = <&gpio5 9 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; bluetooth { diff --git a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7901.dts b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7901.dts index 6433c205f8dd..64b366e83fa1 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7901.dts +++ b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7901.dts @@ -733,7 +733,6 @@ dtr-gpios = <&gpio1 14 GPIO_ACTIVE_LOW>; dsr-gpios = <&gpio1 1 GPIO_ACTIVE_LOW>; dcd-gpios = <&gpio1 11 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; @@ -749,7 +748,6 @@ pinctrl-0 = <&pinctrl_uart3>, <&pinctrl_uart3_gpio>; cts-gpios = <&gpio4 10 GPIO_ACTIVE_LOW>; rts-gpios = <&gpio4 9 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; @@ -758,7 +756,6 @@ pinctrl-0 = <&pinctrl_uart4>, <&pinctrl_uart4_gpio>; cts-gpios = <&gpio5 11 GPIO_ACTIVE_LOW>; rts-gpios = <&gpio5 12 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7902.dts b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7902.dts index 32872b0b1aaf..e8bc1fccc47b 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7902.dts +++ b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7902.dts @@ -664,7 +664,6 @@ pinctrl-0 = <&pinctrl_uart1>, <&pinctrl_uart1_gpio>; rts-gpios = <&gpio4 10 GPIO_ACTIVE_LOW>; cts-gpios = <&gpio4 24 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; @@ -681,7 +680,6 @@ pinctrl-0 = <&pinctrl_uart3>, <&pinctrl_uart3_gpio>; rts-gpios = <&gpio2 1 GPIO_ACTIVE_LOW>; cts-gpios = <&gpio2 0 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; bluetooth { @@ -699,7 +697,6 @@ dtr-gpios = <&gpio4 3 GPIO_ACTIVE_LOW>; dsr-gpios = <&gpio4 4 GPIO_ACTIVE_LOW>; dcd-gpios = <&gpio4 6 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7903.dts b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7903.dts index 8ce562246a08..acc2ba8e00a8 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7903.dts +++ b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7903.dts @@ -581,7 +581,6 @@ dtr-gpios = <&gpio1 0 GPIO_ACTIVE_LOW>; dsr-gpios = <&gpio1 1 GPIO_ACTIVE_LOW>; dcd-gpios = <&gpio3 24 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi index 0d454e0e2f7c..702d87621bb4 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi @@ -98,6 +98,7 @@ off-on-delay = <500000>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_reg_eth>; + regulator-always-on; regulator-boot-on; regulator-max-microvolt = <3300000>; regulator-min-microvolt = <3300000>; diff --git a/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts b/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts index b9444e4a3d2d..7c12518dbc96 100644 --- a/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts +++ b/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts @@ -643,7 +643,6 @@ pinctrl-0 = <&pinctrl_uart3>, <&pinctrl_uart3_gpio>; rts-gpios = <&gpio2 1 GPIO_ACTIVE_LOW>; cts-gpios = <&gpio2 0 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; bluetooth { diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts index ceeca4966fc5..8eb7d5ee38da 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts +++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts @@ -623,7 +623,6 @@ pinctrl-0 = <&pinctrl_uart3>, <&pinctrl_uart3_gpio>; cts-gpios = <&gpio3 21 GPIO_ACTIVE_LOW>; rts-gpios = <&gpio3 22 GPIO_ACTIVE_LOW>; - uart-has-rtscts; status = "okay"; bluetooth { diff --git a/arch/arm64/boot/dts/mediatek/mt8195.dtsi b/arch/arm64/boot/dts/mediatek/mt8195.dtsi index 5d31536f4c48..c10cfeb1214d 100644 --- a/arch/arm64/boot/dts/mediatek/mt8195.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt8195.dtsi @@ -2146,7 +2146,7 @@ }; vdosys0: syscon@1c01a000 { - compatible = "mediatek,mt8195-mmsys", "syscon"; + compatible = "mediatek,mt8195-vdosys0", "mediatek,mt8195-mmsys", "syscon"; reg = <0 0x1c01a000 0 0x1000>; mboxes = <&gce0 0 CMDQ_THR_PRIO_4>; #clock-cells = <1>; @@ -2292,7 +2292,7 @@ }; vdosys1: syscon@1c100000 { - compatible = "mediatek,mt8195-mmsys", "syscon"; + compatible = "mediatek,mt8195-vdosys1", "syscon"; reg = <0 0x1c100000 0 0x1000>; #clock-cells = <1>; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3328-roc-cc.dts b/arch/arm64/boot/dts/rockchip/rk3328-roc-cc.dts index aa22a0c22265..5d5d9574088c 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-roc-cc.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-roc-cc.dts @@ -96,7 +96,6 @@ linux,default-trigger = "heartbeat"; gpios = <&rk805 1 GPIO_ACTIVE_LOW>; default-state = "on"; - mode = <0x23>; }; user_led: led-1 { @@ -104,7 +103,6 @@ linux,default-trigger = "mmc1"; gpios = <&rk805 0 GPIO_ACTIVE_LOW>; default-state = "off"; - mode = <0x05>; }; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399-op1-opp.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-op1-opp.dtsi index 6e29e74f6fc6..783120e9cebe 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-op1-opp.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-op1-opp.dtsi @@ -111,7 +111,7 @@ }; }; - dmc_opp_table: dmc_opp_table { + dmc_opp_table: opp-table-3 { compatible = "operating-points-v2"; opp00 { diff --git a/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts b/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts index 04403a76238b..a0795a2b1cb1 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts +++ b/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts @@ -104,6 +104,13 @@ }; }; +&cpu_alert0 { + temperature = <65000>; +}; +&cpu_alert1 { + temperature = <68000>; +}; + &cpu_l0 { cpu-supply = <&vdd_cpu_l>; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index 4391aea25984..1881b4b71f91 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -589,7 +589,7 @@ clocks = <&cru HCLK_M_CRYPTO0>, <&cru HCLK_S_CRYPTO0>, <&cru SCLK_CRYPTO0>; clock-names = "hclk_master", "hclk_slave", "sclk"; resets = <&cru SRST_CRYPTO0>, <&cru SRST_CRYPTO0_S>, <&cru SRST_CRYPTO0_M>; - reset-names = "master", "lave", "crypto"; + reset-names = "master", "slave", "crypto-rst"; }; crypto1: crypto@ff8b8000 { @@ -599,7 +599,7 @@ clocks = <&cru HCLK_M_CRYPTO1>, <&cru HCLK_S_CRYPTO1>, <&cru SCLK_CRYPTO1>; clock-names = "hclk_master", "hclk_slave", "sclk"; resets = <&cru SRST_CRYPTO1>, <&cru SRST_CRYPTO1_S>, <&cru SRST_CRYPTO1_M>; - reset-names = "master", "slave", "crypto"; + reset-names = "master", "slave", "crypto-rst"; }; i2c1: i2c@ff110000 { @@ -2241,13 +2241,11 @@ pcfg_input_pull_up: pcfg-input-pull-up { input-enable; bias-pull-up; - drive-strength = <2>; }; pcfg_input_pull_down: pcfg-input-pull-down { input-enable; bias-pull-down; - drive-strength = <2>; }; clock { diff --git a/arch/arm64/boot/dts/rockchip/rk3566-box-demo.dts b/arch/arm64/boot/dts/rockchip/rk3566-box-demo.dts index 4c7f9abd594f..d956496d5221 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-box-demo.dts +++ b/arch/arm64/boot/dts/rockchip/rk3566-box-demo.dts @@ -353,6 +353,17 @@ }; }; +&pmu_io_domains { + pmuio2-supply = <&vcc_3v3>; + vccio1-supply = <&vcc_3v3>; + vccio3-supply = <&vcc_3v3>; + vccio4-supply = <&vcca_1v8>; + vccio5-supply = <&vcc_3v3>; + vccio6-supply = <&vcca_1v8>; + vccio7-supply = <&vcc_3v3>; + status = "okay"; +}; + &pwm0 { status = "okay"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3568-rock-3a.dts b/arch/arm64/boot/dts/rockchip/rk3568-rock-3a.dts index a1c5fdf7d68f..3c9d85257cc9 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568-rock-3a.dts +++ b/arch/arm64/boot/dts/rockchip/rk3568-rock-3a.dts @@ -571,6 +571,8 @@ }; &i2s1_8ch { + pinctrl-names = "default"; + pinctrl-0 = <&i2s1m0_sclktx &i2s1m0_lrcktx &i2s1m0_sdi0 &i2s1m0_sdo0>; rockchip,trcm-sync-tx-only; status = "okay"; }; @@ -730,14 +732,13 @@ disable-wp; pinctrl-names = "default"; pinctrl-0 = <&sdmmc0_bus4 &sdmmc0_clk &sdmmc0_cmd &sdmmc0_det>; - sd-uhs-sdr104; + sd-uhs-sdr50; vmmc-supply = <&vcc3v3_sd>; vqmmc-supply = <&vccio_sd>; status = "okay"; }; &sdmmc2 { - supports-sdio; bus-width = <4>; disable-wp; cap-sd-highspeed; diff --git a/arch/arm64/boot/dts/rockchip/rk356x.dtsi b/arch/arm64/boot/dts/rockchip/rk356x.dtsi index 5706c3e24f0a..c27f1c7f072d 100644 --- a/arch/arm64/boot/dts/rockchip/rk356x.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk356x.dtsi @@ -966,6 +966,7 @@ clock-names = "aclk_mst", "aclk_slv", "aclk_dbi", "pclk", "aux"; device_type = "pci"; + #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0 0 0 1 &pcie_intc 0>, <0 0 0 2 &pcie_intc 1>, diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index a5193f2146a6..dde06c0f97f3 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -1023,12 +1023,6 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event, return 0; } -static bool armv8pmu_filter(struct pmu *pmu, int cpu) -{ - struct arm_pmu *armpmu = to_arm_pmu(pmu); - return !cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus); -} - static void armv8pmu_reset(void *info) { struct arm_pmu *cpu_pmu = (struct arm_pmu *)info; @@ -1069,6 +1063,14 @@ static int __armv8_pmuv3_map_event(struct perf_event *event, &armv8_pmuv3_perf_cache_map, ARMV8_PMU_EVTYPE_EVENT); + /* + * CHAIN events only work when paired with an adjacent counter, and it + * never makes sense for a user to open one in isolation, as they'll be + * rotated arbitrarily. + */ + if (hw_event_id == ARMV8_PMUV3_PERFCTR_CHAIN) + return -EINVAL; + if (armv8pmu_event_is_64bit(event)) event->hw.flags |= ARMPMU_EVT_64BIT; @@ -1258,7 +1260,6 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name, cpu_pmu->stop = armv8pmu_stop; cpu_pmu->reset = armv8pmu_reset; cpu_pmu->set_event_filter = armv8pmu_set_event_filter; - cpu_pmu->filter = armv8pmu_filter; cpu_pmu->pmu.event_idx = armv8pmu_user_event_idx; diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 94a666dd1443..2642e9ce2819 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -2187,7 +2187,7 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | ite->collection->collection_id; val = cpu_to_le64(val); - return kvm_write_guest_lock(kvm, gpa, &val, ite_esz); + return vgic_write_guest_lock(kvm, gpa, &val, ite_esz); } /** @@ -2339,7 +2339,7 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | (dev->num_eventid_bits - 1)); val = cpu_to_le64(val); - return kvm_write_guest_lock(kvm, ptr, &val, dte_esz); + return vgic_write_guest_lock(kvm, ptr, &val, dte_esz); } /** @@ -2526,7 +2526,7 @@ static int vgic_its_save_cte(struct vgic_its *its, ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | collection->collection_id); val = cpu_to_le64(val); - return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz); + return vgic_write_guest_lock(its->dev->kvm, gpa, &val, esz); } /* @@ -2607,7 +2607,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) */ val = 0; BUG_ON(cte_esz > sizeof(val)); - ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); + ret = vgic_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); return ret; } @@ -2743,7 +2743,6 @@ static int vgic_its_has_attr(struct kvm_device *dev, static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); - struct vgic_dist *dist = &kvm->arch.vgic; int ret = 0; if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */ @@ -2763,9 +2762,7 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) vgic_its_reset(kvm, its); break; case KVM_DEV_ARM_ITS_SAVE_TABLES: - dist->save_its_tables_in_progress = true; ret = abi->save_tables(its); - dist->save_its_tables_in_progress = false; break; case KVM_DEV_ARM_ITS_RESTORE_TABLES: ret = abi->restore_tables(its); @@ -2792,7 +2789,7 @@ bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - return dist->save_its_tables_in_progress; + return dist->table_write_in_progress; } static int vgic_its_set_attr(struct kvm_device *dev, diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 2624963cb95b..684bdfaad4a9 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -339,7 +339,7 @@ retry: if (status) { /* clear consumed data */ val &= ~(1 << bit_nr); - ret = kvm_write_guest_lock(kvm, ptr, &val, 1); + ret = vgic_write_guest_lock(kvm, ptr, &val, 1); if (ret) return ret; } @@ -434,7 +434,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) else val &= ~(1 << bit_nr); - ret = kvm_write_guest_lock(kvm, ptr, &val, 1); + ret = vgic_write_guest_lock(kvm, ptr, &val, 1); if (ret) goto out; } diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 23e280fa0a16..7f7f3c5ed85a 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -6,6 +6,7 @@ #define __KVM_ARM_VGIC_NEW_H__ #include <linux/irqchip/arm-gic-common.h> +#include <asm/kvm_mmu.h> #define PRODUCT_ID_KVM 0x4b /* ASCII code K */ #define IMPLEMENTER_ARM 0x43b @@ -131,6 +132,19 @@ static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq) return vgic_irq_get_lr_count(irq) > 1; } +static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa, + const void *data, unsigned long len) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + int ret; + + dist->table_write_in_progress = true; + ret = kvm_write_guest_lock(kvm, gpa, data, len); + dist->table_write_in_progress = false; + + return ret; +} + /* * This struct provides an intermediate representation of the fields contained * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c index f6a502e8f02c..6e948d015332 100644 --- a/arch/ia64/kernel/sys_ia64.c +++ b/arch/ia64/kernel/sys_ia64.c @@ -170,6 +170,9 @@ ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, u asmlinkage long ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *tp) { + struct timespec64 rtn_tp; + s64 tick_ns; + /* * ia64's clock_gettime() syscall is implemented as a vdso call * fsys_clock_gettime(). Currently it handles only @@ -185,8 +188,8 @@ ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user * switch (which_clock) { case CLOCK_REALTIME: case CLOCK_MONOTONIC: - s64 tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); - struct timespec64 rtn_tp = ns_to_timespec64(tick_ns); + tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); + rtn_tp = ns_to_timespec64(tick_ns); return put_timespec64(&rtn_tp, tp); } diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c index 4dfe1f49c5c8..6817892a2c58 100644 --- a/arch/parisc/kernel/firmware.c +++ b/arch/parisc/kernel/firmware.c @@ -1303,7 +1303,7 @@ static char iodc_dbuf[4096] __page_aligned_bss; */ int pdc_iodc_print(const unsigned char *str, unsigned count) { - unsigned int i; + unsigned int i, found = 0; unsigned long flags; count = min_t(unsigned int, count, sizeof(iodc_dbuf)); @@ -1315,6 +1315,7 @@ int pdc_iodc_print(const unsigned char *str, unsigned count) iodc_dbuf[i+0] = '\r'; iodc_dbuf[i+1] = '\n'; i += 2; + found = 1; goto print; default: iodc_dbuf[i] = str[i]; @@ -1330,7 +1331,7 @@ print: __pa(pdc_result), 0, __pa(iodc_dbuf), i, 0); spin_unlock_irqrestore(&pdc_lock, flags); - return i; + return i - found; } #if !defined(BOOTLOADER) diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c index 69c62933e952..ceb45f51d52e 100644 --- a/arch/parisc/kernel/ptrace.c +++ b/arch/parisc/kernel/ptrace.c @@ -126,6 +126,12 @@ long arch_ptrace(struct task_struct *child, long request, unsigned long tmp; long ret = -EIO; + unsigned long user_regs_struct_size = sizeof(struct user_regs_struct); +#ifdef CONFIG_64BIT + if (is_compat_task()) + user_regs_struct_size /= 2; +#endif + switch (request) { /* Read the word at location addr in the USER area. For ptraced @@ -166,7 +172,7 @@ long arch_ptrace(struct task_struct *child, long request, addr >= sizeof(struct pt_regs)) break; if (addr == PT_IAOQ0 || addr == PT_IAOQ1) { - data |= 3; /* ensure userspace privilege */ + data |= PRIV_USER; /* ensure userspace privilege */ } if ((addr >= PT_GR1 && addr <= PT_GR31) || addr == PT_IAOQ0 || addr == PT_IAOQ1 || @@ -181,14 +187,14 @@ long arch_ptrace(struct task_struct *child, long request, return copy_regset_to_user(child, task_user_regset_view(current), REGSET_GENERAL, - 0, sizeof(struct user_regs_struct), + 0, user_regs_struct_size, datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, task_user_regset_view(current), REGSET_GENERAL, - 0, sizeof(struct user_regs_struct), + 0, user_regs_struct_size, datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ @@ -285,7 +291,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, if (addr >= sizeof(struct pt_regs)) break; if (addr == PT_IAOQ0+4 || addr == PT_IAOQ1+4) { - data |= 3; /* ensure userspace privilege */ + data |= PRIV_USER; /* ensure userspace privilege */ } if (addr >= PT_FR0 && addr <= PT_FR31 + 4) { /* Special case, fp regs are 64 bits anyway */ @@ -302,6 +308,11 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, } } break; + case PTRACE_GETREGS: + case PTRACE_SETREGS: + case PTRACE_GETFPREGS: + case PTRACE_SETFPREGS: + return arch_ptrace(child, request, addr, data); default: ret = compat_ptrace_request(child, request, addr, data); @@ -484,7 +495,7 @@ static void set_reg(struct pt_regs *regs, int num, unsigned long val) case RI(iaoq[0]): case RI(iaoq[1]): /* set 2 lowest bits to ensure userspace privilege: */ - regs->iaoq[num - RI(iaoq[0])] = val | 3; + regs->iaoq[num - RI(iaoq[0])] = val | PRIV_USER; return; case RI(sar): regs->sar = val; return; diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b8c4ac56bddc..7a5f8dbfbdd0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -163,7 +163,6 @@ config PPC select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_MODULES_DATA_IN_VMALLOC if PPC_BOOK3S_32 || PPC_8xx - select ARCH_WANTS_NO_INSTR select ARCH_WEAK_RELEASE_ACQUIRE select BINFMT_ELF select BUILDTIME_TABLE_SORT diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index dd39313242b4..2bbc0fcce04a 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -97,6 +97,8 @@ static inline void tlb_flush(struct mmu_gather *tlb) { if (radix_enabled()) radix__tlb_flush(tlb); + else + hash__tlb_flush(tlb); } #ifdef CONFIG_SMP diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 77fa88c2aed0..eb6d094083fd 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -173,6 +173,15 @@ static inline notrace unsigned long irq_soft_mask_or_return(unsigned long mask) return flags; } +static inline notrace unsigned long irq_soft_mask_andc_return(unsigned long mask) +{ + unsigned long flags = irq_soft_mask_return(); + + irq_soft_mask_set(flags & ~mask); + + return flags; +} + static inline unsigned long arch_local_save_flags(void) { return irq_soft_mask_return(); @@ -192,7 +201,7 @@ static inline void arch_local_irq_enable(void) static inline unsigned long arch_local_irq_save(void) { - return irq_soft_mask_set_return(IRQS_DISABLED); + return irq_soft_mask_or_return(IRQS_DISABLED); } static inline bool arch_irqs_disabled_flags(unsigned long flags) @@ -331,10 +340,11 @@ bool power_pmu_wants_prompt_pmi(void); * is a different soft-masked interrupt pending that requires hard * masking. */ -static inline bool should_hard_irq_enable(void) +static inline bool should_hard_irq_enable(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { - WARN_ON(irq_soft_mask_return() == IRQS_ENABLED); + WARN_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED); + WARN_ON(!(get_paca()->irq_happened & PACA_IRQ_HARD_DIS)); WARN_ON(mfmsr() & MSR_EE); } @@ -347,8 +357,17 @@ static inline bool should_hard_irq_enable(void) * * TODO: Add test for 64e */ - if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !power_pmu_wants_prompt_pmi()) - return false; + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) { + if (!power_pmu_wants_prompt_pmi()) + return false; + /* + * If PMIs are disabled then IRQs should be disabled as well, + * so we shouldn't see this condition, check for it just in + * case because we are about to enable PMIs. + */ + if (WARN_ON_ONCE(regs->softe & IRQS_PMI_DISABLED)) + return false; + } if (get_paca()->irq_happened & PACA_IRQ_MUST_HARD_MASK) return false; @@ -358,18 +377,16 @@ static inline bool should_hard_irq_enable(void) /* * Do the hard enabling, only call this if should_hard_irq_enable is true. + * This allows PMI interrupts to profile irq handlers. */ static inline void do_hard_irq_enable(void) { - if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { - WARN_ON(irq_soft_mask_return() == IRQS_ENABLED); - WARN_ON(get_paca()->irq_happened & PACA_IRQ_MUST_HARD_MASK); - WARN_ON(mfmsr() & MSR_EE); - } /* - * This allows PMI interrupts (and watchdog soft-NMIs) through. - * There is no other reason to enable this way. + * Asynch interrupts come in with IRQS_ALL_DISABLED, + * PACA_IRQ_HARD_DIS, and MSR[EE]=0. */ + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) + irq_soft_mask_andc_return(IRQS_PMI_DISABLED); get_paca()->irq_happened &= ~PACA_IRQ_HARD_DIS; __hard_irq_enable(); } @@ -452,7 +469,7 @@ static inline bool arch_irq_disabled_regs(struct pt_regs *regs) return !(regs->msr & MSR_EE); } -static __always_inline bool should_hard_irq_enable(void) +static __always_inline bool should_hard_irq_enable(struct pt_regs *regs) { return false; } diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c index f55c6fb34a3a..5712dd846263 100644 --- a/arch/powerpc/kernel/dbell.c +++ b/arch/powerpc/kernel/dbell.c @@ -27,7 +27,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(doorbell_exception) ppc_msgsync(); - if (should_hard_irq_enable()) + if (should_hard_irq_enable(regs)) do_hard_irq_enable(); kvmppc_clear_host_ipi(smp_processor_id()); diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S index d438ca74e96c..fdbee1093e2b 100644 --- a/arch/powerpc/kernel/head_85xx.S +++ b/arch/powerpc/kernel/head_85xx.S @@ -864,7 +864,7 @@ _GLOBAL(load_up_spe) * SPE unavailable trap from kernel - print a message, but let * the task use SPE in the kernel until it returns to user mode. */ -KernelSPE: +SYM_FUNC_START_LOCAL(KernelSPE) lwz r3,_MSR(r1) oris r3,r3,MSR_SPE@h stw r3,_MSR(r1) /* enable use of SPE after return */ @@ -881,6 +881,7 @@ KernelSPE: #endif .align 4,0 +SYM_FUNC_END(KernelSPE) #endif /* CONFIG_SPE */ /* diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index fc6631a80527..0ec1581619db 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -50,16 +50,18 @@ static inline bool exit_must_hard_disable(void) */ static notrace __always_inline bool prep_irq_for_enabled_exit(bool restartable) { + bool must_hard_disable = (exit_must_hard_disable() || !restartable); + /* This must be done with RI=1 because tracing may touch vmaps */ trace_hardirqs_on(); - if (exit_must_hard_disable() || !restartable) + if (must_hard_disable) __hard_EE_RI_disable(); #ifdef CONFIG_PPC64 /* This pattern matches prep_irq_for_idle */ if (unlikely(lazy_irq_pending_nocheck())) { - if (exit_must_hard_disable() || !restartable) { + if (must_hard_disable) { local_paca->irq_happened |= PACA_IRQ_HARD_DIS; __hard_RI_enable(); } diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index c5b9ce887483..c9535f2760b5 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -238,7 +238,7 @@ static void __do_irq(struct pt_regs *regs, unsigned long oldsp) irq = static_call(ppc_get_irq)(); /* We can hard enable interrupts now to allow perf interrupts */ - if (should_hard_irq_enable()) + if (should_hard_irq_enable(regs)) do_hard_irq_enable(); /* And finally process it */ diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index d68de3618741..e26eb6618ae5 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -515,7 +515,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt) } /* Conditionally hard-enable interrupts. */ - if (should_hard_irq_enable()) { + if (should_hard_irq_enable(regs)) { /* * Ensure a positive value is written to the decrementer, or * else some CPUs will continue to take decrementer exceptions. diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index af8854f9eae3..9be3e818a240 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -26,6 +26,7 @@ #include <asm/firmware.h> #include <asm/kexec_ranges.h> #include <asm/crashdump-ppc64.h> +#include <asm/mmzone.h> #include <asm/prom.h> struct umem_info { @@ -989,10 +990,13 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image) * linux,drconf-usable-memory properties. Get an approximate on the * number of usable memory entries and use for FDT size estimation. */ - usm_entries = ((memblock_end_of_DRAM() / drmem_lmb_size()) + - (2 * (resource_size(&crashk_res) / drmem_lmb_size()))); - - extra_size = (unsigned int)(usm_entries * sizeof(u64)); + if (drmem_lmb_size()) { + usm_entries = ((memory_hotplug_max() / drmem_lmb_size()) + + (2 * (resource_size(&crashk_res) / drmem_lmb_size()))); + extra_size = (unsigned int)(usm_entries * sizeof(u64)); + } else { + extra_size = 0; + } /* * Get the number of CPU nodes in the current DT. This allows to diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 0dce93ccaadf..e89281d3ba28 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -912,16 +912,15 @@ static int kvmppc_handle_debug(struct kvm_vcpu *vcpu) static void kvmppc_fill_pt_regs(struct pt_regs *regs) { - ulong r1, ip, msr, lr; + ulong r1, msr, lr; asm("mr %0, 1" : "=r"(r1)); asm("mflr %0" : "=r"(lr)); asm("mfmsr %0" : "=r"(msr)); - asm("bl 1f; 1: mflr %0" : "=r"(ip)); memset(regs, 0, sizeof(*regs)); regs->gpr[1] = r1; - regs->nip = ip; + regs->nip = _THIS_IP_; regs->msr = msr; regs->link = lr; } diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index cac727b01799..26245aaf12b8 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -234,6 +234,14 @@ void radix__mark_rodata_ro(void) end = (unsigned long)__end_rodata; radix__change_memory_range(start, end, _PAGE_WRITE); + + for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) { + end = start + PAGE_SIZE; + if (overlaps_interrupt_vector_text(start, end)) + radix__change_memory_range(start, end, _PAGE_WRITE); + else + break; + } } void radix__mark_initmem_nx(void) @@ -262,6 +270,22 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e static unsigned long next_boundary(unsigned long addr, unsigned long end) { #ifdef CONFIG_STRICT_KERNEL_RWX + unsigned long stext_phys; + + stext_phys = __pa_symbol(_stext); + + // Relocatable kernel running at non-zero real address + if (stext_phys != 0) { + // The end of interrupts code at zero is a rodata boundary + unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys; + if (addr < end_intr) + return end_intr; + + // Start of relocated kernel text is a rodata boundary + if (addr < stext_phys) + return stext_phys; + } + if (addr < __pa_symbol(__srwx_boundary)) return __pa_symbol(__srwx_boundary); #endif diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 100e97daf76b..9d229ef7f86e 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -22,7 +22,7 @@ * Used to avoid races in counting the nest-pmu units during hotplug * register and unregister */ -static DEFINE_SPINLOCK(nest_init_lock); +static DEFINE_MUTEX(nest_init_lock); static DEFINE_PER_CPU(struct imc_pmu_ref *, local_nest_imc_refc); static struct imc_pmu **per_nest_pmu_arr; static cpumask_t nest_imc_cpumask; @@ -1629,7 +1629,7 @@ static void imc_common_mem_free(struct imc_pmu *pmu_ptr) static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) { if (pmu_ptr->domain == IMC_DOMAIN_NEST) { - spin_lock(&nest_init_lock); + mutex_lock(&nest_init_lock); if (nest_pmus == 1) { cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE); kfree(nest_imc_refc); @@ -1639,7 +1639,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) if (nest_pmus > 0) nest_pmus--; - spin_unlock(&nest_init_lock); + mutex_unlock(&nest_init_lock); } /* Free core_imc memory */ @@ -1796,11 +1796,11 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id * rest. To handle the cpuhotplug callback unregister, we track * the number of nest pmus in "nest_pmus". */ - spin_lock(&nest_init_lock); + mutex_lock(&nest_init_lock); if (nest_pmus == 0) { ret = init_nest_pmu_ref(); if (ret) { - spin_unlock(&nest_init_lock); + mutex_unlock(&nest_init_lock); kfree(per_nest_pmu_arr); per_nest_pmu_arr = NULL; goto err_free_mem; @@ -1808,7 +1808,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id /* Register for cpu hotplug notification. */ ret = nest_pmu_cpumask_init(); if (ret) { - spin_unlock(&nest_init_lock); + mutex_unlock(&nest_init_lock); kfree(nest_imc_refc); kfree(per_nest_pmu_arr); per_nest_pmu_arr = NULL; @@ -1816,7 +1816,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id } } nest_pmus++; - spin_unlock(&nest_init_lock); + mutex_unlock(&nest_init_lock); break; case IMC_DOMAIN_CORE: ret = core_imc_pmu_cpumask_init(); diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index dbcfe361831a..ea807aa0c31a 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -92,7 +92,7 @@ out: } static int -spufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +spufs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -100,7 +100,7 @@ spufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if ((attr->ia_valid & ATTR_SIZE) && (attr->ia_size != inode->i_size)) return -EINVAL; - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -237,7 +237,7 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags, if (!inode) return -ENOSPC; - inode_init_owner(&init_user_ns, inode, dir, mode | S_IFDIR); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode | S_IFDIR); ctx = alloc_spu_context(SPUFS_I(dir)->i_gang); /* XXX gang */ SPUFS_I(inode)->i_ctx = ctx; if (!ctx) { @@ -468,7 +468,7 @@ spufs_mkgang(struct inode *dir, struct dentry *dentry, umode_t mode) goto out; ret = 0; - inode_init_owner(&init_user_ns, inode, dir, mode | S_IFDIR); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode | S_IFDIR); gang = alloc_spu_gang(); SPUFS_I(inode)->i_ctx = NULL; SPUFS_I(inode)->i_gang = gang; diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index faf2c2177094..82153960ac00 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -80,6 +80,9 @@ ifeq ($(CONFIG_PERF_EVENTS),y) KBUILD_CFLAGS += -fno-omit-frame-pointer endif +# Avoid generating .eh_frame sections. +KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables + KBUILD_CFLAGS_MODULE += $(call cc-option,-mno-relax) KBUILD_AFLAGS_MODULE += $(call as-option,-Wa$(comma)-mno-relax) diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 86328e3acb02..64ad1937e714 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -70,7 +70,6 @@ static_assert(RISCV_ISA_EXT_ID_MAX <= RISCV_ISA_EXT_MAX); */ enum riscv_isa_ext_key { RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */ - RISCV_ISA_EXT_KEY_ZIHINTPAUSE, RISCV_ISA_EXT_KEY_SVINVAL, RISCV_ISA_EXT_KEY_MAX, }; @@ -91,8 +90,6 @@ static __always_inline int riscv_isa_ext2key(int num) return RISCV_ISA_EXT_KEY_FPU; case RISCV_ISA_EXT_d: return RISCV_ISA_EXT_KEY_FPU; - case RISCV_ISA_EXT_ZIHINTPAUSE: - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE; case RISCV_ISA_EXT_SVINVAL: return RISCV_ISA_EXT_KEY_SVINVAL; default: diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 4eba9a98d0e3..3e01f4f3ab08 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -721,6 +721,10 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); return __pmd(atomic_long_xchg((atomic_long_t *)pmdp, pmd_val(pmd))); } + +#define pmdp_collapse_flush pmdp_collapse_flush +extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h index fa70cfe507aa..14f5d27783b8 100644 --- a/arch/riscv/include/asm/vdso/processor.h +++ b/arch/riscv/include/asm/vdso/processor.h @@ -4,30 +4,26 @@ #ifndef __ASSEMBLY__ -#include <linux/jump_label.h> #include <asm/barrier.h> -#include <asm/hwcap.h> static inline void cpu_relax(void) { - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) { #ifdef __riscv_muldiv - int dummy; - /* In lieu of a halt instruction, induce a long-latency stall. */ - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy)); + int dummy; + /* In lieu of a halt instruction, induce a long-latency stall. */ + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy)); #endif - } else { - /* - * Reduce instruction retirement. - * This assumes the PC changes. - */ -#ifdef CONFIG_TOOLCHAIN_HAS_ZIHINTPAUSE - __asm__ __volatile__ ("pause"); + +#ifdef __riscv_zihintpause + /* + * Reduce instruction retirement. + * This assumes the PC changes. + */ + __asm__ __volatile__ ("pause"); #else - /* Encoding of the pause instruction */ - __asm__ __volatile__ (".4byte 0x100000F"); + /* Encoding of the pause instruction */ + __asm__ __volatile__ (".4byte 0x100000F"); #endif - } barrier(); } diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index f21592d20306..2bedec37d092 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -48,15 +48,35 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) post_kprobe_handler(p, kcb, regs); } +static bool __kprobes arch_check_kprobe(struct kprobe *p) +{ + unsigned long tmp = (unsigned long)p->addr - p->offset; + unsigned long addr = (unsigned long)p->addr; + + while (tmp <= addr) { + if (tmp == addr) + return true; + + tmp += GET_INSN_LENGTH(*(u16 *)tmp); + } + + return false; +} + int __kprobes arch_prepare_kprobe(struct kprobe *p) { - unsigned long probe_addr = (unsigned long)p->addr; + u16 *insn = (u16 *)p->addr; + + if ((unsigned long)insn & 0x1) + return -EILSEQ; - if (probe_addr & 0x1) + if (!arch_check_kprobe(p)) return -EILSEQ; /* copy instruction */ - p->opcode = *p->addr; + p->opcode = (kprobe_opcode_t)(*insn++); + if (GET_INSN_LENGTH(p->opcode) == 4) + p->opcode |= (kprobe_opcode_t)(*insn) << 16; /* decode instruction */ switch (riscv_probe_decode_insn(p->addr, &p->ainsn.api)) { diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 75c8dd64fc48..f9a5a7c90ff0 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -32,6 +32,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, fp = (unsigned long)__builtin_frame_address(0); sp = current_stack_pointer; pc = (unsigned long)walk_stackframe; + level = -1; } else { /* task blocked in __switch_to */ fp = task->thread.s[0]; @@ -43,7 +44,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, unsigned long low, high; struct stackframe *frame; - if (unlikely(!__kernel_text_address(pc) || (level++ >= 1 && !fn(arg, pc)))) + if (unlikely(!__kernel_text_address(pc) || (level++ >= 0 && !fn(arg, pc)))) break; /* Validate frame pointer */ diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 3cc07ed45aeb..fcd6145fbead 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -90,8 +90,10 @@ void flush_icache_pte(pte_t pte) if (PageHuge(page)) page = compound_head(page); - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) + if (!test_bit(PG_dcache_clean, &page->flags)) { flush_icache_all(); + set_bit(PG_dcache_clean, &page->flags); + } } #endif /* CONFIG_MMU */ diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c index 6645ead1a7c1..fef4e7328e49 100644 --- a/arch/riscv/mm/pgtable.c +++ b/arch/riscv/mm/pgtable.c @@ -81,3 +81,23 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(pmd_trans_huge(*pmdp)); + /* + * When leaf PTE entries (regular pages) are collapsed into a leaf + * PMD entry (huge page), a valid non-leaf PTE is converted into a + * valid leaf PTE at the level 1 page table. Since the sfence.vma + * forms that specify an address only apply to leaf PTEs, we need a + * global flush here. collapse_huge_page() assumes these flushes are + * eager, so just do the fence here. + */ + flush_tlb_mm(vma->vm_mm); + return pmd; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/s390/boot/decompressor.c b/arch/s390/boot/decompressor.c index 8dcd7af2911a..b519a1f045d8 100644 --- a/arch/s390/boot/decompressor.c +++ b/arch/s390/boot/decompressor.c @@ -80,6 +80,6 @@ void *decompress_kernel(void) void *output = (void *)decompress_offset; __decompress(_compressed_start, _compressed_end - _compressed_start, - NULL, NULL, output, 0, NULL, error); + NULL, NULL, output, vmlinux.image_size, NULL, error); return output; } diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 3161b9ccd2a5..b6276a3521d7 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -4,6 +4,7 @@ * Written by Niibe Yutaka and Paul Mundt */ OUTPUT_ARCH(sh) +#define RUNTIME_DISCARD_EXIT #include <asm/thread_info.h> #include <asm/cache.h> #include <asm/vmlinux.lds.h> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 85a63a41c471..d096b04bf80e 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2974,17 +2974,19 @@ unsigned long perf_misc_flags(struct pt_regs *regs) void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { - if (!x86_pmu_initialized()) { + /* This API doesn't currently support enumerating hybrid PMUs. */ + if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) || + !x86_pmu_initialized()) { memset(cap, 0, sizeof(*cap)); return; } - cap->version = x86_pmu.version; /* - * KVM doesn't support the hybrid PMU yet. - * Return the common value in global x86_pmu, - * which available for all cores. + * Note, hybrid CPU models get tracked as having hybrid PMUs even when + * all E-cores are disabled via BIOS. When E-cores are disabled, the + * base PMU holds the correct number of counters for P-cores. */ + cap->version = x86_pmu.version; cap->num_counters_gp = x86_pmu.num_counters; cap->num_counters_fixed = x86_pmu.num_counters_fixed; cap->bit_width_gp = x86_pmu.cntval_bits; diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 61012476d66e..8f39c46197b8 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -466,5 +466,6 @@ #define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */ #define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */ #define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ +#define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index b049d950612f..ca97442e8d49 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -39,7 +39,20 @@ static __always_inline unsigned long native_get_debugreg(int regno) asm("mov %%db6, %0" :"=r" (val)); break; case 7: - asm("mov %%db7, %0" :"=r" (val)); + /* + * Apply __FORCE_ORDER to DR7 reads to forbid re-ordering them + * with other code. + * + * This is needed because a DR7 access can cause a #VC exception + * when running under SEV-ES. Taking a #VC exception is not a + * safe thing to do just anywhere in the entry code and + * re-ordering might place the access into an unsafe location. + * + * This happened in the NMI handler, where the DR7 read was + * re-ordered to happen before the call to sev_es_ist_enter(), + * causing stack recursion. + */ + asm volatile("mov %%db7, %0" : "=r" (val) : __FORCE_ORDER); break; default: BUG(); @@ -66,7 +79,16 @@ static __always_inline void native_set_debugreg(int regno, unsigned long value) asm("mov %0, %%db6" ::"r" (value)); break; case 7: - asm("mov %0, %%db7" ::"r" (value)); + /* + * Apply __FORCE_ORDER to DR7 writes to forbid re-ordering them + * with other code. + * + * While is didn't happen with a DR7 write (see the DR7 read + * comment above which explains where it happened), add the + * __FORCE_ORDER here too to avoid similar problems in the + * future. + */ + asm volatile("mov %0, %%db7" ::"r" (value), __FORCE_ORDER); break; default: BUG(); diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 347707d459c6..cbaf174d8efd 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -123,6 +123,8 @@ #define INTEL_FAM6_METEORLAKE 0xAC #define INTEL_FAM6_METEORLAKE_L 0xAA +#define INTEL_FAM6_LUNARLAKE_M 0xBD + /* "Small Core" Processors (Atom/E-Core) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9cfca3d7d0e2..f3cc7699e1e1 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1256,6 +1256,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define MMIO_SBDS BIT(2) /* CPU is affected by RETbleed, speculating where you would not expect it */ #define RETBLEED BIT(3) +/* CPU is affected by SMT (cross-thread) return predictions */ +#define SMT_RSB BIT(4) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), @@ -1287,8 +1289,8 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), - VULNBL_AMD(0x17, RETBLEED), - VULNBL_HYGON(0x18, RETBLEED), + VULNBL_AMD(0x17, RETBLEED | SMT_RSB), + VULNBL_HYGON(0x18, RETBLEED | SMT_RSB), {} }; @@ -1406,6 +1408,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) !(ia32_cap & ARCH_CAP_PBRSB_NO)) setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + if (cpu_matches(cpu_vuln_blacklist, SMT_RSB)) + setup_force_cpu_bug(X86_BUG_SMT_RSB); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index b36f3c367cb2..695873c0f50b 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -625,7 +625,7 @@ static int prepare_emulation(struct kprobe *p, struct insn *insn) /* 1 byte conditional jump */ p->ainsn.emulate_op = kprobe_emulate_jcc; p->ainsn.jcc.type = opcode & 0xf; - p->ainsn.rel32 = *(char *)insn->immediate.bytes; + p->ainsn.rel32 = insn->immediate.value; break; case 0x0f: opcode = insn->opcode.bytes[1]; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index cdb91009701d..ee67ba625094 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -165,15 +165,27 @@ static inline void kvm_init_pmu_capability(void) { bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; - perf_get_x86_pmu_capability(&kvm_pmu_cap); - - /* - * For Intel, only support guest architectural pmu - * on a host with architectural pmu. - */ - if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp) + /* + * Hybrid PMUs don't play nice with virtualization without careful + * configuration by userspace, and KVM's APIs for reporting supported + * vPMU features do not account for hybrid PMUs. Disable vPMU support + * for hybrid PMUs until KVM gains a way to let userspace opt-in. + */ + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) enable_pmu = false; + if (enable_pmu) { + perf_get_x86_pmu_capability(&kvm_pmu_cap); + + /* + * For Intel, only support guest architectural pmu + * on a host with architectural pmu. + */ + if ((is_intel && !kvm_pmu_cap.version) || + !kvm_pmu_cap.num_counters_gp) + enable_pmu = false; + } + if (!enable_pmu) { memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); return; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index da4bbd043a7b..a2c299d47e69 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -191,6 +191,10 @@ module_param(enable_pmu, bool, 0444); bool __read_mostly eager_page_split = true; module_param(eager_page_split, bool, 0644); +/* Enable/disable SMT_RSB bug mitigation */ +bool __read_mostly mitigate_smt_rsb; +module_param(mitigate_smt_rsb, bool, 0444); + /* * Restoring the host value for MSRs that are only consumed when running in * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU @@ -4448,10 +4452,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_CLOCK_VALID_FLAGS; break; case KVM_CAP_X86_DISABLE_EXITS: - r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | - KVM_X86_DISABLE_EXITS_CSTATE; - if(kvm_can_mwait_in_guest()) - r |= KVM_X86_DISABLE_EXITS_MWAIT; + r = KVM_X86_DISABLE_EXITS_PAUSE; + + if (!mitigate_smt_rsb) { + r |= KVM_X86_DISABLE_EXITS_HLT | + KVM_X86_DISABLE_EXITS_CSTATE; + + if (kvm_can_mwait_in_guest()) + r |= KVM_X86_DISABLE_EXITS_MWAIT; + } break; case KVM_CAP_X86_SMM: if (!IS_ENABLED(CONFIG_KVM_SMM)) @@ -5254,12 +5263,11 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, { unsigned long val; + memset(dbgregs, 0, sizeof(*dbgregs)); memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); kvm_get_dr(vcpu, 6, &val); dbgregs->dr6 = val; dbgregs->dr7 = vcpu->arch.dr7; - dbgregs->flags = 0; - memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); } static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, @@ -6227,15 +6235,26 @@ split_irqchip_unlock: if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS) break; - if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) && - kvm_can_mwait_in_guest()) - kvm->arch.mwait_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) - kvm->arch.hlt_in_guest = true; if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) kvm->arch.pause_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) - kvm->arch.cstate_in_guest = true; + +#define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \ + "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests." + + if (!mitigate_smt_rsb) { + if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() && + (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE)) + pr_warn_once(SMT_RSB_MSG); + + if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) && + kvm_can_mwait_in_guest()) + kvm->arch.mwait_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) + kvm->arch.hlt_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) + kvm->arch.cstate_in_guest = true; + } + r = 0; break; case KVM_CAP_MSR_PLATFORM_INFO: @@ -13456,6 +13475,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); static int __init kvm_x86_init(void) { kvm_mmu_x86_module_init(); + mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible(); return 0; } module_init(kvm_x86_init); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index fb4b1b5e0dea..46de9cf5c91d 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -387,8 +387,7 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, u8 mtrr_type, uniform; mtrr_type = mtrr_type_lookup(start, end, &uniform); - if (mtrr_type != MTRR_TYPE_WRBACK && - mtrr_type != MTRR_TYPE_INVALID) + if (mtrr_type != MTRR_TYPE_WRBACK) return _PAGE_CACHE_MODE_UC_MINUS; return _PAGE_CACHE_MODE_WB; diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 7d9b15f0dbd5..0fbde0fc0628 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -769,8 +769,8 @@ static void __bfq_bic_change_cgroup(struct bfq_data *bfqd, * request from the old cgroup. */ bfq_put_cooperator(sync_bfqq); - bfq_release_process_ref(bfqd, sync_bfqq); bic_set_bfqq(bic, NULL, true); + bfq_release_process_ref(bfqd, sync_bfqq); } } } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ccf2204477a5..380e9bda2e57 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5425,9 +5425,11 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bfqq = bic_to_bfqq(bic, false); if (bfqq) { - bfq_release_process_ref(bfqd, bfqq); + struct bfq_queue *old_bfqq = bfqq; + bfqq = bfq_get_queue(bfqd, bio, false, bic, true); bic_set_bfqq(bic, bfqq, false); + bfq_release_process_ref(bfqd, old_bfqq); } bfqq = bic_to_bfqq(bic, true); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4c94a6560f62..9ac1efb053e0 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -2001,6 +2001,10 @@ void blk_cgroup_bio_start(struct bio *bio) struct blkg_iostat_set *bis; unsigned long flags; + /* Root-level stats are sourced from system-wide IO stats */ + if (!cgroup_parent(blkcg->css.cgroup)) + return; + cpu = get_cpu(); bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); flags = u64_stats_update_begin_irqsave(&bis->sync); diff --git a/block/blk-merge.c b/block/blk-merge.c index b7c193d67185..64bf7d9dd8e8 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -276,7 +276,7 @@ static bool bvec_split_segs(const struct queue_limits *lim, * responsible for ensuring that @bs is only destroyed after processing of the * split bio has finished. */ -static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *segs, struct bio_set *bs, unsigned max_bytes) { struct bio_vec bv, bvprv, *bvprvp = NULL; @@ -336,6 +336,7 @@ split: bio_clear_polled(bio); return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs); } +EXPORT_SYMBOL_GPL(bio_split_rw); /** * __bio_split_to_limits - split a bio to fit the queue limits diff --git a/block/blk-mq.c b/block/blk-mq.c index 9d463f7563bc..9c8dc70020bc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4069,8 +4069,9 @@ EXPORT_SYMBOL(blk_mq_init_queue); * blk_mq_destroy_queue - shutdown a request queue * @q: request queue to shutdown * - * This shuts down a request queue allocated by blk_mq_init_queue() and drops - * the initial reference. All future requests will failed with -ENODEV. + * This shuts down a request queue allocated by blk_mq_init_queue(). All future + * requests will be failed with -ENODEV. The caller is responsible for dropping + * the reference from blk_mq_init_queue() by calling blk_put_queue(). * * Context: can sleep */ diff --git a/certs/Makefile b/certs/Makefile index 9486ed924731..799ad7b9e68a 100644 --- a/certs/Makefile +++ b/certs/Makefile @@ -23,8 +23,8 @@ $(obj)/blacklist_hash_list: $(CONFIG_SYSTEM_BLACKLIST_HASH_LIST) FORCE targets += blacklist_hash_list quiet_cmd_extract_certs = CERT $@ - cmd_extract_certs = $(obj)/extract-cert $(extract-cert-in) $@ -extract-cert-in = $(or $(filter-out $(obj)/extract-cert, $(real-prereqs)),"") + cmd_extract_certs = $(obj)/extract-cert "$(extract-cert-in)" $@ +extract-cert-in = $(filter-out $(obj)/extract-cert, $(real-prereqs)) $(obj)/system_certificates.o: $(obj)/x509_certificate_list diff --git a/certs/blacklist.c b/certs/blacklist.c index 41f10601cc72..675dd7a8f07a 100644 --- a/certs/blacklist.c +++ b/certs/blacklist.c @@ -183,16 +183,19 @@ static int mark_raw_hash_blacklisted(const char *hash) { key_ref_t key; - key = key_create_or_update(make_key_ref(blacklist_keyring, true), - "blacklist", - hash, - NULL, - 0, - BLACKLIST_KEY_PERM, - KEY_ALLOC_NOT_IN_QUOTA | - KEY_ALLOC_BUILT_IN); + key = key_create(make_key_ref(blacklist_keyring, true), + "blacklist", + hash, + NULL, + 0, + BLACKLIST_KEY_PERM, + KEY_ALLOC_NOT_IN_QUOTA | + KEY_ALLOC_BUILT_IN); if (IS_ERR(key)) { - pr_err("Problem blacklisting hash (%ld)\n", PTR_ERR(key)); + if (PTR_ERR(key) == -EEXIST) + pr_warn("Duplicate blacklisted hash %s\n", hash); + else + pr_err("Problem blacklisting hash %s: %pe\n", hash, key); return PTR_ERR(key); } return 0; diff --git a/crypto/asymmetric_keys/Kconfig b/crypto/asymmetric_keys/Kconfig index 3df3fe4ed95f..1ef3b46d6f6e 100644 --- a/crypto/asymmetric_keys/Kconfig +++ b/crypto/asymmetric_keys/Kconfig @@ -83,6 +83,6 @@ config FIPS_SIGNATURE_SELFTEST for FIPS. depends on KEYS depends on ASYMMETRIC_KEY_TYPE - depends on PKCS7_MESSAGE_PARSER + depends on PKCS7_MESSAGE_PARSER=X509_CERTIFICATE_PARSER endif # ASYMMETRIC_KEY_TYPE diff --git a/crypto/asymmetric_keys/pkcs7_verify.c b/crypto/asymmetric_keys/pkcs7_verify.c index f6321c785714..4fa769c4bcdb 100644 --- a/crypto/asymmetric_keys/pkcs7_verify.c +++ b/crypto/asymmetric_keys/pkcs7_verify.c @@ -485,3 +485,4 @@ int pkcs7_supply_detached_data(struct pkcs7_message *pkcs7, pkcs7->data_len = datalen; return 0; } +EXPORT_SYMBOL_GPL(pkcs7_supply_detached_data); diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index 2f8352e88860..eca5671ad3f2 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -186,8 +186,28 @@ static int software_key_query(const struct kernel_pkey_params *params, len = crypto_akcipher_maxsize(tfm); info->key_size = len * 8; - info->max_data_size = len; - info->max_sig_size = len; + + if (strncmp(pkey->pkey_algo, "ecdsa", 5) == 0) { + /* + * ECDSA key sizes are much smaller than RSA, and thus could + * operate on (hashed) inputs that are larger than key size. + * For example SHA384-hashed input used with secp256r1 + * based keys. Set max_data_size to be at least as large as + * the largest supported hash size (SHA512) + */ + info->max_data_size = 64; + + /* + * Verify takes ECDSA-Sig (described in RFC 5480) as input, + * which is actually 2 'key_size'-bit integers encoded in + * ASN.1. Account for the ASN.1 encoding overhead here. + */ + info->max_sig_size = 2 * (len + 3) + 2; + } else { + info->max_data_size = len; + info->max_sig_size = len; + } + info->max_enc_size = len; info->max_dec_size = len; info->supported_ops = (KEYCTL_SUPPORTS_ENCRYPT | diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index f1cc5ec6a3b6..4e48d6db05eb 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -3297,8 +3297,8 @@ void acpi_nfit_shutdown(void *data) mutex_lock(&acpi_desc->init_mutex); set_bit(ARS_CANCEL, &acpi_desc->scrub_flags); - cancel_delayed_work_sync(&acpi_desc->dwork); mutex_unlock(&acpi_desc->init_mutex); + cancel_delayed_work_sync(&acpi_desc->dwork); /* * Bounce the nvdimm bus lock to make sure any in-flight diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c index 09b2ce7e4c34..348d63d1e3d3 100644 --- a/drivers/android/binderfs.c +++ b/drivers/android/binderfs.c @@ -352,7 +352,7 @@ static inline bool is_binderfs_control_device(const struct dentry *dentry) return info->control_dentry == dentry; } -static int binderfs_rename(struct user_namespace *mnt_userns, +static int binderfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -361,7 +361,7 @@ static int binderfs_rename(struct user_namespace *mnt_userns, is_binderfs_control_device(new_dentry)) return -EPERM; - return simple_rename(&init_user_ns, old_dir, old_dentry, new_dir, + return simple_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); } diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 14a1c0d14916..3bb9bb483fe3 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -421,6 +421,7 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0x34d3), board_ahci_low_power }, /* Ice Lake LP AHCI */ { PCI_VDEVICE(INTEL, 0x02d3), board_ahci_low_power }, /* Comet Lake PCH-U AHCI */ { PCI_VDEVICE(INTEL, 0x02d7), board_ahci_low_power }, /* Comet Lake PCH RAID */ + { PCI_VDEVICE(INTEL, 0xa0d3), board_ahci_low_power }, /* Tiger Lake UP{3,4} AHCI */ /* JMicron 360/1/3/5/6, match class to avoid IDE function */ { PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 884ae73b11ea..c4c89d24f84c 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -3109,7 +3109,7 @@ int sata_down_spd_limit(struct ata_link *link, u32 spd_limit) */ if (spd > 1) mask &= (1 << (spd - 1)) - 1; - else + else if (link->sata_spd) return -EINVAL; /* were we already at the bottom? */ @@ -4045,6 +4045,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { "Samsung SSD 870*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | ATA_HORKAGE_ZERO_AFTER_TRIM | ATA_HORKAGE_NO_NCQ_ON_ATI }, + { "SAMSUNG*MZ7LH*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | + ATA_HORKAGE_ZERO_AFTER_TRIM | + ATA_HORKAGE_NO_NCQ_ON_ATI, }, { "FCCT*M500*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | ATA_HORKAGE_ZERO_AFTER_TRIM }, diff --git a/drivers/ata/pata_octeon_cf.c b/drivers/ata/pata_octeon_cf.c index 35608a0cf552..4cbcdc5da038 100644 --- a/drivers/ata/pata_octeon_cf.c +++ b/drivers/ata/pata_octeon_cf.c @@ -67,7 +67,7 @@ module_param(enable_dma, int, 0444); MODULE_PARM_DESC(enable_dma, "Enable use of DMA on interfaces that support it (0=no dma [default], 1=use dma)"); -/** +/* * Convert nanosecond based time to setting used in the * boot bus timing register, based on timing multiple */ @@ -114,7 +114,7 @@ static void octeon_cf_set_boot_reg_cfg(int cs, unsigned int multiplier) cvmx_write_csr(CVMX_MIO_BOOT_REG_CFGX(cs), reg_cfg.u64); } -/** +/* * Called after libata determines the needed PIO mode. This * function programs the Octeon bootbus regions to support the * timing requirements of the PIO mode. @@ -278,7 +278,7 @@ static void octeon_cf_set_dmamode(struct ata_port *ap, struct ata_device *dev) cvmx_write_csr(cf_port->dma_base + DMA_TIM, dma_tim.u64); } -/** +/* * Handle an 8 bit I/O request. * * @qc: Queued command @@ -317,7 +317,7 @@ static unsigned int octeon_cf_data_xfer8(struct ata_queued_cmd *qc, return buflen; } -/** +/* * Handle a 16 bit I/O request. * * @qc: Queued command @@ -372,7 +372,7 @@ static unsigned int octeon_cf_data_xfer16(struct ata_queued_cmd *qc, return buflen; } -/** +/* * Read the taskfile for 16bit non-True IDE only. */ static void octeon_cf_tf_read16(struct ata_port *ap, struct ata_taskfile *tf) @@ -453,7 +453,7 @@ static int octeon_cf_softreset16(struct ata_link *link, unsigned int *classes, return 0; } -/** +/* * Load the taskfile for 16bit non-True IDE only. The device_addr is * not loaded, we do this as part of octeon_cf_exec_command16. */ @@ -525,7 +525,7 @@ static void octeon_cf_dma_setup(struct ata_queued_cmd *qc) ap->ops->sff_exec_command(ap, &qc->tf); } -/** +/* * Start a DMA transfer that was already setup * * @qc: Information about the DMA @@ -580,7 +580,7 @@ static void octeon_cf_dma_start(struct ata_queued_cmd *qc) cvmx_write_csr(cf_port->dma_base + DMA_CFG, mio_boot_dma_cfg.u64); } -/** +/* * * LOCKING: * spin_lock_irqsave(host lock) diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index e4bffeabf344..03e8a95f1f35 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -173,7 +173,7 @@ static int dev_mkdir(const char *name, umode_t mode) if (IS_ERR(dentry)) return PTR_ERR(dentry); - err = vfs_mkdir(&init_user_ns, d_inode(path.dentry), dentry, mode); + err = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode); if (!err) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; @@ -223,7 +223,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, if (IS_ERR(dentry)) return PTR_ERR(dentry); - err = vfs_mknod(&init_user_ns, d_inode(path.dentry), dentry, mode, + err = vfs_mknod(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode, dev->devt); if (!err) { struct iattr newattrs; @@ -233,7 +233,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, newattrs.ia_gid = gid; newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID; inode_lock(d_inode(dentry)); - notify_change(&init_user_ns, dentry, &newattrs, NULL); + notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); inode_unlock(d_inode(dentry)); /* mark as kernel-created inode */ @@ -254,7 +254,7 @@ static int dev_rmdir(const char *name) return PTR_ERR(dentry); if (d_really_is_positive(dentry)) { if (d_inode(dentry)->i_private == &thread) - err = vfs_rmdir(&init_user_ns, d_inode(parent.dentry), + err = vfs_rmdir(&nop_mnt_idmap, d_inode(parent.dentry), dentry); else err = -EPERM; @@ -341,9 +341,9 @@ static int handle_remove(const char *nodename, struct device *dev) newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; inode_lock(d_inode(dentry)); - notify_change(&init_user_ns, dentry, &newattrs, NULL); + notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); inode_unlock(d_inode(dentry)); - err = vfs_unlink(&init_user_ns, d_inode(parent.dentry), + err = vfs_unlink(&nop_mnt_idmap, d_inode(parent.dentry), dentry, NULL); if (!err || err == -ENOENT) deleted = 1; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index e54693204630..6368b56eacf1 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -137,7 +137,7 @@ struct ublk_device { char *__queues; - unsigned short queue_size; + unsigned int queue_size; struct ublksrv_ctrl_dev_info dev_info; struct blk_mq_tag_set tag_set; diff --git a/drivers/bus/sunxi-rsb.c b/drivers/bus/sunxi-rsb.c index 3aa91aed3bf7..226e87b85116 100644 --- a/drivers/bus/sunxi-rsb.c +++ b/drivers/bus/sunxi-rsb.c @@ -857,7 +857,13 @@ static int __init sunxi_rsb_init(void) return ret; } - return platform_driver_register(&sunxi_rsb_driver); + ret = platform_driver_register(&sunxi_rsb_driver); + if (ret) { + bus_unregister(&sunxi_rsb_bus); + return ret; + } + + return 0; } module_init(sunxi_rsb_init); diff --git a/drivers/char/tpm/eventlog/acpi.c b/drivers/char/tpm/eventlog/acpi.c index 0913d3eb8d51..40360e599bc3 100644 --- a/drivers/char/tpm/eventlog/acpi.c +++ b/drivers/char/tpm/eventlog/acpi.c @@ -14,6 +14,7 @@ * Access to the event log extended by the TCG BIOS of PC platform */ +#include <linux/device.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/security.h> @@ -135,7 +136,7 @@ int tpm_read_log_acpi(struct tpm_chip *chip) } /* malloc EventLog space */ - log->bios_event_log = kmalloc(len, GFP_KERNEL); + log->bios_event_log = devm_kmalloc(&chip->dev, len, GFP_KERNEL); if (!log->bios_event_log) return -ENOMEM; @@ -160,7 +161,7 @@ int tpm_read_log_acpi(struct tpm_chip *chip) return format; err: - kfree(log->bios_event_log); + devm_kfree(&chip->dev, log->bios_event_log); log->bios_event_log = NULL; return ret; } diff --git a/drivers/char/tpm/eventlog/efi.c b/drivers/char/tpm/eventlog/efi.c index e6cb9d525e30..4e9d7c2bf32e 100644 --- a/drivers/char/tpm/eventlog/efi.c +++ b/drivers/char/tpm/eventlog/efi.c @@ -6,6 +6,7 @@ * Thiebaud Weksteen <tweek@google.com> */ +#include <linux/device.h> #include <linux/efi.h> #include <linux/tpm_eventlog.h> @@ -55,7 +56,7 @@ int tpm_read_log_efi(struct tpm_chip *chip) } /* malloc EventLog space */ - log->bios_event_log = kmemdup(log_tbl->log, log_size, GFP_KERNEL); + log->bios_event_log = devm_kmemdup(&chip->dev, log_tbl->log, log_size, GFP_KERNEL); if (!log->bios_event_log) { ret = -ENOMEM; goto out; @@ -76,7 +77,7 @@ int tpm_read_log_efi(struct tpm_chip *chip) MEMREMAP_WB); if (!final_tbl) { pr_err("Could not map UEFI TPM final log\n"); - kfree(log->bios_event_log); + devm_kfree(&chip->dev, log->bios_event_log); ret = -ENOMEM; goto out; } @@ -91,11 +92,11 @@ int tpm_read_log_efi(struct tpm_chip *chip) * Allocate memory for the 'combined log' where we will append the * 'final events log' to. */ - tmp = krealloc(log->bios_event_log, - log_size + final_events_log_size, - GFP_KERNEL); + tmp = devm_krealloc(&chip->dev, log->bios_event_log, + log_size + final_events_log_size, + GFP_KERNEL); if (!tmp) { - kfree(log->bios_event_log); + devm_kfree(&chip->dev, log->bios_event_log); ret = -ENOMEM; goto out; } diff --git a/drivers/char/tpm/eventlog/of.c b/drivers/char/tpm/eventlog/of.c index a9ce66d09a75..930fe43d5daf 100644 --- a/drivers/char/tpm/eventlog/of.c +++ b/drivers/char/tpm/eventlog/of.c @@ -10,13 +10,44 @@ * Read the event log created by the firmware on PPC64 */ +#include <linux/device.h> #include <linux/slab.h> +#include <linux/io.h> +#include <linux/ioport.h> #include <linux/of.h> +#include <linux/of_address.h> +#include <linux/of_reserved_mem.h> #include <linux/tpm_eventlog.h> #include "../tpm.h" #include "common.h" +static int tpm_read_log_memory_region(struct tpm_chip *chip) +{ + struct device_node *node; + struct resource res; + int rc; + + node = of_parse_phandle(chip->dev.parent->of_node, "memory-region", 0); + if (!node) + return -ENODEV; + + rc = of_address_to_resource(node, 0, &res); + of_node_put(node); + if (rc) + return rc; + + chip->log.bios_event_log = devm_memremap(&chip->dev, res.start, resource_size(&res), + MEMREMAP_WB); + if (IS_ERR(chip->log.bios_event_log)) + return -ENOMEM; + + chip->log.bios_event_log_end = chip->log.bios_event_log + resource_size(&res); + + return chip->flags & TPM_CHIP_FLAG_TPM2 ? EFI_TCG2_EVENT_LOG_FORMAT_TCG_2 : + EFI_TCG2_EVENT_LOG_FORMAT_TCG_1_2; +} + int tpm_read_log_of(struct tpm_chip *chip) { struct device_node *np; @@ -38,7 +69,7 @@ int tpm_read_log_of(struct tpm_chip *chip) sizep = of_get_property(np, "linux,sml-size", NULL); basep = of_get_property(np, "linux,sml-base", NULL); if (sizep == NULL && basep == NULL) - return -ENODEV; + return tpm_read_log_memory_region(chip); if (sizep == NULL || basep == NULL) return -EIO; @@ -65,7 +96,7 @@ int tpm_read_log_of(struct tpm_chip *chip) return -EIO; } - log->bios_event_log = kmemdup(__va(base), size, GFP_KERNEL); + log->bios_event_log = devm_kmemdup(&chip->dev, __va(base), size, GFP_KERNEL); if (!log->bios_event_log) return -ENOMEM; diff --git a/drivers/char/tpm/st33zp24/i2c.c b/drivers/char/tpm/st33zp24/i2c.c index 8156bb2af78c..c4d0b744e3cc 100644 --- a/drivers/char/tpm/st33zp24/i2c.c +++ b/drivers/char/tpm/st33zp24/i2c.c @@ -101,8 +101,7 @@ static const struct st33zp24_phy_ops i2c_phy_ops = { * @return: 0 in case of success. * -1 in other case. */ -static int st33zp24_i2c_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int st33zp24_i2c_probe(struct i2c_client *client) { struct st33zp24_i2c_phy *phy; @@ -161,7 +160,7 @@ static struct i2c_driver st33zp24_i2c_driver = { .of_match_table = of_match_ptr(of_st33zp24_i2c_match), .acpi_match_table = ACPI_PTR(st33zp24_i2c_acpi_match), }, - .probe = st33zp24_i2c_probe, + .probe_new = st33zp24_i2c_probe, .remove = st33zp24_i2c_remove, .id_table = st33zp24_i2c_id }; diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c index 741d8f3e8fb3..b99f55f2d4fd 100644 --- a/drivers/char/tpm/tpm-chip.c +++ b/drivers/char/tpm/tpm-chip.c @@ -267,7 +267,6 @@ static void tpm_dev_release(struct device *dev) idr_remove(&dev_nums_idr, chip->dev_num); mutex_unlock(&idr_lock); - kfree(chip->log.bios_event_log); kfree(chip->work_space.context_buf); kfree(chip->work_space.session_buf); kfree(chip->allocated_banks); diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index 65d03867e114..93545be190a5 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -777,10 +777,12 @@ out: int tpm2_find_cc(struct tpm_chip *chip, u32 cc) { + u32 cc_mask; int i; + cc_mask = 1 << TPM2_CC_ATTR_VENDOR | GENMASK(15, 0); for (i = 0; i < chip->nr_commands; i++) - if (cc == (chip->cc_attrs_tbl[i] & GENMASK(15, 0))) + if (cc == (chip->cc_attrs_tbl[i] & cc_mask)) return i; return -1; diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c index 7e9da671a0e8..d43a0d7b97a8 100644 --- a/drivers/char/tpm/tpm_crb.c +++ b/drivers/char/tpm/tpm_crb.c @@ -98,6 +98,8 @@ struct crb_priv { u8 __iomem *rsp; u32 cmd_size; u32 smc_func_id; + u32 __iomem *pluton_start_addr; + u32 __iomem *pluton_reply_addr; }; struct tpm2_crb_smc { @@ -108,6 +110,11 @@ struct tpm2_crb_smc { u32 smc_func_id; }; +struct tpm2_crb_pluton { + u64 start_addr; + u64 reply_addr; +}; + static bool crb_wait_for_reg_32(u32 __iomem *reg, u32 mask, u32 value, unsigned long timeout) { @@ -127,6 +134,25 @@ static bool crb_wait_for_reg_32(u32 __iomem *reg, u32 mask, u32 value, return ((ioread32(reg) & mask) == value); } +static int crb_try_pluton_doorbell(struct crb_priv *priv, bool wait_for_complete) +{ + if (priv->sm != ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON) + return 0; + + if (!crb_wait_for_reg_32(priv->pluton_reply_addr, ~0, 1, TPM2_TIMEOUT_C)) + return -ETIME; + + iowrite32(1, priv->pluton_start_addr); + if (wait_for_complete == false) + return 0; + + if (!crb_wait_for_reg_32(priv->pluton_start_addr, + 0xffffffff, 0, 200)) + return -ETIME; + + return 0; +} + /** * __crb_go_idle - request tpm crb device to go the idle state * @@ -145,6 +171,8 @@ static bool crb_wait_for_reg_32(u32 __iomem *reg, u32 mask, u32 value, */ static int __crb_go_idle(struct device *dev, struct crb_priv *priv) { + int rc; + if ((priv->sm == ACPI_TPM2_START_METHOD) || (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_START_METHOD) || (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_ARM_SMC)) @@ -152,6 +180,10 @@ static int __crb_go_idle(struct device *dev, struct crb_priv *priv) iowrite32(CRB_CTRL_REQ_GO_IDLE, &priv->regs_t->ctrl_req); + rc = crb_try_pluton_doorbell(priv, true); + if (rc) + return rc; + if (!crb_wait_for_reg_32(&priv->regs_t->ctrl_req, CRB_CTRL_REQ_GO_IDLE/* mask */, 0, /* value */ @@ -188,12 +220,19 @@ static int crb_go_idle(struct tpm_chip *chip) */ static int __crb_cmd_ready(struct device *dev, struct crb_priv *priv) { + int rc; + if ((priv->sm == ACPI_TPM2_START_METHOD) || (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_START_METHOD) || (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_ARM_SMC)) return 0; iowrite32(CRB_CTRL_REQ_CMD_READY, &priv->regs_t->ctrl_req); + + rc = crb_try_pluton_doorbell(priv, true); + if (rc) + return rc; + if (!crb_wait_for_reg_32(&priv->regs_t->ctrl_req, CRB_CTRL_REQ_CMD_READY /* mask */, 0, /* value */ @@ -371,6 +410,10 @@ static int crb_send(struct tpm_chip *chip, u8 *buf, size_t len) return -E2BIG; } + /* Seems to be necessary for every command */ + if (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON) + __crb_cmd_ready(&chip->dev, priv); + memcpy_toio(priv->cmd, buf, len); /* Make sure that cmd is populated before issuing start. */ @@ -394,7 +437,10 @@ static int crb_send(struct tpm_chip *chip, u8 *buf, size_t len) rc = tpm_crb_smc_start(&chip->dev, priv->smc_func_id); } - return rc; + if (rc) + return rc; + + return crb_try_pluton_doorbell(priv, false); } static void crb_cancel(struct tpm_chip *chip) @@ -524,15 +570,18 @@ static int crb_map_io(struct acpi_device *device, struct crb_priv *priv, return ret; acpi_dev_free_resource_list(&acpi_resource_list); - if (resource_type(iores_array) != IORESOURCE_MEM) { - dev_err(dev, FW_BUG "TPM2 ACPI table does not define a memory resource\n"); - return -EINVAL; - } else if (resource_type(iores_array + TPM_CRB_MAX_RESOURCES) == - IORESOURCE_MEM) { - dev_warn(dev, "TPM2 ACPI table defines too many memory resources\n"); - memset(iores_array + TPM_CRB_MAX_RESOURCES, - 0, sizeof(*iores_array)); - iores_array[TPM_CRB_MAX_RESOURCES].flags = 0; + /* Pluton doesn't appear to define ACPI memory regions */ + if (priv->sm != ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON) { + if (resource_type(iores_array) != IORESOURCE_MEM) { + dev_err(dev, FW_BUG "TPM2 ACPI table does not define a memory resource\n"); + return -EINVAL; + } else if (resource_type(iores_array + TPM_CRB_MAX_RESOURCES) == + IORESOURCE_MEM) { + dev_warn(dev, "TPM2 ACPI table defines too many memory resources\n"); + memset(iores_array + TPM_CRB_MAX_RESOURCES, + 0, sizeof(*iores_array)); + iores_array[TPM_CRB_MAX_RESOURCES].flags = 0; + } } iores = NULL; @@ -656,6 +705,22 @@ out_relinquish_locality: return ret; } +static int crb_map_pluton(struct device *dev, struct crb_priv *priv, + struct acpi_table_tpm2 *buf, struct tpm2_crb_pluton *crb_pluton) +{ + priv->pluton_start_addr = crb_map_res(dev, NULL, NULL, + crb_pluton->start_addr, 4); + if (IS_ERR(priv->pluton_start_addr)) + return PTR_ERR(priv->pluton_start_addr); + + priv->pluton_reply_addr = crb_map_res(dev, NULL, NULL, + crb_pluton->reply_addr, 4); + if (IS_ERR(priv->pluton_reply_addr)) + return PTR_ERR(priv->pluton_reply_addr); + + return 0; +} + static int crb_acpi_add(struct acpi_device *device) { struct acpi_table_tpm2 *buf; @@ -663,6 +728,7 @@ static int crb_acpi_add(struct acpi_device *device) struct tpm_chip *chip; struct device *dev = &device->dev; struct tpm2_crb_smc *crb_smc; + struct tpm2_crb_pluton *crb_pluton; acpi_status status; u32 sm; int rc; @@ -700,6 +766,20 @@ static int crb_acpi_add(struct acpi_device *device) priv->smc_func_id = crb_smc->smc_func_id; } + if (sm == ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON) { + if (buf->header.length < (sizeof(*buf) + sizeof(*crb_pluton))) { + dev_err(dev, + FW_BUG "TPM2 ACPI table has wrong size %u for start method type %d\n", + buf->header.length, + ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON); + return -EINVAL; + } + crb_pluton = ACPI_ADD_PTR(struct tpm2_crb_pluton, buf, sizeof(*buf)); + rc = crb_map_pluton(dev, priv, buf, crb_pluton); + if (rc) + return rc; + } + priv->sm = sm; priv->hid = acpi_device_hid(device); diff --git a/drivers/char/tpm/tpm_i2c_atmel.c b/drivers/char/tpm/tpm_i2c_atmel.c index 4be3677c1463..8f77154e0550 100644 --- a/drivers/char/tpm/tpm_i2c_atmel.c +++ b/drivers/char/tpm/tpm_i2c_atmel.c @@ -146,8 +146,7 @@ static const struct tpm_class_ops i2c_atmel = { .req_canceled = i2c_atmel_req_canceled, }; -static int i2c_atmel_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int i2c_atmel_probe(struct i2c_client *client) { struct tpm_chip *chip; struct device *dev = &client->dev; @@ -204,7 +203,7 @@ static SIMPLE_DEV_PM_OPS(i2c_atmel_pm_ops, tpm_pm_suspend, tpm_pm_resume); static struct i2c_driver i2c_atmel_driver = { .id_table = i2c_atmel_id, - .probe = i2c_atmel_probe, + .probe_new = i2c_atmel_probe, .remove = i2c_atmel_remove, .driver = { .name = I2C_DRIVER_NAME, diff --git a/drivers/char/tpm/tpm_i2c_infineon.c b/drivers/char/tpm/tpm_i2c_infineon.c index fd3c3661e646..7cdaff52a96d 100644 --- a/drivers/char/tpm/tpm_i2c_infineon.c +++ b/drivers/char/tpm/tpm_i2c_infineon.c @@ -681,8 +681,7 @@ MODULE_DEVICE_TABLE(of, tpm_tis_i2c_of_match); static SIMPLE_DEV_PM_OPS(tpm_tis_i2c_ops, tpm_pm_suspend, tpm_pm_resume); -static int tpm_tis_i2c_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int tpm_tis_i2c_probe(struct i2c_client *client) { int rc; struct device *dev = &(client->dev); @@ -717,7 +716,7 @@ static void tpm_tis_i2c_remove(struct i2c_client *client) static struct i2c_driver tpm_tis_i2c_driver = { .id_table = tpm_tis_i2c_table, - .probe = tpm_tis_i2c_probe, + .probe_new = tpm_tis_i2c_probe, .remove = tpm_tis_i2c_remove, .driver = { .name = "tpm_i2c_infineon", diff --git a/drivers/char/tpm/tpm_i2c_nuvoton.c b/drivers/char/tpm/tpm_i2c_nuvoton.c index 95c37350cc8e..a026e98add50 100644 --- a/drivers/char/tpm/tpm_i2c_nuvoton.c +++ b/drivers/char/tpm/tpm_i2c_nuvoton.c @@ -522,9 +522,9 @@ static int get_vid(struct i2c_client *client, u32 *res) return 0; } -static int i2c_nuvoton_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int i2c_nuvoton_probe(struct i2c_client *client) { + const struct i2c_device_id *id = i2c_client_get_device_id(client); int rc; struct tpm_chip *chip; struct device *dev = &client->dev; @@ -650,7 +650,7 @@ static SIMPLE_DEV_PM_OPS(i2c_nuvoton_pm_ops, tpm_pm_suspend, tpm_pm_resume); static struct i2c_driver i2c_nuvoton_driver = { .id_table = i2c_nuvoton_id, - .probe = i2c_nuvoton_probe, + .probe_new = i2c_nuvoton_probe, .remove = i2c_nuvoton_remove, .driver = { .name = "tpm_i2c_nuvoton", diff --git a/drivers/char/tpm/tpm_tis_i2c.c b/drivers/char/tpm/tpm_tis_i2c.c index f3a7251c8e38..c8c34adc14c0 100644 --- a/drivers/char/tpm/tpm_tis_i2c.c +++ b/drivers/char/tpm/tpm_tis_i2c.c @@ -312,8 +312,7 @@ static const struct tpm_tis_phy_ops tpm_i2c_phy_ops = { .verify_crc = tpm_tis_i2c_verify_crc, }; -static int tpm_tis_i2c_probe(struct i2c_client *dev, - const struct i2c_device_id *id) +static int tpm_tis_i2c_probe(struct i2c_client *dev) { struct tpm_tis_i2c_phy *phy; const u8 crc_enable = 1; @@ -380,7 +379,7 @@ static struct i2c_driver tpm_tis_i2c_driver = { .pm = &tpm_tis_pm, .of_match_table = of_match_ptr(of_tis_i2c_match), }, - .probe = tpm_tis_i2c_probe, + .probe_new = tpm_tis_i2c_probe, .remove = tpm_tis_i2c_remove, .id_table = tpm_tis_i2c_id, }; diff --git a/drivers/clk/ingenic/jz4760-cgu.c b/drivers/clk/ingenic/jz4760-cgu.c index ecd395ac8a28..e407f00bd594 100644 --- a/drivers/clk/ingenic/jz4760-cgu.c +++ b/drivers/clk/ingenic/jz4760-cgu.c @@ -58,7 +58,7 @@ jz4760_cgu_calc_m_n_od(const struct ingenic_cgu_pll_info *pll_info, unsigned long rate, unsigned long parent_rate, unsigned int *pm, unsigned int *pn, unsigned int *pod) { - unsigned int m, n, od, m_max = (1 << pll_info->m_bits) - 2; + unsigned int m, n, od, m_max = (1 << pll_info->m_bits) - 1; /* The frequency after the N divider must be between 1 and 50 MHz. */ n = parent_rate / (1 * MHZ); @@ -66,19 +66,17 @@ jz4760_cgu_calc_m_n_od(const struct ingenic_cgu_pll_info *pll_info, /* The N divider must be >= 2. */ n = clamp_val(n, 2, 1 << pll_info->n_bits); - for (;; n >>= 1) { - od = (unsigned int)-1; + rate /= MHZ; + parent_rate /= MHZ; - do { - m = (rate / MHZ) * (1 << ++od) * n / (parent_rate / MHZ); - } while ((m > m_max || m & 1) && (od < 4)); - - if (od < 4 && m >= 4 && m <= m_max) - break; + for (m = m_max; m >= m_max && n >= 2; n--) { + m = rate * n / parent_rate; + od = m & 1; + m <<= od; } *pm = m; - *pn = n; + *pn = n + 1; *pod = 1 << od; } diff --git a/drivers/clk/microchip/clk-mpfs-ccc.c b/drivers/clk/microchip/clk-mpfs-ccc.c index 32aae880a14f..0ddc73e07be4 100644 --- a/drivers/clk/microchip/clk-mpfs-ccc.c +++ b/drivers/clk/microchip/clk-mpfs-ccc.c @@ -164,12 +164,11 @@ static int mpfs_ccc_register_outputs(struct device *dev, struct mpfs_ccc_out_hw_ for (unsigned int i = 0; i < num_clks; i++) { struct mpfs_ccc_out_hw_clock *out_hw = &out_hws[i]; - char *name = devm_kzalloc(dev, 23, GFP_KERNEL); + char *name = devm_kasprintf(dev, GFP_KERNEL, "%s_out%u", parent->name, i); if (!name) return -ENOMEM; - snprintf(name, 23, "%s_out%u", parent->name, i); out_hw->divider.hw.init = CLK_HW_INIT_HW(name, &parent->hw, &clk_divider_ops, 0); out_hw->divider.reg = data->pll_base[i / MPFS_CCC_OUTPUTS_PER_PLL] + out_hw->reg_offset; @@ -201,14 +200,13 @@ static int mpfs_ccc_register_plls(struct device *dev, struct mpfs_ccc_pll_hw_clo for (unsigned int i = 0; i < num_clks; i++) { struct mpfs_ccc_pll_hw_clock *pll_hw = &pll_hws[i]; - char *name = devm_kzalloc(dev, 18, GFP_KERNEL); - if (!name) + pll_hw->name = devm_kasprintf(dev, GFP_KERNEL, "ccc%s_pll%u", + strchrnul(dev->of_node->full_name, '@'), i); + if (!pll_hw->name) return -ENOMEM; pll_hw->base = data->pll_base[i]; - snprintf(name, 18, "ccc%s_pll%u", strchrnul(dev->of_node->full_name, '@'), i); - pll_hw->name = (const char *)name; pll_hw->hw.init = CLK_HW_INIT_PARENTS_DATA_FIXED_SIZE(pll_hw->name, pll_hw->parents, &mpfs_ccc_pll_ops, 0); diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 9505a812d6a1..d3f55ca06ed3 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -143,40 +143,42 @@ static unsigned long qcom_lmh_get_throttle_freq(struct qcom_cpufreq_data *data) return lval * xo_rate; } -/* Get the current frequency of the CPU (after throttling) */ -static unsigned int qcom_cpufreq_hw_get(unsigned int cpu) +/* Get the frequency requested by the cpufreq core for the CPU */ +static unsigned int qcom_cpufreq_get_freq(unsigned int cpu) { struct qcom_cpufreq_data *data; + const struct qcom_cpufreq_soc_data *soc_data; struct cpufreq_policy *policy; + unsigned int index; policy = cpufreq_cpu_get_raw(cpu); if (!policy) return 0; data = policy->driver_data; + soc_data = qcom_cpufreq.soc_data; - return qcom_lmh_get_throttle_freq(data) / HZ_PER_KHZ; + index = readl_relaxed(data->base + soc_data->reg_perf_state); + index = min(index, LUT_MAX_ENTRIES - 1); + + return policy->freq_table[index].frequency; } -/* Get the frequency requested by the cpufreq core for the CPU */ -static unsigned int qcom_cpufreq_get_freq(unsigned int cpu) +static unsigned int qcom_cpufreq_hw_get(unsigned int cpu) { struct qcom_cpufreq_data *data; - const struct qcom_cpufreq_soc_data *soc_data; struct cpufreq_policy *policy; - unsigned int index; policy = cpufreq_cpu_get_raw(cpu); if (!policy) return 0; data = policy->driver_data; - soc_data = qcom_cpufreq.soc_data; - index = readl_relaxed(data->base + soc_data->reg_perf_state); - index = min(index, LUT_MAX_ENTRIES - 1); + if (data->throttle_irq >= 0) + return qcom_lmh_get_throttle_freq(data) / HZ_PER_KHZ; - return policy->freq_table[index].frequency; + return qcom_cpufreq_get_freq(cpu); } static unsigned int qcom_cpufreq_hw_fast_switch(struct cpufreq_policy *policy, @@ -704,6 +706,8 @@ static int qcom_cpufreq_hw_driver_probe(struct platform_device *pdev) return -ENOMEM; qcom_cpufreq.soc_data = of_device_get_match_data(dev); + if (!qcom_cpufreq.soc_data) + return -ENODEV; clk_data = devm_kzalloc(dev, struct_size(clk_data, hws, num_domains), GFP_KERNEL); if (!clk_data) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 02f28da519e3..940f805b1534 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -131,7 +131,7 @@ static int cxl_region_decode_reset(struct cxl_region *cxlr, int count) struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_port *iter = cxled_to_port(cxled); struct cxl_ep *ep; - int rc; + int rc = 0; while (!is_cxl_root(to_cxl_port(iter->dev.parent))) iter = to_cxl_port(iter->dev.parent); @@ -143,7 +143,8 @@ static int cxl_region_decode_reset(struct cxl_region *cxlr, int count) cxl_rr = cxl_rr_load(iter, cxlr); cxld = cxl_rr->decoder; - rc = cxld->reset(cxld); + if (cxld->reset) + rc = cxld->reset(cxld); if (rc) return rc; } @@ -186,7 +187,8 @@ static int cxl_region_decode_commit(struct cxl_region *cxlr) iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) { cxl_rr = cxl_rr_load(iter, cxlr); cxld = cxl_rr->decoder; - cxld->reset(cxld); + if (cxld->reset) + cxld->reset(cxld); } cxled->cxld.reset(&cxled->cxld); @@ -991,10 +993,10 @@ static int cxl_port_setup_targets(struct cxl_port *port, int i, distance; /* - * Passthrough ports impose no distance requirements between + * Passthrough decoders impose no distance requirements between * peers */ - if (port->nr_dports == 1) + if (cxl_rr->nr_targets == 1) distance = 0; else distance = p->nr_targets / cxl_rr->nr_targets; diff --git a/drivers/dax/super.c b/drivers/dax/super.c index da4438f3188c..c4c4728a36e4 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -475,7 +475,7 @@ EXPORT_SYMBOL_GPL(put_dax); /** * dax_holder() - obtain the holder of a dax device * @dax_dev: a dax_device instance - + * * Return: the holder's data which represents the holder if registered, * otherwize NULL. */ diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 406b4e26f538..0de0482cd36e 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -167,7 +167,7 @@ struct dma_fence *dma_fence_allocate_private_stub(void) 0, 0); set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, - &dma_fence_stub.flags); + &fence->flags); dma_fence_signal(fence); diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 9c89f7d53e99..958aa4662ccb 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -819,8 +819,10 @@ static int ioctl_send_response(struct client *client, union ioctl_arg *arg) r = container_of(resource, struct inbound_transaction_resource, resource); - if (is_fcp_request(r->request)) + if (is_fcp_request(r->request)) { + kfree(r->data); goto out; + } if (a->length != fw_get_response_length(r->request)) { ret = -EINVAL; diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index a2b0cbc8741c..1e0b016fdc2b 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -1007,6 +1007,8 @@ int __ref efi_mem_reserve_persistent(phys_addr_t addr, u64 size) /* first try to find a slot in an existing linked list entry */ for (prsv = efi_memreserve_root->next; prsv; ) { rsv = memremap(prsv, sizeof(*rsv), MEMREMAP_WB); + if (!rsv) + return -ENOMEM; index = atomic_fetch_add_unless(&rsv->count, 1, rsv->size); if (index < rsv->size) { rsv->entry[index].base = addr; diff --git a/drivers/firmware/efi/libstub/arm64.c b/drivers/firmware/efi/libstub/arm64.c index ff2d18c42ee7..4501652e11ab 100644 --- a/drivers/firmware/efi/libstub/arm64.c +++ b/drivers/firmware/efi/libstub/arm64.c @@ -19,10 +19,13 @@ static bool system_needs_vamap(void) const u8 *type1_family = efi_get_smbios_string(1, family); /* - * Ampere Altra machines crash in SetTime() if SetVirtualAddressMap() - * has not been called prior. + * Ampere eMAG, Altra, and Altra Max machines crash in SetTime() if + * SetVirtualAddressMap() has not been called prior. */ - if (!type1_family || strcmp(type1_family, "Altra")) + if (!type1_family || ( + strcmp(type1_family, "eMAG") && + strcmp(type1_family, "Altra") && + strcmp(type1_family, "Altra Max"))) return false; efi_warn("Working around broken SetVirtualAddressMap()\n"); diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c index 0a9aba5f9cef..f178b2984dfb 100644 --- a/drivers/firmware/efi/memattr.c +++ b/drivers/firmware/efi/memattr.c @@ -33,7 +33,7 @@ int __init efi_memattr_init(void) return -ENOMEM; } - if (tbl->version > 1) { + if (tbl->version > 2) { pr_warn("Unexpected EFI Memory Attributes table version %d\n", tbl->version); goto unmap; diff --git a/drivers/fpga/intel-m10-bmc-sec-update.c b/drivers/fpga/intel-m10-bmc-sec-update.c index 79d48852825e..03f1bd81c434 100644 --- a/drivers/fpga/intel-m10-bmc-sec-update.c +++ b/drivers/fpga/intel-m10-bmc-sec-update.c @@ -574,20 +574,27 @@ static int m10bmc_sec_probe(struct platform_device *pdev) len = scnprintf(buf, SEC_UPDATE_LEN_MAX, "secure-update%d", sec->fw_name_id); sec->fw_name = kmemdup_nul(buf, len, GFP_KERNEL); - if (!sec->fw_name) - return -ENOMEM; + if (!sec->fw_name) { + ret = -ENOMEM; + goto fw_name_fail; + } fwl = firmware_upload_register(THIS_MODULE, sec->dev, sec->fw_name, &m10bmc_ops, sec); if (IS_ERR(fwl)) { dev_err(sec->dev, "Firmware Upload driver failed to start\n"); - kfree(sec->fw_name); - xa_erase(&fw_upload_xa, sec->fw_name_id); - return PTR_ERR(fwl); + ret = PTR_ERR(fwl); + goto fw_uploader_fail; } sec->fwl = fwl; return 0; + +fw_uploader_fail: + kfree(sec->fw_name); +fw_name_fail: + xa_erase(&fw_upload_xa, sec->fw_name_id); + return ret; } static int m10bmc_sec_remove(struct platform_device *pdev) diff --git a/drivers/fpga/stratix10-soc.c b/drivers/fpga/stratix10-soc.c index 357cea58ec98..f7f01982a512 100644 --- a/drivers/fpga/stratix10-soc.c +++ b/drivers/fpga/stratix10-soc.c @@ -213,9 +213,9 @@ static int s10_ops_write_init(struct fpga_manager *mgr, /* Allocate buffers from the service layer's pool. */ for (i = 0; i < NUM_SVC_BUFS; i++) { kbuf = stratix10_svc_allocate_memory(priv->chan, SVC_BUF_SIZE); - if (!kbuf) { + if (IS_ERR(kbuf)) { s10_free_buffers(mgr); - ret = -ENOMEM; + ret = PTR_ERR(kbuf); goto init_done; } diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index ec7cfd4f52b1..e9917a45b005 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -1531,6 +1531,7 @@ config GPIO_MLXBF2 tristate "Mellanox BlueField 2 SoC GPIO" depends on (MELLANOX_PLATFORM && ARM64 && ACPI) || (64BIT && COMPILE_TEST) select GPIO_GENERIC + select GPIOLIB_IRQCHIP help Say Y here if you want GPIO support on Mellanox BlueField 2 SoC. diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c index 60514bc5454f..9e3893b19e4f 100644 --- a/drivers/gpio/gpio-sim.c +++ b/drivers/gpio/gpio-sim.c @@ -736,7 +736,7 @@ static void gpio_sim_remove_hogs(struct gpio_sim_device *dev) gpiod_remove_hogs(dev->hogs); - for (hog = dev->hogs; !hog->chip_label; hog++) { + for (hog = dev->hogs; hog->chip_label; hog++) { kfree(hog->chip_label); kfree(hog->line_name); } diff --git a/drivers/gpio/gpio-vf610.c b/drivers/gpio/gpio-vf610.c index 9db42f6a2043..9033db00c360 100644 --- a/drivers/gpio/gpio-vf610.c +++ b/drivers/gpio/gpio-vf610.c @@ -30,7 +30,6 @@ struct fsl_gpio_soc_data { struct vf610_gpio_port { struct gpio_chip gc; - struct irq_chip ic; void __iomem *base; void __iomem *gpio_base; const struct fsl_gpio_soc_data *sdata; @@ -207,20 +206,24 @@ static int vf610_gpio_irq_set_type(struct irq_data *d, u32 type) static void vf610_gpio_irq_mask(struct irq_data *d) { - struct vf610_gpio_port *port = - gpiochip_get_data(irq_data_get_irq_chip_data(d)); - void __iomem *pcr_base = port->base + PORT_PCR(d->hwirq); + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct vf610_gpio_port *port = gpiochip_get_data(gc); + irq_hw_number_t gpio_num = irqd_to_hwirq(d); + void __iomem *pcr_base = port->base + PORT_PCR(gpio_num); vf610_gpio_writel(0, pcr_base); + gpiochip_disable_irq(gc, gpio_num); } static void vf610_gpio_irq_unmask(struct irq_data *d) { - struct vf610_gpio_port *port = - gpiochip_get_data(irq_data_get_irq_chip_data(d)); - void __iomem *pcr_base = port->base + PORT_PCR(d->hwirq); + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct vf610_gpio_port *port = gpiochip_get_data(gc); + irq_hw_number_t gpio_num = irqd_to_hwirq(d); + void __iomem *pcr_base = port->base + PORT_PCR(gpio_num); - vf610_gpio_writel(port->irqc[d->hwirq] << PORT_PCR_IRQC_OFFSET, + gpiochip_enable_irq(gc, gpio_num); + vf610_gpio_writel(port->irqc[gpio_num] << PORT_PCR_IRQC_OFFSET, pcr_base); } @@ -237,6 +240,17 @@ static int vf610_gpio_irq_set_wake(struct irq_data *d, u32 enable) return 0; } +static const struct irq_chip vf610_irqchip = { + .name = "gpio-vf610", + .irq_ack = vf610_gpio_irq_ack, + .irq_mask = vf610_gpio_irq_mask, + .irq_unmask = vf610_gpio_irq_unmask, + .irq_set_type = vf610_gpio_irq_set_type, + .irq_set_wake = vf610_gpio_irq_set_wake, + .flags = IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + static void vf610_gpio_disable_clk(void *data) { clk_disable_unprepare(data); @@ -249,7 +263,6 @@ static int vf610_gpio_probe(struct platform_device *pdev) struct vf610_gpio_port *port; struct gpio_chip *gc; struct gpio_irq_chip *girq; - struct irq_chip *ic; int i; int ret; @@ -315,14 +328,6 @@ static int vf610_gpio_probe(struct platform_device *pdev) gc->direction_output = vf610_gpio_direction_output; gc->set = vf610_gpio_set; - ic = &port->ic; - ic->name = "gpio-vf610"; - ic->irq_ack = vf610_gpio_irq_ack; - ic->irq_mask = vf610_gpio_irq_mask; - ic->irq_unmask = vf610_gpio_irq_unmask; - ic->irq_set_type = vf610_gpio_irq_set_type; - ic->irq_set_wake = vf610_gpio_irq_set_wake; - /* Mask all GPIO interrupts */ for (i = 0; i < gc->ngpio; i++) vf610_gpio_writel(0, port->base + PORT_PCR(i)); @@ -331,7 +336,7 @@ static int vf610_gpio_probe(struct platform_device *pdev) vf610_gpio_writel(~0, port->base + PORT_ISFR); girq = &gc->irq; - girq->chip = ic; + gpio_irq_chip_set_chip(girq, &vf610_irqchip); girq->parent_handler = vf610_gpio_irq_handler; girq->num_parents = 1; girq->parents = devm_kcalloc(&pdev->dev, 1, diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index 17c53f484280..34ff048e70d0 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -1637,6 +1637,18 @@ static const struct dmi_system_id gpiolib_acpi_quirks[] __initconst = { .ignore_wake = "ELAN0415:00@9", }, }, + { + /* + * Spurious wakeups from TP_ATTN# pin + * Found in BIOS 1.7.7 + */ + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "NH5xAx"), + }, + .driver_data = &(struct acpi_gpiolib_dmi_quirk) { + .ignore_wake = "SYNA1202:00@16", + }, + }, {} /* Terminating entry */ }; diff --git a/drivers/gpio/gpiolib-acpi.h b/drivers/gpio/gpiolib-acpi.h index 9475f99a9694..5a08693b8fb1 100644 --- a/drivers/gpio/gpiolib-acpi.h +++ b/drivers/gpio/gpiolib-acpi.h @@ -14,7 +14,6 @@ #include <linux/gpio/consumer.h> -struct acpi_device; struct device; struct fwnode_handle; diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 315cbdf61979..9abfb482b615 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -53,7 +53,8 @@ config DRM_DEBUG_MM config DRM_USE_DYNAMIC_DEBUG bool "use dynamic debug to implement drm.debug" - default y + default n + depends on BROKEN depends on DRM depends on DYNAMIC_DEBUG || DYNAMIC_DEBUG_CORE depends on JUMP_LABEL diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index e3e2e6e3b485..d148a1bd85e6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -243,6 +243,7 @@ extern int amdgpu_num_kcq; #define AMDGPU_VCNFW_LOG_SIZE (32 * 1024) extern int amdgpu_vcnfw_log; +extern int amdgpu_sg_display; #define AMDGPU_VM_MAX_NUM_CTX 4096 #define AMDGPU_SG_THRESHOLD (256*1024*1024) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 7b5ce00f0602..7af3041ccd0e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1220,10 +1220,13 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) * next job actually sees the results from the previous one * before we start executing on the same scheduler ring. */ - if (!s_fence || s_fence->sched != sched) + if (!s_fence || s_fence->sched != sched) { + dma_fence_put(fence); continue; + } r = amdgpu_sync_fence(&p->gang_leader->explicit_sync, fence); + dma_fence_put(fence); if (r) return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2f28a8c02f64..fbf2f24169eb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4268,6 +4268,9 @@ exit: } adev->in_suspend = false; + if (adev->enable_mes) + amdgpu_mes_self_test(adev); + if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) DRM_WARN("smart shift update failed\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index cd4caaa29528..3fe277bc233f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -186,6 +186,7 @@ int amdgpu_num_kcq = -1; int amdgpu_smartshift_bias; int amdgpu_use_xgmi_p2p = 1; int amdgpu_vcnfw_log; +int amdgpu_sg_display = -1; /* auto */ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work); @@ -932,6 +933,16 @@ MODULE_PARM_DESC(vcnfw_log, "Enable vcnfw log(0 = disable (default value), 1 = e module_param_named(vcnfw_log, amdgpu_vcnfw_log, int, 0444); /** + * DOC: sg_display (int) + * Disable S/G (scatter/gather) display (i.e., display from system memory). + * This option is only relevant on APUs. Set this option to 0 to disable + * S/G display if you experience flickering or other issues under memory + * pressure and report the issue. + */ +MODULE_PARM_DESC(sg_display, "S/G Display (-1 = auto (default), 0 = disable)"); +module_param_named(sg_display, amdgpu_sg_display, int, 0444); + +/** * DOC: smu_pptable_id (int) * Used to override pptable id. id = 0 use VBIOS pptable. * id > 0 use the soft pptable with specicfied id. diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 00444203220d..faff4a3f96e6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -618,7 +618,13 @@ void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev) if (!ring || !ring->fence_drv.initialized) continue; - if (!ring->no_scheduler) + /* + * Notice we check for sched.ops since there's some + * override on the meaning of sched.ready by amdgpu. + * The natural check would be sched.ready, which is + * set as drm_sched_init() finishes... + */ + if (ring->sched.ops) drm_sched_fini(&ring->sched); for (j = 0; j <= ring->fence_drv.num_fences_mask; ++j) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index f752c7ae7f60..3989e755a5b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -295,7 +295,7 @@ struct amdgpu_ring { #define amdgpu_ring_parse_cs(r, p, job, ib) ((r)->funcs->parse_cs((p), (job), (ib))) #define amdgpu_ring_patch_cs_in_place(r, p, job, ib) ((r)->funcs->patch_cs_in_place((p), (job), (ib))) #define amdgpu_ring_test_ring(r) (r)->funcs->test_ring((r)) -#define amdgpu_ring_test_ib(r, t) (r)->funcs->test_ib((r), (t)) +#define amdgpu_ring_test_ib(r, t) ((r)->funcs->test_ib ? (r)->funcs->test_ib((r), (t)) : 0) #define amdgpu_ring_get_rptr(r) (r)->funcs->get_rptr((r)) #define amdgpu_ring_get_wptr(r) (r)->funcs->get_wptr((r)) #define amdgpu_ring_set_wptr(r) (r)->funcs->set_wptr((r)) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index b5f3bba851db..01e42bdd8e4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -974,7 +974,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, trace_amdgpu_vm_update_ptes(params, frag_start, upd_end, min(nptes, 32u), dst, incr, upd_flags, - vm->task_info.pid, + vm->task_info.tgid, vm->immediate.fence_context); amdgpu_vm_pte_update_flags(params, to_amdgpu_bo_vm(pt), cursor.level, pe_start, dst, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index b9b57a66e113..66eb102cd88f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -790,8 +790,8 @@ static void gfx_v11_0_read_wave_data(struct amdgpu_device *adev, uint32_t simd, * zero here */ WARN_ON(simd != 0); - /* type 2 wave data */ - dst[(*no_fields)++] = 2; + /* type 3 wave data */ + dst[(*no_fields)++] = 3; dst[(*no_fields)++] = wave_read_ind(adev, wave, ixSQ_WAVE_STATUS); dst[(*no_fields)++] = wave_read_ind(adev, wave, ixSQ_WAVE_PC_LO); dst[(*no_fields)++] = wave_read_ind(adev, wave, ixSQ_WAVE_PC_HI); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index f202b45c413c..5dde6f82a1ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -6877,7 +6877,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = { .emit_gds_switch = gfx_v9_0_ring_emit_gds_switch, .emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush, .test_ring = gfx_v9_0_ring_test_ring, - .test_ib = gfx_v9_0_ring_test_ib, .insert_nop = amdgpu_ring_insert_nop, .pad_ib = amdgpu_ring_generic_pad_ib, .emit_switch_buffer = gfx_v9_ring_emit_sb, diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 5dff79e8f301..1c4787000a5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -1344,7 +1344,7 @@ static int mes_v11_0_late_init(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; /* it's only intended for use in mes_self_test case, not for s0ix and reset */ - if (!amdgpu_in_reset(adev) && !adev->in_s0ix && + if (!amdgpu_in_reset(adev) && !adev->in_s0ix && !adev->in_suspend && (adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))) amdgpu_mes_self_test(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c index 15eb3658d70e..09fdcd20cb91 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c @@ -337,7 +337,13 @@ const struct nbio_hdp_flush_reg nbio_v4_3_hdp_flush_reg = { static void nbio_v4_3_init_registers(struct amdgpu_device *adev) { - return; + if (adev->ip_versions[NBIO_HWIP][0] == IP_VERSION(4, 3, 0)) { + uint32_t data; + + data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF2_STRAP2); + data &= ~RCC_DEV0_EPF2_STRAP2__STRAP_NO_SOFT_RESET_DEV0_F2_MASK; + WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF2_STRAP2, data); + } } static u32 nbio_v4_3_get_rom_offset(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 5562670b7b52..7050238c4c48 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -640,7 +640,10 @@ static int soc21_common_early_init(void *handle) AMD_CG_SUPPORT_GFX_CGCG | AMD_CG_SUPPORT_GFX_CGLS | AMD_CG_SUPPORT_REPEATER_FGCG | - AMD_CG_SUPPORT_GFX_MGCG; + AMD_CG_SUPPORT_GFX_MGCG | + AMD_CG_SUPPORT_HDP_SD | + AMD_CG_SUPPORT_ATHUB_MGCG | + AMD_CG_SUPPORT_ATHUB_LS; adev->pg_flags = AMD_PG_SUPPORT_VCN | AMD_PG_SUPPORT_VCN_DPG | AMD_PG_SUPPORT_JPEG; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index af37bc6ed1f5..9c7b69d377bd 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1184,24 +1184,38 @@ static void mmhub_read_system_context(struct amdgpu_device *adev, struct dc_phy_ memset(pa_config, 0, sizeof(*pa_config)); - logical_addr_low = min(adev->gmc.fb_start, adev->gmc.agp_start) >> 18; - pt_base = amdgpu_gmc_pd_addr(adev->gart.bo); - - if (adev->apu_flags & AMD_APU_IS_RAVEN2) - /* - * Raven2 has a HW issue that it is unable to use the vram which - * is out of MC_VM_SYSTEM_APERTURE_HIGH_ADDR. So here is the - * workaround that increase system aperture high address (add 1) - * to get rid of the VM fault and hardware hang. - */ - logical_addr_high = max((adev->gmc.fb_end >> 18) + 0x1, adev->gmc.agp_end >> 18); - else - logical_addr_high = max(adev->gmc.fb_end, adev->gmc.agp_end) >> 18; - agp_base = 0; agp_bot = adev->gmc.agp_start >> 24; agp_top = adev->gmc.agp_end >> 24; + /* AGP aperture is disabled */ + if (agp_bot == agp_top) { + logical_addr_low = adev->gmc.vram_start >> 18; + if (adev->apu_flags & AMD_APU_IS_RAVEN2) + /* + * Raven2 has a HW issue that it is unable to use the vram which + * is out of MC_VM_SYSTEM_APERTURE_HIGH_ADDR. So here is the + * workaround that increase system aperture high address (add 1) + * to get rid of the VM fault and hardware hang. + */ + logical_addr_high = (adev->gmc.fb_end >> 18) + 0x1; + else + logical_addr_high = adev->gmc.vram_end >> 18; + } else { + logical_addr_low = min(adev->gmc.fb_start, adev->gmc.agp_start) >> 18; + if (adev->apu_flags & AMD_APU_IS_RAVEN2) + /* + * Raven2 has a HW issue that it is unable to use the vram which + * is out of MC_VM_SYSTEM_APERTURE_HIGH_ADDR. So here is the + * workaround that increase system aperture high address (add 1) + * to get rid of the VM fault and hardware hang. + */ + logical_addr_high = max((adev->gmc.fb_end >> 18) + 0x1, adev->gmc.agp_end >> 18); + else + logical_addr_high = max(adev->gmc.fb_end, adev->gmc.agp_end) >> 18; + } + + pt_base = amdgpu_gmc_pd_addr(adev->gart.bo); page_table_start.high_part = (u32)(adev->gmc.gart_start >> 44) & 0xF; page_table_start.low_part = (u32)(adev->gmc.gart_start >> 12); @@ -1503,6 +1517,8 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) case IP_VERSION(3, 0, 1): case IP_VERSION(3, 1, 2): case IP_VERSION(3, 1, 3): + case IP_VERSION(3, 1, 4): + case IP_VERSION(3, 1, 5): case IP_VERSION(3, 1, 6): init_data.flags.gpu_vm_support = true; break; @@ -1511,6 +1527,9 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) } break; } + if (init_data.flags.gpu_vm_support && + (amdgpu_sg_display == 0)) + init_data.flags.gpu_vm_support = false; if (init_data.flags.gpu_vm_support) adev->mode_info.gpu_vm_support = true; @@ -4501,6 +4520,17 @@ DEVICE_ATTR_WO(s3_debug); static int dm_early_init(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + struct amdgpu_mode_info *mode_info = &adev->mode_info; + struct atom_context *ctx = mode_info->atom_context; + int index = GetIndexIntoMasterTable(DATA, Object_Header); + u16 data_offset; + + /* if there is no object header, skip DM */ + if (!amdgpu_atom_parse_data_header(ctx, index, NULL, NULL, NULL, &data_offset)) { + adev->harvest_ip_mask |= AMD_HARVEST_IP_DMU_MASK; + dev_info(adev->dev, "No object header, skipping DM\n"); + return -ENOENT; + } switch (adev->asic_type) { #if defined(CONFIG_DRM_AMD_DC_SI) @@ -9628,7 +9658,11 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev, * `dcn10_can_pipe_disable_cursor`). By now, all modified planes are in * atomic state, so call drm helper to normalize zpos. */ - drm_atomic_normalize_zpos(dev, state); + ret = drm_atomic_normalize_zpos(dev, state); + if (ret) { + drm_dbg(dev, "drm_atomic_normalize_zpos() failed\n"); + goto fail; + } /* Remove exiting planes if they are modified */ for_each_oldnew_plane_in_state_reverse(state, plane, old_plane_state, new_plane_state, i) { diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c index fe2023f18b7d..8f894c1d1d1e 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c @@ -3626,7 +3626,7 @@ void dcn10_set_cursor_position(struct pipe_ctx *pipe_ctx) (int)hubp->curs_attr.width || pos_cpy.x <= (int)hubp->curs_attr.width + pipe_ctx->plane_state->src_rect.x) { - pos_cpy.x = temp_x + viewport_width; + pos_cpy.x = 2 * viewport_width - temp_x; } } } else { diff --git a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c index f9ea1e86707f..79850a68f62a 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c @@ -874,8 +874,9 @@ static const struct dc_plane_cap plane_cap = { }, // 6:1 downscaling ratio: 1000/6 = 166.666 + // 4:1 downscaling ratio for ARGB888 to prevent underflow during P010 playback: 1000/4 = 250 .max_downscale_factor = { - .argb8888 = 167, + .argb8888 = 250, .nv12 = 167, .fp16 = 167 }, @@ -1763,7 +1764,7 @@ static bool dcn314_resource_construct( pool->base.underlay_pipe_index = NO_UNDERLAY_PIPE; pool->base.pipe_count = pool->base.res_cap->num_timing_generator; pool->base.mpcc_count = pool->base.res_cap->num_timing_generator; - dc->caps.max_downscale_ratio = 600; + dc->caps.max_downscale_ratio = 400; dc->caps.i2c_speed_in_khz = 100; dc->caps.i2c_speed_in_khz_hdcp = 100; dc->caps.max_cursor_size = 256; diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_init.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_init.c index dc4649458567..a4e9fd5307c6 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_init.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_init.c @@ -94,7 +94,7 @@ static const struct hw_sequencer_funcs dcn32_funcs = { .get_vupdate_offset_from_vsync = dcn10_get_vupdate_offset_from_vsync, .calc_vupdate_position = dcn10_calc_vupdate_position, .apply_idle_power_optimizations = dcn32_apply_idle_power_optimizations, - .does_plane_fit_in_mall = dcn30_does_plane_fit_in_mall, + .does_plane_fit_in_mall = NULL, .set_backlight_level = dcn21_set_backlight_level, .set_abm_immediate_disable = dcn21_set_abm_immediate_disable, .hardware_release = dcn30_hardware_release, diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn314/display_mode_vba_314.c b/drivers/gpu/drm/amd/display/dc/dml/dcn314/display_mode_vba_314.c index 950669f2c10d..cb7c0c878423 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn314/display_mode_vba_314.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn314/display_mode_vba_314.c @@ -3183,7 +3183,7 @@ static void DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerforman } else { v->MIN_DST_Y_NEXT_START[k] = v->VTotal[k] - v->VFrontPorch[k] + v->VTotal[k] - v->VActive[k] - v->VStartup[k]; } - v->MIN_DST_Y_NEXT_START[k] += dml_floor(4.0 * v->TSetup[k] / (double)v->HTotal[k] / v->PixelClock[k], 1.0) / 4.0; + v->MIN_DST_Y_NEXT_START[k] += dml_floor(4.0 * v->TSetup[k] / ((double)v->HTotal[k] / v->PixelClock[k]), 1.0) / 4.0; if (((v->VUpdateOffsetPix[k] + v->VUpdateWidthPix[k] + v->VReadyOffsetPix[k]) / v->HTotal[k]) <= (isInterlaceTiming ? dml_floor((v->VTotal[k] - v->VActive[k] - v->VFrontPorch[k] - v->VStartup[k]) / 2.0, 1.0) : diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c index 4a122925c3ae..92c18bfb98b3 100644 --- a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c +++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c @@ -532,6 +532,9 @@ enum dmub_status dmub_srv_hw_init(struct dmub_srv *dmub, if (dmub->hw_funcs.reset) dmub->hw_funcs.reset(dmub); + /* reset the cache of the last wptr as well now that hw is reset */ + dmub->inbox1_last_wptr = 0; + cw0.offset.quad_part = inst_fb->gpu_addr; cw0.region.base = DMUB_CW0_BASE; cw0.region.top = cw0.region.base + inst_fb->size - 1; @@ -649,6 +652,15 @@ enum dmub_status dmub_srv_hw_reset(struct dmub_srv *dmub) if (dmub->hw_funcs.reset) dmub->hw_funcs.reset(dmub); + /* mailboxes have been reset in hw, so reset the sw state as well */ + dmub->inbox1_last_wptr = 0; + dmub->inbox1_rb.wrpt = 0; + dmub->inbox1_rb.rptr = 0; + dmub->outbox0_rb.wrpt = 0; + dmub->outbox0_rb.rptr = 0; + dmub->outbox1_rb.wrpt = 0; + dmub->outbox1_rb.rptr = 0; + dmub->hw_init = false; return DMUB_STATUS_OK; diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index 236657eece47..2f3e239e623d 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -1991,6 +1991,8 @@ static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_ case IP_VERSION(9, 4, 2): case IP_VERSION(10, 3, 0): case IP_VERSION(11, 0, 0): + case IP_VERSION(11, 0, 1): + case IP_VERSION(11, 0, 2): *states = ATTR_STATE_SUPPORTED; break; default: @@ -2007,14 +2009,16 @@ static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_ gc_ver == IP_VERSION(10, 3, 0) || gc_ver == IP_VERSION(10, 1, 2) || gc_ver == IP_VERSION(11, 0, 0) || - gc_ver == IP_VERSION(11, 0, 2))) + gc_ver == IP_VERSION(11, 0, 2) || + gc_ver == IP_VERSION(11, 0, 3))) *states = ATTR_STATE_UNSUPPORTED; } else if (DEVICE_ATTR_IS(pp_dpm_dclk)) { if (!(gc_ver == IP_VERSION(10, 3, 1) || gc_ver == IP_VERSION(10, 3, 0) || gc_ver == IP_VERSION(10, 1, 2) || gc_ver == IP_VERSION(11, 0, 0) || - gc_ver == IP_VERSION(11, 0, 2))) + gc_ver == IP_VERSION(11, 0, 2) || + gc_ver == IP_VERSION(11, 0, 3))) *states = ATTR_STATE_UNSUPPORTED; } else if (DEVICE_ATTR_IS(pp_power_profile_mode)) { if (amdgpu_dpm_get_power_profile_mode(adev, NULL) == -EOPNOTSUPP) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index ca3beb5d8f27..6ab155023592 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -1500,6 +1500,20 @@ static int smu_disable_dpms(struct smu_context *smu) } /* + * For SMU 13.0.4/11, PMFW will handle the features disablement properly + * for gpu reset case. Driver involvement is unnecessary. + */ + if (amdgpu_in_reset(adev)) { + switch (adev->ip_versions[MP1_HWIP][0]) { + case IP_VERSION(13, 0, 4): + case IP_VERSION(13, 0, 11): + return 0; + default: + break; + } + } + + /* * For gpu reset, runpm and hibernation through BACO, * BACO feature has to be kept enabled. */ diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_0.h index d6b964cf73bd..4bc7aee4d44f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_0.h @@ -123,7 +123,8 @@ (1 << FEATURE_DS_FCLK_BIT) | \ (1 << FEATURE_DS_LCLK_BIT) | \ (1 << FEATURE_DS_DCFCLK_BIT) | \ - (1 << FEATURE_DS_UCLK_BIT)) + (1 << FEATURE_DS_UCLK_BIT) | \ + (1ULL << FEATURE_DS_VCN_BIT)) //For use with feature control messages typedef enum { @@ -522,9 +523,9 @@ typedef enum { TEMP_HOTSPOT_M, TEMP_MEM, TEMP_VR_GFX, - TEMP_VR_SOC, TEMP_VR_MEM0, TEMP_VR_MEM1, + TEMP_VR_SOC, TEMP_VR_U, TEMP_LIQUID0, TEMP_LIQUID1, diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_7.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_7.h index d6b13933a98f..48a3a3952ceb 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_7.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_7.h @@ -113,20 +113,21 @@ #define NUM_FEATURES 64 #define ALLOWED_FEATURE_CTRL_DEFAULT 0xFFFFFFFFFFFFFFFFULL -#define ALLOWED_FEATURE_CTRL_SCPM (1 << FEATURE_DPM_GFXCLK_BIT) | \ - (1 << FEATURE_DPM_GFX_POWER_OPTIMIZER_BIT) | \ - (1 << FEATURE_DPM_UCLK_BIT) | \ - (1 << FEATURE_DPM_FCLK_BIT) | \ - (1 << FEATURE_DPM_SOCCLK_BIT) | \ - (1 << FEATURE_DPM_MP0CLK_BIT) | \ - (1 << FEATURE_DPM_LINK_BIT) | \ - (1 << FEATURE_DPM_DCN_BIT) | \ - (1 << FEATURE_DS_GFXCLK_BIT) | \ - (1 << FEATURE_DS_SOCCLK_BIT) | \ - (1 << FEATURE_DS_FCLK_BIT) | \ - (1 << FEATURE_DS_LCLK_BIT) | \ - (1 << FEATURE_DS_DCFCLK_BIT) | \ - (1 << FEATURE_DS_UCLK_BIT) +#define ALLOWED_FEATURE_CTRL_SCPM ((1 << FEATURE_DPM_GFXCLK_BIT) | \ + (1 << FEATURE_DPM_GFX_POWER_OPTIMIZER_BIT) | \ + (1 << FEATURE_DPM_UCLK_BIT) | \ + (1 << FEATURE_DPM_FCLK_BIT) | \ + (1 << FEATURE_DPM_SOCCLK_BIT) | \ + (1 << FEATURE_DPM_MP0CLK_BIT) | \ + (1 << FEATURE_DPM_LINK_BIT) | \ + (1 << FEATURE_DPM_DCN_BIT) | \ + (1 << FEATURE_DS_GFXCLK_BIT) | \ + (1 << FEATURE_DS_SOCCLK_BIT) | \ + (1 << FEATURE_DS_FCLK_BIT) | \ + (1 << FEATURE_DS_LCLK_BIT) | \ + (1 << FEATURE_DS_DCFCLK_BIT) | \ + (1 << FEATURE_DS_UCLK_BIT) | \ + (1ULL << FEATURE_DS_VCN_BIT)) //For use with feature control messages typedef enum { diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h index e8c6febb8b64..992163e66f7b 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h @@ -28,11 +28,11 @@ #define SMU13_DRIVER_IF_VERSION_INV 0xFFFFFFFF #define SMU13_DRIVER_IF_VERSION_YELLOW_CARP 0x04 #define SMU13_DRIVER_IF_VERSION_ALDE 0x08 -#define SMU13_DRIVER_IF_VERSION_SMU_V13_0_0_0 0x34 +#define SMU13_DRIVER_IF_VERSION_SMU_V13_0_0_0 0x37 #define SMU13_DRIVER_IF_VERSION_SMU_V13_0_4 0x07 #define SMU13_DRIVER_IF_VERSION_SMU_V13_0_5 0x04 #define SMU13_DRIVER_IF_VERSION_SMU_V13_0_0_10 0x32 -#define SMU13_DRIVER_IF_VERSION_SMU_V13_0_7 0x35 +#define SMU13_DRIVER_IF_VERSION_SMU_V13_0_7 0x37 #define SMU13_DRIVER_IF_VERSION_SMU_V13_0_10 0x1D #define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500 //500ms diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index cf96c3f2affe..508e392547d7 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -407,6 +407,9 @@ static int smu_v13_0_0_setup_pptable(struct smu_context *smu) struct amdgpu_device *adev = smu->adev; int ret = 0; + if (amdgpu_sriov_vf(smu->adev)) + return 0; + ret = smu_v13_0_0_get_pptable_from_pmfw(smu, &smu_table->power_play_table, &smu_table->power_play_table_size); @@ -1257,6 +1260,9 @@ static int smu_v13_0_0_get_thermal_temperature_range(struct smu_context *smu, table_context->power_play_table; PPTable_t *pptable = smu->smu_table.driver_pptable; + if (amdgpu_sriov_vf(smu->adev)) + return 0; + if (!range) return -EINVAL; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index e87db7e02e8a..9e1967d8049e 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -124,6 +124,7 @@ static struct cmn2asic_msg_mapping smu_v13_0_7_message_map[SMU_MSG_MAX_COUNT] = MSG_MAP(DFCstateControl, PPSMC_MSG_SetExternalClientDfCstateAllow, 0), MSG_MAP(ArmD3, PPSMC_MSG_ArmD3, 0), MSG_MAP(AllowGpo, PPSMC_MSG_SetGpoAllow, 0), + MSG_MAP(GetPptLimit, PPSMC_MSG_GetPptLimit, 0), }; static struct cmn2asic_mapping smu_v13_0_7_clk_map[SMU_CLK_COUNT] = { diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c index c7443317c747..66a4a41c3fe9 100644 --- a/drivers/gpu/drm/ast/ast_mode.c +++ b/drivers/gpu/drm/ast/ast_mode.c @@ -714,7 +714,7 @@ static int ast_primary_plane_init(struct ast_private *ast) struct ast_plane *ast_primary_plane = &ast->primary_plane; struct drm_plane *primary_plane = &ast_primary_plane->base; void __iomem *vaddr = ast->vram; - u64 offset = ast->vram_base; + u64 offset = 0; /* with shmem, the primary plane is always at offset 0 */ unsigned long cursor_size = roundup(AST_HWC_SIZE + AST_HWC_SIGNATURE_SIZE, PAGE_SIZE); unsigned long size = ast->vram_fb_available - cursor_size; int ret; @@ -972,7 +972,7 @@ static int ast_cursor_plane_init(struct ast_private *ast) return -ENOMEM; vaddr = ast->vram + ast->vram_fb_available - size; - offset = ast->vram_base + ast->vram_fb_available - size; + offset = ast->vram_fb_available - size; ret = ast_plane_init(dev, ast_cursor_plane, vaddr, offset, size, 0x01, &ast_cursor_plane_funcs, diff --git a/drivers/gpu/drm/bridge/synopsys/dw-hdmi-i2s-audio.c b/drivers/gpu/drm/bridge/synopsys/dw-hdmi-i2s-audio.c index a2f0860b20bb..d751820c6da6 100644 --- a/drivers/gpu/drm/bridge/synopsys/dw-hdmi-i2s-audio.c +++ b/drivers/gpu/drm/bridge/synopsys/dw-hdmi-i2s-audio.c @@ -193,6 +193,7 @@ static int snd_dw_hdmi_probe(struct platform_device *pdev) struct hdmi_codec_pdata pdata; struct platform_device *platform; + memset(&pdata, 0, sizeof(pdata)); pdata.ops = &dw_hdmi_i2s_ops; pdata.i2s = 1; pdata.max_i2s_channels = 8; diff --git a/drivers/gpu/drm/drm_client.c b/drivers/gpu/drm/drm_client.c index fd67efe37c63..056ab9d5f313 100644 --- a/drivers/gpu/drm/drm_client.c +++ b/drivers/gpu/drm/drm_client.c @@ -233,21 +233,17 @@ void drm_client_dev_restore(struct drm_device *dev) static void drm_client_buffer_delete(struct drm_client_buffer *buffer) { - struct drm_device *dev = buffer->client->dev; - if (buffer->gem) { drm_gem_vunmap_unlocked(buffer->gem, &buffer->map); drm_gem_object_put(buffer->gem); } - if (buffer->handle) - drm_mode_destroy_dumb(dev, buffer->handle, buffer->client->file); - kfree(buffer); } static struct drm_client_buffer * -drm_client_buffer_create(struct drm_client_dev *client, u32 width, u32 height, u32 format) +drm_client_buffer_create(struct drm_client_dev *client, u32 width, u32 height, + u32 format, u32 *handle) { const struct drm_format_info *info = drm_format_info(format); struct drm_mode_create_dumb dumb_args = { }; @@ -269,16 +265,15 @@ drm_client_buffer_create(struct drm_client_dev *client, u32 width, u32 height, u if (ret) goto err_delete; - buffer->handle = dumb_args.handle; - buffer->pitch = dumb_args.pitch; - obj = drm_gem_object_lookup(client->file, dumb_args.handle); if (!obj) { ret = -ENOENT; goto err_delete; } + buffer->pitch = dumb_args.pitch; buffer->gem = obj; + *handle = dumb_args.handle; return buffer; @@ -365,7 +360,8 @@ static void drm_client_buffer_rmfb(struct drm_client_buffer *buffer) } static int drm_client_buffer_addfb(struct drm_client_buffer *buffer, - u32 width, u32 height, u32 format) + u32 width, u32 height, u32 format, + u32 handle) { struct drm_client_dev *client = buffer->client; struct drm_mode_fb_cmd fb_req = { }; @@ -377,7 +373,7 @@ static int drm_client_buffer_addfb(struct drm_client_buffer *buffer, fb_req.depth = info->depth; fb_req.width = width; fb_req.height = height; - fb_req.handle = buffer->handle; + fb_req.handle = handle; fb_req.pitch = buffer->pitch; ret = drm_mode_addfb(client->dev, &fb_req, client->file); @@ -414,13 +410,24 @@ struct drm_client_buffer * drm_client_framebuffer_create(struct drm_client_dev *client, u32 width, u32 height, u32 format) { struct drm_client_buffer *buffer; + u32 handle; int ret; - buffer = drm_client_buffer_create(client, width, height, format); + buffer = drm_client_buffer_create(client, width, height, format, + &handle); if (IS_ERR(buffer)) return buffer; - ret = drm_client_buffer_addfb(buffer, width, height, format); + ret = drm_client_buffer_addfb(buffer, width, height, format, handle); + + /* + * The handle is only needed for creating the framebuffer, destroy it + * again to solve a circular dependency should anybody export the GEM + * object as DMA-buf. The framebuffer and our buffer structure are still + * holding references to the GEM object to prevent its destruction. + */ + drm_mode_destroy_dumb(client->dev, handle, client->file); + if (ret) { drm_client_buffer_delete(buffer); return ERR_PTR(ret); diff --git a/drivers/gpu/drm/i915/display/intel_bios.c b/drivers/gpu/drm/i915/display/intel_bios.c index 572a4e3769f3..a491e6c38875 100644 --- a/drivers/gpu/drm/i915/display/intel_bios.c +++ b/drivers/gpu/drm/i915/display/intel_bios.c @@ -2466,6 +2466,22 @@ static enum port dvo_port_to_port(struct drm_i915_private *i915, dvo_port); } +static enum port +dsi_dvo_port_to_port(struct drm_i915_private *i915, u8 dvo_port) +{ + switch (dvo_port) { + case DVO_PORT_MIPIA: + return PORT_A; + case DVO_PORT_MIPIC: + if (DISPLAY_VER(i915) >= 11) + return PORT_B; + else + return PORT_C; + default: + return PORT_NONE; + } +} + static int parse_bdb_230_dp_max_link_rate(const int vbt_max_link_rate) { switch (vbt_max_link_rate) { @@ -3414,19 +3430,16 @@ bool intel_bios_is_dsi_present(struct drm_i915_private *i915, dvo_port = child->dvo_port; - if (dvo_port == DVO_PORT_MIPIA || - (dvo_port == DVO_PORT_MIPIB && DISPLAY_VER(i915) >= 11) || - (dvo_port == DVO_PORT_MIPIC && DISPLAY_VER(i915) < 11)) { - if (port) - *port = dvo_port - DVO_PORT_MIPIA; - return true; - } else if (dvo_port == DVO_PORT_MIPIB || - dvo_port == DVO_PORT_MIPIC || - dvo_port == DVO_PORT_MIPID) { + if (dsi_dvo_port_to_port(i915, dvo_port) == PORT_NONE) { drm_dbg_kms(&i915->drm, "VBT has unsupported DSI port %c\n", port_name(dvo_port - DVO_PORT_MIPIA)); + continue; } + + if (port) + *port = dsi_dvo_port_to_port(i915, dvo_port); + return true; } return false; @@ -3511,7 +3524,7 @@ bool intel_bios_get_dsc_params(struct intel_encoder *encoder, if (!(child->device_type & DEVICE_TYPE_MIPI_OUTPUT)) continue; - if (child->dvo_port - DVO_PORT_MIPIA == encoder->port) { + if (dsi_dvo_port_to_port(i915, child->dvo_port) == encoder->port) { if (!devdata->dsc) return false; diff --git a/drivers/gpu/drm/i915/display/intel_cdclk.c b/drivers/gpu/drm/i915/display/intel_cdclk.c index b74e36d76013..407a477939e5 100644 --- a/drivers/gpu/drm/i915/display/intel_cdclk.c +++ b/drivers/gpu/drm/i915/display/intel_cdclk.c @@ -1319,7 +1319,7 @@ static const struct intel_cdclk_vals adlp_cdclk_table[] = { { .refclk = 24000, .cdclk = 192000, .divider = 2, .ratio = 16 }, { .refclk = 24000, .cdclk = 312000, .divider = 2, .ratio = 26 }, { .refclk = 24000, .cdclk = 552000, .divider = 2, .ratio = 46 }, - { .refclk = 24400, .cdclk = 648000, .divider = 2, .ratio = 54 }, + { .refclk = 24000, .cdclk = 648000, .divider = 2, .ratio = 54 }, { .refclk = 38400, .cdclk = 179200, .divider = 3, .ratio = 14 }, { .refclk = 38400, .cdclk = 192000, .divider = 2, .ratio = 10 }, diff --git a/drivers/gpu/drm/i915/display/intel_fbdev.c b/drivers/gpu/drm/i915/display/intel_fbdev.c index 5575d7abdc09..f76c06b7f1d4 100644 --- a/drivers/gpu/drm/i915/display/intel_fbdev.c +++ b/drivers/gpu/drm/i915/display/intel_fbdev.c @@ -328,8 +328,20 @@ out_unlock: return ret; } +static int intelfb_dirty(struct drm_fb_helper *helper, struct drm_clip_rect *clip) +{ + if (!(clip->x1 < clip->x2 && clip->y1 < clip->y2)) + return 0; + + if (helper->fb->funcs->dirty) + return helper->fb->funcs->dirty(helper->fb, NULL, 0, 0, clip, 1); + + return 0; +} + static const struct drm_fb_helper_funcs intel_fb_helper_funcs = { .fb_probe = intelfb_create, + .fb_dirty = intelfb_dirty, }; static void intel_fbdev_destroy(struct intel_fbdev *ifbdev) diff --git a/drivers/gpu/drm/i915/display/skl_watermark.c b/drivers/gpu/drm/i915/display/skl_watermark.c index e0766d1be966..11554645e6ee 100644 --- a/drivers/gpu/drm/i915/display/skl_watermark.c +++ b/drivers/gpu/drm/i915/display/skl_watermark.c @@ -1587,7 +1587,8 @@ skl_crtc_allocate_plane_ddb(struct intel_atomic_state *state, skl_check_wm_level(&wm->wm[level], ddb); if (icl_need_wm1_wa(i915, plane_id) && - level == 1 && wm->wm[0].enable) { + level == 1 && !wm->wm[level].enable && + wm->wm[0].enable) { wm->wm[level].blocks = wm->wm[0].blocks; wm->wm[level].lines = wm->wm[0].lines; wm->wm[level].ignore_lines = wm->wm[0].ignore_lines; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 6250de9b9196..e4b78ab4773b 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -1861,11 +1861,19 @@ static int get_ppgtt(struct drm_i915_file_private *file_priv, vm = ctx->vm; GEM_BUG_ON(!vm); + /* + * Get a reference for the allocated handle. Once the handle is + * visible in the vm_xa table, userspace could try to close it + * from under our feet, so we need to hold the extra reference + * first. + */ + i915_vm_get(vm); + err = xa_alloc(&file_priv->vm_xa, &id, vm, xa_limit_32b, GFP_KERNEL); - if (err) + if (err) { + i915_vm_put(vm); return err; - - i915_vm_get(vm); + } GEM_BUG_ON(id == 0); /* reserved for invalid/unassigned ppgtt */ args->value = id; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index f266b68cf012..0f2e056c02dd 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -3483,6 +3483,13 @@ err_request: eb.composite_fence : &eb.requests[0]->fence); + if (unlikely(eb.gem_context->syncobj)) { + drm_syncobj_replace_fence(eb.gem_context->syncobj, + eb.composite_fence ? + eb.composite_fence : + &eb.requests[0]->fence); + } + if (out_fence) { if (err == 0) { fd_install(out_fence_fd, out_fence->file); @@ -3494,13 +3501,6 @@ err_request: } } - if (unlikely(eb.gem_context->syncobj)) { - drm_syncobj_replace_fence(eb.gem_context->syncobj, - eb.composite_fence ? - eb.composite_fence : - &eb.requests[0]->fence); - } - if (!out_fence && eb.composite_fence) dma_fence_put(eb.composite_fence); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 9c759df700ca..937728840428 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -579,7 +579,7 @@ static int shmem_object_init(struct intel_memory_region *mem, mapping_set_gfp_mask(mapping, mask); GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM)); - i915_gem_object_init(obj, &i915_gem_shmem_ops, &lock_class, 0); + i915_gem_object_init(obj, &i915_gem_shmem_ops, &lock_class, flags); obj->mem_flags |= I915_BO_FLAG_STRUCT_PAGE; obj->write_domain = I915_GEM_DOMAIN_CPU; obj->read_domains = I915_GEM_DOMAIN_CPU; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_tiling.c b/drivers/gpu/drm/i915/gem/i915_gem_tiling.c index fd42b89b7162..bc21b1c2350a 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_tiling.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_tiling.c @@ -305,10 +305,6 @@ i915_gem_object_set_tiling(struct drm_i915_gem_object *obj, spin_unlock(&obj->vma.lock); obj->tiling_and_stride = tiling | stride; - i915_gem_object_unlock(obj); - - /* Force the fence to be reacquired for GTT access */ - i915_gem_object_release_mmap_gtt(obj); /* Try to preallocate memory required to save swizzling on put-pages */ if (i915_gem_object_needs_bit17_swizzle(obj)) { @@ -321,6 +317,11 @@ i915_gem_object_set_tiling(struct drm_i915_gem_object *obj, obj->bit_17 = NULL; } + i915_gem_object_unlock(obj); + + /* Force the fence to be reacquired for GTT access */ + i915_gem_object_release_mmap_gtt(obj); + return 0; } diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index e94365b08f1e..2aa63ec521b8 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -528,7 +528,7 @@ retry: return rq; } -struct i915_request *intel_context_find_active_request(struct intel_context *ce) +struct i915_request *intel_context_get_active_request(struct intel_context *ce) { struct intel_context *parent = intel_context_to_parent(ce); struct i915_request *rq, *active = NULL; @@ -552,6 +552,8 @@ struct i915_request *intel_context_find_active_request(struct intel_context *ce) active = rq; } + if (active) + active = i915_request_get_rcu(active); spin_unlock_irqrestore(&parent->guc_state.lock, flags); return active; diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h index fb62b7b8cbcd..0a8d553da3f4 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.h +++ b/drivers/gpu/drm/i915/gt/intel_context.h @@ -268,8 +268,7 @@ int intel_context_prepare_remote_request(struct intel_context *ce, struct i915_request *intel_context_create_request(struct intel_context *ce); -struct i915_request * -intel_context_find_active_request(struct intel_context *ce); +struct i915_request *intel_context_get_active_request(struct intel_context *ce); static inline bool intel_context_is_barrier(const struct intel_context *ce) { diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h index cbc8b857d5f7..7a4504ea35c3 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine.h +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -248,8 +248,8 @@ void intel_engine_dump_active_requests(struct list_head *requests, ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now); -struct i915_request * -intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine); +void intel_engine_get_hung_entity(struct intel_engine_cs *engine, + struct intel_context **ce, struct i915_request **rq); u32 intel_engine_context_size(struct intel_gt *gt, u8 class); struct intel_context * diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index c33e0d72d670..d37931e16fd9 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -2094,17 +2094,6 @@ static void print_request_ring(struct drm_printer *m, struct i915_request *rq) } } -static unsigned long list_count(struct list_head *list) -{ - struct list_head *pos; - unsigned long count = 0; - - list_for_each(pos, list) - count++; - - return count; -} - static unsigned long read_ul(void *p, size_t x) { return *(unsigned long *)(p + x); @@ -2196,11 +2185,11 @@ void intel_engine_dump_active_requests(struct list_head *requests, } } -static void engine_dump_active_requests(struct intel_engine_cs *engine, struct drm_printer *m) +static void engine_dump_active_requests(struct intel_engine_cs *engine, + struct drm_printer *m) { + struct intel_context *hung_ce = NULL; struct i915_request *hung_rq = NULL; - struct intel_context *ce; - bool guc; /* * No need for an engine->irq_seqno_barrier() before the seqno reads. @@ -2209,27 +2198,22 @@ static void engine_dump_active_requests(struct intel_engine_cs *engine, struct d * But the intention here is just to report an instantaneous snapshot * so that's fine. */ - lockdep_assert_held(&engine->sched_engine->lock); + intel_engine_get_hung_entity(engine, &hung_ce, &hung_rq); drm_printf(m, "\tRequests:\n"); - guc = intel_uc_uses_guc_submission(&engine->gt->uc); - if (guc) { - ce = intel_engine_get_hung_context(engine); - if (ce) - hung_rq = intel_context_find_active_request(ce); - } else { - hung_rq = intel_engine_execlist_find_hung_request(engine); - } - if (hung_rq) engine_dump_request(hung_rq, m, "\t\thung"); + else if (hung_ce) + drm_printf(m, "\t\tGot hung ce but no hung rq!\n"); - if (guc) + if (intel_uc_uses_guc_submission(&engine->gt->uc)) intel_guc_dump_active_requests(engine, hung_rq, m); else - intel_engine_dump_active_requests(&engine->sched_engine->requests, - hung_rq, m); + intel_execlists_dump_active_requests(engine, hung_rq, m); + + if (hung_rq) + i915_request_put(hung_rq); } void intel_engine_dump(struct intel_engine_cs *engine, @@ -2239,7 +2223,6 @@ void intel_engine_dump(struct intel_engine_cs *engine, struct i915_gpu_error * const error = &engine->i915->gpu_error; struct i915_request *rq; intel_wakeref_t wakeref; - unsigned long flags; ktime_t dummy; if (header) { @@ -2276,13 +2259,8 @@ void intel_engine_dump(struct intel_engine_cs *engine, i915_reset_count(error)); print_properties(engine, m); - spin_lock_irqsave(&engine->sched_engine->lock, flags); engine_dump_active_requests(engine, m); - drm_printf(m, "\tOn hold?: %lu\n", - list_count(&engine->sched_engine->hold)); - spin_unlock_irqrestore(&engine->sched_engine->lock, flags); - drm_printf(m, "\tMMIO base: 0x%08x\n", engine->mmio_base); wakeref = intel_runtime_pm_get_if_in_use(engine->uncore->rpm); if (wakeref) { @@ -2328,8 +2306,7 @@ intel_engine_create_virtual(struct intel_engine_cs **siblings, return siblings[0]->cops->create_virtual(siblings, count, flags); } -struct i915_request * -intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine) +static struct i915_request *engine_execlist_find_hung_request(struct intel_engine_cs *engine) { struct i915_request *request, *active = NULL; @@ -2381,6 +2358,33 @@ intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine) return active; } +void intel_engine_get_hung_entity(struct intel_engine_cs *engine, + struct intel_context **ce, struct i915_request **rq) +{ + unsigned long flags; + + *ce = intel_engine_get_hung_context(engine); + if (*ce) { + intel_engine_clear_hung_context(engine); + + *rq = intel_context_get_active_request(*ce); + return; + } + + /* + * Getting here with GuC enabled means it is a forced error capture + * with no actual hang. So, no need to attempt the execlist search. + */ + if (intel_uc_uses_guc_submission(&engine->gt->uc)) + return; + + spin_lock_irqsave(&engine->sched_engine->lock, flags); + *rq = engine_execlist_find_hung_request(engine); + if (*rq) + *rq = i915_request_get_rcu(*rq); + spin_unlock_irqrestore(&engine->sched_engine->lock, flags); +} + void xehp_enable_ccs_engines(struct intel_engine_cs *engine) { /* diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c index 2daffa7c7dfd..21cb5b69d82e 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@ -4148,6 +4148,33 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine, spin_unlock_irqrestore(&sched_engine->lock, flags); } +static unsigned long list_count(struct list_head *list) +{ + struct list_head *pos; + unsigned long count = 0; + + list_for_each(pos, list) + count++; + + return count; +} + +void intel_execlists_dump_active_requests(struct intel_engine_cs *engine, + struct i915_request *hung_rq, + struct drm_printer *m) +{ + unsigned long flags; + + spin_lock_irqsave(&engine->sched_engine->lock, flags); + + intel_engine_dump_active_requests(&engine->sched_engine->requests, hung_rq, m); + + drm_printf(m, "\tOn hold?: %lu\n", + list_count(&engine->sched_engine->hold)); + + spin_unlock_irqrestore(&engine->sched_engine->lock, flags); +} + #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) #include "selftest_execlists.c" #endif diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h index a1aa92c983a5..d2c7d45ea062 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h @@ -32,6 +32,10 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine, int indent), unsigned int max); +void intel_execlists_dump_active_requests(struct intel_engine_cs *engine, + struct i915_request *hung_rq, + struct drm_printer *m); + bool intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine); diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 949c19339015..a0740308555d 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -1355,6 +1355,13 @@ icl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) GAMT_CHKN_BIT_REG, GAMT_CHKN_DISABLE_L3_COH_PIPE); + /* + * Wa_1408615072:icl,ehl (vsunit) + * Wa_1407596294:icl,ehl (hsunit) + */ + wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, + VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS); + /* Wa_1407352427:icl,ehl */ wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, PSDUNIT_CLKGATE_DIS); @@ -2540,13 +2547,6 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) GEN11_ENABLE_32_PLANE_MODE); /* - * Wa_1408615072:icl,ehl (vsunit) - * Wa_1407596294:icl,ehl (hsunit) - */ - wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, - VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS); - - /* * Wa_1408767742:icl[a2..forever],ehl[all] * Wa_1605460711:icl[a0..c0] */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 0a42f1807f52..c10977cb06b9 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1702,7 +1702,7 @@ static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st goto next_context; guilty = false; - rq = intel_context_find_active_request(ce); + rq = intel_context_get_active_request(ce); if (!rq) { head = ce->ring->tail; goto out_replay; @@ -1715,6 +1715,7 @@ static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st head = intel_ring_wrap(ce->ring, rq->head); __i915_request_reset(rq, guilty); + i915_request_put(rq); out_replay: guc_reset_state(ce, head, guilty); next_context: @@ -4817,6 +4818,8 @@ void intel_guc_find_hung_context(struct intel_engine_cs *engine) xa_lock_irqsave(&guc->context_lookup, flags); xa_for_each(&guc->context_lookup, index, ce) { + bool found; + if (!kref_get_unless_zero(&ce->ref)) continue; @@ -4833,10 +4836,18 @@ void intel_guc_find_hung_context(struct intel_engine_cs *engine) goto next; } + found = false; + spin_lock(&ce->guc_state.lock); list_for_each_entry(rq, &ce->guc_state.requests, sched.link) { if (i915_test_request_state(rq) != I915_REQUEST_ACTIVE) continue; + found = true; + break; + } + spin_unlock(&ce->guc_state.lock); + + if (found) { intel_engine_set_hung_context(engine, ce); /* Can only cope with one hang at a time... */ @@ -4844,6 +4855,7 @@ void intel_guc_find_hung_context(struct intel_engine_cs *engine) xa_lock(&guc->context_lookup); goto done; } + next: intel_context_put(ce); xa_lock(&guc->context_lookup); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 9d5d5a397b64..b20bd6365615 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1596,43 +1596,20 @@ capture_engine(struct intel_engine_cs *engine, { struct intel_engine_capture_vma *capture = NULL; struct intel_engine_coredump *ee; - struct intel_context *ce; + struct intel_context *ce = NULL; struct i915_request *rq = NULL; - unsigned long flags; ee = intel_engine_coredump_alloc(engine, ALLOW_FAIL, dump_flags); if (!ee) return NULL; - ce = intel_engine_get_hung_context(engine); - if (ce) { - intel_engine_clear_hung_context(engine); - rq = intel_context_find_active_request(ce); - if (!rq || !i915_request_started(rq)) - goto no_request_capture; - } else { - /* - * Getting here with GuC enabled means it is a forced error capture - * with no actual hang. So, no need to attempt the execlist search. - */ - if (!intel_uc_uses_guc_submission(&engine->gt->uc)) { - spin_lock_irqsave(&engine->sched_engine->lock, flags); - rq = intel_engine_execlist_find_hung_request(engine); - spin_unlock_irqrestore(&engine->sched_engine->lock, - flags); - } - } - if (rq) - rq = i915_request_get_rcu(rq); - - if (!rq) + intel_engine_get_hung_entity(engine, &ce, &rq); + if (!rq || !i915_request_started(rq)) goto no_request_capture; capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); - if (!capture) { - i915_request_put(rq); + if (!capture) goto no_request_capture; - } if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) intel_guc_capture_get_matching_node(engine->gt, ee, ce); @@ -1642,6 +1619,8 @@ capture_engine(struct intel_engine_cs *engine, return ee; no_request_capture: + if (rq) + i915_request_put(rq); kfree(ee); return NULL; } diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h index 40768373cdd9..c5a4f49ee206 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h @@ -97,6 +97,7 @@ int gp100_fb_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct n int gp102_fb_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fb **); int gp10b_fb_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fb **); int gv100_fb_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fb **); +int tu102_fb_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fb **); int ga100_fb_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fb **); int ga102_fb_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fb **); diff --git a/drivers/gpu/drm/nouveau/nvkm/core/firmware.c b/drivers/gpu/drm/nouveau/nvkm/core/firmware.c index fcf2a002f6cb..91fb494d4009 100644 --- a/drivers/gpu/drm/nouveau/nvkm/core/firmware.c +++ b/drivers/gpu/drm/nouveau/nvkm/core/firmware.c @@ -151,6 +151,9 @@ nvkm_firmware_mem_page(struct nvkm_memory *memory) static enum nvkm_memory_target nvkm_firmware_mem_target(struct nvkm_memory *memory) { + if (nvkm_firmware_mem(memory)->device->func->tegra) + return NVKM_MEM_TARGET_NCOH; + return NVKM_MEM_TARGET_HOST; } diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 364fea320cb3..1c81e5b34d29 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2405,7 +2405,7 @@ nv162_chipset = { .bus = { 0x00000001, gf100_bus_new }, .devinit = { 0x00000001, tu102_devinit_new }, .fault = { 0x00000001, tu102_fault_new }, - .fb = { 0x00000001, gv100_fb_new }, + .fb = { 0x00000001, tu102_fb_new }, .fuse = { 0x00000001, gm107_fuse_new }, .gpio = { 0x00000001, gk104_gpio_new }, .gsp = { 0x00000001, gv100_gsp_new }, @@ -2440,7 +2440,7 @@ nv164_chipset = { .bus = { 0x00000001, gf100_bus_new }, .devinit = { 0x00000001, tu102_devinit_new }, .fault = { 0x00000001, tu102_fault_new }, - .fb = { 0x00000001, gv100_fb_new }, + .fb = { 0x00000001, tu102_fb_new }, .fuse = { 0x00000001, gm107_fuse_new }, .gpio = { 0x00000001, gk104_gpio_new }, .gsp = { 0x00000001, gv100_gsp_new }, @@ -2475,7 +2475,7 @@ nv166_chipset = { .bus = { 0x00000001, gf100_bus_new }, .devinit = { 0x00000001, tu102_devinit_new }, .fault = { 0x00000001, tu102_fault_new }, - .fb = { 0x00000001, gv100_fb_new }, + .fb = { 0x00000001, tu102_fb_new }, .fuse = { 0x00000001, gm107_fuse_new }, .gpio = { 0x00000001, gk104_gpio_new }, .gsp = { 0x00000001, gv100_gsp_new }, @@ -2510,7 +2510,7 @@ nv167_chipset = { .bus = { 0x00000001, gf100_bus_new }, .devinit = { 0x00000001, tu102_devinit_new }, .fault = { 0x00000001, tu102_fault_new }, - .fb = { 0x00000001, gv100_fb_new }, + .fb = { 0x00000001, tu102_fb_new }, .fuse = { 0x00000001, gm107_fuse_new }, .gpio = { 0x00000001, gk104_gpio_new }, .gsp = { 0x00000001, gv100_gsp_new }, @@ -2545,7 +2545,7 @@ nv168_chipset = { .bus = { 0x00000001, gf100_bus_new }, .devinit = { 0x00000001, tu102_devinit_new }, .fault = { 0x00000001, tu102_fault_new }, - .fb = { 0x00000001, gv100_fb_new }, + .fb = { 0x00000001, tu102_fb_new }, .fuse = { 0x00000001, gm107_fuse_new }, .gpio = { 0x00000001, gk104_gpio_new }, .gsp = { 0x00000001, gv100_gsp_new }, diff --git a/drivers/gpu/drm/nouveau/nvkm/falcon/gm200.c b/drivers/gpu/drm/nouveau/nvkm/falcon/gm200.c index 393ade9f7e6c..b7da3ab44c27 100644 --- a/drivers/gpu/drm/nouveau/nvkm/falcon/gm200.c +++ b/drivers/gpu/drm/nouveau/nvkm/falcon/gm200.c @@ -48,6 +48,16 @@ gm200_flcn_pio_dmem_rd(struct nvkm_falcon *falcon, u8 port, const u8 *img, int l img += 4; len -= 4; } + + /* Sigh. Tegra PMU FW's init message... */ + if (len) { + u32 data = nvkm_falcon_rd32(falcon, 0x1c4 + (port * 8)); + + while (len--) { + *(u8 *)img++ = data & 0xff; + data >>= 8; + } + } } static void @@ -64,6 +74,8 @@ gm200_flcn_pio_dmem_wr(struct nvkm_falcon *falcon, u8 port, const u8 *img, int l img += 4; len -= 4; } + + WARN_ON(len); } static void @@ -74,7 +86,7 @@ gm200_flcn_pio_dmem_wr_init(struct nvkm_falcon *falcon, u8 port, bool sec, u32 d const struct nvkm_falcon_func_pio gm200_flcn_dmem_pio = { - .min = 4, + .min = 1, .max = 0x100, .wr_init = gm200_flcn_pio_dmem_wr_init, .wr = gm200_flcn_pio_dmem_wr, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/tu102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/tu102.c index 634f64f88fc8..81a1ad2c88a7 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/tu102.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/tu102.c @@ -65,10 +65,33 @@ tu102_devinit_pll_set(struct nvkm_devinit *init, u32 type, u32 freq) return ret; } +static int +tu102_devinit_wait(struct nvkm_device *device) +{ + unsigned timeout = 50 + 2000; + + do { + if (nvkm_rd32(device, 0x118128) & 0x00000001) { + if ((nvkm_rd32(device, 0x118234) & 0x000000ff) == 0xff) + return 0; + } + + usleep_range(1000, 2000); + } while (timeout--); + + return -ETIMEDOUT; +} + int tu102_devinit_post(struct nvkm_devinit *base, bool post) { struct nv50_devinit *init = nv50_devinit(base); + int ret; + + ret = tu102_devinit_wait(init->base.subdev.device); + if (ret) + return ret; + gm200_devinit_preos(init, post); return 0; } diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/Kbuild b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/Kbuild index 5d0bab8ecb43..6ba5120a2ebe 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/Kbuild +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/Kbuild @@ -32,6 +32,7 @@ nvkm-y += nvkm/subdev/fb/gp100.o nvkm-y += nvkm/subdev/fb/gp102.o nvkm-y += nvkm/subdev/fb/gp10b.o nvkm-y += nvkm/subdev/fb/gv100.o +nvkm-y += nvkm/subdev/fb/tu102.o nvkm-y += nvkm/subdev/fb/ga100.o nvkm-y += nvkm/subdev/fb/ga102.o diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga102.c index 8b7c8ea5e8a5..5a21b0ae4595 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga102.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga102.c @@ -40,12 +40,6 @@ ga102_fb_vpr_scrub(struct nvkm_fb *fb) return ret; } -static bool -ga102_fb_vpr_scrub_required(struct nvkm_fb *fb) -{ - return (nvkm_rd32(fb->subdev.device, 0x1fa80c) & 0x00000010) != 0; -} - static const struct nvkm_fb_func ga102_fb = { .dtor = gf100_fb_dtor, @@ -56,7 +50,7 @@ ga102_fb = { .sysmem.flush_page_init = gf100_fb_sysmem_flush_page_init, .ram_new = ga102_ram_new, .default_bigpage = 16, - .vpr.scrub_required = ga102_fb_vpr_scrub_required, + .vpr.scrub_required = tu102_fb_vpr_scrub_required, .vpr.scrub = ga102_fb_vpr_scrub, }; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c index 1f0126437c1a..0e3c0a8f5d71 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c @@ -49,8 +49,3 @@ gv100_fb_new(struct nvkm_device *device, enum nvkm_subdev_type type, int inst, s } MODULE_FIRMWARE("nvidia/gv100/nvdec/scrubber.bin"); -MODULE_FIRMWARE("nvidia/tu102/nvdec/scrubber.bin"); -MODULE_FIRMWARE("nvidia/tu104/nvdec/scrubber.bin"); -MODULE_FIRMWARE("nvidia/tu106/nvdec/scrubber.bin"); -MODULE_FIRMWARE("nvidia/tu116/nvdec/scrubber.bin"); -MODULE_FIRMWARE("nvidia/tu117/nvdec/scrubber.bin"); diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h index ac03eac0f261..f517751f94ac 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h @@ -89,4 +89,6 @@ bool gp102_fb_vpr_scrub_required(struct nvkm_fb *); int gp102_fb_vpr_scrub(struct nvkm_fb *); int gv100_fb_init_page(struct nvkm_fb *); + +bool tu102_fb_vpr_scrub_required(struct nvkm_fb *); #endif diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/tu102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/tu102.c new file mode 100644 index 000000000000..be82af0364ee --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/tu102.c @@ -0,0 +1,55 @@ +/* + * Copyright 2018 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "gf100.h" +#include "ram.h" + +bool +tu102_fb_vpr_scrub_required(struct nvkm_fb *fb) +{ + return (nvkm_rd32(fb->subdev.device, 0x1fa80c) & 0x00000010) != 0; +} + +static const struct nvkm_fb_func +tu102_fb = { + .dtor = gf100_fb_dtor, + .oneinit = gf100_fb_oneinit, + .init = gm200_fb_init, + .init_page = gv100_fb_init_page, + .init_unkn = gp100_fb_init_unkn, + .sysmem.flush_page_init = gf100_fb_sysmem_flush_page_init, + .vpr.scrub_required = tu102_fb_vpr_scrub_required, + .vpr.scrub = gp102_fb_vpr_scrub, + .ram_new = gp100_ram_new, + .default_bigpage = 16, +}; + +int +tu102_fb_new(struct nvkm_device *device, enum nvkm_subdev_type type, int inst, struct nvkm_fb **pfb) +{ + return gp102_fb_new_(&tu102_fb, device, type, inst, pfb); +} + +MODULE_FIRMWARE("nvidia/tu102/nvdec/scrubber.bin"); +MODULE_FIRMWARE("nvidia/tu104/nvdec/scrubber.bin"); +MODULE_FIRMWARE("nvidia/tu106/nvdec/scrubber.bin"); +MODULE_FIRMWARE("nvidia/tu116/nvdec/scrubber.bin"); +MODULE_FIRMWARE("nvidia/tu117/nvdec/scrubber.bin"); diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c index a72403777329..2ed04da3621d 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gm20b.c @@ -225,7 +225,7 @@ gm20b_pmu_init(struct nvkm_pmu *pmu) pmu->initmsg_received = false; - nvkm_falcon_load_dmem(falcon, &args, addr_args, sizeof(args), 0); + nvkm_falcon_pio_wr(falcon, (u8 *)&args, 0, 0, DMEM, addr_args, sizeof(args), 0, false); nvkm_falcon_start(falcon); return 0; } diff --git a/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c b/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c index 857a2f0420d7..c924f1124ebc 100644 --- a/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c +++ b/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c @@ -1193,14 +1193,11 @@ static int boe_panel_enter_sleep_mode(struct boe_panel *boe) return 0; } -static int boe_panel_unprepare(struct drm_panel *panel) +static int boe_panel_disable(struct drm_panel *panel) { struct boe_panel *boe = to_boe_panel(panel); int ret; - if (!boe->prepared) - return 0; - ret = boe_panel_enter_sleep_mode(boe); if (ret < 0) { dev_err(panel->dev, "failed to set panel off: %d\n", ret); @@ -1209,6 +1206,16 @@ static int boe_panel_unprepare(struct drm_panel *panel) msleep(150); + return 0; +} + +static int boe_panel_unprepare(struct drm_panel *panel) +{ + struct boe_panel *boe = to_boe_panel(panel); + + if (!boe->prepared) + return 0; + if (boe->desc->discharge_on_disable) { regulator_disable(boe->avee); regulator_disable(boe->avdd); @@ -1528,6 +1535,7 @@ static enum drm_panel_orientation boe_panel_get_orientation(struct drm_panel *pa } static const struct drm_panel_funcs boe_panel_funcs = { + .disable = boe_panel_disable, .unprepare = boe_panel_unprepare, .prepare = boe_panel_prepare, .enable = boe_panel_enable, diff --git a/drivers/gpu/drm/solomon/ssd130x.c b/drivers/gpu/drm/solomon/ssd130x.c index 53464afc2b9a..91f69e62430b 100644 --- a/drivers/gpu/drm/solomon/ssd130x.c +++ b/drivers/gpu/drm/solomon/ssd130x.c @@ -656,18 +656,8 @@ static const struct drm_crtc_helper_funcs ssd130x_crtc_helper_funcs = { .atomic_check = drm_crtc_helper_atomic_check, }; -static void ssd130x_crtc_reset(struct drm_crtc *crtc) -{ - struct drm_device *drm = crtc->dev; - struct ssd130x_device *ssd130x = drm_to_ssd130x(drm); - - ssd130x_init(ssd130x); - - drm_atomic_helper_crtc_reset(crtc); -} - static const struct drm_crtc_funcs ssd130x_crtc_funcs = { - .reset = ssd130x_crtc_reset, + .reset = drm_atomic_helper_crtc_reset, .destroy = drm_crtc_cleanup, .set_config = drm_atomic_helper_set_config, .page_flip = drm_atomic_helper_page_flip, @@ -686,6 +676,12 @@ static void ssd130x_encoder_helper_atomic_enable(struct drm_encoder *encoder, if (ret) return; + ret = ssd130x_init(ssd130x); + if (ret) { + ssd130x_power_off(ssd130x); + return; + } + ssd130x_write_cmd(ssd130x, 1, SSD130X_DISPLAY_ON); backlight_enable(ssd130x->bl_dev); diff --git a/drivers/gpu/drm/vc4/vc4_crtc.c b/drivers/gpu/drm/vc4/vc4_crtc.c index 0108613e79d5..7258975331ca 100644 --- a/drivers/gpu/drm/vc4/vc4_crtc.c +++ b/drivers/gpu/drm/vc4/vc4_crtc.c @@ -711,7 +711,7 @@ static int vc4_crtc_atomic_check(struct drm_crtc *crtc, struct vc4_encoder *vc4_encoder = to_vc4_encoder(encoder); if (vc4_encoder->type == VC4_ENCODER_TYPE_HDMI0) { - vc4_state->hvs_load = max(mode->clock * mode->hdisplay / mode->htotal + 1000, + vc4_state->hvs_load = max(mode->clock * mode->hdisplay / mode->htotal + 8000, mode->clock * 9 / 10) * 1000; } else { vc4_state->hvs_load = mode->clock * 1000; diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c index 12a00d644b61..7546103f1499 100644 --- a/drivers/gpu/drm/vc4/vc4_hdmi.c +++ b/drivers/gpu/drm/vc4/vc4_hdmi.c @@ -97,6 +97,10 @@ #define VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_1_SHIFT 8 #define VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_1_MASK VC4_MASK(15, 8) +#define VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_0_MASK VC4_MASK(7, 0) +#define VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_0_SET_AVMUTE BIT(0) +#define VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_0_CLEAR_AVMUTE BIT(4) + # define VC4_HD_M_SW_RST BIT(2) # define VC4_HD_M_ENABLE BIT(0) @@ -1306,7 +1310,6 @@ static void vc5_hdmi_set_timings(struct vc4_hdmi *vc4_hdmi, VC4_HDMI_VERTB_VBP)); unsigned long flags; unsigned char gcp; - bool gcp_en; u32 reg; int idx; @@ -1341,16 +1344,13 @@ static void vc5_hdmi_set_timings(struct vc4_hdmi *vc4_hdmi, switch (vc4_state->output_bpc) { case 12: gcp = 6; - gcp_en = true; break; case 10: gcp = 5; - gcp_en = true; break; case 8: default: - gcp = 4; - gcp_en = false; + gcp = 0; break; } @@ -1359,8 +1359,7 @@ static void vc5_hdmi_set_timings(struct vc4_hdmi *vc4_hdmi, * doesn't signal in GCP. */ if (vc4_state->output_format == VC4_HDMI_OUTPUT_YUV422) { - gcp = 4; - gcp_en = false; + gcp = 0; } reg = HDMI_READ(HDMI_DEEP_COLOR_CONFIG_1); @@ -1373,11 +1372,12 @@ static void vc5_hdmi_set_timings(struct vc4_hdmi *vc4_hdmi, reg = HDMI_READ(HDMI_GCP_WORD_1); reg &= ~VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_1_MASK; reg |= VC4_SET_FIELD(gcp, VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_1); + reg &= ~VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_0_MASK; + reg |= VC5_HDMI_GCP_WORD_1_GCP_SUBPACKET_BYTE_0_CLEAR_AVMUTE; HDMI_WRITE(HDMI_GCP_WORD_1, reg); reg = HDMI_READ(HDMI_GCP_CONFIG); - reg &= ~VC5_HDMI_GCP_CONFIG_GCP_ENABLE; - reg |= gcp_en ? VC5_HDMI_GCP_CONFIG_GCP_ENABLE : 0; + reg |= VC5_HDMI_GCP_CONFIG_GCP_ENABLE; HDMI_WRITE(HDMI_GCP_CONFIG, reg); reg = HDMI_READ(HDMI_MISC_CONTROL); @@ -3018,7 +3018,8 @@ static int vc4_hdmi_cec_init(struct vc4_hdmi *vc4_hdmi) } vc4_hdmi->cec_adap = cec_allocate_adapter(&vc4_hdmi_cec_adap_ops, - vc4_hdmi, "vc4", + vc4_hdmi, + vc4_hdmi->variant->card_name, CEC_CAP_DEFAULTS | CEC_CAP_CONNECTOR_INFO, 1); ret = PTR_ERR_OR_ZERO(vc4_hdmi->cec_adap); diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c index 8b92a45a3c89..bd5acc4a8687 100644 --- a/drivers/gpu/drm/vc4/vc4_plane.c +++ b/drivers/gpu/drm/vc4/vc4_plane.c @@ -340,7 +340,7 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state) { struct vc4_plane_state *vc4_state = to_vc4_plane_state(state); struct drm_framebuffer *fb = state->fb; - struct drm_gem_dma_object *bo = drm_fb_dma_get_gem_obj(fb, 0); + struct drm_gem_dma_object *bo; int num_planes = fb->format->num_planes; struct drm_crtc_state *crtc_state; u32 h_subsample = fb->format->hsub; @@ -359,8 +359,10 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state) if (ret) return ret; - for (i = 0; i < num_planes; i++) + for (i = 0; i < num_planes; i++) { + bo = drm_fb_dma_get_gem_obj(fb, i); vc4_state->offsets[i] = bo->dma_addr + fb->offsets[i]; + } /* * We don't support subpixel source positioning for scaling, diff --git a/drivers/gpu/drm/virtio/virtgpu_ioctl.c b/drivers/gpu/drm/virtio/virtgpu_ioctl.c index 9f4a90493aea..da45215a933d 100644 --- a/drivers/gpu/drm/virtio/virtgpu_ioctl.c +++ b/drivers/gpu/drm/virtio/virtgpu_ioctl.c @@ -126,7 +126,6 @@ static int virtio_gpu_execbuffer_ioctl(struct drm_device *dev, void *data, void __user *user_bo_handles = NULL; struct virtio_gpu_object_array *buflist = NULL; struct sync_file *sync_file; - int in_fence_fd = exbuf->fence_fd; int out_fence_fd = -1; void *buf; uint64_t fence_ctx; @@ -152,13 +151,11 @@ static int virtio_gpu_execbuffer_ioctl(struct drm_device *dev, void *data, ring_idx = exbuf->ring_idx; } - exbuf->fence_fd = -1; - virtio_gpu_create_context(dev, file); if (exbuf->flags & VIRTGPU_EXECBUF_FENCE_FD_IN) { struct dma_fence *in_fence; - in_fence = sync_file_get_fence(in_fence_fd); + in_fence = sync_file_get_fence(exbuf->fence_fd); if (!in_fence) return -EINVAL; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c index aa1cd5126a32..4dcf2eb7aa80 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c @@ -462,6 +462,9 @@ int vmw_bo_create(struct vmw_private *vmw, return -ENOMEM; } + /* + * vmw_bo_init will delete the *p_bo object if it fails + */ ret = vmw_bo_init(vmw, *p_bo, size, placement, interruptible, pin, bo_free); @@ -470,7 +473,6 @@ int vmw_bo_create(struct vmw_private *vmw, return ret; out_error: - kfree(*p_bo); *p_bo = NULL; return ret; } @@ -596,6 +598,7 @@ static int vmw_user_bo_synccpu_release(struct drm_file *filp, ttm_bo_put(&vmw_bo->base); } + drm_gem_object_put(&vmw_bo->base.base); return ret; } @@ -636,6 +639,7 @@ int vmw_user_bo_synccpu_ioctl(struct drm_device *dev, void *data, ret = vmw_user_bo_synccpu_grab(vbo, arg->flags); vmw_bo_unreference(&vbo); + drm_gem_object_put(&vbo->base.base); if (unlikely(ret != 0)) { if (ret == -ERESTARTSYS || ret == -EBUSY) return -EBUSY; @@ -693,7 +697,7 @@ int vmw_bo_unref_ioctl(struct drm_device *dev, void *data, * struct vmw_buffer_object should be placed. * Return: Zero on success, Negative error code on error. * - * The vmw buffer object pointer will be refcounted. + * The vmw buffer object pointer will be refcounted (both ttm and gem) */ int vmw_user_bo_lookup(struct drm_file *filp, uint32_t handle, @@ -710,7 +714,6 @@ int vmw_user_bo_lookup(struct drm_file *filp, *out = gem_to_vmw_bo(gobj); ttm_bo_get(&(*out)->base); - drm_gem_object_put(gobj); return 0; } @@ -791,7 +794,8 @@ int vmw_dumb_create(struct drm_file *file_priv, ret = vmw_gem_object_create_with_handle(dev_priv, file_priv, args->size, &args->handle, &vbo); - + /* drop reference from allocate - handle holds it now */ + drm_gem_object_put(&vbo->base.base); return ret; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c index a44d53e33cdb..c0686283ffd1 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c @@ -1160,6 +1160,7 @@ static int vmw_translate_mob_ptr(struct vmw_private *dev_priv, } ret = vmw_validation_add_bo(sw_context->ctx, vmw_bo, true, false); ttm_bo_put(&vmw_bo->base); + drm_gem_object_put(&vmw_bo->base.base); if (unlikely(ret != 0)) return ret; @@ -1214,6 +1215,7 @@ static int vmw_translate_guest_ptr(struct vmw_private *dev_priv, } ret = vmw_validation_add_bo(sw_context->ctx, vmw_bo, false, false); ttm_bo_put(&vmw_bo->base); + drm_gem_object_put(&vmw_bo->base.base); if (unlikely(ret != 0)) return ret; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c b/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c index ce609e7d758f..4d2c28e39f4e 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c @@ -146,14 +146,12 @@ int vmw_gem_object_create_with_handle(struct vmw_private *dev_priv, &vmw_sys_placement : &vmw_vram_sys_placement, true, false, &vmw_gem_destroy, p_vbo); - - (*p_vbo)->base.base.funcs = &vmw_gem_object_funcs; if (ret != 0) goto out_no_bo; + (*p_vbo)->base.base.funcs = &vmw_gem_object_funcs; + ret = drm_gem_handle_create(filp, &(*p_vbo)->base.base, handle); - /* drop reference from allocate - handle holds it now */ - drm_gem_object_put(&(*p_vbo)->base.base); out_no_bo: return ret; } @@ -180,6 +178,8 @@ int vmw_gem_object_create_ioctl(struct drm_device *dev, void *data, rep->map_handle = drm_vma_node_offset_addr(&vbo->base.base.vma_node); rep->cur_gmr_id = handle; rep->cur_gmr_offset = 0; + /* drop reference from allocate - handle holds it now */ + drm_gem_object_put(&vbo->base.base); out_no_bo: return ret; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 257f090071f1..445d619e1fdc 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -1815,8 +1815,10 @@ static struct drm_framebuffer *vmw_kms_fb_create(struct drm_device *dev, err_out: /* vmw_user_lookup_handle takes one ref so does new_fb */ - if (bo) + if (bo) { vmw_bo_unreference(&bo); + drm_gem_object_put(&bo->base.base); + } if (surface) vmw_surface_unreference(&surface); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c b/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c index e9f5c89b4ca6..b5b311f2a91a 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c @@ -458,6 +458,7 @@ int vmw_overlay_ioctl(struct drm_device *dev, void *data, ret = vmw_overlay_update_stream(dev_priv, buf, arg, true); vmw_bo_unreference(&buf); + drm_gem_object_put(&buf->base.base); out_unlock: mutex_unlock(&overlay->mutex); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c index 108a496b5d18..51e83dfa1cac 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c @@ -807,6 +807,7 @@ static int vmw_shader_define(struct drm_device *dev, struct drm_file *file_priv, num_output_sig, tfile, shader_handle); out_bad_arg: vmw_bo_unreference(&buffer); + drm_gem_object_put(&buffer->base.base); return ret; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c index 3bc63ae768f3..dcfb003841b3 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c @@ -683,7 +683,7 @@ static void vmw_user_surface_base_release(struct ttm_base_object **p_base) container_of(base, struct vmw_user_surface, prime.base); struct vmw_resource *res = &user_srf->srf.res; - if (base->shareable && res && res->backup) + if (res && res->backup) drm_gem_object_put(&res->backup->base.base); *p_base = NULL; @@ -864,7 +864,11 @@ int vmw_surface_define_ioctl(struct drm_device *dev, void *data, goto out_unlock; } vmw_bo_reference(res->backup); - drm_gem_object_get(&res->backup->base.base); + /* + * We don't expose the handle to the userspace and surface + * already holds a gem reference + */ + drm_gem_handle_delete(file_priv, backup_handle); } tmp = vmw_resource_reference(&srf->res); @@ -1568,8 +1572,6 @@ vmw_gb_surface_define_internal(struct drm_device *dev, drm_vma_node_offset_addr(&res->backup->base.base.vma_node); rep->buffer_size = res->backup->base.base.size; rep->buffer_handle = backup_handle; - if (user_srf->prime.base.shareable) - drm_gem_object_get(&res->backup->base.base); } else { rep->buffer_map_handle = 0; rep->buffer_size = 0; diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_client.c b/drivers/hid/amd-sfh-hid/amd_sfh_client.c index 1fb0f7105fb2..c751d12f5df8 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_client.c +++ b/drivers/hid/amd-sfh-hid/amd_sfh_client.c @@ -227,6 +227,7 @@ int amd_sfh_hid_client_init(struct amd_mp2_dev *privdata) cl_data->num_hid_devices = amd_mp2_get_sensor_num(privdata, &cl_data->sensor_idx[0]); if (cl_data->num_hid_devices == 0) return -ENODEV; + cl_data->is_any_sensor_enabled = false; INIT_DELAYED_WORK(&cl_data->work, amd_sfh_work); INIT_DELAYED_WORK(&cl_data->work_buffer, amd_sfh_work_buffer); @@ -287,6 +288,7 @@ int amd_sfh_hid_client_init(struct amd_mp2_dev *privdata) status = amd_sfh_wait_for_response (privdata, cl_data->sensor_idx[i], SENSOR_ENABLED); if (status == SENSOR_ENABLED) { + cl_data->is_any_sensor_enabled = true; cl_data->sensor_sts[i] = SENSOR_ENABLED; rc = amdtp_hid_probe(cl_data->cur_hid_dev, cl_data); if (rc) { @@ -301,19 +303,26 @@ int amd_sfh_hid_client_init(struct amd_mp2_dev *privdata) cl_data->sensor_sts[i]); goto cleanup; } + } else { + cl_data->sensor_sts[i] = SENSOR_DISABLED; + dev_dbg(dev, "sid 0x%x (%s) status 0x%x\n", + cl_data->sensor_idx[i], + get_sensor_name(cl_data->sensor_idx[i]), + cl_data->sensor_sts[i]); } dev_dbg(dev, "sid 0x%x (%s) status 0x%x\n", cl_data->sensor_idx[i], get_sensor_name(cl_data->sensor_idx[i]), cl_data->sensor_sts[i]); } - if (mp2_ops->discovery_status && mp2_ops->discovery_status(privdata) == 0) { + if (!cl_data->is_any_sensor_enabled || + (mp2_ops->discovery_status && mp2_ops->discovery_status(privdata) == 0)) { amd_sfh_hid_client_deinit(privdata); for (i = 0; i < cl_data->num_hid_devices; i++) { devm_kfree(dev, cl_data->feature_report[i]); devm_kfree(dev, in_data->input_report[i]); devm_kfree(dev, cl_data->report_descr[i]); } - dev_warn(dev, "Failed to discover, sensors not enabled\n"); + dev_warn(dev, "Failed to discover, sensors not enabled is %d\n", cl_data->is_any_sensor_enabled); return -EOPNOTSUPP; } schedule_delayed_work(&cl_data->work_buffer, msecs_to_jiffies(AMD_SFH_IDLE_LOOP)); diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_hid.h b/drivers/hid/amd-sfh-hid/amd_sfh_hid.h index 3754fb423e3a..528036892c9d 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_hid.h +++ b/drivers/hid/amd-sfh-hid/amd_sfh_hid.h @@ -32,6 +32,7 @@ struct amd_input_data { struct amdtp_cl_data { u8 init_done; u32 cur_hid_dev; + bool is_any_sensor_enabled; u32 hid_dev_count; u32 num_hid_devices; struct device_info *hid_devices; diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 3e1803592bd4..5c72aef3d3dd 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1202,6 +1202,7 @@ int hid_open_report(struct hid_device *device) __u8 *end; __u8 *next; int ret; + int i; static int (*dispatch_type[])(struct hid_parser *parser, struct hid_item *item) = { hid_parser_main, @@ -1252,6 +1253,8 @@ int hid_open_report(struct hid_device *device) goto err; } device->collection_size = HID_DEFAULT_NUM_COLLECTIONS; + for (i = 0; i < HID_DEFAULT_NUM_COLLECTIONS; i++) + device->collection[i].parent_idx = -1; ret = -EINVAL; while ((next = fetch_item(start, end, &item)) != NULL) { diff --git a/drivers/hid/hid-elecom.c b/drivers/hid/hid-elecom.c index e59e9911fc37..4fa45ee77503 100644 --- a/drivers/hid/hid-elecom.c +++ b/drivers/hid/hid-elecom.c @@ -12,6 +12,7 @@ * Copyright (c) 2017 Alex Manoussakis <amanou@gnu.org> * Copyright (c) 2017 Tomasz Kramkowski <tk@the-tk.com> * Copyright (c) 2020 YOSHIOKA Takuma <lo48576@hard-wi.red> + * Copyright (c) 2022 Takahiro Fujii <fujii@xaxxi.net> */ /* @@ -89,7 +90,7 @@ static __u8 *elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc, case USB_DEVICE_ID_ELECOM_M_DT1URBK: case USB_DEVICE_ID_ELECOM_M_DT1DRBK: case USB_DEVICE_ID_ELECOM_M_HT1URBK: - case USB_DEVICE_ID_ELECOM_M_HT1DRBK: + case USB_DEVICE_ID_ELECOM_M_HT1DRBK_010D: /* * Report descriptor format: * 12: button bit count @@ -99,6 +100,16 @@ static __u8 *elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc, */ mouse_button_fixup(hdev, rdesc, *rsize, 12, 30, 14, 20, 8); break; + case USB_DEVICE_ID_ELECOM_M_HT1DRBK_011C: + /* + * Report descriptor format: + * 22: button bit count + * 30: padding bit count + * 24: button report size + * 16: button usage maximum + */ + mouse_button_fixup(hdev, rdesc, *rsize, 22, 30, 24, 16, 8); + break; } return rdesc; } @@ -112,7 +123,8 @@ static const struct hid_device_id elecom_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1URBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1DRBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1URBK) }, - { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1DRBK) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1DRBK_010D) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1DRBK_011C) }, { } }; MODULE_DEVICE_TABLE(hid, elecom_devices); diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 0f8c11842a3a..9e36b4cd905e 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -413,6 +413,8 @@ #define I2C_DEVICE_ID_HP_ENVY_X360_15T_DR100 0x29CF #define I2C_DEVICE_ID_HP_ENVY_X360_EU0009NV 0x2CF9 #define I2C_DEVICE_ID_HP_SPECTRE_X360_15 0x2817 +#define I2C_DEVICE_ID_HP_SPECTRE_X360_13_AW0020NG 0x29DF +#define I2C_DEVICE_ID_ASUS_TP420IA_TOUCHSCREEN 0x2BC8 #define USB_DEVICE_ID_ASUS_UX550VE_TOUCHSCREEN 0x2544 #define USB_DEVICE_ID_ASUS_UX550_TOUCHSCREEN 0x2706 #define I2C_DEVICE_ID_SURFACE_GO_TOUCHSCREEN 0x261A @@ -428,7 +430,8 @@ #define USB_DEVICE_ID_ELECOM_M_DT1URBK 0x00fe #define USB_DEVICE_ID_ELECOM_M_DT1DRBK 0x00ff #define USB_DEVICE_ID_ELECOM_M_HT1URBK 0x010c -#define USB_DEVICE_ID_ELECOM_M_HT1DRBK 0x010d +#define USB_DEVICE_ID_ELECOM_M_HT1DRBK_010D 0x010d +#define USB_DEVICE_ID_ELECOM_M_HT1DRBK_011C 0x011c #define USB_VENDOR_ID_DREAM_CHEEKY 0x1d34 #define USB_DEVICE_ID_DREAM_CHEEKY_WN 0x0004 diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 9b59e436df0a..77c8c49852b5 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -370,6 +370,8 @@ static const struct hid_device_id hid_battery_quirks[] = { { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_DINOVO_EDGE_KBD), HID_BATTERY_QUIRK_IGNORE }, + { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_ASUS_TP420IA_TOUCHSCREEN), + HID_BATTERY_QUIRK_IGNORE }, { HID_USB_DEVICE(USB_VENDOR_ID_ELAN, USB_DEVICE_ID_ASUS_UX550_TOUCHSCREEN), HID_BATTERY_QUIRK_IGNORE }, { HID_USB_DEVICE(USB_VENDOR_ID_ELAN, USB_DEVICE_ID_ASUS_UX550VE_TOUCHSCREEN), @@ -384,6 +386,8 @@ static const struct hid_device_id hid_battery_quirks[] = { HID_BATTERY_QUIRK_IGNORE }, { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_HP_SPECTRE_X360_15), HID_BATTERY_QUIRK_IGNORE }, + { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_HP_SPECTRE_X360_13_AW0020NG), + HID_BATTERY_QUIRK_IGNORE }, { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_SURFACE_GO_TOUCHSCREEN), HID_BATTERY_QUIRK_IGNORE }, { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_SURFACE_GO2_TOUCHSCREEN), diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c index abf2c95e4d0b..9c1ee8e91e0c 100644 --- a/drivers/hid/hid-logitech-hidpp.c +++ b/drivers/hid/hid-logitech-hidpp.c @@ -3978,7 +3978,8 @@ static void hidpp_connect_event(struct hidpp_device *hidpp) } hidpp_initialize_battery(hidpp); - hidpp_initialize_hires_scroll(hidpp); + if (!hid_is_usb(hidpp->hid_dev)) + hidpp_initialize_hires_scroll(hidpp); /* forward current battery state */ if (hidpp->capabilities & HIDPP_CAPABILITY_HIDPP10_BATTERY) { diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index be3ad02573de..5bc91f68b374 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -393,7 +393,8 @@ static const struct hid_device_id hid_have_special_driver[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1URBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1DRBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1URBK) }, - { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1DRBK) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1DRBK_010D) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1DRBK_011C) }, #endif #if IS_ENABLED(CONFIG_HID_ELO) { HID_USB_DEVICE(USB_VENDOR_ID_ELO, 0x0009) }, diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index cbe43e2567a7..64ac5bdee3a6 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1963,7 +1963,7 @@ static void hv_balloon_debugfs_init(struct hv_dynmem_device *b) static void hv_balloon_debugfs_exit(struct hv_dynmem_device *b) { - debugfs_remove(debugfs_lookup("hv-balloon", NULL)); + debugfs_lookup_and_remove("hv-balloon", NULL); } #else diff --git a/drivers/iio/accel/hid-sensor-accel-3d.c b/drivers/iio/accel/hid-sensor-accel-3d.c index a2def6f9380a..5eac7ea19993 100644 --- a/drivers/iio/accel/hid-sensor-accel-3d.c +++ b/drivers/iio/accel/hid-sensor-accel-3d.c @@ -280,6 +280,7 @@ static int accel_3d_capture_sample(struct hid_sensor_hub_device *hsdev, hid_sensor_convert_timestamp( &accel_state->common_attributes, *(int64_t *)raw_data); + ret = 0; break; default: break; diff --git a/drivers/iio/adc/berlin2-adc.c b/drivers/iio/adc/berlin2-adc.c index 3d2e8b4db61a..a4e7c7eff5ac 100644 --- a/drivers/iio/adc/berlin2-adc.c +++ b/drivers/iio/adc/berlin2-adc.c @@ -298,8 +298,10 @@ static int berlin2_adc_probe(struct platform_device *pdev) int ret; indio_dev = devm_iio_device_alloc(&pdev->dev, sizeof(*priv)); - if (!indio_dev) + if (!indio_dev) { + of_node_put(parent_np); return -ENOMEM; + } priv = iio_priv(indio_dev); diff --git a/drivers/iio/adc/imx8qxp-adc.c b/drivers/iio/adc/imx8qxp-adc.c index 36777b827165..f5a0fc9e64c5 100644 --- a/drivers/iio/adc/imx8qxp-adc.c +++ b/drivers/iio/adc/imx8qxp-adc.c @@ -86,6 +86,8 @@ #define IMX8QXP_ADC_TIMEOUT msecs_to_jiffies(100) +#define IMX8QXP_ADC_MAX_FIFO_SIZE 16 + struct imx8qxp_adc { struct device *dev; void __iomem *regs; @@ -95,6 +97,7 @@ struct imx8qxp_adc { /* Serialise ADC channel reads */ struct mutex lock; struct completion completion; + u32 fifo[IMX8QXP_ADC_MAX_FIFO_SIZE]; }; #define IMX8QXP_ADC_CHAN(_idx) { \ @@ -238,8 +241,7 @@ static int imx8qxp_adc_read_raw(struct iio_dev *indio_dev, return ret; } - *val = FIELD_GET(IMX8QXP_ADC_RESFIFO_VAL_MASK, - readl(adc->regs + IMX8QXP_ADR_ADC_RESFIFO)); + *val = adc->fifo[0]; mutex_unlock(&adc->lock); return IIO_VAL_INT; @@ -265,10 +267,15 @@ static irqreturn_t imx8qxp_adc_isr(int irq, void *dev_id) { struct imx8qxp_adc *adc = dev_id; u32 fifo_count; + int i; fifo_count = FIELD_GET(IMX8QXP_ADC_FCTRL_FCOUNT_MASK, readl(adc->regs + IMX8QXP_ADR_ADC_FCTRL)); + for (i = 0; i < fifo_count; i++) + adc->fifo[i] = FIELD_GET(IMX8QXP_ADC_RESFIFO_VAL_MASK, + readl_relaxed(adc->regs + IMX8QXP_ADR_ADC_RESFIFO)); + if (fifo_count) complete(&adc->completion); diff --git a/drivers/iio/adc/stm32-dfsdm-adc.c b/drivers/iio/adc/stm32-dfsdm-adc.c index 6d21ea84fa82..a428bdb567d5 100644 --- a/drivers/iio/adc/stm32-dfsdm-adc.c +++ b/drivers/iio/adc/stm32-dfsdm-adc.c @@ -1520,6 +1520,7 @@ static const struct of_device_id stm32_dfsdm_adc_match[] = { }, {} }; +MODULE_DEVICE_TABLE(of, stm32_dfsdm_adc_match); static int stm32_dfsdm_adc_probe(struct platform_device *pdev) { diff --git a/drivers/iio/adc/twl6030-gpadc.c b/drivers/iio/adc/twl6030-gpadc.c index f53e8558b560..32873fb5f367 100644 --- a/drivers/iio/adc/twl6030-gpadc.c +++ b/drivers/iio/adc/twl6030-gpadc.c @@ -57,6 +57,18 @@ #define TWL6030_GPADCS BIT(1) #define TWL6030_GPADCR BIT(0) +#define USB_VBUS_CTRL_SET 0x04 +#define USB_ID_CTRL_SET 0x06 + +#define TWL6030_MISC1 0xE4 +#define VBUS_MEAS 0x01 +#define ID_MEAS 0x01 + +#define VAC_MEAS 0x04 +#define VBAT_MEAS 0x02 +#define BB_MEAS 0x01 + + /** * struct twl6030_chnl_calib - channel calibration * @gain: slope coefficient for ideal curve @@ -927,6 +939,26 @@ static int twl6030_gpadc_probe(struct platform_device *pdev) return ret; } + ret = twl_i2c_write_u8(TWL_MODULE_USB, VBUS_MEAS, USB_VBUS_CTRL_SET); + if (ret < 0) { + dev_err(dev, "failed to wire up inputs\n"); + return ret; + } + + ret = twl_i2c_write_u8(TWL_MODULE_USB, ID_MEAS, USB_ID_CTRL_SET); + if (ret < 0) { + dev_err(dev, "failed to wire up inputs\n"); + return ret; + } + + ret = twl_i2c_write_u8(TWL6030_MODULE_ID0, + VBAT_MEAS | BB_MEAS | VAC_MEAS, + TWL6030_MISC1); + if (ret < 0) { + dev_err(dev, "failed to wire up inputs\n"); + return ret; + } + indio_dev->name = DRIVER_NAME; indio_dev->info = &twl6030_gpadc_iio_info; indio_dev->modes = INDIO_DIRECT_MODE; diff --git a/drivers/iio/adc/xilinx-ams.c b/drivers/iio/adc/xilinx-ams.c index 5b4bdf3a26bb..a507d2e17079 100644 --- a/drivers/iio/adc/xilinx-ams.c +++ b/drivers/iio/adc/xilinx-ams.c @@ -1329,7 +1329,7 @@ static int ams_parse_firmware(struct iio_dev *indio_dev) dev_channels = devm_krealloc(dev, ams_channels, dev_size, GFP_KERNEL); if (!dev_channels) - ret = -ENOMEM; + return -ENOMEM; indio_dev->channels = dev_channels; indio_dev->num_channels = num_channels; diff --git a/drivers/iio/gyro/hid-sensor-gyro-3d.c b/drivers/iio/gyro/hid-sensor-gyro-3d.c index 8f0ad022c7f1..698c50da1f10 100644 --- a/drivers/iio/gyro/hid-sensor-gyro-3d.c +++ b/drivers/iio/gyro/hid-sensor-gyro-3d.c @@ -231,6 +231,7 @@ static int gyro_3d_capture_sample(struct hid_sensor_hub_device *hsdev, gyro_state->timestamp = hid_sensor_convert_timestamp(&gyro_state->common_attributes, *(s64 *)raw_data); + ret = 0; break; default: break; diff --git a/drivers/iio/imu/fxos8700_core.c b/drivers/iio/imu/fxos8700_core.c index 423cfe526f2a..6d189c4b9ff9 100644 --- a/drivers/iio/imu/fxos8700_core.c +++ b/drivers/iio/imu/fxos8700_core.c @@ -10,6 +10,7 @@ #include <linux/regmap.h> #include <linux/acpi.h> #include <linux/bitops.h> +#include <linux/bitfield.h> #include <linux/iio/iio.h> #include <linux/iio/sysfs.h> @@ -144,9 +145,8 @@ #define FXOS8700_NVM_DATA_BNK0 0xa7 /* Bit definitions for FXOS8700_CTRL_REG1 */ -#define FXOS8700_CTRL_ODR_MSK 0x38 #define FXOS8700_CTRL_ODR_MAX 0x00 -#define FXOS8700_CTRL_ODR_MIN GENMASK(4, 3) +#define FXOS8700_CTRL_ODR_MSK GENMASK(5, 3) /* Bit definitions for FXOS8700_M_CTRL_REG1 */ #define FXOS8700_HMS_MASK GENMASK(1, 0) @@ -320,7 +320,7 @@ static enum fxos8700_sensor fxos8700_to_sensor(enum iio_chan_type iio_type) switch (iio_type) { case IIO_ACCEL: return FXOS8700_ACCEL; - case IIO_ANGL_VEL: + case IIO_MAGN: return FXOS8700_MAGN; default: return -EINVAL; @@ -345,15 +345,35 @@ static int fxos8700_set_active_mode(struct fxos8700_data *data, static int fxos8700_set_scale(struct fxos8700_data *data, enum fxos8700_sensor t, int uscale) { - int i; + int i, ret, val; + bool active_mode; static const int scale_num = ARRAY_SIZE(fxos8700_accel_scale); struct device *dev = regmap_get_device(data->regmap); if (t == FXOS8700_MAGN) { - dev_err(dev, "Magnetometer scale is locked at 1200uT\n"); + dev_err(dev, "Magnetometer scale is locked at 0.001Gs\n"); return -EINVAL; } + /* + * When device is in active mode, it failed to set an ACCEL + * full-scale range(2g/4g/8g) in FXOS8700_XYZ_DATA_CFG. + * This is not align with the datasheet, but it is a fxos8700 + * chip behavier. Set the device in standby mode before setting + * an ACCEL full-scale range. + */ + ret = regmap_read(data->regmap, FXOS8700_CTRL_REG1, &val); + if (ret) + return ret; + + active_mode = val & FXOS8700_ACTIVE; + if (active_mode) { + ret = regmap_write(data->regmap, FXOS8700_CTRL_REG1, + val & ~FXOS8700_ACTIVE); + if (ret) + return ret; + } + for (i = 0; i < scale_num; i++) if (fxos8700_accel_scale[i].uscale == uscale) break; @@ -361,8 +381,12 @@ static int fxos8700_set_scale(struct fxos8700_data *data, if (i == scale_num) return -EINVAL; - return regmap_write(data->regmap, FXOS8700_XYZ_DATA_CFG, + ret = regmap_write(data->regmap, FXOS8700_XYZ_DATA_CFG, fxos8700_accel_scale[i].bits); + if (ret) + return ret; + return regmap_write(data->regmap, FXOS8700_CTRL_REG1, + active_mode); } static int fxos8700_get_scale(struct fxos8700_data *data, @@ -372,7 +396,7 @@ static int fxos8700_get_scale(struct fxos8700_data *data, static const int scale_num = ARRAY_SIZE(fxos8700_accel_scale); if (t == FXOS8700_MAGN) { - *uscale = 1200; /* Magnetometer is locked at 1200uT */ + *uscale = 1000; /* Magnetometer is locked at 0.001Gs */ return 0; } @@ -394,22 +418,61 @@ static int fxos8700_get_data(struct fxos8700_data *data, int chan_type, int axis, int *val) { u8 base, reg; + s16 tmp; int ret; - enum fxos8700_sensor type = fxos8700_to_sensor(chan_type); - base = type ? FXOS8700_OUT_X_MSB : FXOS8700_M_OUT_X_MSB; + /* + * Different register base addresses varies with channel types. + * This bug hasn't been noticed before because using an enum is + * really hard to read. Use an a switch statement to take over that. + */ + switch (chan_type) { + case IIO_ACCEL: + base = FXOS8700_OUT_X_MSB; + break; + case IIO_MAGN: + base = FXOS8700_M_OUT_X_MSB; + break; + default: + return -EINVAL; + } /* Block read 6 bytes of device output registers to avoid data loss */ ret = regmap_bulk_read(data->regmap, base, data->buf, - FXOS8700_DATA_BUF_SIZE); + sizeof(data->buf)); if (ret) return ret; /* Convert axis to buffer index */ reg = axis - IIO_MOD_X; + /* + * Convert to native endianness. The accel data and magn data + * are signed, so a forced type conversion is needed. + */ + tmp = be16_to_cpu(data->buf[reg]); + + /* + * ACCEL output data registers contain the X-axis, Y-axis, and Z-axis + * 14-bit left-justified sample data and MAGN output data registers + * contain the X-axis, Y-axis, and Z-axis 16-bit sample data. Apply + * a signed 2 bits right shift to the readback raw data from ACCEL + * output data register and keep that from MAGN sensor as the origin. + * Value should be extended to 32 bit. + */ + switch (chan_type) { + case IIO_ACCEL: + tmp = tmp >> 2; + break; + case IIO_MAGN: + /* Nothing to do */ + break; + default: + return -EINVAL; + } + /* Convert to native endianness */ - *val = sign_extend32(be16_to_cpu(data->buf[reg]), 15); + *val = sign_extend32(tmp, 15); return 0; } @@ -445,10 +508,9 @@ static int fxos8700_set_odr(struct fxos8700_data *data, enum fxos8700_sensor t, if (i >= odr_num) return -EINVAL; - return regmap_update_bits(data->regmap, - FXOS8700_CTRL_REG1, - FXOS8700_CTRL_ODR_MSK + FXOS8700_ACTIVE, - fxos8700_odr[i].bits << 3 | active_mode); + val &= ~FXOS8700_CTRL_ODR_MSK; + val |= FIELD_PREP(FXOS8700_CTRL_ODR_MSK, fxos8700_odr[i].bits) | FXOS8700_ACTIVE; + return regmap_write(data->regmap, FXOS8700_CTRL_REG1, val); } static int fxos8700_get_odr(struct fxos8700_data *data, enum fxos8700_sensor t, @@ -461,7 +523,7 @@ static int fxos8700_get_odr(struct fxos8700_data *data, enum fxos8700_sensor t, if (ret) return ret; - val &= FXOS8700_CTRL_ODR_MSK; + val = FIELD_GET(FXOS8700_CTRL_ODR_MSK, val); for (i = 0; i < odr_num; i++) if (val == fxos8700_odr[i].bits) @@ -526,7 +588,7 @@ static IIO_CONST_ATTR(in_accel_sampling_frequency_available, static IIO_CONST_ATTR(in_magn_sampling_frequency_available, "1.5625 6.25 12.5 50 100 200 400 800"); static IIO_CONST_ATTR(in_accel_scale_available, "0.000244 0.000488 0.000976"); -static IIO_CONST_ATTR(in_magn_scale_available, "0.000001200"); +static IIO_CONST_ATTR(in_magn_scale_available, "0.001000"); static struct attribute *fxos8700_attrs[] = { &iio_const_attr_in_accel_sampling_frequency_available.dev_attr.attr, @@ -592,14 +654,19 @@ static int fxos8700_chip_init(struct fxos8700_data *data, bool use_spi) if (ret) return ret; - /* Max ODR (800Hz individual or 400Hz hybrid), active mode */ - ret = regmap_write(data->regmap, FXOS8700_CTRL_REG1, - FXOS8700_CTRL_ODR_MAX | FXOS8700_ACTIVE); + /* + * Set max full-scale range (+/-8G) for ACCEL sensor in chip + * initialization then activate the device. + */ + ret = regmap_write(data->regmap, FXOS8700_XYZ_DATA_CFG, MODE_8G); if (ret) return ret; - /* Set for max full-scale range (+/-8G) */ - return regmap_write(data->regmap, FXOS8700_XYZ_DATA_CFG, MODE_8G); + /* Max ODR (800Hz individual or 400Hz hybrid), active mode */ + return regmap_update_bits(data->regmap, FXOS8700_CTRL_REG1, + FXOS8700_CTRL_ODR_MSK | FXOS8700_ACTIVE, + FIELD_PREP(FXOS8700_CTRL_ODR_MSK, FXOS8700_CTRL_ODR_MAX) | + FXOS8700_ACTIVE); } static void fxos8700_chip_uninit(void *data) diff --git a/drivers/iio/imu/st_lsm6dsx/Kconfig b/drivers/iio/imu/st_lsm6dsx/Kconfig index f6660847fb58..8c16cdacf2f2 100644 --- a/drivers/iio/imu/st_lsm6dsx/Kconfig +++ b/drivers/iio/imu/st_lsm6dsx/Kconfig @@ -4,6 +4,7 @@ config IIO_ST_LSM6DSX tristate "ST_LSM6DSx driver for STM 6-axis IMU MEMS sensors" depends on (I2C || SPI || I3C) select IIO_BUFFER + select IIO_TRIGGERED_BUFFER select IIO_KFIFO_BUF select IIO_ST_LSM6DSX_I2C if (I2C) select IIO_ST_LSM6DSX_SPI if (SPI_MASTER) diff --git a/drivers/iio/light/cm32181.c b/drivers/iio/light/cm32181.c index 001055d09750..b1674a5bfa36 100644 --- a/drivers/iio/light/cm32181.c +++ b/drivers/iio/light/cm32181.c @@ -440,6 +440,8 @@ static int cm32181_probe(struct i2c_client *client) if (!indio_dev) return -ENOMEM; + i2c_set_clientdata(client, indio_dev); + /* * Some ACPI systems list 2 I2C resources for the CM3218 sensor, the * SMBus Alert Response Address (ARA, 0x0c) and the actual I2C address. @@ -460,8 +462,6 @@ static int cm32181_probe(struct i2c_client *client) return PTR_ERR(client); } - i2c_set_clientdata(client, indio_dev); - cm32181 = iio_priv(indio_dev); cm32181->client = client; cm32181->dev = dev; @@ -490,7 +490,8 @@ static int cm32181_probe(struct i2c_client *client) static int cm32181_suspend(struct device *dev) { - struct i2c_client *client = to_i2c_client(dev); + struct cm32181_chip *cm32181 = iio_priv(dev_get_drvdata(dev)); + struct i2c_client *client = cm32181->client; return i2c_smbus_write_word_data(client, CM32181_REG_ADDR_CMD, CM32181_CMD_ALS_DISABLE); @@ -498,8 +499,8 @@ static int cm32181_suspend(struct device *dev) static int cm32181_resume(struct device *dev) { - struct i2c_client *client = to_i2c_client(dev); struct cm32181_chip *cm32181 = iio_priv(dev_get_drvdata(dev)); + struct i2c_client *client = cm32181->client; return i2c_smbus_write_word_data(client, CM32181_REG_ADDR_CMD, cm32181->conf_regs[CM32181_REG_ADDR_CMD]); diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c index 43b26bc12288..39357dc2d229 100644 --- a/drivers/infiniband/core/umem_dmabuf.c +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -26,8 +26,8 @@ int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) if (umem_dmabuf->sgt) goto wait_fence; - sgt = dma_buf_map_attachment_unlocked(umem_dmabuf->attach, - DMA_BIDIRECTIONAL); + sgt = dma_buf_map_attachment(umem_dmabuf->attach, + DMA_BIDIRECTIONAL); if (IS_ERR(sgt)) return PTR_ERR(sgt); @@ -103,8 +103,8 @@ void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) umem_dmabuf->last_sg_trim = 0; } - dma_buf_unmap_attachment_unlocked(umem_dmabuf->attach, umem_dmabuf->sgt, - DMA_BIDIRECTIONAL); + dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt, + DMA_BIDIRECTIONAL); umem_dmabuf->sgt = NULL; } diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index f5f9269fdc16..7c5d487ec916 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -1318,12 +1318,15 @@ static int user_exp_rcv_setup(struct hfi1_filedata *fd, unsigned long arg, addr = arg + offsetof(struct hfi1_tid_info, tidcnt); if (copy_to_user((void __user *)addr, &tinfo.tidcnt, sizeof(tinfo.tidcnt))) - return -EFAULT; + ret = -EFAULT; addr = arg + offsetof(struct hfi1_tid_info, length); - if (copy_to_user((void __user *)addr, &tinfo.length, + if (!ret && copy_to_user((void __user *)addr, &tinfo.length, sizeof(tinfo.length))) ret = -EFAULT; + + if (ret) + hfi1_user_exp_rcv_invalid(fd, &tinfo); } return ret; diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index b02f2f0809c8..350884d5f089 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -160,16 +160,11 @@ static void unpin_rcv_pages(struct hfi1_filedata *fd, static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) { int pinned; - unsigned int npages; + unsigned int npages = tidbuf->npages; unsigned long vaddr = tidbuf->vaddr; struct page **pages = NULL; struct hfi1_devdata *dd = fd->uctxt->dd; - /* Get the number of pages the user buffer spans */ - npages = num_user_pages(vaddr, tidbuf->length); - if (!npages) - return -EINVAL; - if (npages > fd->uctxt->expected_count) { dd_dev_err(dd, "Expected buffer too big\n"); return -EINVAL; @@ -196,7 +191,6 @@ static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) return pinned; } tidbuf->pages = pages; - tidbuf->npages = npages; fd->tid_n_pinned += pinned; return pinned; } @@ -274,6 +268,7 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, mutex_init(&tidbuf->cover_mutex); tidbuf->vaddr = tinfo->vaddr; tidbuf->length = tinfo->length; + tidbuf->npages = num_user_pages(tidbuf->vaddr, tidbuf->length); tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets), GFP_KERNEL); if (!tidbuf->psets) { diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 7b086fe63a24..195aa9ea18b6 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -1722,6 +1722,9 @@ static int irdma_add_mqh_4(struct irdma_device *iwdev, continue; idev = in_dev_get(ip_dev); + if (!idev) + continue; + in_dev_for_each_ifa_rtnl(ifa, idev) { ibdev_dbg(&iwdev->ibdev, "CM: Allocating child CM Listener forIP=%pI4, vlan_id=%d, MAC=%pM\n", diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index ea15ec77e321..54b61930a7fd 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -289,7 +289,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, /* IB ports start with 1, MANA Ethernet ports start with 0 */ port = ucmd.port; - if (ucmd.port > mc->num_ports) + if (port < 1 || port > mc->num_ports) return -EINVAL; if (attr->cap.max_send_wr > MAX_SEND_BUFFERS_PER_QUEUE) { diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index c301b3be9f30..a2857accc427 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -276,8 +276,8 @@ iter_chunk: size = pa_end - pa_start + PAGE_SIZE; usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x", va_start, &pa_start, size, flags); - err = iommu_map(pd->domain, va_start, pa_start, - size, flags); + err = iommu_map_atomic(pd->domain, va_start, + pa_start, size, flags); if (err) { usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", va_start, &pa_start, size, err); @@ -293,8 +293,8 @@ iter_chunk: size = pa - pa_start + PAGE_SIZE; usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x\n", va_start, &pa_start, size, flags); - err = iommu_map(pd->domain, va_start, pa_start, - size, flags); + err = iommu_map_atomic(pd->domain, va_start, + pa_start, size, flags); if (err) { usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", va_start, &pa_start, size, err); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index ac25fc80fb33..f10d4bcf87d2 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -2200,6 +2200,14 @@ int ipoib_intf_init(struct ib_device *hca, u32 port, const char *name, rn->attach_mcast = ipoib_mcast_attach; rn->detach_mcast = ipoib_mcast_detach; rn->hca = hca; + + rc = netif_set_real_num_tx_queues(dev, 1); + if (rc) + goto out; + + rc = netif_set_real_num_rx_queues(dev, 1); + if (rc) + goto out; } priv->rn_ops = dev->netdev_ops; diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c index c76ba29da1e2..5adba0f754b6 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c @@ -312,9 +312,8 @@ void rtrs_srv_destroy_path_files(struct rtrs_srv_path *srv_path) if (srv_path->kobj.state_in_sysfs) { sysfs_remove_group(&srv_path->kobj, &rtrs_srv_path_attr_group); - kobject_del(&srv_path->kobj); kobject_put(&srv_path->kobj); + rtrs_srv_destroy_once_sysfs_root_folders(srv_path); } - rtrs_srv_destroy_once_sysfs_root_folders(srv_path); } diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c index fc3758a5bc1c..53e495223ea0 100644 --- a/drivers/media/common/videobuf2/videobuf2-core.c +++ b/drivers/media/common/videobuf2/videobuf2-core.c @@ -2149,8 +2149,6 @@ int vb2_core_streamon(struct vb2_queue *q, unsigned int type) if (ret) return ret; - q->streaming = 1; - /* * Tell driver to start streaming provided sufficient buffers * are available. @@ -2161,12 +2159,13 @@ int vb2_core_streamon(struct vb2_queue *q, unsigned int type) goto unprepare; } + q->streaming = 1; + dprintk(q, 3, "successful\n"); return 0; unprepare: call_void_qop(q, unprepare_streaming, q); - q->streaming = 0; return ret; } EXPORT_SYMBOL_GPL(vb2_core_streamon); diff --git a/drivers/media/v4l2-core/v4l2-ctrls-api.c b/drivers/media/v4l2-core/v4l2-ctrls-api.c index 3d3b6dc24ca6..002ea6588edf 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls-api.c +++ b/drivers/media/v4l2-core/v4l2-ctrls-api.c @@ -150,8 +150,8 @@ static int user_to_new(struct v4l2_ext_control *c, struct v4l2_ctrl *ctrl) * then return an error. */ if (strlen(ctrl->p_new.p_char) == ctrl->maximum && last) - ctrl->is_new = 1; return -ERANGE; + ctrl->is_new = 1; } return ret; default: diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c index babf21a0adeb..f191a2a76f3b 100644 --- a/drivers/mmc/core/sdio_bus.c +++ b/drivers/mmc/core/sdio_bus.c @@ -294,6 +294,12 @@ static void sdio_release_func(struct device *dev) if (!(func->card->quirks & MMC_QUIRK_NONSTD_SDIO)) sdio_free_func_cis(func); + /* + * We have now removed the link to the tuples in the + * card structure, so remove the reference. + */ + put_device(&func->card->dev); + kfree(func->info); kfree(func->tmpbuf); kfree(func); @@ -324,6 +330,12 @@ struct sdio_func *sdio_alloc_func(struct mmc_card *card) device_initialize(&func->dev); + /* + * We may link to tuples in the card structure, + * we need make sure we have a reference to it. + */ + get_device(&func->card->dev); + func->dev.parent = &card->dev; func->dev.bus = &sdio_bus_type; func->dev.release = sdio_release_func; @@ -377,10 +389,9 @@ int sdio_add_func(struct sdio_func *func) */ void sdio_remove_func(struct sdio_func *func) { - if (!sdio_func_present(func)) - return; + if (sdio_func_present(func)) + device_del(&func->dev); - device_del(&func->dev); of_node_put(func->dev.of_node); put_device(&func->dev); } diff --git a/drivers/mmc/core/sdio_cis.c b/drivers/mmc/core/sdio_cis.c index a705ba6eff5b..afaa6cab1adc 100644 --- a/drivers/mmc/core/sdio_cis.c +++ b/drivers/mmc/core/sdio_cis.c @@ -404,12 +404,6 @@ int sdio_read_func_cis(struct sdio_func *func) return ret; /* - * Since we've linked to tuples in the card structure, - * we must make sure we have a reference to it. - */ - get_device(&func->card->dev); - - /* * Vendor/device id is optional for function CIS, so * copy it from the card structure as needed. */ @@ -434,11 +428,5 @@ void sdio_free_func_cis(struct sdio_func *func) } func->tuples = NULL; - - /* - * We have now removed the link to the tuples in the - * card structure, so remove the reference. - */ - put_device(&func->card->dev); } diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c index dc2db9c185ea..eda1e2ddcaca 100644 --- a/drivers/mmc/host/jz4740_mmc.c +++ b/drivers/mmc/host/jz4740_mmc.c @@ -1053,6 +1053,16 @@ static int jz4740_mmc_probe(struct platform_device* pdev) mmc->ops = &jz4740_mmc_ops; if (!mmc->f_max) mmc->f_max = JZ_MMC_CLK_RATE; + + /* + * There seems to be a problem with this driver on the JZ4760 and + * JZ4760B SoCs. There, when using the maximum rate supported (50 MHz), + * the communication fails with many SD cards. + * Until this bug is sorted out, limit the maximum rate to 24 MHz. + */ + if (host->version == JZ_MMC_JZ4760 && mmc->f_max > JZ_MMC_CLK_RATE) + mmc->f_max = JZ_MMC_CLK_RATE; + mmc->f_min = mmc->f_max / 128; mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34; diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index 6e5ea0213b47..5c94ad4661ce 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -435,7 +435,8 @@ static int meson_mmc_clk_init(struct meson_host *host) clk_reg |= FIELD_PREP(CLK_CORE_PHASE_MASK, CLK_PHASE_180); clk_reg |= FIELD_PREP(CLK_TX_PHASE_MASK, CLK_PHASE_0); clk_reg |= FIELD_PREP(CLK_RX_PHASE_MASK, CLK_PHASE_0); - clk_reg |= CLK_IRQ_SDIO_SLEEP(host); + if (host->mmc->caps & MMC_CAP_SDIO_IRQ) + clk_reg |= CLK_IRQ_SDIO_SLEEP(host); writel(clk_reg, host->regs + SD_EMMC_CLOCK); /* get the mux parents */ @@ -948,16 +949,18 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id) { struct meson_host *host = dev_id; struct mmc_command *cmd; - u32 status, raw_status; + u32 status, raw_status, irq_mask = IRQ_EN_MASK; irqreturn_t ret = IRQ_NONE; + if (host->mmc->caps & MMC_CAP_SDIO_IRQ) + irq_mask |= IRQ_SDIO; raw_status = readl(host->regs + SD_EMMC_STATUS); - status = raw_status & (IRQ_EN_MASK | IRQ_SDIO); + status = raw_status & irq_mask; if (!status) { dev_dbg(host->dev, - "Unexpected IRQ! irq_en 0x%08lx - status 0x%08x\n", - IRQ_EN_MASK | IRQ_SDIO, raw_status); + "Unexpected IRQ! irq_en 0x%08x - status 0x%08x\n", + irq_mask, raw_status); return IRQ_NONE; } @@ -1204,6 +1207,11 @@ static int meson_mmc_probe(struct platform_device *pdev) goto free_host; } + mmc->caps |= MMC_CAP_CMD23; + + if (mmc->caps & MMC_CAP_SDIO_IRQ) + mmc->caps2 |= MMC_CAP2_SDIO_IRQ_NOTHREAD; + host->data = (struct meson_mmc_data *) of_device_get_match_data(&pdev->dev); if (!host->data) { @@ -1277,11 +1285,6 @@ static int meson_mmc_probe(struct platform_device *pdev) spin_lock_init(&host->lock); - mmc->caps |= MMC_CAP_CMD23; - - if (mmc->caps & MMC_CAP_SDIO_IRQ) - mmc->caps2 |= MMC_CAP2_SDIO_IRQ_NOTHREAD; - if (host->dram_access_quirk) { /* Limit segments to 1 due to low available sram memory */ mmc->max_segs = 1; diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index 106dd204b1a7..cc333ad67cac 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -1437,7 +1437,7 @@ static int mmc_spi_probe(struct spi_device *spi) status = mmc_add_host(mmc); if (status != 0) - goto fail_add_host; + goto fail_glue_init; /* * Index 0 is card detect @@ -1445,7 +1445,7 @@ static int mmc_spi_probe(struct spi_device *spi) */ status = mmc_gpiod_request_cd(mmc, NULL, 0, false, 1000); if (status == -EPROBE_DEFER) - goto fail_add_host; + goto fail_gpiod_request; if (!status) { /* * The platform has a CD GPIO signal that may support @@ -1460,7 +1460,7 @@ static int mmc_spi_probe(struct spi_device *spi) /* Index 1 is write protect/read only */ status = mmc_gpiod_request_ro(mmc, NULL, 1, 0); if (status == -EPROBE_DEFER) - goto fail_add_host; + goto fail_gpiod_request; if (!status) has_ro = true; @@ -1474,7 +1474,7 @@ static int mmc_spi_probe(struct spi_device *spi) ? ", cd polling" : ""); return 0; -fail_add_host: +fail_gpiod_request: mmc_remove_host(mmc); fail_glue_init: mmc_spi_dma_free(host); diff --git a/drivers/net/bonding/bond_debugfs.c b/drivers/net/bonding/bond_debugfs.c index 4f9b4a18c74c..594094526648 100644 --- a/drivers/net/bonding/bond_debugfs.c +++ b/drivers/net/bonding/bond_debugfs.c @@ -76,7 +76,7 @@ void bond_debug_reregister(struct bonding *bond) d = debugfs_rename(bonding_debug_root, bond->debug_dir, bonding_debug_root, bond->dev->name); - if (d) { + if (!IS_ERR(d)) { bond->debug_dir = d; } else { netdev_warn(bond->dev, "failed to reregister, so just unregister old one\n"); diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ethtool.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ethtool.c index 3585f02575df..57eeb066a945 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ethtool.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ethtool.c @@ -48,6 +48,7 @@ mcp251xfd_ring_set_ringparam(struct net_device *ndev, priv->rx_obj_num = layout.cur_rx; priv->rx_obj_num_coalesce_irq = layout.rx_coalesce; priv->tx->obj_num = layout.cur_tx; + priv->tx_obj_num_coalesce_irq = layout.tx_coalesce; return 0; } diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig index c26755f662c1..f6f3b43dfb06 100644 --- a/drivers/net/dsa/Kconfig +++ b/drivers/net/dsa/Kconfig @@ -35,12 +35,13 @@ config NET_DSA_LANTIQ_GSWIP the xrx200 / VR9 SoC. config NET_DSA_MT7530 - tristate "MediaTek MT753x and MT7621 Ethernet switch support" + tristate "MediaTek MT7530 and MT7531 Ethernet switch support" select NET_DSA_TAG_MTK select MEDIATEK_GE_PHY help - This enables support for the MediaTek MT7530, MT7531, and MT7621 - Ethernet switch chips. + This enables support for the MediaTek MT7530 and MT7531 Ethernet + switch chips. Multi-chip module MT7530 in MT7621AT, MT7621DAT, + MT7621ST and MT7623AI SoCs is supported. config NET_DSA_MV88E6060 tristate "Marvell 88E6060 ethernet switch chip support" diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 908fa89444c9..338f238f2043 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1309,14 +1309,26 @@ mt7530_port_set_vlan_aware(struct dsa_switch *ds, int port) if (!priv->ports[port].pvid) mt7530_rmw(priv, MT7530_PVC_P(port), ACC_FRM_MASK, MT7530_VLAN_ACC_TAGGED); - } - /* Set the port as a user port which is to be able to recognize VID - * from incoming packets before fetching entry within the VLAN table. - */ - mt7530_rmw(priv, MT7530_PVC_P(port), VLAN_ATTR_MASK | PVC_EG_TAG_MASK, - VLAN_ATTR(MT7530_VLAN_USER) | - PVC_EG_TAG(MT7530_VLAN_EG_DISABLED)); + /* Set the port as a user port which is to be able to recognize + * VID from incoming packets before fetching entry within the + * VLAN table. + */ + mt7530_rmw(priv, MT7530_PVC_P(port), + VLAN_ATTR_MASK | PVC_EG_TAG_MASK, + VLAN_ATTR(MT7530_VLAN_USER) | + PVC_EG_TAG(MT7530_VLAN_EG_DISABLED)); + } else { + /* Also set CPU ports to the "user" VLAN port attribute, to + * allow VLAN classification, but keep the EG_TAG attribute as + * "consistent" (i.o.w. don't change its value) for packets + * received by the switch from the CPU, so that tagged packets + * are forwarded to user ports as tagged, and untagged as + * untagged. + */ + mt7530_rmw(priv, MT7530_PVC_P(port), VLAN_ATTR_MASK, + VLAN_ATTR(MT7530_VLAN_USER)); + } } static void diff --git a/drivers/net/ethernet/broadcom/bgmac-bcma.c b/drivers/net/ethernet/broadcom/bgmac-bcma.c index 02bd3cf9a260..6e4f36aaf5db 100644 --- a/drivers/net/ethernet/broadcom/bgmac-bcma.c +++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c @@ -240,12 +240,12 @@ static int bgmac_probe(struct bcma_device *core) bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST; bgmac->feature_flags |= BGMAC_FEAT_FLW_CTRL1; bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_PHY; - if (ci->pkg == BCMA_PKG_ID_BCM47188 || - ci->pkg == BCMA_PKG_ID_BCM47186) { + if ((ci->id == BCMA_CHIP_ID_BCM5357 && ci->pkg == BCMA_PKG_ID_BCM47186) || + (ci->id == BCMA_CHIP_ID_BCM53572 && ci->pkg == BCMA_PKG_ID_BCM47188)) { bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_RGMII; bgmac->feature_flags |= BGMAC_FEAT_IOST_ATTACHED; } - if (ci->pkg == BCMA_PKG_ID_BCM5358) + if (ci->id == BCMA_CHIP_ID_BCM5357 && ci->pkg == BCMA_PKG_ID_BCM5358) bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_EPHYRMII; break; case BCMA_CHIP_ID_BCM53573: diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 240a7e8a7652..6c32f5c427b5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9274,10 +9274,14 @@ int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init) netdev_err(bp->dev, "ring reservation/IRQ init failure rc: %d\n", rc); return rc; } - if (tcs && (bp->tx_nr_rings_per_tc * tcs != bp->tx_nr_rings)) { + if (tcs && (bp->tx_nr_rings_per_tc * tcs != + bp->tx_nr_rings - bp->tx_nr_rings_xdp)) { netdev_err(bp->dev, "tx ring reservation failure\n"); netdev_reset_tc(bp->dev); - bp->tx_nr_rings_per_tc = bp->tx_nr_rings; + if (bp->tx_nr_rings_xdp) + bp->tx_nr_rings_per_tc = bp->tx_nr_rings_xdp; + else + bp->tx_nr_rings_per_tc = bp->tx_nr_rings; return -ENOMEM; } return 0; diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 72e42820713d..6cda31520c42 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -4627,25 +4627,26 @@ static int init_reset_optional(struct platform_device *pdev) if (ret) return dev_err_probe(&pdev->dev, ret, "failed to init SGMII PHY\n"); - } - ret = zynqmp_pm_is_function_supported(PM_IOCTL, IOCTL_SET_GEM_CONFIG); - if (!ret) { - u32 pm_info[2]; + ret = zynqmp_pm_is_function_supported(PM_IOCTL, IOCTL_SET_GEM_CONFIG); + if (!ret) { + u32 pm_info[2]; + + ret = of_property_read_u32_array(pdev->dev.of_node, "power-domains", + pm_info, ARRAY_SIZE(pm_info)); + if (ret) { + dev_err(&pdev->dev, "Failed to read power management information\n"); + goto err_out_phy_exit; + } + ret = zynqmp_pm_set_gem_config(pm_info[1], GEM_CONFIG_FIXED, 0); + if (ret) + goto err_out_phy_exit; - ret = of_property_read_u32_array(pdev->dev.of_node, "power-domains", - pm_info, ARRAY_SIZE(pm_info)); - if (ret) { - dev_err(&pdev->dev, "Failed to read power management information\n"); - goto err_out_phy_exit; + ret = zynqmp_pm_set_gem_config(pm_info[1], GEM_CONFIG_SGMII_MODE, 1); + if (ret) + goto err_out_phy_exit; } - ret = zynqmp_pm_set_gem_config(pm_info[1], GEM_CONFIG_FIXED, 0); - if (ret) - goto err_out_phy_exit; - ret = zynqmp_pm_set_gem_config(pm_info[1], GEM_CONFIG_SGMII_MODE, 1); - if (ret) - goto err_out_phy_exit; } /* Fully reset controller at hardware level if mapped in device tree */ diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 3f8032947d86..027fff9f7db0 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2410,6 +2410,9 @@ static int dpaa_eth_poll(struct napi_struct *napi, int budget) cleaned = qman_p_poll_dqrr(np->p, budget); + if (np->xdp_act & XDP_REDIRECT) + xdp_do_flush(); + if (cleaned < budget) { napi_complete_done(napi, cleaned); qman_p_irqsource_add(np->p, QM_PIRQ_DQRI); @@ -2417,9 +2420,6 @@ static int dpaa_eth_poll(struct napi_struct *napi, int budget) qman_p_irqsource_add(np->p, QM_PIRQ_DQRI); } - if (np->xdp_act & XDP_REDIRECT) - xdp_do_flush(); - return cleaned; } diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 0c35abb7d065..2e79d18fc3c7 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -1993,10 +1993,15 @@ static int dpaa2_eth_poll(struct napi_struct *napi, int budget) if (rx_cleaned >= budget || txconf_cleaned >= DPAA2_ETH_TXCONF_PER_NAPI) { work_done = budget; + if (ch->xdp.res & XDP_REDIRECT) + xdp_do_flush(); goto out; } } while (store_cleaned); + if (ch->xdp.res & XDP_REDIRECT) + xdp_do_flush(); + /* Update NET DIM with the values for this CDAN */ dpaa2_io_update_net_dim(ch->dpio, ch->stats.frames_per_cdan, ch->stats.bytes_per_cdan); @@ -2032,9 +2037,7 @@ out: txc_fq->dq_bytes = 0; } - if (ch->xdp.res & XDP_REDIRECT) - xdp_do_flush_map(); - else if (rx_cleaned && ch->xdp.res & XDP_TX) + if (rx_cleaned && ch->xdp.res & XDP_TX) dpaa2_eth_xdp_tx_flush(priv, ch, &priv->fq[flowid]); return work_done; diff --git a/drivers/net/ethernet/freescale/fman/fman_memac.c b/drivers/net/ethernet/freescale/fman/fman_memac.c index 9349f841bd06..587ad81a2dc3 100644 --- a/drivers/net/ethernet/freescale/fman/fman_memac.c +++ b/drivers/net/ethernet/freescale/fman/fman_memac.c @@ -1055,6 +1055,9 @@ static struct phylink_pcs *memac_pcs_create(struct device_node *mac_node, return ERR_PTR(-EPROBE_DEFER); pcs = lynx_pcs_create(mdiodev); + if (!pcs) + mdio_device_free(mdiodev); + return pcs; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 53d0083e35da..52eec0a50492 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -2921,7 +2921,7 @@ static int i40e_change_mtu(struct net_device *netdev, int new_mtu) struct i40e_pf *pf = vsi->back; if (i40e_enabled_xdp_vsi(vsi)) { - int frame_size = new_mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; + int frame_size = new_mtu + I40E_PACKET_HDR_PAD; if (frame_size > i40e_max_xdp_frame_size(vsi)) return -EINVAL; @@ -13167,6 +13167,8 @@ static int i40e_ndo_bridge_setlink(struct net_device *dev, } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); + if (!br_spec) + return -EINVAL; nla_for_each_nested(attr, br_spec, rem) { __u16 mode; diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 2f0b604abc5e..713069f809ec 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -880,7 +880,7 @@ void ice_set_ethtool_repr_ops(struct net_device *netdev); void ice_set_ethtool_safe_mode_ops(struct net_device *netdev); u16 ice_get_avail_txq_count(struct ice_pf *pf); u16 ice_get_avail_rxq_count(struct ice_pf *pf); -int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx); +int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx, bool locked); void ice_update_vsi_stats(struct ice_vsi *vsi); void ice_update_pf_stats(struct ice_pf *pf); void diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index d02b55b6aa9c..3e08847505ce 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -5524,7 +5524,7 @@ bool ice_fw_supports_report_dflt_cfg(struct ice_hw *hw) * returned by the firmware is a 16 bit * value, but is indexed * by [fls(speed) - 1] */ -static const u32 ice_aq_to_link_speed[15] = { +static const u32 ice_aq_to_link_speed[] = { SPEED_10, /* BIT(0) */ SPEED_100, SPEED_1000, @@ -5536,10 +5536,6 @@ static const u32 ice_aq_to_link_speed[15] = { SPEED_40000, SPEED_50000, SPEED_100000, /* BIT(10) */ - 0, - 0, - 0, - 0 /* BIT(14) */ }; /** @@ -5550,5 +5546,8 @@ static const u32 ice_aq_to_link_speed[15] = { */ u32 ice_get_link_speed(u16 index) { + if (index >= ARRAY_SIZE(ice_aq_to_link_speed)) + return 0; + return ice_aq_to_link_speed[index]; } diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index 4f24d441c35e..0a55c552189a 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -441,7 +441,7 @@ int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked) goto out; } - ice_pf_dcb_recfg(pf); + ice_pf_dcb_recfg(pf, false); out: /* enable previously downed VSIs */ @@ -731,12 +731,13 @@ static int ice_dcb_noncontig_cfg(struct ice_pf *pf) /** * ice_pf_dcb_recfg - Reconfigure all VEBs and VSIs * @pf: pointer to the PF struct + * @locked: is adev device lock held * * Assumed caller has already disabled all VSIs before * calling this function. Reconfiguring DCB based on * local_dcbx_cfg. */ -void ice_pf_dcb_recfg(struct ice_pf *pf) +void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked) { struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg; struct iidc_event *event; @@ -783,14 +784,16 @@ void ice_pf_dcb_recfg(struct ice_pf *pf) if (vsi->type == ICE_VSI_PF) ice_dcbnl_set_all(vsi); } - /* Notify the AUX drivers that TC change is finished */ - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return; + if (!locked) { + /* Notify the AUX drivers that TC change is finished */ + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return; - set_bit(IIDC_EVENT_AFTER_TC_CHANGE, event->type); - ice_send_event_to_aux(pf, event); - kfree(event); + set_bit(IIDC_EVENT_AFTER_TC_CHANGE, event->type); + ice_send_event_to_aux(pf, event); + kfree(event); + } } /** @@ -1044,7 +1047,7 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf, } /* changes in configuration update VSI */ - ice_pf_dcb_recfg(pf); + ice_pf_dcb_recfg(pf, false); /* enable previously downed VSIs */ ice_dcb_ena_dis_vsi(pf, true, true); diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h index 4c421c842a13..800879a88c5e 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h @@ -23,7 +23,7 @@ u8 ice_dcb_get_tc(struct ice_vsi *vsi, int queue_index); int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked); int ice_dcb_bwchk(struct ice_pf *pf, struct ice_dcbx_cfg *dcbcfg); -void ice_pf_dcb_recfg(struct ice_pf *pf); +void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked); void ice_vsi_cfg_dcb_rings(struct ice_vsi *vsi); int ice_init_pf_dcb(struct ice_pf *pf, bool locked); void ice_update_dcb_stats(struct ice_pf *pf); @@ -128,7 +128,7 @@ static inline u8 ice_get_pfc_mode(struct ice_pf *pf) return 0; } -static inline void ice_pf_dcb_recfg(struct ice_pf *pf) { } +static inline void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked) { } static inline void ice_vsi_cfg_dcb_rings(struct ice_vsi *vsi) { } static inline void ice_update_dcb_stats(struct ice_pf *pf) { } static inline void diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.c b/drivers/net/ethernet/intel/ice/ice_devlink.c index 8286e47b4bae..0fae0186bd85 100644 --- a/drivers/net/ethernet/intel/ice/ice_devlink.c +++ b/drivers/net/ethernet/intel/ice/ice_devlink.c @@ -899,7 +899,7 @@ static int ice_set_object_tx_priority(struct ice_port_info *pi, struct ice_sched { int status; - if (node->tx_priority >= 8) { + if (priority >= 8) { NL_SET_ERR_MSG_MOD(extack, "Priority should be less than 8"); return -EINVAL; } @@ -929,7 +929,7 @@ static int ice_set_object_tx_weight(struct ice_port_info *pi, struct ice_sched_n { int status; - if (node->tx_weight > 200 || node->tx_weight < 1) { + if (weight > 200 || weight < 1) { NL_SET_ERR_MSG_MOD(extack, "Weight must be between 1 and 200"); return -EINVAL; } diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 4191994d8f3a..a359f1610fc1 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3641,7 +3641,9 @@ static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch) struct ice_vsi *vsi = np->vsi; struct ice_pf *pf = vsi->back; int new_rx = 0, new_tx = 0; + bool locked = false; u32 curr_combined; + int ret = 0; /* do not support changing channels in Safe Mode */ if (ice_is_safe_mode(pf)) { @@ -3705,15 +3707,33 @@ static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch) return -EINVAL; } - ice_vsi_recfg_qs(vsi, new_rx, new_tx); + if (pf->adev) { + mutex_lock(&pf->adev_mutex); + device_lock(&pf->adev->dev); + locked = true; + if (pf->adev->dev.driver) { + netdev_err(dev, "Cannot change channels when RDMA is active\n"); + ret = -EBUSY; + goto adev_unlock; + } + } + + ice_vsi_recfg_qs(vsi, new_rx, new_tx, locked); - if (!netif_is_rxfh_configured(dev)) - return ice_vsi_set_dflt_rss_lut(vsi, new_rx); + if (!netif_is_rxfh_configured(dev)) { + ret = ice_vsi_set_dflt_rss_lut(vsi, new_rx); + goto adev_unlock; + } /* Update rss_size due to change in Rx queues */ vsi->rss_size = ice_get_valid_rss_size(&pf->hw, new_rx); - return 0; +adev_unlock: + if (locked) { + device_unlock(&pf->adev->dev); + mutex_unlock(&pf->adev_mutex); + } + return ret; } /** diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 237ede2cffb0..8ec24f6cf6be 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -275,6 +275,8 @@ static int ice_set_promisc(struct ice_vsi *vsi, u8 promisc_m) if (status && status != -EEXIST) return status; + netdev_dbg(vsi->netdev, "set promisc filter bits for VSI %i: 0x%x\n", + vsi->vsi_num, promisc_m); return 0; } @@ -300,6 +302,8 @@ static int ice_clear_promisc(struct ice_vsi *vsi, u8 promisc_m) promisc_m, 0); } + netdev_dbg(vsi->netdev, "clear promisc filter bits for VSI %i: 0x%x\n", + vsi->vsi_num, promisc_m); return status; } @@ -414,6 +418,16 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi) } err = 0; vlan_ops->dis_rx_filtering(vsi); + + /* promiscuous mode implies allmulticast so + * that VSIs that are in promiscuous mode are + * subscribed to multicast packets coming to + * the port + */ + err = ice_set_promisc(vsi, + ICE_MCAST_PROMISC_BITS); + if (err) + goto out_promisc; } } else { /* Clear Rx filter to remove traffic from wire */ @@ -430,6 +444,18 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi) NETIF_F_HW_VLAN_CTAG_FILTER) vlan_ops->ena_rx_filtering(vsi); } + + /* disable allmulti here, but only if allmulti is not + * still enabled for the netdev + */ + if (!(vsi->current_netdev_flags & IFF_ALLMULTI)) { + err = ice_clear_promisc(vsi, + ICE_MCAST_PROMISC_BITS); + if (err) { + netdev_err(netdev, "Error %d clearing multicast promiscuous on VSI %i\n", + err, vsi->vsi_num); + } + } } } goto exit; @@ -4195,12 +4221,13 @@ bool ice_is_wol_supported(struct ice_hw *hw) * @vsi: VSI being changed * @new_rx: new number of Rx queues * @new_tx: new number of Tx queues + * @locked: is adev device_lock held * * Only change the number of queues if new_tx, or new_rx is non-0. * * Returns 0 on success. */ -int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx) +int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx, bool locked) { struct ice_pf *pf = vsi->back; int err = 0, timeout = 50; @@ -4229,7 +4256,7 @@ int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx) ice_vsi_close(vsi); ice_vsi_rebuild(vsi, false); - ice_pf_dcb_recfg(pf); + ice_pf_dcb_recfg(pf, locked); ice_vsi_open(vsi); done: clear_bit(ICE_CFG_BUSY, pf->state); @@ -5540,7 +5567,7 @@ static int __init ice_module_init(void) pr_info("%s\n", ice_driver_string); pr_info("%s\n", ice_copyright); - ice_wq = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0, KBUILD_MODNAME); + ice_wq = alloc_workqueue("%s", 0, 0, KBUILD_MODNAME); if (!ice_wq) { pr_err("Failed to create workqueue\n"); return -ENOMEM; diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index 9b762f7972ce..61f844d22512 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -5420,7 +5420,7 @@ ice_add_adv_recipe(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups, */ status = ice_add_special_words(rinfo, lkup_exts, ice_is_dvm_ena(hw)); if (status) - goto err_free_lkup_exts; + goto err_unroll; /* Group match words into recipes using preferred recipe grouping * criteria. diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index faba0f857cd9..95f392ab9670 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -1681,7 +1681,7 @@ ice_tc_forward_to_queue(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr, struct ice_vsi *ch_vsi = NULL; u16 queue = act->rx_queue; - if (queue > vsi->num_rxq) { + if (queue >= vsi->num_rxq) { NL_SET_ERR_MSG_MOD(fltr->extack, "Unable to add filter because specified queue is invalid"); return -EINVAL; diff --git a/drivers/net/ethernet/intel/ice/ice_vf_mbx.c b/drivers/net/ethernet/intel/ice/ice_vf_mbx.c index d4a4001b6e5d..f56fa94ff3d0 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_mbx.c +++ b/drivers/net/ethernet/intel/ice/ice_vf_mbx.c @@ -39,7 +39,7 @@ ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval, return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd); } -static const u32 ice_legacy_aq_to_vc_speed[15] = { +static const u32 ice_legacy_aq_to_vc_speed[] = { VIRTCHNL_LINK_SPEED_100MB, /* BIT(0) */ VIRTCHNL_LINK_SPEED_100MB, VIRTCHNL_LINK_SPEED_1GB, @@ -51,10 +51,6 @@ static const u32 ice_legacy_aq_to_vc_speed[15] = { VIRTCHNL_LINK_SPEED_40GB, VIRTCHNL_LINK_SPEED_40GB, VIRTCHNL_LINK_SPEED_40GB, - VIRTCHNL_LINK_SPEED_UNKNOWN, - VIRTCHNL_LINK_SPEED_UNKNOWN, - VIRTCHNL_LINK_SPEED_UNKNOWN, - VIRTCHNL_LINK_SPEED_UNKNOWN /* BIT(14) */ }; /** @@ -71,21 +67,20 @@ static const u32 ice_legacy_aq_to_vc_speed[15] = { */ u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed) { - u32 speed; + /* convert a BIT() value into an array index */ + u32 index = fls(link_speed) - 1; - if (adv_link_support) { - /* convert a BIT() value into an array index */ - speed = ice_get_link_speed(fls(link_speed) - 1); - } else { + if (adv_link_support) + return ice_get_link_speed(index); + else if (index < ARRAY_SIZE(ice_legacy_aq_to_vc_speed)) /* Virtchnl speeds are not defined for every speed supported in * the hardware. To maintain compatibility with older AVF * drivers, while reporting the speed the new speed values are * resolved to the closest known virtchnl speeds */ - speed = ice_legacy_aq_to_vc_speed[fls(link_speed) - 1]; - } + return ice_legacy_aq_to_vc_speed[index]; - return speed; + return VIRTCHNL_LINK_SPEED_UNKNOWN; } /* The mailbox overflow detection algorithm helps to check if there diff --git a/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c index 5ecc0ee9a78e..b1ffb81893d4 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c +++ b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c @@ -44,13 +44,17 @@ void ice_vf_vsi_init_vlan_ops(struct ice_vsi *vsi) /* outer VLAN ops regardless of port VLAN config */ vlan_ops->add_vlan = ice_vsi_add_vlan; - vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering; vlan_ops->ena_tx_filtering = ice_vsi_ena_tx_vlan_filtering; vlan_ops->dis_tx_filtering = ice_vsi_dis_tx_vlan_filtering; if (ice_vf_is_port_vlan_ena(vf)) { /* setup outer VLAN ops */ vlan_ops->set_port_vlan = ice_vsi_set_outer_port_vlan; + /* all Rx traffic should be in the domain of the + * assigned port VLAN, so prevent disabling Rx VLAN + * filtering + */ + vlan_ops->dis_rx_filtering = noop_vlan; vlan_ops->ena_rx_filtering = ice_vsi_ena_rx_vlan_filtering; @@ -63,6 +67,9 @@ void ice_vf_vsi_init_vlan_ops(struct ice_vsi *vsi) vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion; vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion; } else { + vlan_ops->dis_rx_filtering = + ice_vsi_dis_rx_vlan_filtering; + if (!test_bit(ICE_FLAG_VF_VLAN_PRUNING, pf->flags)) vlan_ops->ena_rx_filtering = noop_vlan; else @@ -96,7 +103,14 @@ void ice_vf_vsi_init_vlan_ops(struct ice_vsi *vsi) vlan_ops->set_port_vlan = ice_vsi_set_inner_port_vlan; vlan_ops->ena_rx_filtering = ice_vsi_ena_rx_vlan_filtering; + /* all Rx traffic should be in the domain of the + * assigned port VLAN, so prevent disabling Rx VLAN + * filtering + */ + vlan_ops->dis_rx_filtering = noop_vlan; } else { + vlan_ops->dis_rx_filtering = + ice_vsi_dis_rx_vlan_filtering; if (!test_bit(ICE_FLAG_VF_VLAN_PRUNING, pf->flags)) vlan_ops->ena_rx_filtering = noop_vlan; else diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 7105de6fb344..374b7f10b549 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -800,6 +800,7 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) struct ice_tx_desc *tx_desc; u16 cnt = xdp_ring->count; struct ice_tx_buf *tx_buf; + u16 completed_frames = 0; u16 xsk_frames = 0; u16 last_rs; int i; @@ -809,19 +810,21 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) if ((tx_desc->cmd_type_offset_bsz & cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE))) { if (last_rs >= ntc) - xsk_frames = last_rs - ntc + 1; + completed_frames = last_rs - ntc + 1; else - xsk_frames = last_rs + cnt - ntc + 1; + completed_frames = last_rs + cnt - ntc + 1; } - if (!xsk_frames) + if (!completed_frames) return; - if (likely(!xdp_ring->xdp_tx_active)) + if (likely(!xdp_ring->xdp_tx_active)) { + xsk_frames = completed_frames; goto skip; + } ntc = xdp_ring->next_to_clean; - for (i = 0; i < xsk_frames; i++) { + for (i = 0; i < completed_frames; i++) { tx_buf = &xdp_ring->tx_buf[ntc]; if (tx_buf->raw_buf) { @@ -837,7 +840,7 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) } skip: tx_desc->cmd_type_offset_bsz = 0; - xdp_ring->next_to_clean += xsk_frames; + xdp_ring->next_to_clean += completed_frames; if (xdp_ring->next_to_clean >= cnt) xdp_ring->next_to_clean -= cnt; if (xsk_frames) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 3c0c35ecea10..b5b443883da9 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2256,6 +2256,30 @@ static void igb_enable_mas(struct igb_adapter *adapter) } } +#ifdef CONFIG_IGB_HWMON +/** + * igb_set_i2c_bb - Init I2C interface + * @hw: pointer to hardware structure + **/ +static void igb_set_i2c_bb(struct e1000_hw *hw) +{ + u32 ctrl_ext; + s32 i2cctl; + + ctrl_ext = rd32(E1000_CTRL_EXT); + ctrl_ext |= E1000_CTRL_I2C_ENA; + wr32(E1000_CTRL_EXT, ctrl_ext); + wrfl(); + + i2cctl = rd32(E1000_I2CPARAMS); + i2cctl |= E1000_I2CBB_EN + | E1000_I2C_CLK_OE_N + | E1000_I2C_DATA_OE_N; + wr32(E1000_I2CPARAMS, i2cctl); + wrfl(); +} +#endif + void igb_reset(struct igb_adapter *adapter) { struct pci_dev *pdev = adapter->pdev; @@ -2400,7 +2424,8 @@ void igb_reset(struct igb_adapter *adapter) * interface. */ if (adapter->ets) - mac->ops.init_thermal_sensor_thresh(hw); + igb_set_i2c_bb(hw); + mac->ops.init_thermal_sensor_thresh(hw); } } #endif @@ -3117,21 +3142,12 @@ static void igb_init_mas(struct igb_adapter *adapter) **/ static s32 igb_init_i2c(struct igb_adapter *adapter) { - struct e1000_hw *hw = &adapter->hw; s32 status = 0; - s32 i2cctl; /* I2C interface supported on i350 devices */ if (adapter->hw.mac.type != e1000_i350) return 0; - i2cctl = rd32(E1000_I2CPARAMS); - i2cctl |= E1000_I2CBB_EN - | E1000_I2C_CLK_OUT | E1000_I2C_CLK_OE_N - | E1000_I2C_DATA_OUT | E1000_I2C_DATA_OE_N; - wr32(E1000_I2CPARAMS, i2cctl); - wrfl(); - /* Initialize the i2c bus which is controlled by the registers. * This bus will use the i2c_algo_bit structure that implements * the protocol through toggling of the 4 bits in the register. @@ -3521,6 +3537,12 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent) adapter->ets = true; else adapter->ets = false; + /* Only enable I2C bit banging if an external thermal + * sensor is supported. + */ + if (adapter->ets) + igb_set_i2c_bb(hw); + hw->mac.ops.init_thermal_sensor_thresh(hw); if (igb_sysfs_init(adapter)) dev_err(&pdev->dev, "failed to allocate sysfs resources\n"); @@ -6794,7 +6816,7 @@ static void igb_perout(struct igb_adapter *adapter, int tsintr_tt) struct timespec64 ts; u32 tsauxc; - if (pin < 0 || pin >= IGB_N_PEROUT) + if (pin < 0 || pin >= IGB_N_SDP) return; spin_lock(&adapter->tmreg_lock); @@ -6802,7 +6824,7 @@ static void igb_perout(struct igb_adapter *adapter, int tsintr_tt) if (hw->mac.type == e1000_82580 || hw->mac.type == e1000_i354 || hw->mac.type == e1000_i350) { - s64 ns = timespec64_to_ns(&adapter->perout[pin].period); + s64 ns = timespec64_to_ns(&adapter->perout[tsintr_tt].period); u32 systiml, systimh, level_mask, level, rem; u64 systim, now; @@ -6850,8 +6872,8 @@ static void igb_perout(struct igb_adapter *adapter, int tsintr_tt) ts.tv_nsec = (u32)systim; ts.tv_sec = ((u32)(systim >> 32)) & 0xFF; } else { - ts = timespec64_add(adapter->perout[pin].start, - adapter->perout[pin].period); + ts = timespec64_add(adapter->perout[tsintr_tt].start, + adapter->perout[tsintr_tt].period); } /* u32 conversion of tv_sec is safe until y2106 */ @@ -6860,7 +6882,7 @@ static void igb_perout(struct igb_adapter *adapter, int tsintr_tt) tsauxc = rd32(E1000_TSAUXC); tsauxc |= TSAUXC_EN_TT0; wr32(E1000_TSAUXC, tsauxc); - adapter->perout[pin].start = ts; + adapter->perout[tsintr_tt].start = ts; spin_unlock(&adapter->tmreg_lock); } @@ -6874,7 +6896,7 @@ static void igb_extts(struct igb_adapter *adapter, int tsintr_tt) struct ptp_clock_event event; struct timespec64 ts; - if (pin < 0 || pin >= IGB_N_EXTTS) + if (pin < 0 || pin >= IGB_N_SDP) return; if (hw->mac.type == e1000_82580 || diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 44b1740dc098..1dd2a7fee8d4 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2942,7 +2942,9 @@ static bool igc_clean_tx_irq(struct igc_q_vector *q_vector, int napi_budget) if (tx_buffer->next_to_watch && time_after(jiffies, tx_buffer->time_stamp + (adapter->tx_timeout_factor * HZ)) && - !(rd32(IGC_STATUS) & IGC_STATUS_TXOFF)) { + !(rd32(IGC_STATUS) & IGC_STATUS_TXOFF) && + (rd32(IGC_TDH(tx_ring->reg_idx)) != + readl(tx_ring->tail))) { /* detected Tx unit hang */ netdev_err(tx_ring->netdev, "Detected Tx Unit Hang\n" @@ -5069,6 +5071,24 @@ static int igc_change_mtu(struct net_device *netdev, int new_mtu) } /** + * igc_tx_timeout - Respond to a Tx Hang + * @netdev: network interface device structure + * @txqueue: queue number that timed out + **/ +static void igc_tx_timeout(struct net_device *netdev, + unsigned int __always_unused txqueue) +{ + struct igc_adapter *adapter = netdev_priv(netdev); + struct igc_hw *hw = &adapter->hw; + + /* Do the reset outside of interrupt context */ + adapter->tx_timeout_count++; + schedule_work(&adapter->reset_task); + wr32(IGC_EICS, + (adapter->eims_enable_mask & ~adapter->eims_other)); +} + +/** * igc_get_stats64 - Get System Network Statistics * @netdev: network interface device structure * @stats: rtnl_link_stats64 pointer @@ -5495,7 +5515,7 @@ static void igc_watchdog_task(struct work_struct *work) case SPEED_100: case SPEED_1000: case SPEED_2500: - adapter->tx_timeout_factor = 7; + adapter->tx_timeout_factor = 1; break; } @@ -6320,6 +6340,7 @@ static const struct net_device_ops igc_netdev_ops = { .ndo_set_rx_mode = igc_set_rx_mode, .ndo_set_mac_address = igc_set_mac, .ndo_change_mtu = igc_change_mtu, + .ndo_tx_timeout = igc_tx_timeout, .ndo_get_stats64 = igc_get_stats64, .ndo_fix_features = igc_fix_features, .ndo_set_features = igc_set_features, diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index c34734d432e0..4e10ced736db 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -417,10 +417,12 @@ static int igc_ptp_verify_pin(struct ptp_clock_info *ptp, unsigned int pin, * * We need to convert the system time value stored in the RX/TXSTMP registers * into a hwtstamp which can be used by the upper level timestamping functions. + * + * Returns 0 on success. **/ -static void igc_ptp_systim_to_hwtstamp(struct igc_adapter *adapter, - struct skb_shared_hwtstamps *hwtstamps, - u64 systim) +static int igc_ptp_systim_to_hwtstamp(struct igc_adapter *adapter, + struct skb_shared_hwtstamps *hwtstamps, + u64 systim) { switch (adapter->hw.mac.type) { case igc_i225: @@ -430,8 +432,9 @@ static void igc_ptp_systim_to_hwtstamp(struct igc_adapter *adapter, systim & 0xFFFFFFFF); break; default: - break; + return -EINVAL; } + return 0; } /** @@ -652,7 +655,8 @@ static void igc_ptp_tx_hwtstamp(struct igc_adapter *adapter) regval = rd32(IGC_TXSTMPL); regval |= (u64)rd32(IGC_TXSTMPH) << 32; - igc_ptp_systim_to_hwtstamp(adapter, &shhwtstamps, regval); + if (igc_ptp_systim_to_hwtstamp(adapter, &shhwtstamps, regval)) + return; switch (adapter->link_speed) { case SPEED_10: diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index bc68b8f2176d..8736ca4b2628 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -73,6 +73,8 @@ #define IXGBE_RXBUFFER_4K 4096 #define IXGBE_MAX_RXBUFFER 16384 /* largest size for a single descriptor */ +#define IXGBE_PKT_HDR_PAD (ETH_HLEN + ETH_FCS_LEN + (VLAN_HLEN * 2)) + /* Attempt to maximize the headroom available for incoming frames. We * use a 2K buffer for receives and need 1536/1534 to store the data for * the frame. This leaves us with 512 bytes of room. From that we need diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index ab8370c413f3..4507fba8747a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6778,6 +6778,18 @@ static void ixgbe_free_all_rx_resources(struct ixgbe_adapter *adapter) } /** + * ixgbe_max_xdp_frame_size - returns the maximum allowed frame size for XDP + * @adapter: device handle, pointer to adapter + */ +static int ixgbe_max_xdp_frame_size(struct ixgbe_adapter *adapter) +{ + if (PAGE_SIZE >= 8192 || adapter->flags2 & IXGBE_FLAG2_RX_LEGACY) + return IXGBE_RXBUFFER_2K; + else + return IXGBE_RXBUFFER_3K; +} + +/** * ixgbe_change_mtu - Change the Maximum Transfer Unit * @netdev: network interface device structure * @new_mtu: new value for maximum frame size @@ -6788,18 +6800,12 @@ static int ixgbe_change_mtu(struct net_device *netdev, int new_mtu) { struct ixgbe_adapter *adapter = netdev_priv(netdev); - if (adapter->xdp_prog) { - int new_frame_size = new_mtu + ETH_HLEN + ETH_FCS_LEN + - VLAN_HLEN; - int i; - - for (i = 0; i < adapter->num_rx_queues; i++) { - struct ixgbe_ring *ring = adapter->rx_ring[i]; + if (ixgbe_enabled_xdp_adapter(adapter)) { + int new_frame_size = new_mtu + IXGBE_PKT_HDR_PAD; - if (new_frame_size > ixgbe_rx_bufsz(ring)) { - e_warn(probe, "Requested MTU size is not supported with XDP\n"); - return -EINVAL; - } + if (new_frame_size > ixgbe_max_xdp_frame_size(adapter)) { + e_warn(probe, "Requested MTU size is not supported with XDP\n"); + return -EINVAL; } } diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c index bda1a6fa2ec4..e4407f09c9d3 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c @@ -1500,6 +1500,9 @@ static const struct devlink_param rvu_af_dl_params[] = { BIT(DEVLINK_PARAM_CMODE_RUNTIME), rvu_af_dl_dwrr_mtu_get, rvu_af_dl_dwrr_mtu_set, rvu_af_dl_dwrr_mtu_validate), +}; + +static const struct devlink_param rvu_af_dl_param_exact_match[] = { DEVLINK_PARAM_DRIVER(RVU_AF_DEVLINK_PARAM_ID_NPC_EXACT_FEATURE_DISABLE, "npc_exact_feature_disable", DEVLINK_PARAM_TYPE_STRING, BIT(DEVLINK_PARAM_CMODE_RUNTIME), @@ -1556,7 +1559,6 @@ int rvu_register_dl(struct rvu *rvu) { struct rvu_devlink *rvu_dl; struct devlink *dl; - size_t size; int err; dl = devlink_alloc(&rvu_devlink_ops, sizeof(struct rvu_devlink), @@ -1578,21 +1580,32 @@ int rvu_register_dl(struct rvu *rvu) goto err_dl_health; } + err = devlink_params_register(dl, rvu_af_dl_params, ARRAY_SIZE(rvu_af_dl_params)); + if (err) { + dev_err(rvu->dev, + "devlink params register failed with error %d", err); + goto err_dl_health; + } + /* Register exact match devlink only for CN10K-B */ - size = ARRAY_SIZE(rvu_af_dl_params); if (!rvu_npc_exact_has_match_table(rvu)) - size -= 1; + goto done; - err = devlink_params_register(dl, rvu_af_dl_params, size); + err = devlink_params_register(dl, rvu_af_dl_param_exact_match, + ARRAY_SIZE(rvu_af_dl_param_exact_match)); if (err) { dev_err(rvu->dev, - "devlink params register failed with error %d", err); - goto err_dl_health; + "devlink exact match params register failed with error %d", err); + goto err_dl_exact_match; } +done: devlink_register(dl); return 0; +err_dl_exact_match: + devlink_params_unregister(dl, rvu_af_dl_params, ARRAY_SIZE(rvu_af_dl_params)); + err_dl_health: rvu_health_reporters_destroy(rvu); devlink_free(dl); @@ -1605,8 +1618,14 @@ void rvu_unregister_dl(struct rvu *rvu) struct devlink *dl = rvu_dl->dl; devlink_unregister(dl); - devlink_params_unregister(dl, rvu_af_dl_params, - ARRAY_SIZE(rvu_af_dl_params)); + + devlink_params_unregister(dl, rvu_af_dl_params, ARRAY_SIZE(rvu_af_dl_params)); + + /* Unregister exact match devlink only for CN10K-B */ + if (rvu_npc_exact_has_match_table(rvu)) + devlink_params_unregister(dl, rvu_af_dl_param_exact_match, + ARRAY_SIZE(rvu_af_dl_param_exact_match)); + rvu_health_reporters_destroy(rvu); devlink_free(dl); } diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index e3de9a53b2d9..e3123723522e 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -1570,8 +1570,8 @@ static struct page_pool *mtk_create_page_pool(struct mtk_eth *eth, if (IS_ERR(pp)) return pp; - err = __xdp_rxq_info_reg(xdp_q, ð->dummy_dev, eth->rx_napi.napi_id, - id, PAGE_SIZE); + err = __xdp_rxq_info_reg(xdp_q, ð->dummy_dev, id, + eth->rx_napi.napi_id, PAGE_SIZE); if (err < 0) goto err_free_pp; @@ -1870,7 +1870,9 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, while (done < budget) { unsigned int pktlen, *rxdcsum; + bool has_hwaccel_tag = false; struct net_device *netdev; + u16 vlan_proto, vlan_tci; dma_addr_t dma_addr; u32 hash, reason; int mac = 0; @@ -2010,27 +2012,29 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) { if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { - if (trxd.rxd3 & RX_DMA_VTAG_V2) - __vlan_hwaccel_put_tag(skb, - htons(RX_DMA_VPID(trxd.rxd4)), - RX_DMA_VID(trxd.rxd4)); + if (trxd.rxd3 & RX_DMA_VTAG_V2) { + vlan_proto = RX_DMA_VPID(trxd.rxd4); + vlan_tci = RX_DMA_VID(trxd.rxd4); + has_hwaccel_tag = true; + } } else if (trxd.rxd2 & RX_DMA_VTAG) { - __vlan_hwaccel_put_tag(skb, htons(RX_DMA_VPID(trxd.rxd3)), - RX_DMA_VID(trxd.rxd3)); + vlan_proto = RX_DMA_VPID(trxd.rxd3); + vlan_tci = RX_DMA_VID(trxd.rxd3); + has_hwaccel_tag = true; } } /* When using VLAN untagging in combination with DSA, the * hardware treats the MTK special tag as a VLAN and untags it. */ - if (skb_vlan_tag_present(skb) && netdev_uses_dsa(netdev)) { - unsigned int port = ntohs(skb->vlan_proto) & GENMASK(2, 0); + if (has_hwaccel_tag && netdev_uses_dsa(netdev)) { + unsigned int port = vlan_proto & GENMASK(2, 0); if (port < ARRAY_SIZE(eth->dsa_meta) && eth->dsa_meta[port]) skb_dst_set_noref(skb, ð->dsa_meta[port]->dst); - - __vlan_hwaccel_clear_tag(skb); + } else if (has_hwaccel_tag) { + __vlan_hwaccel_put_tag(skb, htons(vlan_proto), vlan_tci); } skb_record_rx_queue(skb, 0); @@ -3111,7 +3115,7 @@ static void mtk_gdm_config(struct mtk_eth *eth, u32 config) val |= config; - if (!i && eth->netdev[0] && netdev_uses_dsa(eth->netdev[0])) + if (eth->netdev[i] && netdev_uses_dsa(eth->netdev[i])) val |= MTK_GDMA_SPECIAL_TAG; mtk_w32(eth, val, MTK_GDMA_FWD_CFG(i)); diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h index 18a50529ce7b..2d9186d32bc0 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h @@ -519,7 +519,7 @@ #define SGMII_SPEED_10 FIELD_PREP(SGMII_SPEED_MASK, 0) #define SGMII_SPEED_100 FIELD_PREP(SGMII_SPEED_MASK, 1) #define SGMII_SPEED_1000 FIELD_PREP(SGMII_SPEED_MASK, 2) -#define SGMII_DUPLEX_FULL BIT(4) +#define SGMII_DUPLEX_HALF BIT(4) #define SGMII_IF_MODE_BIT5 BIT(5) #define SGMII_REMOTE_FAULT_DIS BIT(8) #define SGMII_CODE_SYNC_SET_VAL BIT(9) @@ -1036,11 +1036,13 @@ struct mtk_soc_data { * @regmap: The register map pointing at the range used to setup * SGMII modes * @ana_rgc3: The offset refers to register ANA_RGC3 related to regmap + * @interface: Currently configured interface mode * @pcs: Phylink PCS structure */ struct mtk_pcs { struct regmap *regmap; u32 ana_rgc3; + phy_interface_t interface; struct phylink_pcs pcs; }; diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.c b/drivers/net/ethernet/mediatek/mtk_ppe.c index 269208a841c7..1ff024f42444 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe.c @@ -615,8 +615,7 @@ mtk_foe_entry_commit_subflow(struct mtk_ppe *ppe, struct mtk_flow_entry *entry, u32 ib1_mask = mtk_get_ib1_pkt_type_mask(ppe->eth) | MTK_FOE_IB1_UDP; int type; - flow_info = kzalloc(offsetof(struct mtk_flow_entry, l2_data.end), - GFP_ATOMIC); + flow_info = kzalloc(sizeof(*flow_info), GFP_ATOMIC); if (!flow_info) return; diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.h b/drivers/net/ethernet/mediatek/mtk_ppe.h index ea64fac1d425..b5e432031340 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.h +++ b/drivers/net/ethernet/mediatek/mtk_ppe.h @@ -279,7 +279,6 @@ struct mtk_flow_entry { struct { struct mtk_flow_entry *base_flow; struct hlist_node list; - struct {} end; } l2_data; }; struct rhash_head node; diff --git a/drivers/net/ethernet/mediatek/mtk_sgmii.c b/drivers/net/ethernet/mediatek/mtk_sgmii.c index 5c286f2c9418..bb00de1003ac 100644 --- a/drivers/net/ethernet/mediatek/mtk_sgmii.c +++ b/drivers/net/ethernet/mediatek/mtk_sgmii.c @@ -43,11 +43,6 @@ static int mtk_pcs_config(struct phylink_pcs *pcs, unsigned int mode, int advertise, link_timer; bool changed, use_an; - if (interface == PHY_INTERFACE_MODE_2500BASEX) - rgc3 = RG_PHY_SPEED_3_125G; - else - rgc3 = 0; - advertise = phylink_mii_c22_pcs_encode_advertisement(interface, advertising); if (advertise < 0) @@ -88,9 +83,22 @@ static int mtk_pcs_config(struct phylink_pcs *pcs, unsigned int mode, bmcr = 0; } - /* Configure the underlying interface speed */ - regmap_update_bits(mpcs->regmap, mpcs->ana_rgc3, - RG_PHY_SPEED_3_125G, rgc3); + if (mpcs->interface != interface) { + /* PHYA power down */ + regmap_update_bits(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL, + SGMII_PHYA_PWD, SGMII_PHYA_PWD); + + if (interface == PHY_INTERFACE_MODE_2500BASEX) + rgc3 = RG_PHY_SPEED_3_125G; + else + rgc3 = 0; + + /* Configure the underlying interface speed */ + regmap_update_bits(mpcs->regmap, mpcs->ana_rgc3, + RG_PHY_SPEED_3_125G, rgc3); + + mpcs->interface = interface; + } /* Update the advertisement, noting whether it has changed */ regmap_update_bits_check(mpcs->regmap, SGMSYS_PCS_ADVERTISE, @@ -108,9 +116,17 @@ static int mtk_pcs_config(struct phylink_pcs *pcs, unsigned int mode, regmap_update_bits(mpcs->regmap, SGMSYS_PCS_CONTROL_1, SGMII_AN_RESTART | SGMII_AN_ENABLE, bmcr); - /* Release PHYA power down state */ - regmap_update_bits(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL, - SGMII_PHYA_PWD, 0); + /* Release PHYA power down state + * Only removing bit SGMII_PHYA_PWD isn't enough. + * There are cases when the SGMII_PHYA_PWD register contains 0x9 which + * prevents SGMII from working. The SGMII still shows link but no traffic + * can flow. Writing 0x0 to the PHYA_PWD register fix the issue. 0x0 was + * taken from a good working state of the SGMII interface. + * Unknown how much the QPHY needs but it is racy without a sleep. + * Tested on mt7622 & mt7986. + */ + usleep_range(50, 100); + regmap_write(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL, 0); return changed; } @@ -138,11 +154,11 @@ static void mtk_pcs_link_up(struct phylink_pcs *pcs, unsigned int mode, else sgm_mode = SGMII_SPEED_1000; - if (duplex == DUPLEX_FULL) - sgm_mode |= SGMII_DUPLEX_FULL; + if (duplex != DUPLEX_FULL) + sgm_mode |= SGMII_DUPLEX_HALF; regmap_update_bits(mpcs->regmap, SGMSYS_SGMII_MODE, - SGMII_DUPLEX_FULL | SGMII_SPEED_MASK, + SGMII_DUPLEX_HALF | SGMII_SPEED_MASK, sgm_mode); } } @@ -171,6 +187,8 @@ int mtk_sgmii_init(struct mtk_sgmii *ss, struct device_node *r, u32 ana_rgc3) return PTR_ERR(ss->pcs[i].regmap); ss->pcs[i].pcs.ops = &mtk_pcs_ops; + ss->pcs[i].pcs.poll = true; + ss->pcs[i].interface = PHY_INTERFACE_MODE_NA; } return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 3e232a65a0c3..bb95b40d25eb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -245,8 +245,9 @@ void mlx5_pages_debugfs_init(struct mlx5_core_dev *dev) pages = dev->priv.dbg.pages_debugfs; debugfs_create_u32("fw_pages_total", 0400, pages, &dev->priv.fw_pages); - debugfs_create_u32("fw_pages_vfs", 0400, pages, &dev->priv.vfs_pages); - debugfs_create_u32("fw_pages_host_pf", 0400, pages, &dev->priv.host_pf_pages); + debugfs_create_u32("fw_pages_vfs", 0400, pages, &dev->priv.page_counters[MLX5_VF]); + debugfs_create_u32("fw_pages_sfs", 0400, pages, &dev->priv.page_counters[MLX5_SF]); + debugfs_create_u32("fw_pages_host_pf", 0400, pages, &dev->priv.page_counters[MLX5_HOST_PF]); debugfs_create_u32("fw_pages_alloc_failed", 0400, pages, &dev->priv.fw_pages_alloc_failed); debugfs_create_u32("fw_pages_give_dropped", 0400, pages, &dev->priv.give_pages_dropped); debugfs_create_u32("fw_pages_reclaim_discard", 0400, pages, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c index 21831386b26e..5b05b884b5fb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c @@ -64,6 +64,7 @@ static int mlx5_query_mtrc_caps(struct mlx5_fw_tracer *tracer) MLX5_GET(mtrc_cap, out, num_string_trace); tracer->str_db.num_string_db = MLX5_GET(mtrc_cap, out, num_string_db); tracer->owner = !!MLX5_GET(mtrc_cap, out, trace_owner); + tracer->str_db.loaded = false; for (i = 0; i < tracer->str_db.num_string_db; i++) { mtrc_cap_sp = MLX5_ADDR_OF(mtrc_cap, out, string_db_param[i]); @@ -756,6 +757,7 @@ static int mlx5_fw_tracer_set_mtrc_conf(struct mlx5_fw_tracer *tracer) if (err) mlx5_core_warn(dev, "FWTracer: Failed to set tracer configurations %d\n", err); + tracer->buff.consumer_index = 0; return err; } @@ -820,7 +822,6 @@ static void mlx5_fw_tracer_ownership_change(struct work_struct *work) mlx5_core_dbg(tracer->dev, "FWTracer: ownership changed, current=(%d)\n", tracer->owner); if (tracer->owner) { tracer->owner = false; - tracer->buff.consumer_index = 0; return; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c index 464eb3a18450..cdc87ecae5d3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c @@ -87,7 +87,7 @@ void mlx5_ec_cleanup(struct mlx5_core_dev *dev) mlx5_host_pf_cleanup(dev); - err = mlx5_wait_for_pages(dev, &dev->priv.host_pf_pages); + err = mlx5_wait_for_pages(dev, &dev->priv.page_counters[MLX5_HOST_PF]); if (err) mlx5_core_warn(dev, "Timeout reclaiming external host PF pages err(%d)\n", err); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c index 8099a21e674c..ce85b48d327d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c @@ -438,10 +438,6 @@ static int mlx5_esw_bridge_switchdev_event(struct notifier_block *nb, switch (event) { case SWITCHDEV_FDB_ADD_TO_BRIDGE: - /* only handle the event on native eswtich of representor */ - if (!mlx5_esw_bridge_is_local(dev, rep, esw)) - break; - fdb_info = container_of(info, struct switchdev_notifier_fdb_info, info); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index 1892ccb889b3..7cd36f4ac3ef 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -443,7 +443,7 @@ void mlx5e_enable_cvlan_filter(struct mlx5e_flow_steering *fs, bool promisc) void mlx5e_disable_cvlan_filter(struct mlx5e_flow_steering *fs, bool promisc) { - if (fs->vlan->cvlan_filter_disabled) + if (!fs->vlan || fs->vlan->cvlan_filter_disabled) return; fs->vlan->cvlan_filter_disabled = true; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index abcc614b6191..6c24f33a5ea5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -591,7 +591,8 @@ static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param rq->ix = c->ix; rq->channel = c; rq->mdev = mdev; - rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); + rq->hw_mtu = + MLX5E_SW2HW_MTU(params, params->sw_mtu) - ETH_FCS_LEN * !params->scatter_fcs_en; rq->xdpsq = &c->rq_xdpsq; rq->stats = &c->priv->channel_stats[c->ix]->rq; rq->ptp_cyc2time = mlx5_rq_ts_translator(mdev); @@ -1014,35 +1015,6 @@ int mlx5e_flush_rq(struct mlx5e_rq *rq, int curr_state) return mlx5e_rq_to_ready(rq, curr_state); } -static int mlx5e_modify_rq_scatter_fcs(struct mlx5e_rq *rq, bool enable) -{ - struct mlx5_core_dev *mdev = rq->mdev; - - void *in; - void *rqc; - int inlen; - int err; - - inlen = MLX5_ST_SZ_BYTES(modify_rq_in); - in = kvzalloc(inlen, GFP_KERNEL); - if (!in) - return -ENOMEM; - - rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); - - MLX5_SET(modify_rq_in, in, rq_state, MLX5_RQC_STATE_RDY); - MLX5_SET64(modify_rq_in, in, modify_bitmask, - MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_SCATTER_FCS); - MLX5_SET(rqc, rqc, scatter_fcs, enable); - MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY); - - err = mlx5_core_modify_rq(mdev, rq->rqn, in); - - kvfree(in); - - return err; -} - static int mlx5e_modify_rq_vsd(struct mlx5e_rq *rq, bool vsd) { struct mlx5_core_dev *mdev = rq->mdev; @@ -3314,20 +3286,6 @@ static void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv) mlx5e_destroy_tises(priv); } -static int mlx5e_modify_channels_scatter_fcs(struct mlx5e_channels *chs, bool enable) -{ - int err = 0; - int i; - - for (i = 0; i < chs->num; i++) { - err = mlx5e_modify_rq_scatter_fcs(&chs->c[i]->rq, enable); - if (err) - return err; - } - - return 0; -} - static int mlx5e_modify_channels_vsd(struct mlx5e_channels *chs, bool vsd) { int err; @@ -3903,41 +3861,27 @@ static int mlx5e_set_rx_port_ts(struct mlx5_core_dev *mdev, bool enable) return mlx5_set_ports_check(mdev, in, sizeof(in)); } +static int mlx5e_set_rx_port_ts_wrap(struct mlx5e_priv *priv, void *ctx) +{ + struct mlx5_core_dev *mdev = priv->mdev; + bool enable = *(bool *)ctx; + + return mlx5e_set_rx_port_ts(mdev, enable); +} + static int set_feature_rx_fcs(struct net_device *netdev, bool enable) { struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5e_channels *chs = &priv->channels; - struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_params new_params; int err; mutex_lock(&priv->state_lock); - if (enable) { - err = mlx5e_set_rx_port_ts(mdev, false); - if (err) - goto out; - - chs->params.scatter_fcs_en = true; - err = mlx5e_modify_channels_scatter_fcs(chs, true); - if (err) { - chs->params.scatter_fcs_en = false; - mlx5e_set_rx_port_ts(mdev, true); - } - } else { - chs->params.scatter_fcs_en = false; - err = mlx5e_modify_channels_scatter_fcs(chs, false); - if (err) { - chs->params.scatter_fcs_en = true; - goto out; - } - err = mlx5e_set_rx_port_ts(mdev, true); - if (err) { - mlx5_core_warn(mdev, "Failed to set RX port timestamp %d\n", err); - err = 0; - } - } - -out: + new_params = chs->params; + new_params.scatter_fcs_en = enable; + err = mlx5e_safe_switch_params(priv, &new_params, mlx5e_set_rx_port_ts_wrap, + &new_params.scatter_fcs_en, true); mutex_unlock(&priv->state_lock); return err; } @@ -4074,6 +4018,10 @@ static netdev_features_t mlx5e_fix_uplink_rep_features(struct net_device *netdev if (netdev->features & NETIF_F_GRO_HW) netdev_warn(netdev, "Disabling HW_GRO, not supported in switchdev mode\n"); + features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; + if (netdev->features & NETIF_F_HW_VLAN_CTAG_FILTER) + netdev_warn(netdev, "Disabling HW_VLAN CTAG FILTERING, not supported in switchdev mode\n"); + return features; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c index b176648d1343..3cdcb0e0b20f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c @@ -1715,7 +1715,7 @@ void mlx5_esw_bridge_fdb_update_used(struct net_device *dev, u16 vport_num, u16 struct mlx5_esw_bridge *bridge; port = mlx5_esw_bridge_port_lookup(vport_num, esw_owner_vhca_id, br_offloads); - if (!port || port->flags & MLX5_ESW_BRIDGE_PORT_FLAG_PEER) + if (!port) return; bridge = port->bridge; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c index eff92dc0927c..e09518f887a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c @@ -189,16 +189,16 @@ static inline int mlx5_ptys_rate_enum_to_int(enum mlx5_ptys_rate rate) } } -static int mlx5i_get_speed_settings(u16 ib_link_width_oper, u16 ib_proto_oper) +static u32 mlx5i_get_speed_settings(u16 ib_link_width_oper, u16 ib_proto_oper) { int rate, width; rate = mlx5_ptys_rate_enum_to_int(ib_proto_oper); if (rate < 0) - return -EINVAL; + return SPEED_UNKNOWN; width = mlx5_ptys_width_enum_to_int(ib_link_width_oper); if (width < 0) - return -EINVAL; + return SPEED_UNKNOWN; return rate * width; } @@ -221,16 +221,13 @@ static int mlx5i_get_link_ksettings(struct net_device *netdev, ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising); speed = mlx5i_get_speed_settings(ib_link_width_oper, ib_proto_oper); - if (speed < 0) - return -EINVAL; + link_ksettings->base.speed = speed; + link_ksettings->base.duplex = speed == SPEED_UNKNOWN ? DUPLEX_UNKNOWN : DUPLEX_FULL; - link_ksettings->base.duplex = DUPLEX_FULL; link_ksettings->base.port = PORT_OTHER; link_ksettings->base.autoneg = AUTONEG_DISABLE; - link_ksettings->base.speed = speed; - return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 3d5f2a4b1fed..4e1b5757528a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -2110,7 +2110,7 @@ static int __init mlx5_init(void) mlx5_core_verify_params(); mlx5_register_debugfs(); - err = pci_register_driver(&mlx5_core_driver); + err = mlx5e_init(); if (err) goto err_debug; @@ -2118,16 +2118,16 @@ static int __init mlx5_init(void) if (err) goto err_sf; - err = mlx5e_init(); + err = pci_register_driver(&mlx5_core_driver); if (err) - goto err_en; + goto err_pci; return 0; -err_en: +err_pci: mlx5_sf_driver_unregister(); err_sf: - pci_unregister_driver(&mlx5_core_driver); + mlx5e_cleanup(); err_debug: mlx5_unregister_debugfs(); return err; @@ -2135,9 +2135,9 @@ err_debug: static void __exit mlx5_cleanup(void) { - mlx5e_cleanup(); - mlx5_sf_driver_unregister(); pci_unregister_driver(&mlx5_core_driver); + mlx5_sf_driver_unregister(); + mlx5e_cleanup(); mlx5_unregister_debugfs(); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index 60596357bfc7..0eb50be175cc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -74,6 +74,14 @@ static u32 get_function(u16 func_id, bool ec_function) return (u32)func_id | (ec_function << 16); } +static u16 func_id_to_type(struct mlx5_core_dev *dev, u16 func_id, bool ec_function) +{ + if (!func_id) + return mlx5_core_is_ecpf(dev) && !ec_function ? MLX5_HOST_PF : MLX5_PF; + + return func_id <= mlx5_core_max_vfs(dev) ? MLX5_VF : MLX5_SF; +} + static struct rb_root *page_root_per_function(struct mlx5_core_dev *dev, u32 function) { struct rb_root *root; @@ -332,6 +340,7 @@ static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, u32 out[MLX5_ST_SZ_DW(manage_pages_out)] = {0}; int inlen = MLX5_ST_SZ_BYTES(manage_pages_in); int notify_fail = event; + u16 func_type; u64 addr; int err; u32 *in; @@ -383,11 +392,9 @@ retry: goto out_dropped; } + func_type = func_id_to_type(dev, func_id, ec_function); + dev->priv.page_counters[func_type] += npages; dev->priv.fw_pages += npages; - if (func_id) - dev->priv.vfs_pages += npages; - else if (mlx5_core_is_ecpf(dev) && !ec_function) - dev->priv.host_pf_pages += npages; mlx5_core_dbg(dev, "npages %d, ec_function %d, func_id 0x%x, err %d\n", npages, ec_function, func_id, err); @@ -414,6 +421,7 @@ static void release_all_pages(struct mlx5_core_dev *dev, u16 func_id, struct rb_root *root; struct rb_node *p; int npages = 0; + u16 func_type; root = xa_load(&dev->priv.page_root_xa, function); if (WARN_ON_ONCE(!root)) @@ -428,11 +436,9 @@ static void release_all_pages(struct mlx5_core_dev *dev, u16 func_id, free_fwp(dev, fwp, fwp->free_count); } + func_type = func_id_to_type(dev, func_id, ec_function); + dev->priv.page_counters[func_type] -= npages; dev->priv.fw_pages -= npages; - if (func_id) - dev->priv.vfs_pages -= npages; - else if (mlx5_core_is_ecpf(dev) && !ec_function) - dev->priv.host_pf_pages -= npages; mlx5_core_dbg(dev, "npages %d, ec_function %d, func_id 0x%x\n", npages, ec_function, func_id); @@ -498,6 +504,7 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, int outlen = MLX5_ST_SZ_BYTES(manage_pages_out); u32 in[MLX5_ST_SZ_DW(manage_pages_in)] = {}; int num_claimed; + u16 func_type; u32 *out; int err; int i; @@ -549,11 +556,9 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, if (nclaimed) *nclaimed = num_claimed; + func_type = func_id_to_type(dev, func_id, ec_function); + dev->priv.page_counters[func_type] -= num_claimed; dev->priv.fw_pages -= num_claimed; - if (func_id) - dev->priv.vfs_pages -= num_claimed; - else if (mlx5_core_is_ecpf(dev) && !ec_function) - dev->priv.host_pf_pages -= num_claimed; out_free: kvfree(out); @@ -706,12 +711,12 @@ int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev) WARN(dev->priv.fw_pages, "FW pages counter is %d after reclaiming all pages\n", dev->priv.fw_pages); - WARN(dev->priv.vfs_pages, + WARN(dev->priv.page_counters[MLX5_VF], "VFs FW pages counter is %d after reclaiming all pages\n", - dev->priv.vfs_pages); - WARN(dev->priv.host_pf_pages, + dev->priv.page_counters[MLX5_VF]); + WARN(dev->priv.page_counters[MLX5_HOST_PF], "External host PF FW pages counter is %d after reclaiming all pages\n", - dev->priv.host_pf_pages); + dev->priv.page_counters[MLX5_HOST_PF]); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c index c0e6c487c63c..3008e9ce2bbf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c @@ -147,7 +147,7 @@ mlx5_device_disable_sriov(struct mlx5_core_dev *dev, int num_vfs, bool clear_vf) mlx5_eswitch_disable_sriov(dev->priv.eswitch, clear_vf); - if (mlx5_wait_for_pages(dev, &dev->priv.vfs_pages)) + if (mlx5_wait_for_pages(dev, &dev->priv.page_counters[MLX5_VF])) mlx5_core_warn(dev, "timeout reclaiming VFs pages\n"); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c index b851141e03de..042ca0349124 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c @@ -1138,12 +1138,14 @@ dr_rule_create_rule_nic(struct mlx5dr_rule *rule, rule->flow_source)) return 0; + mlx5dr_domain_nic_lock(nic_dmn); + ret = mlx5dr_matcher_select_builders(matcher, nic_matcher, dr_rule_get_ipv(¶m->outer), dr_rule_get_ipv(¶m->inner)); if (ret) - return ret; + goto err_unlock; hw_ste_arr_is_opt = nic_matcher->num_of_builders <= DR_RULE_MAX_STES_OPTIMIZED; if (likely(hw_ste_arr_is_opt)) { @@ -1152,12 +1154,12 @@ dr_rule_create_rule_nic(struct mlx5dr_rule *rule, hw_ste_arr = kzalloc((nic_matcher->num_of_builders + DR_ACTION_MAX_STES) * DR_STE_SIZE, GFP_KERNEL); - if (!hw_ste_arr) - return -ENOMEM; + if (!hw_ste_arr) { + ret = -ENOMEM; + goto err_unlock; + } } - mlx5dr_domain_nic_lock(nic_dmn); - ret = mlx5dr_matcher_add_to_tbl_nic(dmn, nic_matcher); if (ret) goto free_hw_ste; @@ -1223,7 +1225,10 @@ dr_rule_create_rule_nic(struct mlx5dr_rule *rule, mlx5dr_domain_nic_unlock(nic_dmn); - goto out; + if (unlikely(!hw_ste_arr_is_opt)) + kfree(hw_ste_arr); + + return 0; free_rule: dr_rule_clean_rule_members(rule, nic_rule); @@ -1238,12 +1243,12 @@ remove_from_nic_tbl: mlx5dr_matcher_remove_from_tbl_nic(dmn, nic_matcher); free_hw_ste: - mlx5dr_domain_nic_unlock(nic_dmn); - -out: - if (unlikely(!hw_ste_arr_is_opt)) + if (!hw_ste_arr_is_opt) kfree(hw_ste_arr); +err_unlock: + mlx5dr_domain_nic_unlock(nic_dmn); + return ret; } diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c index 5314c064ceae..55b484b10562 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c @@ -608,12 +608,12 @@ allocate_new: lan966x_fdma_rx_reload(rx); } - if (counter < weight && napi_complete_done(napi, counter)) - lan_wr(0xff, lan966x, FDMA_INTR_DB_ENA); - if (redirect) xdp_do_flush(); + if (counter < weight && napi_complete_done(napi, counter)) + lan_wr(0xff, lan966x, FDMA_INTR_DB_ENA); + return counter; } diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_ptp.c b/drivers/net/ethernet/microchip/sparx5/sparx5_ptp.c index 0ed1ea7727c5..69e76634f9aa 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_ptp.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_ptp.c @@ -633,7 +633,7 @@ int sparx5_ptp_init(struct sparx5 *sparx5) /* Enable master counters */ spx5_wr(PTP_PTP_DOM_CFG_PTP_ENA_SET(0x7), sparx5, PTP_PTP_DOM_CFG); - for (i = 0; i < sparx5->port_count; i++) { + for (i = 0; i < SPX5_PORTS; i++) { port = sparx5->ports[i]; if (!port) continue; @@ -649,7 +649,7 @@ void sparx5_ptp_deinit(struct sparx5 *sparx5) struct sparx5_port *port; int i; - for (i = 0; i < sparx5->port_count; i++) { + for (i = 0; i < SPX5_PORTS; i++) { port = sparx5->ports[i]; if (!port) continue; diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index b144f2237748..f9b8f372ec8a 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -1217,9 +1217,7 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev) unsigned int max_queues_per_port = num_online_cpus(); struct gdma_context *gc = pci_get_drvdata(pdev); struct gdma_irq_context *gic; - unsigned int max_irqs; - u16 *cpus; - cpumask_var_t req_mask; + unsigned int max_irqs, cpu; int nvec, irq; int err, i = 0, j; @@ -1240,21 +1238,7 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev) goto free_irq_vector; } - if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL)) { - err = -ENOMEM; - goto free_irq; - } - - cpus = kcalloc(nvec, sizeof(*cpus), GFP_KERNEL); - if (!cpus) { - err = -ENOMEM; - goto free_mask; - } - for (i = 0; i < nvec; i++) - cpus[i] = cpumask_local_spread(i, gc->numa_node); - for (i = 0; i < nvec; i++) { - cpumask_set_cpu(cpus[i], req_mask); gic = &gc->irq_contexts[i]; gic->handler = NULL; gic->arg = NULL; @@ -1269,17 +1253,16 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev) irq = pci_irq_vector(pdev, i); if (irq < 0) { err = irq; - goto free_mask; + goto free_irq; } err = request_irq(irq, mana_gd_intr, 0, gic->name, gic); if (err) - goto free_mask; - irq_set_affinity_and_hint(irq, req_mask); - cpumask_clear(req_mask); + goto free_irq; + + cpu = cpumask_local_spread(i, gc->numa_node); + irq_set_affinity_and_hint(irq, cpumask_of(cpu)); } - free_cpumask_var(req_mask); - kfree(cpus); err = mana_gd_alloc_res_map(nvec, &gc->msix_resource); if (err) @@ -1290,13 +1273,12 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev) return 0; -free_mask: - free_cpumask_var(req_mask); - kfree(cpus); free_irq: for (j = i - 1; j >= 0; j--) { irq = pci_irq_vector(pdev, j); gic = &gc->irq_contexts[j]; + + irq_update_affinity_hint(irq, NULL); free_irq(irq, gic); } @@ -1324,6 +1306,9 @@ static void mana_gd_remove_irqs(struct pci_dev *pdev) continue; gic = &gc->irq_contexts[i]; + + /* Need to clear the hint before free_irq */ + irq_update_affinity_hint(irq, NULL); free_irq(irq, gic); } diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index 7c0897e779dc..ee052404eb55 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -605,6 +605,18 @@ ocelot_flower_parse_key(struct ocelot *ocelot, int port, bool ingress, flow_rule_match_control(rule, &match); } + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_VLAN)) { + struct flow_match_vlan match; + + flow_rule_match_vlan(rule, &match); + filter->key_type = OCELOT_VCAP_KEY_ANY; + filter->vlan.vid.value = match.key->vlan_id; + filter->vlan.vid.mask = match.mask->vlan_id; + filter->vlan.pcp.value[0] = match.key->vlan_priority; + filter->vlan.pcp.mask[0] = match.mask->vlan_priority; + match_protocol = false; + } + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct flow_match_eth_addrs match; @@ -737,18 +749,6 @@ ocelot_flower_parse_key(struct ocelot *ocelot, int port, bool ingress, match_protocol = false; } - if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_VLAN)) { - struct flow_match_vlan match; - - flow_rule_match_vlan(rule, &match); - filter->key_type = OCELOT_VCAP_KEY_ANY; - filter->vlan.vid.value = match.key->vlan_id; - filter->vlan.vid.mask = match.mask->vlan_id; - filter->vlan.pcp.value[0] = match.key->vlan_priority; - filter->vlan.pcp.mask[0] = match.mask->vlan_priority; - match_protocol = false; - } - finished_key_parsing: if (match_protocol && proto != ETH_P_ALL) { if (filter->block_id == VCAP_ES0) { diff --git a/drivers/net/ethernet/mscc/ocelot_ptp.c b/drivers/net/ethernet/mscc/ocelot_ptp.c index 1a82f10c8853..2180ae94c744 100644 --- a/drivers/net/ethernet/mscc/ocelot_ptp.c +++ b/drivers/net/ethernet/mscc/ocelot_ptp.c @@ -335,8 +335,8 @@ static void ocelot_populate_ipv6_ptp_event_trap_key(struct ocelot_vcap_filter *trap) { trap->key_type = OCELOT_VCAP_KEY_IPV6; - trap->key.ipv4.proto.value[0] = IPPROTO_UDP; - trap->key.ipv4.proto.mask[0] = 0xff; + trap->key.ipv6.proto.value[0] = IPPROTO_UDP; + trap->key.ipv6.proto.mask[0] = 0xff; trap->key.ipv6.dport.value = PTP_EV_PORT; trap->key.ipv6.dport.mask = 0xffff; } @@ -355,8 +355,8 @@ static void ocelot_populate_ipv6_ptp_general_trap_key(struct ocelot_vcap_filter *trap) { trap->key_type = OCELOT_VCAP_KEY_IPV6; - trap->key.ipv4.proto.value[0] = IPPROTO_UDP; - trap->key.ipv4.proto.mask[0] = 0xff; + trap->key.ipv6.proto.value[0] = IPPROTO_UDP; + trap->key.ipv6.proto.mask[0] = 0xff; trap->key.ipv6.dport.value = PTP_GEN_PORT; trap->key.ipv6.dport.mask = 0xffff; } diff --git a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c index 4632268695cb..063cd371033a 100644 --- a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c +++ b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c @@ -129,26 +129,31 @@ struct nfp_ipsec_cfg_mssg { }; }; -static int nfp_ipsec_cfg_cmd_issue(struct nfp_net *nn, int type, int saidx, - struct nfp_ipsec_cfg_mssg *msg) +static int nfp_net_ipsec_cfg(struct nfp_net *nn, struct nfp_mbox_amsg_entry *entry) { + unsigned int offset = nn->tlv_caps.mbox_off + NFP_NET_CFG_MBOX_SIMPLE_VAL; + struct nfp_ipsec_cfg_mssg *msg = (struct nfp_ipsec_cfg_mssg *)entry->msg; int i, msg_size, ret; - msg->cmd = type; - msg->sa_idx = saidx; - msg->rsp = 0; - msg_size = ARRAY_SIZE(msg->raw); + ret = nfp_net_mbox_lock(nn, sizeof(*msg)); + if (ret) + return ret; + msg_size = ARRAY_SIZE(msg->raw); for (i = 0; i < msg_size; i++) - nn_writel(nn, NFP_NET_CFG_MBOX_VAL + 4 * i, msg->raw[i]); + nn_writel(nn, offset + 4 * i, msg->raw[i]); - ret = nfp_net_mbox_reconfig(nn, NFP_NET_CFG_MBOX_CMD_IPSEC); - if (ret < 0) + ret = nfp_net_mbox_reconfig(nn, entry->cmd); + if (ret < 0) { + nn_ctrl_bar_unlock(nn); return ret; + } /* For now we always read the whole message response back */ for (i = 0; i < msg_size; i++) - msg->raw[i] = nn_readl(nn, NFP_NET_CFG_MBOX_VAL + 4 * i); + msg->raw[i] = nn_readl(nn, offset + 4 * i); + + nn_ctrl_bar_unlock(nn); switch (msg->rsp) { case NFP_IPSEC_CFG_MSSG_OK: @@ -477,7 +482,10 @@ static int nfp_net_xfrm_add_state(struct xfrm_state *x) } /* Allocate saidx and commit the SA */ - err = nfp_ipsec_cfg_cmd_issue(nn, NFP_IPSEC_CFG_MSSG_ADD_SA, saidx, &msg); + msg.cmd = NFP_IPSEC_CFG_MSSG_ADD_SA; + msg.sa_idx = saidx; + err = nfp_net_sched_mbox_amsg_work(nn, NFP_NET_CFG_MBOX_CMD_IPSEC, &msg, + sizeof(msg), nfp_net_ipsec_cfg); if (err) { xa_erase(&nn->xa_ipsec, saidx); nn_err(nn, "Failed to issue IPsec command err ret=%d\n", err); @@ -491,14 +499,17 @@ static int nfp_net_xfrm_add_state(struct xfrm_state *x) static void nfp_net_xfrm_del_state(struct xfrm_state *x) { + struct nfp_ipsec_cfg_mssg msg = { + .cmd = NFP_IPSEC_CFG_MSSG_INV_SA, + .sa_idx = x->xso.offload_handle - 1, + }; struct net_device *netdev = x->xso.dev; - struct nfp_ipsec_cfg_mssg msg; struct nfp_net *nn; int err; nn = netdev_priv(netdev); - err = nfp_ipsec_cfg_cmd_issue(nn, NFP_IPSEC_CFG_MSSG_INV_SA, - x->xso.offload_handle - 1, &msg); + err = nfp_net_sched_mbox_amsg_work(nn, NFP_NET_CFG_MBOX_CMD_IPSEC, &msg, + sizeof(msg), nfp_net_ipsec_cfg); if (err) nn_warn(nn, "Failed to invalidate SA in hardware\n"); diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c index a8678d5612ee..060a77f2265d 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c +++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c @@ -460,6 +460,7 @@ nfp_tun_write_neigh(struct net_device *netdev, struct nfp_app *app, sizeof(struct nfp_tun_neigh_v4); unsigned long cookie = (unsigned long)neigh; struct nfp_flower_priv *priv = app->priv; + struct nfp_tun_neigh_lag lag_info; struct nfp_neigh_entry *nn_entry; u32 port_id; u8 mtype; @@ -468,6 +469,11 @@ nfp_tun_write_neigh(struct net_device *netdev, struct nfp_app *app, if (!port_id) return; + if ((port_id & NFP_FL_LAG_OUT) == NFP_FL_LAG_OUT) { + memset(&lag_info, 0, sizeof(struct nfp_tun_neigh_lag)); + nfp_flower_lag_get_info_from_netdev(app, netdev, &lag_info); + } + spin_lock_bh(&priv->predt_lock); nn_entry = rhashtable_lookup_fast(&priv->neigh_table, &cookie, neigh_table_params); @@ -515,7 +521,7 @@ nfp_tun_write_neigh(struct net_device *netdev, struct nfp_app *app, neigh_ha_snapshot(common->dst_addr, neigh, netdev); if ((port_id & NFP_FL_LAG_OUT) == NFP_FL_LAG_OUT) - nfp_flower_lag_get_info_from_netdev(app, netdev, lag); + memcpy(lag, &lag_info, sizeof(struct nfp_tun_neigh_lag)); common->port_id = cpu_to_be32(port_id); if (rhashtable_insert_fast(&priv->neigh_table, diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h index 432d79d691c2..939cfce15830 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h @@ -617,9 +617,10 @@ struct nfp_net_dp { * @vnic_no_name: For non-port PF vNIC make ndo_get_phys_port_name return * -EOPNOTSUPP to keep backwards compatibility (set by app) * @port: Pointer to nfp_port structure if vNIC is a port - * @mc_lock: Protect mc_addrs list - * @mc_addrs: List of mc addrs to add/del to HW - * @mc_work: Work to update mc addrs + * @mbox_amsg: Asynchronously processed message via mailbox + * @mbox_amsg.lock: Protect message list + * @mbox_amsg.list: List of message to process + * @mbox_amsg.work: Work to process message asynchronously * @app_priv: APP private data for this vNIC */ struct nfp_net { @@ -721,13 +722,25 @@ struct nfp_net { struct nfp_port *port; - spinlock_t mc_lock; - struct list_head mc_addrs; - struct work_struct mc_work; + struct { + spinlock_t lock; + struct list_head list; + struct work_struct work; + } mbox_amsg; void *app_priv; }; +struct nfp_mbox_amsg_entry { + struct list_head list; + int (*cfg)(struct nfp_net *nn, struct nfp_mbox_amsg_entry *entry); + u32 cmd; + char msg[]; +}; + +int nfp_net_sched_mbox_amsg_work(struct nfp_net *nn, u32 cmd, const void *data, size_t len, + int (*cb)(struct nfp_net *, struct nfp_mbox_amsg_entry *)); + /* Functions to read/write from/to a BAR * Performs any endian conversion necessary. */ diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 18fc9971f1c8..70d7484c82af 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1334,14 +1334,54 @@ err_unlock: return err; } -struct nfp_mc_addr_entry { - u8 addr[ETH_ALEN]; - u32 cmd; - struct list_head list; -}; +int nfp_net_sched_mbox_amsg_work(struct nfp_net *nn, u32 cmd, const void *data, size_t len, + int (*cb)(struct nfp_net *, struct nfp_mbox_amsg_entry *)) +{ + struct nfp_mbox_amsg_entry *entry; + + entry = kmalloc(sizeof(*entry) + len, GFP_ATOMIC); + if (!entry) + return -ENOMEM; + + memcpy(entry->msg, data, len); + entry->cmd = cmd; + entry->cfg = cb; + + spin_lock_bh(&nn->mbox_amsg.lock); + list_add_tail(&entry->list, &nn->mbox_amsg.list); + spin_unlock_bh(&nn->mbox_amsg.lock); + + schedule_work(&nn->mbox_amsg.work); + + return 0; +} + +static void nfp_net_mbox_amsg_work(struct work_struct *work) +{ + struct nfp_net *nn = container_of(work, struct nfp_net, mbox_amsg.work); + struct nfp_mbox_amsg_entry *entry, *tmp; + struct list_head tmp_list; + + INIT_LIST_HEAD(&tmp_list); + + spin_lock_bh(&nn->mbox_amsg.lock); + list_splice_init(&nn->mbox_amsg.list, &tmp_list); + spin_unlock_bh(&nn->mbox_amsg.lock); + + list_for_each_entry_safe(entry, tmp, &tmp_list, list) { + int err = entry->cfg(nn, entry); + + if (err) + nn_err(nn, "Config cmd %d to HW failed %d.\n", entry->cmd, err); + + list_del(&entry->list); + kfree(entry); + } +} -static int nfp_net_mc_cfg(struct nfp_net *nn, const unsigned char *addr, const u32 cmd) +static int nfp_net_mc_cfg(struct nfp_net *nn, struct nfp_mbox_amsg_entry *entry) { + unsigned char *addr = entry->msg; int ret; ret = nfp_net_mbox_lock(nn, NFP_NET_CFG_MULTICAST_SZ); @@ -1353,26 +1393,7 @@ static int nfp_net_mc_cfg(struct nfp_net *nn, const unsigned char *addr, const u nn_writew(nn, nn->tlv_caps.mbox_off + NFP_NET_CFG_MULTICAST_MAC_LO, get_unaligned_be16(addr + 4)); - return nfp_net_mbox_reconfig_and_unlock(nn, cmd); -} - -static int nfp_net_mc_prep(struct nfp_net *nn, const unsigned char *addr, const u32 cmd) -{ - struct nfp_mc_addr_entry *entry; - - entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - if (!entry) - return -ENOMEM; - - ether_addr_copy(entry->addr, addr); - entry->cmd = cmd; - spin_lock_bh(&nn->mc_lock); - list_add_tail(&entry->list, &nn->mc_addrs); - spin_unlock_bh(&nn->mc_lock); - - schedule_work(&nn->mc_work); - - return 0; + return nfp_net_mbox_reconfig_and_unlock(nn, entry->cmd); } static int nfp_net_mc_sync(struct net_device *netdev, const unsigned char *addr) @@ -1385,35 +1406,16 @@ static int nfp_net_mc_sync(struct net_device *netdev, const unsigned char *addr) return -EINVAL; } - return nfp_net_mc_prep(nn, addr, NFP_NET_CFG_MBOX_CMD_MULTICAST_ADD); + return nfp_net_sched_mbox_amsg_work(nn, NFP_NET_CFG_MBOX_CMD_MULTICAST_ADD, addr, + NFP_NET_CFG_MULTICAST_SZ, nfp_net_mc_cfg); } static int nfp_net_mc_unsync(struct net_device *netdev, const unsigned char *addr) { struct nfp_net *nn = netdev_priv(netdev); - return nfp_net_mc_prep(nn, addr, NFP_NET_CFG_MBOX_CMD_MULTICAST_DEL); -} - -static void nfp_net_mc_addr_config(struct work_struct *work) -{ - struct nfp_net *nn = container_of(work, struct nfp_net, mc_work); - struct nfp_mc_addr_entry *entry, *tmp; - struct list_head tmp_list; - - INIT_LIST_HEAD(&tmp_list); - - spin_lock_bh(&nn->mc_lock); - list_splice_init(&nn->mc_addrs, &tmp_list); - spin_unlock_bh(&nn->mc_lock); - - list_for_each_entry_safe(entry, tmp, &tmp_list, list) { - if (nfp_net_mc_cfg(nn, entry->addr, entry->cmd)) - nn_err(nn, "Config mc address to HW failed.\n"); - - list_del(&entry->list); - kfree(entry); - } + return nfp_net_sched_mbox_amsg_work(nn, NFP_NET_CFG_MBOX_CMD_MULTICAST_DEL, addr, + NFP_NET_CFG_MULTICAST_SZ, nfp_net_mc_cfg); } static void nfp_net_set_rx_mode(struct net_device *netdev) @@ -2681,9 +2683,9 @@ int nfp_net_init(struct nfp_net *nn) if (!nn->dp.netdev) return 0; - spin_lock_init(&nn->mc_lock); - INIT_LIST_HEAD(&nn->mc_addrs); - INIT_WORK(&nn->mc_work, nfp_net_mc_addr_config); + spin_lock_init(&nn->mbox_amsg.lock); + INIT_LIST_HEAD(&nn->mbox_amsg.list); + INIT_WORK(&nn->mbox_amsg.work, nfp_net_mbox_amsg_work); return register_netdev(nn->dp.netdev); @@ -2704,6 +2706,6 @@ void nfp_net_clean(struct nfp_net *nn) unregister_netdev(nn->dp.netdev); nfp_net_ipsec_clean(nn); nfp_ccm_mbox_clean(nn); - flush_work(&nn->mc_work); + flush_work(&nn->mbox_amsg.work); nfp_net_reconfig_wait_posted(nn); } diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h index 51124309ae1f..f03dcadff738 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h @@ -403,7 +403,6 @@ */ #define NFP_NET_CFG_MBOX_BASE 0x1800 #define NFP_NET_CFG_MBOX_VAL_MAX_SZ 0x1F8 -#define NFP_NET_CFG_MBOX_VAL 0x1808 #define NFP_NET_CFG_MBOX_SIMPLE_CMD 0x0 #define NFP_NET_CFG_MBOX_SIMPLE_RET 0x4 #define NFP_NET_CFG_MBOX_SIMPLE_VAL 0x8 diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c index a4a89ef3f18b..cc97b3d00414 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c @@ -293,35 +293,131 @@ nfp_net_set_fec_link_mode(struct nfp_eth_table_port *eth_port, } } -static const u16 nfp_eth_media_table[] = { - [NFP_MEDIA_1000BASE_CX] = ETHTOOL_LINK_MODE_1000baseKX_Full_BIT, - [NFP_MEDIA_1000BASE_KX] = ETHTOOL_LINK_MODE_1000baseKX_Full_BIT, - [NFP_MEDIA_10GBASE_KX4] = ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT, - [NFP_MEDIA_10GBASE_KR] = ETHTOOL_LINK_MODE_10000baseKR_Full_BIT, - [NFP_MEDIA_10GBASE_CX4] = ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT, - [NFP_MEDIA_10GBASE_CR] = ETHTOOL_LINK_MODE_10000baseCR_Full_BIT, - [NFP_MEDIA_10GBASE_SR] = ETHTOOL_LINK_MODE_10000baseSR_Full_BIT, - [NFP_MEDIA_10GBASE_ER] = ETHTOOL_LINK_MODE_10000baseER_Full_BIT, - [NFP_MEDIA_25GBASE_KR] = ETHTOOL_LINK_MODE_25000baseKR_Full_BIT, - [NFP_MEDIA_25GBASE_KR_S] = ETHTOOL_LINK_MODE_25000baseKR_Full_BIT, - [NFP_MEDIA_25GBASE_CR] = ETHTOOL_LINK_MODE_25000baseCR_Full_BIT, - [NFP_MEDIA_25GBASE_CR_S] = ETHTOOL_LINK_MODE_25000baseCR_Full_BIT, - [NFP_MEDIA_25GBASE_SR] = ETHTOOL_LINK_MODE_25000baseSR_Full_BIT, - [NFP_MEDIA_40GBASE_CR4] = ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT, - [NFP_MEDIA_40GBASE_KR4] = ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT, - [NFP_MEDIA_40GBASE_SR4] = ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT, - [NFP_MEDIA_40GBASE_LR4] = ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT, - [NFP_MEDIA_50GBASE_KR] = ETHTOOL_LINK_MODE_50000baseKR_Full_BIT, - [NFP_MEDIA_50GBASE_SR] = ETHTOOL_LINK_MODE_50000baseSR_Full_BIT, - [NFP_MEDIA_50GBASE_CR] = ETHTOOL_LINK_MODE_50000baseCR_Full_BIT, - [NFP_MEDIA_50GBASE_LR] = ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT, - [NFP_MEDIA_50GBASE_ER] = ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT, - [NFP_MEDIA_50GBASE_FR] = ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT, - [NFP_MEDIA_100GBASE_KR4] = ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT, - [NFP_MEDIA_100GBASE_SR4] = ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT, - [NFP_MEDIA_100GBASE_CR4] = ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT, - [NFP_MEDIA_100GBASE_KP4] = ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT, - [NFP_MEDIA_100GBASE_CR10] = ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT, +static const struct nfp_eth_media_link_mode { + u16 ethtool_link_mode; + u16 speed; +} nfp_eth_media_table[NFP_MEDIA_LINK_MODES_NUMBER] = { + [NFP_MEDIA_1000BASE_CX] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_1000baseKX_Full_BIT, + .speed = NFP_SPEED_1G, + }, + [NFP_MEDIA_1000BASE_KX] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_1000baseKX_Full_BIT, + .speed = NFP_SPEED_1G, + }, + [NFP_MEDIA_10GBASE_KX4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT, + .speed = NFP_SPEED_10G, + }, + [NFP_MEDIA_10GBASE_KR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_10000baseKR_Full_BIT, + .speed = NFP_SPEED_10G, + }, + [NFP_MEDIA_10GBASE_CX4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT, + .speed = NFP_SPEED_10G, + }, + [NFP_MEDIA_10GBASE_CR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_10000baseCR_Full_BIT, + .speed = NFP_SPEED_10G, + }, + [NFP_MEDIA_10GBASE_SR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_10000baseSR_Full_BIT, + .speed = NFP_SPEED_10G, + }, + [NFP_MEDIA_10GBASE_ER] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_10000baseER_Full_BIT, + .speed = NFP_SPEED_10G, + }, + [NFP_MEDIA_25GBASE_KR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_25000baseKR_Full_BIT, + .speed = NFP_SPEED_25G, + }, + [NFP_MEDIA_25GBASE_KR_S] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_25000baseKR_Full_BIT, + .speed = NFP_SPEED_25G, + }, + [NFP_MEDIA_25GBASE_CR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_25000baseCR_Full_BIT, + .speed = NFP_SPEED_25G, + }, + [NFP_MEDIA_25GBASE_CR_S] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_25000baseCR_Full_BIT, + .speed = NFP_SPEED_25G, + }, + [NFP_MEDIA_25GBASE_SR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_25000baseSR_Full_BIT, + .speed = NFP_SPEED_25G, + }, + [NFP_MEDIA_40GBASE_CR4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT, + .speed = NFP_SPEED_40G, + }, + [NFP_MEDIA_40GBASE_KR4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT, + .speed = NFP_SPEED_40G, + }, + [NFP_MEDIA_40GBASE_SR4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT, + .speed = NFP_SPEED_40G, + }, + [NFP_MEDIA_40GBASE_LR4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT, + .speed = NFP_SPEED_40G, + }, + [NFP_MEDIA_50GBASE_KR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_50000baseKR_Full_BIT, + .speed = NFP_SPEED_50G, + }, + [NFP_MEDIA_50GBASE_SR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_50000baseSR_Full_BIT, + .speed = NFP_SPEED_50G, + }, + [NFP_MEDIA_50GBASE_CR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_50000baseCR_Full_BIT, + .speed = NFP_SPEED_50G, + }, + [NFP_MEDIA_50GBASE_LR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT, + .speed = NFP_SPEED_50G, + }, + [NFP_MEDIA_50GBASE_ER] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT, + .speed = NFP_SPEED_50G, + }, + [NFP_MEDIA_50GBASE_FR] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT, + .speed = NFP_SPEED_50G, + }, + [NFP_MEDIA_100GBASE_KR4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT, + .speed = NFP_SPEED_100G, + }, + [NFP_MEDIA_100GBASE_SR4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT, + .speed = NFP_SPEED_100G, + }, + [NFP_MEDIA_100GBASE_CR4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT, + .speed = NFP_SPEED_100G, + }, + [NFP_MEDIA_100GBASE_KP4] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT, + .speed = NFP_SPEED_100G, + }, + [NFP_MEDIA_100GBASE_CR10] = { + .ethtool_link_mode = ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT, + .speed = NFP_SPEED_100G, + }, +}; + +static const unsigned int nfp_eth_speed_map[NFP_SUP_SPEED_NUMBER] = { + [NFP_SPEED_1G] = SPEED_1000, + [NFP_SPEED_10G] = SPEED_10000, + [NFP_SPEED_25G] = SPEED_25000, + [NFP_SPEED_40G] = SPEED_40000, + [NFP_SPEED_50G] = SPEED_50000, + [NFP_SPEED_100G] = SPEED_100000, }; static void nfp_add_media_link_mode(struct nfp_port *port, @@ -334,8 +430,12 @@ static void nfp_add_media_link_mode(struct nfp_port *port, }; struct nfp_cpp *cpp = port->app->cpp; - if (nfp_eth_read_media(cpp, ðm)) + if (nfp_eth_read_media(cpp, ðm)) { + bitmap_fill(port->speed_bitmap, NFP_SUP_SPEED_NUMBER); return; + } + + bitmap_zero(port->speed_bitmap, NFP_SUP_SPEED_NUMBER); for (u32 i = 0; i < 2; i++) { supported_modes[i] = le64_to_cpu(ethm.supported_modes[i]); @@ -344,20 +444,26 @@ static void nfp_add_media_link_mode(struct nfp_port *port, for (u32 i = 0; i < NFP_MEDIA_LINK_MODES_NUMBER; i++) { if (i < 64) { - if (supported_modes[0] & BIT_ULL(i)) - __set_bit(nfp_eth_media_table[i], + if (supported_modes[0] & BIT_ULL(i)) { + __set_bit(nfp_eth_media_table[i].ethtool_link_mode, cmd->link_modes.supported); + __set_bit(nfp_eth_media_table[i].speed, + port->speed_bitmap); + } if (advertised_modes[0] & BIT_ULL(i)) - __set_bit(nfp_eth_media_table[i], + __set_bit(nfp_eth_media_table[i].ethtool_link_mode, cmd->link_modes.advertising); } else { - if (supported_modes[1] & BIT_ULL(i - 64)) - __set_bit(nfp_eth_media_table[i], + if (supported_modes[1] & BIT_ULL(i - 64)) { + __set_bit(nfp_eth_media_table[i].ethtool_link_mode, cmd->link_modes.supported); + __set_bit(nfp_eth_media_table[i].speed, + port->speed_bitmap); + } if (advertised_modes[1] & BIT_ULL(i - 64)) - __set_bit(nfp_eth_media_table[i], + __set_bit(nfp_eth_media_table[i].ethtool_link_mode, cmd->link_modes.advertising); } } @@ -468,6 +574,22 @@ nfp_net_set_link_ksettings(struct net_device *netdev, if (cmd->base.speed != SPEED_UNKNOWN) { u32 speed = cmd->base.speed / eth_port->lanes; + bool is_supported = false; + + for (u32 i = 0; i < NFP_SUP_SPEED_NUMBER; i++) { + if (cmd->base.speed == nfp_eth_speed_map[i] && + test_bit(i, port->speed_bitmap)) { + is_supported = true; + break; + } + } + + if (!is_supported) { + netdev_err(netdev, "Speed %u is not supported.\n", + cmd->base.speed); + err = -EINVAL; + goto err_bad_set; + } if (req_aneg) { netdev_err(netdev, "Speed changing is not allowed when working on autoneg mode.\n"); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.h b/drivers/net/ethernet/netronome/nfp/nfp_port.h index f8cd157ca1d7..9c04f9f0e2c9 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_port.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_port.h @@ -38,6 +38,16 @@ enum nfp_port_flags { NFP_PORT_CHANGED = 0, }; +enum { + NFP_SPEED_1G, + NFP_SPEED_10G, + NFP_SPEED_25G, + NFP_SPEED_40G, + NFP_SPEED_50G, + NFP_SPEED_100G, + NFP_SUP_SPEED_NUMBER +}; + /** * struct nfp_port - structure representing NFP port * @netdev: backpointer to associated netdev @@ -52,6 +62,7 @@ enum nfp_port_flags { * @eth_forced: for %NFP_PORT_PHYS_PORT port is forced UP or DOWN, don't change * @eth_port: for %NFP_PORT_PHYS_PORT translated ETH Table port entry * @eth_stats: for %NFP_PORT_PHYS_PORT MAC stats if available + * @speed_bitmap: for %NFP_PORT_PHYS_PORT supported speed bitmap * @pf_id: for %NFP_PORT_PF_PORT, %NFP_PORT_VF_PORT ID of the PCI PF (0-3) * @vf_id: for %NFP_PORT_VF_PORT ID of the PCI VF within @pf_id * @pf_split: for %NFP_PORT_PF_PORT %true if PCI PF has more than one vNIC @@ -78,6 +89,7 @@ struct nfp_port { bool eth_forced; struct nfp_eth_table_port *eth_port; u8 __iomem *eth_stats; + DECLARE_BITMAP(speed_bitmap, NFP_SUP_SPEED_NUMBER); }; /* NFP_PORT_PF_PORT, NFP_PORT_VF_PORT */ struct { diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.c b/drivers/net/ethernet/pensando/ionic/ionic_dev.c index 626b9113e7c4..d911f4fd9af6 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.c @@ -708,9 +708,16 @@ void ionic_q_post(struct ionic_queue *q, bool ring_doorbell, ionic_desc_cb cb, q->lif->index, q->name, q->hw_type, q->hw_index, q->head_idx, ring_doorbell); - if (ring_doorbell) + if (ring_doorbell) { ionic_dbell_ring(lif->kern_dbpage, q->hw_type, q->dbval | q->head_idx); + + q->dbell_jiffies = jiffies; + + if (q_to_qcq(q)->napi_qcq) + mod_timer(&q_to_qcq(q)->napi_qcq->napi_deadline, + jiffies + IONIC_NAPI_DEADLINE); + } } static bool ionic_q_is_posted(struct ionic_queue *q, unsigned int pos) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.h b/drivers/net/ethernet/pensando/ionic/ionic_dev.h index 2a1d7b9c07e7..bce3ca38669b 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.h @@ -25,6 +25,12 @@ #define IONIC_DEV_INFO_REG_COUNT 32 #define IONIC_DEV_CMD_REG_COUNT 32 +#define IONIC_NAPI_DEADLINE (HZ / 200) /* 5ms */ +#define IONIC_ADMIN_DOORBELL_DEADLINE (HZ / 2) /* 500ms */ +#define IONIC_TX_DOORBELL_DEADLINE (HZ / 100) /* 10ms */ +#define IONIC_RX_MIN_DOORBELL_DEADLINE (HZ / 100) /* 10ms */ +#define IONIC_RX_MAX_DOORBELL_DEADLINE (HZ * 5) /* 5s */ + struct ionic_dev_bar { void __iomem *vaddr; phys_addr_t bus_addr; @@ -216,6 +222,8 @@ struct ionic_queue { struct ionic_lif *lif; struct ionic_desc_info *info; u64 dbval; + unsigned long dbell_deadline; + unsigned long dbell_jiffies; u16 head_idx; u16 tail_idx; unsigned int index; @@ -361,4 +369,8 @@ void ionic_q_service(struct ionic_queue *q, struct ionic_cq_info *cq_info, int ionic_heartbeat_check(struct ionic *ionic); bool ionic_is_fw_running(struct ionic_dev *idev); +bool ionic_adminq_poke_doorbell(struct ionic_queue *q); +bool ionic_txq_poke_doorbell(struct ionic_queue *q); +bool ionic_rxq_poke_doorbell(struct ionic_queue *q); + #endif /* _IONIC_DEV_H_ */ diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 4dd16c487f2b..63a78a9ac241 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -16,6 +16,7 @@ #include "ionic.h" #include "ionic_bus.h" +#include "ionic_dev.h" #include "ionic_lif.h" #include "ionic_txrx.h" #include "ionic_ethtool.h" @@ -200,6 +201,13 @@ void ionic_link_status_check_request(struct ionic_lif *lif, bool can_sleep) } } +static void ionic_napi_deadline(struct timer_list *timer) +{ + struct ionic_qcq *qcq = container_of(timer, struct ionic_qcq, napi_deadline); + + napi_schedule(&qcq->napi); +} + static irqreturn_t ionic_isr(int irq, void *data) { struct napi_struct *napi = data; @@ -269,6 +277,7 @@ static int ionic_qcq_enable(struct ionic_qcq *qcq) .oper = IONIC_Q_ENABLE, }, }; + int ret; idev = &lif->ionic->idev; dev = lif->ionic->dev; @@ -276,16 +285,24 @@ static int ionic_qcq_enable(struct ionic_qcq *qcq) dev_dbg(dev, "q_enable.index %d q_enable.qtype %d\n", ctx.cmd.q_control.index, ctx.cmd.q_control.type); + if (qcq->flags & IONIC_QCQ_F_INTR) + ionic_intr_clean(idev->intr_ctrl, qcq->intr.index); + + ret = ionic_adminq_post_wait(lif, &ctx); + if (ret) + return ret; + + if (qcq->napi.poll) + napi_enable(&qcq->napi); + if (qcq->flags & IONIC_QCQ_F_INTR) { irq_set_affinity_hint(qcq->intr.vector, &qcq->intr.affinity_mask); - napi_enable(&qcq->napi); - ionic_intr_clean(idev->intr_ctrl, qcq->intr.index); ionic_intr_mask(idev->intr_ctrl, qcq->intr.index, IONIC_INTR_MASK_CLEAR); } - return ionic_adminq_post_wait(lif, &ctx); + return 0; } static int ionic_qcq_disable(struct ionic_lif *lif, struct ionic_qcq *qcq, int fw_err) @@ -316,6 +333,7 @@ static int ionic_qcq_disable(struct ionic_lif *lif, struct ionic_qcq *qcq, int f synchronize_irq(qcq->intr.vector); irq_set_affinity_hint(qcq->intr.vector, NULL); napi_disable(&qcq->napi); + del_timer_sync(&qcq->napi_deadline); } /* If there was a previous fw communcation error, don't bother with @@ -451,6 +469,7 @@ static void ionic_link_qcq_interrupts(struct ionic_qcq *src_qcq, n_qcq->intr.vector = src_qcq->intr.vector; n_qcq->intr.index = src_qcq->intr.index; + n_qcq->napi_qcq = src_qcq->napi_qcq; } static int ionic_alloc_qcq_interrupt(struct ionic_lif *lif, struct ionic_qcq *qcq) @@ -564,13 +583,15 @@ static int ionic_qcq_alloc(struct ionic_lif *lif, unsigned int type, } if (flags & IONIC_QCQ_F_NOTIFYQ) { - int q_size, cq_size; + int q_size; - /* q & cq need to be contiguous in case of notifyq */ + /* q & cq need to be contiguous in NotifyQ, so alloc it all in q + * and don't alloc qc. We leave new->qc_size and new->qc_base + * as 0 to be sure we don't try to free it later. + */ q_size = ALIGN(num_descs * desc_size, PAGE_SIZE); - cq_size = ALIGN(num_descs * cq_desc_size, PAGE_SIZE); - - new->q_size = PAGE_SIZE + q_size + cq_size; + new->q_size = PAGE_SIZE + q_size + + ALIGN(num_descs * cq_desc_size, PAGE_SIZE); new->q_base = dma_alloc_coherent(dev, new->q_size, &new->q_base_pa, GFP_KERNEL); if (!new->q_base) { @@ -773,8 +794,14 @@ static int ionic_lif_txq_init(struct ionic_lif *lif, struct ionic_qcq *qcq) dev_dbg(dev, "txq->hw_type %d\n", q->hw_type); dev_dbg(dev, "txq->hw_index %d\n", q->hw_index); - if (test_bit(IONIC_LIF_F_SPLIT_INTR, lif->state)) + q->dbell_deadline = IONIC_TX_DOORBELL_DEADLINE; + q->dbell_jiffies = jiffies; + + if (test_bit(IONIC_LIF_F_SPLIT_INTR, lif->state)) { netif_napi_add(lif->netdev, &qcq->napi, ionic_tx_napi); + qcq->napi_qcq = qcq; + timer_setup(&qcq->napi_deadline, ionic_napi_deadline, 0); + } qcq->flags |= IONIC_QCQ_F_INITED; @@ -828,11 +855,17 @@ static int ionic_lif_rxq_init(struct ionic_lif *lif, struct ionic_qcq *qcq) dev_dbg(dev, "rxq->hw_type %d\n", q->hw_type); dev_dbg(dev, "rxq->hw_index %d\n", q->hw_index); + q->dbell_deadline = IONIC_RX_MIN_DOORBELL_DEADLINE; + q->dbell_jiffies = jiffies; + if (test_bit(IONIC_LIF_F_SPLIT_INTR, lif->state)) netif_napi_add(lif->netdev, &qcq->napi, ionic_rx_napi); else netif_napi_add(lif->netdev, &qcq->napi, ionic_txrx_napi); + qcq->napi_qcq = qcq; + timer_setup(&qcq->napi_deadline, ionic_napi_deadline, 0); + qcq->flags |= IONIC_QCQ_F_INITED; return 0; @@ -1150,6 +1183,7 @@ static int ionic_adminq_napi(struct napi_struct *napi, int budget) struct ionic_dev *idev = &lif->ionic->idev; unsigned long irqflags; unsigned int flags = 0; + bool resched = false; int rx_work = 0; int tx_work = 0; int n_work = 0; @@ -1187,6 +1221,16 @@ static int ionic_adminq_napi(struct napi_struct *napi, int budget) ionic_intr_credits(idev->intr_ctrl, intr->index, credits, flags); } + if (!a_work && ionic_adminq_poke_doorbell(&lif->adminqcq->q)) + resched = true; + if (lif->hwstamp_rxq && !rx_work && ionic_rxq_poke_doorbell(&lif->hwstamp_rxq->q)) + resched = true; + if (lif->hwstamp_txq && !tx_work && ionic_txq_poke_doorbell(&lif->hwstamp_txq->q)) + resched = true; + if (resched) + mod_timer(&lif->adminqcq->napi_deadline, + jiffies + IONIC_NAPI_DEADLINE); + return work_done; } @@ -3245,8 +3289,14 @@ static int ionic_lif_adminq_init(struct ionic_lif *lif) dev_dbg(dev, "adminq->hw_type %d\n", q->hw_type); dev_dbg(dev, "adminq->hw_index %d\n", q->hw_index); + q->dbell_deadline = IONIC_ADMIN_DOORBELL_DEADLINE; + q->dbell_jiffies = jiffies; + netif_napi_add(lif->netdev, &qcq->napi, ionic_adminq_napi); + qcq->napi_qcq = qcq; + timer_setup(&qcq->napi_deadline, ionic_napi_deadline, 0); + napi_enable(&qcq->napi); if (qcq->flags & IONIC_QCQ_F_INTR) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.h b/drivers/net/ethernet/pensando/ionic/ionic_lif.h index a53984bf3544..734519895614 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.h @@ -74,8 +74,10 @@ struct ionic_qcq { struct ionic_queue q; struct ionic_cq cq; struct ionic_intr_info intr; + struct timer_list napi_deadline; struct napi_struct napi; unsigned int flags; + struct ionic_qcq *napi_qcq; struct dentry *dentry; }; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c index a13530ec4dd8..08c42b039d92 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_main.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c @@ -289,6 +289,35 @@ static void ionic_adminq_cb(struct ionic_queue *q, complete_all(&ctx->work); } +bool ionic_adminq_poke_doorbell(struct ionic_queue *q) +{ + struct ionic_lif *lif = q->lif; + unsigned long now, then, dif; + unsigned long irqflags; + + spin_lock_irqsave(&lif->adminq_lock, irqflags); + + if (q->tail_idx == q->head_idx) { + spin_unlock_irqrestore(&lif->adminq_lock, irqflags); + return false; + } + + now = READ_ONCE(jiffies); + then = q->dbell_jiffies; + dif = now - then; + + if (dif > q->dbell_deadline) { + ionic_dbell_ring(q->lif->kern_dbpage, q->hw_type, + q->dbval | q->head_idx); + + q->dbell_jiffies = now; + } + + spin_unlock_irqrestore(&lif->adminq_lock, irqflags); + + return true; +} + int ionic_adminq_post(struct ionic_lif *lif, struct ionic_admin_ctx *ctx) { struct ionic_desc_info *desc_info; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c index 0c3977416cd1..f761780f0162 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c @@ -22,6 +22,67 @@ static inline void ionic_rxq_post(struct ionic_queue *q, bool ring_dbell, ionic_q_post(q, ring_dbell, cb_func, cb_arg); } +bool ionic_txq_poke_doorbell(struct ionic_queue *q) +{ + unsigned long now, then, dif; + struct netdev_queue *netdev_txq; + struct net_device *netdev; + + netdev = q->lif->netdev; + netdev_txq = netdev_get_tx_queue(netdev, q->index); + + HARD_TX_LOCK(netdev, netdev_txq, smp_processor_id()); + + if (q->tail_idx == q->head_idx) { + HARD_TX_UNLOCK(netdev, netdev_txq); + return false; + } + + now = READ_ONCE(jiffies); + then = q->dbell_jiffies; + dif = now - then; + + if (dif > q->dbell_deadline) { + ionic_dbell_ring(q->lif->kern_dbpage, q->hw_type, + q->dbval | q->head_idx); + + q->dbell_jiffies = now; + } + + HARD_TX_UNLOCK(netdev, netdev_txq); + + return true; +} + +bool ionic_rxq_poke_doorbell(struct ionic_queue *q) +{ + unsigned long now, then, dif; + + /* no lock, called from rx napi or txrx napi, nothing else can fill */ + + if (q->tail_idx == q->head_idx) + return false; + + now = READ_ONCE(jiffies); + then = q->dbell_jiffies; + dif = now - then; + + if (dif > q->dbell_deadline) { + ionic_dbell_ring(q->lif->kern_dbpage, q->hw_type, + q->dbval | q->head_idx); + + q->dbell_jiffies = now; + + dif = 2 * q->dbell_deadline; + if (dif > IONIC_RX_MAX_DOORBELL_DEADLINE) + dif = IONIC_RX_MAX_DOORBELL_DEADLINE; + + q->dbell_deadline = dif; + } + + return true; +} + static inline struct netdev_queue *q_to_ndq(struct ionic_queue *q) { return netdev_get_tx_queue(q->lif->netdev, q->index); @@ -424,6 +485,12 @@ void ionic_rx_fill(struct ionic_queue *q) ionic_dbell_ring(q->lif->kern_dbpage, q->hw_type, q->dbval | q->head_idx); + + q->dbell_deadline = IONIC_RX_MIN_DOORBELL_DEADLINE; + q->dbell_jiffies = jiffies; + + mod_timer(&q_to_qcq(q)->napi_qcq->napi_deadline, + jiffies + IONIC_NAPI_DEADLINE); } void ionic_rx_empty(struct ionic_queue *q) @@ -511,6 +578,9 @@ int ionic_tx_napi(struct napi_struct *napi, int budget) work_done, flags); } + if (!work_done && ionic_txq_poke_doorbell(&qcq->q)) + mod_timer(&qcq->napi_deadline, jiffies + IONIC_NAPI_DEADLINE); + return work_done; } @@ -544,23 +614,29 @@ int ionic_rx_napi(struct napi_struct *napi, int budget) work_done, flags); } + if (!work_done && ionic_rxq_poke_doorbell(&qcq->q)) + mod_timer(&qcq->napi_deadline, jiffies + IONIC_NAPI_DEADLINE); + return work_done; } int ionic_txrx_napi(struct napi_struct *napi, int budget) { - struct ionic_qcq *qcq = napi_to_qcq(napi); + struct ionic_qcq *rxqcq = napi_to_qcq(napi); struct ionic_cq *rxcq = napi_to_cq(napi); unsigned int qi = rxcq->bound_q->index; + struct ionic_qcq *txqcq; struct ionic_dev *idev; struct ionic_lif *lif; struct ionic_cq *txcq; + bool resched = false; u32 rx_work_done = 0; u32 tx_work_done = 0; u32 flags = 0; lif = rxcq->bound_q->lif; idev = &lif->ionic->idev; + txqcq = lif->txqcqs[qi]; txcq = &lif->txqcqs[qi]->cq; tx_work_done = ionic_cq_service(txcq, IONIC_TX_BUDGET_DEFAULT, @@ -572,7 +648,7 @@ int ionic_txrx_napi(struct napi_struct *napi, int budget) ionic_rx_fill(rxcq->bound_q); if (rx_work_done < budget && napi_complete_done(napi, rx_work_done)) { - ionic_dim_update(qcq, 0); + ionic_dim_update(rxqcq, 0); flags |= IONIC_INTR_CRED_UNMASK; rxcq->bound_intr->rearm_count++; } @@ -583,6 +659,13 @@ int ionic_txrx_napi(struct napi_struct *napi, int budget) tx_work_done + rx_work_done, flags); } + if (!rx_work_done && ionic_rxq_poke_doorbell(&rxqcq->q)) + resched = true; + if (!tx_work_done && ionic_txq_poke_doorbell(&txqcq->q)) + resched = true; + if (resched) + mod_timer(&rxqcq->napi_deadline, jiffies + IONIC_NAPI_DEADLINE); + return rx_work_done; } diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 7c2af482192d..cb1746bc0e0c 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1438,6 +1438,10 @@ int qede_poll(struct napi_struct *napi, int budget) rx_work_done = (likely(fp->type & QEDE_FASTPATH_RX) && qede_has_rx_work(fp->rxq)) ? qede_rx_int(fp, budget) : 0; + + if (fp->xdp_xmit & QEDE_XDP_REDIRECT) + xdp_do_flush(); + /* Handle case where we are called by netpoll with a budget of 0 */ if (rx_work_done < budget || !budget) { if (!qede_poll_is_more_work(fp)) { @@ -1457,9 +1461,6 @@ int qede_poll(struct napi_struct *napi, int budget) qede_update_tx_producer(fp->xdp_tx); } - if (fp->xdp_xmit & QEDE_XDP_REDIRECT) - xdp_do_flush_map(); - return rx_work_done; } diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 0556542d7a6b..3a86f1213a05 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -1003,8 +1003,11 @@ static int efx_pci_probe_post_io(struct efx_nic *efx) /* Determine netdevice features */ net_dev->features |= (efx->type->offload_features | NETIF_F_SG | NETIF_F_TSO | NETIF_F_RXCSUM | NETIF_F_RXALL); - if (efx->type->offload_features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) + if (efx->type->offload_features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) { net_dev->features |= NETIF_F_TSO6; + if (efx_has_cap(efx, TX_TSO_V2_ENCAP)) + net_dev->hw_enc_features |= NETIF_F_TSO6; + } /* Check whether device supports TSO */ if (!efx->type->tso_versions || !efx->type->tso_versions(efx)) net_dev->features &= ~NETIF_F_ALL_TSO; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index 835caa15d55f..732774645c1a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -560,6 +560,8 @@ static int qcom_ethqos_probe(struct platform_device *pdev) plat_dat->has_gmac4 = 1; plat_dat->pmt = 1; plat_dat->tso_en = of_property_read_bool(np, "snps,tso"); + if (of_device_is_compatible(np, "qcom,qcs404-ethqos")) + plat_dat->rx_clk_runs_in_lpi = 1; ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); if (ret) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c index 413f66017219..e95d35f1e5a0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c @@ -541,9 +541,9 @@ int dwmac5_flex_pps_config(void __iomem *ioaddr, int index, return 0; } - val |= PPSCMDx(index, 0x2); val |= TRGTMODSELx(index, 0x2); val |= PPSEN0; + writel(val, ioaddr + MAC_PPS_CONTROL); writel(cfg->start.tv_sec, ioaddr + MAC_PPSx_TARGET_TIME_SEC(index)); @@ -568,6 +568,7 @@ int dwmac5_flex_pps_config(void __iomem *ioaddr, int index, writel(period - 1, ioaddr + MAC_PPSx_WIDTH(index)); /* Finally, activate it */ + val |= PPSCMDx(index, 0x2); writel(val, ioaddr + MAC_PPS_CONTROL); return 0; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index b7e5af58ab75..1a5b8dab5e9b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1080,7 +1080,8 @@ static void stmmac_mac_link_up(struct phylink_config *config, stmmac_mac_set(priv, priv->ioaddr, true); if (phy && priv->dma_cap.eee) { - priv->eee_active = phy_init_eee(phy, 1) >= 0; + priv->eee_active = + phy_init_eee(phy, !priv->plat->rx_clk_runs_in_lpi) >= 0; priv->eee_enabled = stmmac_eee_init(priv); priv->tx_lpi_enabled = priv->eee_enabled; stmmac_set_eee_pls(priv, priv->hw, true); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index eb6d9cd8e93f..0046a4ee6e64 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -559,7 +559,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) dma_cfg->mixed_burst = of_property_read_bool(np, "snps,mixed-burst"); plat->force_thresh_dma_mode = of_property_read_bool(np, "snps,force_thresh_dma_mode"); - if (plat->force_thresh_dma_mode) { + if (plat->force_thresh_dma_mode && plat->force_sf_dma_mode) { plat->force_sf_dma_mode = 0; dev_warn(&pdev->dev, "force_sf_dma_mode is ignored if force_thresh_dma_mode is set.\n"); diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index ecbde83b5243..6cda4b7c10cb 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -501,7 +501,15 @@ static int am65_cpsw_nuss_common_stop(struct am65_cpsw_common *common) k3_udma_glue_disable_tx_chn(common->tx_chns[i].tx_chn); } + reinit_completion(&common->tdown_complete); k3_udma_glue_tdown_rx_chn(common->rx_chns.rx_chn, true); + + if (common->pdata.quirks & AM64_CPSW_QUIRK_DMA_RX_TDOWN_IRQ) { + i = wait_for_completion_timeout(&common->tdown_complete, msecs_to_jiffies(1000)); + if (!i) + dev_err(common->dev, "rx teardown timeout\n"); + } + napi_disable(&common->napi_rx); for (i = 0; i < AM65_CPSW_MAX_RX_FLOWS; i++) @@ -721,6 +729,8 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_common *common, if (cppi5_desc_is_tdcm(desc_dma)) { dev_dbg(dev, "%s RX tdown flow: %u\n", __func__, flow_idx); + if (common->pdata.quirks & AM64_CPSW_QUIRK_DMA_RX_TDOWN_IRQ) + complete(&common->tdown_complete); return 0; } @@ -2672,7 +2682,7 @@ static const struct am65_cpsw_pdata j721e_pdata = { }; static const struct am65_cpsw_pdata am64x_cpswxg_pdata = { - .quirks = 0, + .quirks = AM64_CPSW_QUIRK_DMA_RX_TDOWN_IRQ, .ale_dev_id = "am64-cpswxg", .fdqring_mode = K3_RINGACC_RING_MODE_RING, }; diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.h b/drivers/net/ethernet/ti/am65-cpsw-nuss.h index 4b75620f8d28..e5f1c44788c1 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.h +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.h @@ -90,6 +90,7 @@ struct am65_cpsw_rx_chn { }; #define AM65_CPSW_QUIRK_I2027_NO_TX_CSUM BIT(0) +#define AM64_CPSW_QUIRK_DMA_RX_TDOWN_IRQ BIT(1) struct am65_cpsw_pdata { u32 quirks; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 9352dad58996..79f4e13620a4 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -987,9 +987,6 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device, void netvsc_dma_unmap(struct hv_device *hv_dev, struct hv_netvsc_packet *packet) { - u32 page_count = packet->cp_partial ? - packet->page_buf_cnt - packet->rmsg_pgcnt : - packet->page_buf_cnt; int i; if (!hv_is_isolation_supported()) @@ -998,7 +995,7 @@ void netvsc_dma_unmap(struct hv_device *hv_dev, if (!packet->dma_range) return; - for (i = 0; i < page_count; i++) + for (i = 0; i < packet->page_buf_cnt; i++) dma_unmap_single(&hv_dev->device, packet->dma_range[i].dma, packet->dma_range[i].mapping_size, DMA_TO_DEVICE); @@ -1028,9 +1025,7 @@ static int netvsc_dma_map(struct hv_device *hv_dev, struct hv_netvsc_packet *packet, struct hv_page_buffer *pb) { - u32 page_count = packet->cp_partial ? - packet->page_buf_cnt - packet->rmsg_pgcnt : - packet->page_buf_cnt; + u32 page_count = packet->page_buf_cnt; dma_addr_t dma; int i; @@ -1039,7 +1034,7 @@ static int netvsc_dma_map(struct hv_device *hv_dev, packet->dma_range = kcalloc(page_count, sizeof(*packet->dma_range), - GFP_KERNEL); + GFP_ATOMIC); if (!packet->dma_range) return -ENOMEM; diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index a6f05e35d91f..b7cb71817780 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -233,7 +233,8 @@ static int dp83822_config_intr(struct phy_device *phydev) DP83822_ENERGY_DET_INT_EN | DP83822_LINK_QUAL_INT_EN); - if (!dp83822->fx_enabled) + /* Private data pointer is NULL on DP83825/26 */ + if (!dp83822 || !dp83822->fx_enabled) misr_status |= DP83822_ANEG_COMPLETE_INT_EN | DP83822_DUP_MODE_CHANGE_INT_EN | DP83822_SPEED_CHANGED_INT_EN; @@ -253,7 +254,8 @@ static int dp83822_config_intr(struct phy_device *phydev) DP83822_PAGE_RX_INT_EN | DP83822_EEE_ERROR_CHANGE_INT_EN); - if (!dp83822->fx_enabled) + /* Private data pointer is NULL on DP83825/26 */ + if (!dp83822 || !dp83822->fx_enabled) misr_status |= DP83822_ANEG_ERR_INT_EN | DP83822_WOL_PKT_INT_EN; diff --git a/drivers/net/phy/meson-gxl.c b/drivers/net/phy/meson-gxl.c index c49062ad72c6..a6015cd03bff 100644 --- a/drivers/net/phy/meson-gxl.c +++ b/drivers/net/phy/meson-gxl.c @@ -261,6 +261,8 @@ static struct phy_driver meson_gxl_phy[] = { .handle_interrupt = meson_gxl_handle_interrupt, .suspend = genphy_suspend, .resume = genphy_resume, + .read_mmd = genphy_read_mmd_unsupported, + .write_mmd = genphy_write_mmd_unsupported, }, { PHY_ID_MATCH_EXACT(0x01803301), .name = "Meson G12A Internal PHY", @@ -271,6 +273,8 @@ static struct phy_driver meson_gxl_phy[] = { .handle_interrupt = meson_gxl_handle_interrupt, .suspend = genphy_suspend, .resume = genphy_resume, + .read_mmd = genphy_read_mmd_unsupported, + .write_mmd = genphy_write_mmd_unsupported, }, }; diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 716870a4499c..607aa786c8cb 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1517,7 +1517,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, * another mac interface, so we should create a device link between * phy dev and mac dev. */ - if (phydev->mdio.bus->parent && dev->dev.parent != phydev->mdio.bus->parent) + if (dev && phydev->mdio.bus->parent && dev->dev.parent != phydev->mdio.bus->parent) phydev->devlink = device_link_add(dev->dev.parent, &phydev->mdio.dev, DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS); diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 09cc65c0da93..4d2519cdb801 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1812,10 +1812,9 @@ int phylink_fwnode_phy_connect(struct phylink *pl, ret = phy_attach_direct(pl->netdev, phy_dev, flags, pl->link_interface); - if (ret) { - phy_device_free(phy_dev); + phy_device_free(phy_dev); + if (ret) return ret; - } ret = phylink_bringup_phy(pl, phy_dev, pl->link_config.interface); if (ret) diff --git a/drivers/net/usb/kalmia.c b/drivers/net/usb/kalmia.c index 9f2b70ef39aa..613fc6910f14 100644 --- a/drivers/net/usb/kalmia.c +++ b/drivers/net/usb/kalmia.c @@ -65,8 +65,8 @@ kalmia_send_init_packet(struct usbnet *dev, u8 *init_msg, u8 init_msg_len, init_msg, init_msg_len, &act_len, KALMIA_USB_TIMEOUT); if (status != 0) { netdev_err(dev->net, - "Error sending init packet. Status %i, length %i\n", - status, act_len); + "Error sending init packet. Status %i\n", + status); return status; } else if (act_len != init_msg_len) { @@ -83,8 +83,8 @@ kalmia_send_init_packet(struct usbnet *dev, u8 *init_msg, u8 init_msg_len, if (status != 0) netdev_err(dev->net, - "Error receiving init result. Status %i, length %i\n", - status, act_len); + "Error receiving init result. Status %i\n", + status); else if (act_len != expected_len) netdev_err(dev->net, "Unexpected init result length: %i\n", act_len); diff --git a/drivers/net/usb/plusb.c b/drivers/net/usb/plusb.c index 2c82fbcaab22..7a2b0094de51 100644 --- a/drivers/net/usb/plusb.c +++ b/drivers/net/usb/plusb.c @@ -57,9 +57,7 @@ static inline int pl_vendor_req(struct usbnet *dev, u8 req, u8 val, u8 index) { - return usbnet_read_cmd(dev, req, - USB_DIR_IN | USB_TYPE_VENDOR | - USB_RECIP_DEVICE, + return usbnet_write_cmd(dev, req, USB_TYPE_VENDOR | USB_RECIP_DEVICE, val, index, NULL, 0); } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 18b3de854aeb..61e33e4dd0cd 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1677,13 +1677,13 @@ static int virtnet_poll(struct napi_struct *napi, int budget) received = virtnet_receive(rq, budget, &xdp_xmit); + if (xdp_xmit & VIRTIO_XDP_REDIR) + xdp_do_flush(); + /* Out of packets? */ if (received < budget) virtqueue_napi_complete(napi, rq->vq, received); - if (xdp_xmit & VIRTIO_XDP_REDIR) - xdp_do_flush(); - if (xdp_xmit & VIRTIO_XDP_TX) { sq = virtnet_xdp_get_sq(vi); if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { @@ -2158,8 +2158,8 @@ static int virtnet_close(struct net_device *dev) cancel_delayed_work_sync(&vi->refill); for (i = 0; i < vi->max_queue_pairs; i++) { - xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); napi_disable(&vi->rq[i].napi); + xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); virtnet_napi_tx_disable(&vi->sq[i].napi); } diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 56267c327f0b..682987040ea8 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -1546,31 +1546,6 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq, rxd->len = rbi->len; } -#ifdef VMXNET3_RSS - if (rcd->rssType != VMXNET3_RCD_RSS_TYPE_NONE && - (adapter->netdev->features & NETIF_F_RXHASH)) { - enum pkt_hash_types hash_type; - - switch (rcd->rssType) { - case VMXNET3_RCD_RSS_TYPE_IPV4: - case VMXNET3_RCD_RSS_TYPE_IPV6: - hash_type = PKT_HASH_TYPE_L3; - break; - case VMXNET3_RCD_RSS_TYPE_TCPIPV4: - case VMXNET3_RCD_RSS_TYPE_TCPIPV6: - case VMXNET3_RCD_RSS_TYPE_UDPIPV4: - case VMXNET3_RCD_RSS_TYPE_UDPIPV6: - hash_type = PKT_HASH_TYPE_L4; - break; - default: - hash_type = PKT_HASH_TYPE_L3; - break; - } - skb_set_hash(ctx->skb, - le32_to_cpu(rcd->rssHash), - hash_type); - } -#endif skb_record_rx_queue(ctx->skb, rq->qid); skb_put(ctx->skb, rcd->len); @@ -1653,6 +1628,31 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq, u32 mtu = adapter->netdev->mtu; skb->len += skb->data_len; +#ifdef VMXNET3_RSS + if (rcd->rssType != VMXNET3_RCD_RSS_TYPE_NONE && + (adapter->netdev->features & NETIF_F_RXHASH)) { + enum pkt_hash_types hash_type; + + switch (rcd->rssType) { + case VMXNET3_RCD_RSS_TYPE_IPV4: + case VMXNET3_RCD_RSS_TYPE_IPV6: + hash_type = PKT_HASH_TYPE_L3; + break; + case VMXNET3_RCD_RSS_TYPE_TCPIPV4: + case VMXNET3_RCD_RSS_TYPE_TCPIPV6: + case VMXNET3_RCD_RSS_TYPE_UDPIPV4: + case VMXNET3_RCD_RSS_TYPE_UDPIPV6: + hash_type = PKT_HASH_TYPE_L4; + break; + default: + hash_type = PKT_HASH_TYPE_L3; + break; + } + skb_set_hash(skb, + le32_to_cpu(rcd->rssHash), + hash_type); + } +#endif vmxnet3_rx_csum(adapter, skb, (union Vmxnet3_GenericDesc *)rcd); skb->protocol = eth_type_trans(skb, adapter->netdev); diff --git a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c index 7eff3531b9a5..7ff33c1d6ac7 100644 --- a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c +++ b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c @@ -152,6 +152,15 @@ static irqreturn_t t7xx_dpmaif_isr_handler(int irq, void *data) } t7xx_pcie_mac_clear_int(dpmaif_ctrl->t7xx_dev, isr_para->pcie_int); + + return IRQ_WAKE_THREAD; +} + +static irqreturn_t t7xx_dpmaif_isr_thread(int irq, void *data) +{ + struct dpmaif_isr_para *isr_para = data; + struct dpmaif_ctrl *dpmaif_ctrl = isr_para->dpmaif_ctrl; + t7xx_dpmaif_irq_cb(isr_para); t7xx_pcie_mac_set_int(dpmaif_ctrl->t7xx_dev, isr_para->pcie_int); return IRQ_HANDLED; @@ -188,7 +197,7 @@ static void t7xx_dpmaif_register_pcie_irq(struct dpmaif_ctrl *dpmaif_ctrl) t7xx_pcie_mac_clear_int(t7xx_dev, int_type); t7xx_dev->intr_handler[int_type] = t7xx_dpmaif_isr_handler; - t7xx_dev->intr_thread[int_type] = NULL; + t7xx_dev->intr_thread[int_type] = t7xx_dpmaif_isr_thread; t7xx_dev->callback_param[int_type] = isr_para; t7xx_pcie_mac_clear_int_status(t7xx_dev, int_type); diff --git a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c index aa2174a10437..f4ff2198b5ef 100644 --- a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c +++ b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c @@ -840,14 +840,13 @@ int t7xx_dpmaif_napi_rx_poll(struct napi_struct *napi, const int budget) if (!rxq->que_started) { atomic_set(&rxq->rx_processing, 0); + pm_runtime_put_autosuspend(rxq->dpmaif_ctrl->dev); dev_err(rxq->dpmaif_ctrl->dev, "Work RXQ: %d has not been started\n", rxq->index); return work_done; } - if (!rxq->sleep_lock_pending) { - pm_runtime_get_noresume(rxq->dpmaif_ctrl->dev); + if (!rxq->sleep_lock_pending) t7xx_pci_disable_sleep(t7xx_dev); - } ret = try_wait_for_completion(&t7xx_dev->sleep_lock_acquire); if (!ret) { @@ -876,22 +875,22 @@ int t7xx_dpmaif_napi_rx_poll(struct napi_struct *napi, const int budget) napi_complete_done(napi, work_done); t7xx_dpmaif_clr_ip_busy_sts(&rxq->dpmaif_ctrl->hw_info); t7xx_dpmaif_dlq_unmask_rx_done(&rxq->dpmaif_ctrl->hw_info, rxq->index); + t7xx_pci_enable_sleep(rxq->dpmaif_ctrl->t7xx_dev); + pm_runtime_mark_last_busy(rxq->dpmaif_ctrl->dev); + pm_runtime_put_autosuspend(rxq->dpmaif_ctrl->dev); + atomic_set(&rxq->rx_processing, 0); } else { t7xx_dpmaif_clr_ip_busy_sts(&rxq->dpmaif_ctrl->hw_info); } - t7xx_pci_enable_sleep(rxq->dpmaif_ctrl->t7xx_dev); - pm_runtime_mark_last_busy(rxq->dpmaif_ctrl->dev); - pm_runtime_put_noidle(rxq->dpmaif_ctrl->dev); - atomic_set(&rxq->rx_processing, 0); - return work_done; } void t7xx_dpmaif_irq_rx_done(struct dpmaif_ctrl *dpmaif_ctrl, const unsigned int que_mask) { struct dpmaif_rx_queue *rxq; - int qno; + struct dpmaif_ctrl *ctrl; + int qno, ret; qno = ffs(que_mask) - 1; if (qno < 0 || qno > DPMAIF_RXQ_NUM - 1) { @@ -900,6 +899,18 @@ void t7xx_dpmaif_irq_rx_done(struct dpmaif_ctrl *dpmaif_ctrl, const unsigned int } rxq = &dpmaif_ctrl->rxq[qno]; + ctrl = rxq->dpmaif_ctrl; + /* We need to make sure that the modem has been resumed before + * calling napi. This can't be done inside the polling function + * as we could be blocked waiting for device to be resumed, + * which can't be done from softirq context the poll function + * is running in. + */ + ret = pm_runtime_resume_and_get(ctrl->dev); + if (ret < 0 && ret != -EACCES) { + dev_err(ctrl->dev, "Failed to resume device: %d\n", ret); + return; + } napi_schedule(&rxq->napi); } diff --git a/drivers/net/wwan/t7xx/t7xx_netdev.c b/drivers/net/wwan/t7xx/t7xx_netdev.c index 494a28e386a3..3ef4a8a4f8fd 100644 --- a/drivers/net/wwan/t7xx/t7xx_netdev.c +++ b/drivers/net/wwan/t7xx/t7xx_netdev.c @@ -27,6 +27,7 @@ #include <linux/list.h> #include <linux/netdev_features.h> #include <linux/netdevice.h> +#include <linux/pm_runtime.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/wwan.h> @@ -45,12 +46,25 @@ static void t7xx_ccmni_enable_napi(struct t7xx_ccmni_ctrl *ctlb) { - int i; + struct dpmaif_ctrl *ctrl; + int i, ret; + + ctrl = ctlb->hif_ctrl; if (ctlb->is_napi_en) return; for (i = 0; i < RXQ_NUM; i++) { + /* The usage count has to be bumped every time before calling + * napi_schedule. It will be decresed in the poll routine, + * right after napi_complete_done is called. + */ + ret = pm_runtime_resume_and_get(ctrl->dev); + if (ret < 0) { + dev_err(ctrl->dev, "Failed to resume device: %d\n", + ret); + return; + } napi_enable(ctlb->napi[i]); napi_schedule(ctlb->napi[i]); } diff --git a/drivers/net/wwan/t7xx/t7xx_pci.c b/drivers/net/wwan/t7xx/t7xx_pci.c index 871f2a27a398..226fc1703e90 100644 --- a/drivers/net/wwan/t7xx/t7xx_pci.c +++ b/drivers/net/wwan/t7xx/t7xx_pci.c @@ -121,6 +121,8 @@ void t7xx_pci_pm_init_late(struct t7xx_pci_dev *t7xx_dev) iowrite32(T7XX_L1_BIT(0), IREG_BASE(t7xx_dev) + ENABLE_ASPM_LOWPWR); atomic_set(&t7xx_dev->md_pm_state, MTK_PM_RESUMED); + pm_runtime_mark_last_busy(&t7xx_dev->pdev->dev); + pm_runtime_allow(&t7xx_dev->pdev->dev); pm_runtime_put_noidle(&t7xx_dev->pdev->dev); } diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index 79d93126453d..77b06d54cc62 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -102,6 +102,25 @@ config NVDIMM_KEYS depends on ENCRYPTED_KEYS depends on (LIBNVDIMM=ENCRYPTED_KEYS) || LIBNVDIMM=m +config NVDIMM_KMSAN + bool + depends on KMSAN + help + KMSAN, and other memory debug facilities, increase the size of + 'struct page' to contain extra metadata. This collides with + the NVDIMM capability to store a potentially + larger-than-"System RAM" size 'struct page' array in a + reservation of persistent memory rather than limited / + precious DRAM. However, that reservation needs to persist for + the life of the given NVDIMM namespace. If you are using KMSAN + to debug an issue unrelated to NVDIMMs or DAX then say N to this + option. Otherwise, say Y but understand that any namespaces + (with the page array stored pmem) created with this build of + the kernel will permanently reserve and strand excess + capacity compared to the CONFIG_KMSAN=n case. + + Select N if unsure. + config NVDIMM_TEST_BUILD tristate "Build the unit test core" depends on m diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 85ca5b4da3cf..ec5219680092 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -652,7 +652,7 @@ void devm_namespace_disable(struct device *dev, struct nd_namespace_common *ndns); #if IS_ENABLED(CONFIG_ND_CLAIM) /* max struct page size independent of kernel config */ -#define MAX_STRUCT_PAGE_SIZE 128 +#define MAX_STRUCT_PAGE_SIZE 64 int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap); #else static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 61af072ac98f..af7d9301520c 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -13,6 +13,8 @@ #include "pfn.h" #include "nd.h" +static const bool page_struct_override = IS_ENABLED(CONFIG_NVDIMM_KMSAN); + static void nd_pfn_release(struct device *dev) { struct nd_region *nd_region = to_nd_region(dev->parent); @@ -758,12 +760,6 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) return -ENXIO; } - /* - * Note, we use 64 here for the standard size of struct page, - * debugging options may cause it to be larger in which case the - * implementation will limit the pfns advertised through - * ->direct_access() to those that are included in the memmap. - */ start = nsio->res.start; size = resource_size(&nsio->res); npfns = PHYS_PFN(size - SZ_8K); @@ -782,20 +778,33 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) } end_trunc = start + size - ALIGN_DOWN(start + size, align); if (nd_pfn->mode == PFN_MODE_PMEM) { + unsigned long page_map_size = MAX_STRUCT_PAGE_SIZE * npfns; + /* * The altmap should be padded out to the block size used * when populating the vmemmap. This *should* be equal to * PMD_SIZE for most architectures. * - * Also make sure size of struct page is less than 128. We - * want to make sure we use large enough size here so that - * we don't have a dynamic reserve space depending on - * struct page size. But we also want to make sure we notice - * when we end up adding new elements to struct page. + * Also make sure size of struct page is less than + * MAX_STRUCT_PAGE_SIZE. The goal here is compatibility in the + * face of production kernel configurations that reduce the + * 'struct page' size below MAX_STRUCT_PAGE_SIZE. For debug + * kernel configurations that increase the 'struct page' size + * above MAX_STRUCT_PAGE_SIZE, the page_struct_override allows + * for continuing with the capacity that will be wasted when + * reverting to a production kernel configuration. Otherwise, + * those configurations are blocked by default. */ - BUILD_BUG_ON(sizeof(struct page) > MAX_STRUCT_PAGE_SIZE); - offset = ALIGN(start + SZ_8K + MAX_STRUCT_PAGE_SIZE * npfns, align) - - start; + if (sizeof(struct page) > MAX_STRUCT_PAGE_SIZE) { + if (page_struct_override) + page_map_size = sizeof(struct page) * npfns; + else { + dev_err(&nd_pfn->dev, + "Memory debug options prevent using pmem for the page map\n"); + return -EINVAL; + } + } + offset = ALIGN(start + SZ_8K + page_map_size, align) - start; } else if (nd_pfn->mode == PFN_MODE_RAM) offset = ALIGN(start + SZ_8K, align) - start; else @@ -818,7 +827,10 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) pfn_sb->version_minor = cpu_to_le16(4); pfn_sb->end_trunc = cpu_to_le32(end_trunc); pfn_sb->align = cpu_to_le32(nd_pfn->align); - pfn_sb->page_struct_size = cpu_to_le16(MAX_STRUCT_PAGE_SIZE); + if (sizeof(struct page) > MAX_STRUCT_PAGE_SIZE && page_struct_override) + pfn_sb->page_struct_size = cpu_to_le16(sizeof(struct page)); + else + pfn_sb->page_struct_size = cpu_to_le16(MAX_STRUCT_PAGE_SIZE); pfn_sb->page_size = cpu_to_le32(PAGE_SIZE); checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb); pfn_sb->checksum = cpu_to_le64(checksum); diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 4424f53a8a0a..bdb97496ba2d 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -45,6 +45,8 @@ struct nvme_dhchap_queue_context { int sess_key_len; }; +static struct workqueue_struct *nvme_auth_wq; + #define nvme_auth_flags_from_qid(qid) \ (qid == 0) ? 0 : BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED #define nvme_auth_queue_from_qid(ctrl, qid) \ @@ -866,7 +868,7 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) chap = &ctrl->dhchap_ctxs[qid]; cancel_work_sync(&chap->auth_work); - queue_work(nvme_wq, &chap->auth_work); + queue_work(nvme_auth_wq, &chap->auth_work); return 0; } EXPORT_SYMBOL_GPL(nvme_auth_negotiate); @@ -1008,10 +1010,15 @@ EXPORT_SYMBOL_GPL(nvme_auth_free); int __init nvme_init_auth(void) { + nvme_auth_wq = alloc_workqueue("nvme-auth-wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!nvme_auth_wq) + return -ENOMEM; + nvme_chap_buf_cache = kmem_cache_create("nvme-chap-buf-cache", CHAP_BUF_SIZE, 0, SLAB_HWCACHE_ALIGN, NULL); if (!nvme_chap_buf_cache) - return -ENOMEM; + goto err_destroy_workqueue; nvme_chap_buf_pool = mempool_create(16, mempool_alloc_slab, mempool_free_slab, nvme_chap_buf_cache); @@ -1021,6 +1028,8 @@ int __init nvme_init_auth(void) return 0; err_destroy_chap_buf_cache: kmem_cache_destroy(nvme_chap_buf_cache); +err_destroy_workqueue: + destroy_workqueue(nvme_auth_wq); return -ENOMEM; } @@ -1028,4 +1037,5 @@ void __exit nvme_exit_auth(void) { mempool_destroy(nvme_chap_buf_pool); kmem_cache_destroy(nvme_chap_buf_cache); + destroy_workqueue(nvme_auth_wq); } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 505e16f20e57..8b6421141162 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4921,7 +4921,9 @@ out_cleanup_admin_q: blk_mq_destroy_queue(ctrl->admin_q); blk_put_queue(ctrl->admin_q); out_free_tagset: - blk_mq_free_tag_set(ctrl->admin_tagset); + blk_mq_free_tag_set(set); + ctrl->admin_q = NULL; + ctrl->fabrics_q = NULL; return ret; } EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set); @@ -4983,6 +4985,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, out_free_tag_set: blk_mq_free_tag_set(set); + ctrl->connect_q = NULL; return ret; } EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c734934c407c..c11e0cfeef0f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -110,6 +110,7 @@ struct nvme_queue; static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); static void nvme_delete_io_queues(struct nvme_dev *dev); +static void nvme_update_attrs(struct nvme_dev *dev); /* * Represents an NVM Express device. Each nvme_dev is a PCI function. @@ -1923,6 +1924,8 @@ static void nvme_map_cmb(struct nvme_dev *dev) if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) pci_p2pmem_publish(pdev, true); + + nvme_update_attrs(dev); } static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) @@ -2209,6 +2212,11 @@ static const struct attribute_group *nvme_pci_dev_attr_groups[] = { NULL, }; +static void nvme_update_attrs(struct nvme_dev *dev) +{ + sysfs_update_group(&dev->ctrl.device->kobj, &nvme_pci_dev_attrs_group); +} + /* * nirqs is the number of interrupts available for write and read * queues. The core already reserved an interrupt for the admin queue. @@ -2509,18 +2517,12 @@ static int nvme_pci_enable(struct nvme_dev *dev) { int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); - int dma_address_bits = 64; if (pci_enable_device_mem(pdev)) return result; pci_set_master(pdev); - if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48) - dma_address_bits = 48; - if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits))) - goto disable; - if (readl(dev->bar + NVME_REG_CSTS) == -1) { result = -ENODEV; goto disable; @@ -2970,7 +2972,7 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) - return NULL; + return ERR_PTR(-ENOMEM); INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); mutex_init(&dev->shutdown_lock); @@ -2998,7 +3000,11 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, quirks); if (ret) goto out_put_device; - + + if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48) + dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(48)); + else + dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1); dma_set_max_seg_size(&pdev->dev, 0xffffffff); @@ -3031,8 +3037,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) int result = -ENOMEM; dev = nvme_pci_alloc_dev(pdev, id); - if (!dev) - return -ENOMEM; + if (IS_ERR(dev)) + return PTR_ERR(dev); result = nvme_dev_map(dev); if (result) @@ -3423,6 +3429,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN | NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x10ec, 0x5763), /* ADATA SX6000PNP */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_IGNORE_DEV_SUBNQN, }, diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index ab2627e17bb9..1ab6601fdd5c 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1685,8 +1685,10 @@ nvmet_fc_ls_create_association(struct nvmet_fc_tgtport *tgtport, else { queue = nvmet_fc_alloc_target_queue(iod->assoc, 0, be16_to_cpu(rqst->assoc_cmd.sqsize)); - if (!queue) + if (!queue) { ret = VERR_QUEUE_ALLOC_FAIL; + nvmet_fc_tgt_a_put(iod->assoc); + } } } diff --git a/drivers/nvmem/brcm_nvram.c b/drivers/nvmem/brcm_nvram.c index 34130449f2d2..39aa27942f28 100644 --- a/drivers/nvmem/brcm_nvram.c +++ b/drivers/nvmem/brcm_nvram.c @@ -98,6 +98,9 @@ static int brcm_nvram_parse(struct brcm_nvram *priv) len = le32_to_cpu(header.len); data = kzalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + memcpy_fromio(data, priv->base, len); data[len - 1] = '\0'; diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index 321d7d63e068..34ee9d36ee7b 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -770,31 +770,32 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config) return ERR_PTR(rval); } - if (config->wp_gpio) - nvmem->wp_gpio = config->wp_gpio; - else if (!config->ignore_wp) + nvmem->id = rval; + + nvmem->dev.type = &nvmem_provider_type; + nvmem->dev.bus = &nvmem_bus_type; + nvmem->dev.parent = config->dev; + + device_initialize(&nvmem->dev); + + if (!config->ignore_wp) nvmem->wp_gpio = gpiod_get_optional(config->dev, "wp", GPIOD_OUT_HIGH); if (IS_ERR(nvmem->wp_gpio)) { - ida_free(&nvmem_ida, nvmem->id); rval = PTR_ERR(nvmem->wp_gpio); - kfree(nvmem); - return ERR_PTR(rval); + nvmem->wp_gpio = NULL; + goto err_put_device; } kref_init(&nvmem->refcnt); INIT_LIST_HEAD(&nvmem->cells); - nvmem->id = rval; nvmem->owner = config->owner; if (!nvmem->owner && config->dev->driver) nvmem->owner = config->dev->driver->owner; nvmem->stride = config->stride ?: 1; nvmem->word_size = config->word_size ?: 1; nvmem->size = config->size; - nvmem->dev.type = &nvmem_provider_type; - nvmem->dev.bus = &nvmem_bus_type; - nvmem->dev.parent = config->dev; nvmem->root_only = config->root_only; nvmem->priv = config->priv; nvmem->type = config->type; @@ -822,11 +823,8 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config) break; } - if (rval) { - ida_free(&nvmem_ida, nvmem->id); - kfree(nvmem); - return ERR_PTR(rval); - } + if (rval) + goto err_put_device; nvmem->read_only = device_property_present(config->dev, "read-only") || config->read_only || !nvmem->reg_write; @@ -835,28 +833,22 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config) nvmem->dev.groups = nvmem_dev_groups; #endif - dev_dbg(&nvmem->dev, "Registering nvmem device %s\n", config->name); - - rval = device_register(&nvmem->dev); - if (rval) - goto err_put_device; - if (nvmem->nkeepout) { rval = nvmem_validate_keepouts(nvmem); if (rval) - goto err_device_del; + goto err_put_device; } if (config->compat) { rval = nvmem_sysfs_setup_compat(nvmem, config); if (rval) - goto err_device_del; + goto err_put_device; } if (config->cells) { rval = nvmem_add_cells(nvmem, config->cells, config->ncells); if (rval) - goto err_teardown_compat; + goto err_remove_cells; } rval = nvmem_add_cells_from_table(nvmem); @@ -867,17 +859,20 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config) if (rval) goto err_remove_cells; + dev_dbg(&nvmem->dev, "Registering nvmem device %s\n", config->name); + + rval = device_add(&nvmem->dev); + if (rval) + goto err_remove_cells; + blocking_notifier_call_chain(&nvmem_notifier, NVMEM_ADD, nvmem); return nvmem; err_remove_cells: nvmem_device_remove_all_cells(nvmem); -err_teardown_compat: if (config->compat) nvmem_sysfs_remove_compat(nvmem, config); -err_device_del: - device_del(&nvmem->dev); err_put_device: put_device(&nvmem->dev); @@ -1242,16 +1237,21 @@ struct nvmem_cell *of_nvmem_cell_get(struct device_node *np, const char *id) if (!cell_np) return ERR_PTR(-ENOENT); - nvmem_np = of_get_next_parent(cell_np); - if (!nvmem_np) + nvmem_np = of_get_parent(cell_np); + if (!nvmem_np) { + of_node_put(cell_np); return ERR_PTR(-EINVAL); + } nvmem = __nvmem_device_get(nvmem_np, device_match_of_node); of_node_put(nvmem_np); - if (IS_ERR(nvmem)) + if (IS_ERR(nvmem)) { + of_node_put(cell_np); return ERR_CAST(nvmem); + } cell_entry = nvmem_find_cell_entry_by_node(nvmem, cell_np); + of_node_put(cell_np); if (!cell_entry) { __nvmem_device_put(nvmem); return ERR_PTR(-ENOENT); diff --git a/drivers/nvmem/qcom-spmi-sdam.c b/drivers/nvmem/qcom-spmi-sdam.c index 4fcb63507ecd..8499892044b7 100644 --- a/drivers/nvmem/qcom-spmi-sdam.c +++ b/drivers/nvmem/qcom-spmi-sdam.c @@ -166,6 +166,7 @@ static const struct of_device_id sdam_match_table[] = { { .compatible = "qcom,spmi-sdam" }, {}, }; +MODULE_DEVICE_TABLE(of, sdam_match_table); static struct platform_driver sdam_driver = { .driver = { diff --git a/drivers/nvmem/sunxi_sid.c b/drivers/nvmem/sunxi_sid.c index 5750e1f4bcdb..92dfe4cb10e3 100644 --- a/drivers/nvmem/sunxi_sid.c +++ b/drivers/nvmem/sunxi_sid.c @@ -41,8 +41,21 @@ static int sunxi_sid_read(void *context, unsigned int offset, void *val, size_t bytes) { struct sunxi_sid *sid = context; + u32 word; + + /* .stride = 4 so offset is guaranteed to be aligned */ + __ioread32_copy(val, sid->base + sid->value_offset + offset, bytes / 4); - memcpy_fromio(val, sid->base + sid->value_offset + offset, bytes); + val += round_down(bytes, 4); + offset += round_down(bytes, 4); + bytes = bytes % 4; + + if (!bytes) + return 0; + + /* Handle any trailing bytes */ + word = readl_relaxed(sid->base + sid->value_offset + offset); + memcpy(val, &word, bytes); return 0; } diff --git a/drivers/of/address.c b/drivers/of/address.c index c34ac33b7338..67763e5b8c0e 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -965,8 +965,19 @@ int of_dma_get_range(struct device_node *np, const struct bus_dma_region **map) } of_dma_range_parser_init(&parser, node); - for_each_of_range(&parser, &range) + for_each_of_range(&parser, &range) { + if (range.cpu_addr == OF_BAD_ADDR) { + pr_err("translation of DMA address(%llx) to CPU address failed node(%pOF)\n", + range.bus_addr, node); + continue; + } num_ranges++; + } + + if (!num_ranges) { + ret = -EINVAL; + goto out; + } r = kcalloc(num_ranges + 1, sizeof(*r), GFP_KERNEL); if (!r) { @@ -975,18 +986,16 @@ int of_dma_get_range(struct device_node *np, const struct bus_dma_region **map) } /* - * Record all info in the generic DMA ranges array for struct device. + * Record all info in the generic DMA ranges array for struct device, + * returning an error if we don't find any parsable ranges. */ *map = r; of_dma_range_parser_init(&parser, node); for_each_of_range(&parser, &range) { pr_debug("dma_addr(%llx) cpu_addr(%llx) size(%llx)\n", range.bus_addr, range.cpu_addr, range.size); - if (range.cpu_addr == OF_BAD_ADDR) { - pr_err("translation of DMA address(%llx) to CPU address failed node(%pOF)\n", - range.bus_addr, node); + if (range.cpu_addr == OF_BAD_ADDR) continue; - } r->cpu_start = range.cpu_addr; r->dma_start = range.bus_addr; r->size = range.size; diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index f08b25195ae7..d1a68b6d03b3 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -26,7 +26,6 @@ #include <linux/serial_core.h> #include <linux/sysfs.h> #include <linux/random.h> -#include <linux/kmemleak.h> #include <asm/setup.h> /* for COMMAND_LINE_SIZE */ #include <asm/page.h> @@ -525,12 +524,9 @@ static int __init __reserved_mem_reserve_reg(unsigned long node, size = dt_mem_next_cell(dt_root_size_cells, &prop); if (size && - early_init_dt_reserve_memory(base, size, nomap) == 0) { + early_init_dt_reserve_memory(base, size, nomap) == 0) pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); - if (!nomap) - kmemleak_alloc_phys(base, size, 0); - } else pr_err("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 65f3b02a0e4e..f90975e00446 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -48,9 +48,10 @@ static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size, err = memblock_mark_nomap(base, size); if (err) memblock_phys_free(base, size); - kmemleak_ignore_phys(base); } + kmemleak_ignore_phys(base); + return err; } diff --git a/drivers/of/platform.c b/drivers/of/platform.c index 81c8c227ab6b..b3878a98d27f 100644 --- a/drivers/of/platform.c +++ b/drivers/of/platform.c @@ -525,6 +525,7 @@ static int __init of_platform_default_populate_init(void) if (IS_ENABLED(CONFIG_PPC)) { struct device_node *boot_display = NULL; struct platform_device *dev; + int display_number = 0; int ret; /* Check if we have a MacOS display without a node spec */ @@ -555,16 +556,23 @@ static int __init of_platform_default_populate_init(void) if (!of_get_property(node, "linux,opened", NULL) || !of_get_property(node, "linux,boot-display", NULL)) continue; - dev = of_platform_device_create(node, "of-display", NULL); + dev = of_platform_device_create(node, "of-display.0", NULL); + of_node_put(node); if (WARN_ON(!dev)) return -ENOMEM; boot_display = node; + display_number++; break; } for_each_node_by_type(node, "display") { + char buf[14]; + const char *of_display_format = "of-display.%d"; + if (!of_get_property(node, "linux,opened", NULL) || node == boot_display) continue; - of_platform_device_create(node, "of-display", NULL); + ret = snprintf(buf, sizeof(buf), of_display_format, display_number++); + if (ret < sizeof(buf)) + of_platform_device_create(node, buf, NULL); } } else { diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c index d6af5726ddf3..2a18f7ba2398 100644 --- a/drivers/parisc/pdc_stable.c +++ b/drivers/parisc/pdc_stable.c @@ -274,8 +274,7 @@ pdcspath_hwpath_write(struct pdcspath_entry *entry, const char *buf, size_t coun /* We'll use a local copy of buf */ count = min_t(size_t, count, sizeof(in)-1); - strncpy(in, buf, count); - in[count] = '\0'; + strscpy(in, buf, count + 1); /* Let's clean up the target. 0xff is a blank pattern */ memset(&hwpath, 0xff, sizeof(hwpath)); @@ -388,8 +387,7 @@ pdcspath_layer_write(struct pdcspath_entry *entry, const char *buf, size_t count /* We'll use a local copy of buf */ count = min_t(size_t, count, sizeof(in)-1); - strncpy(in, buf, count); - in[count] = '\0'; + strscpy(in, buf, count + 1); /* Let's clean up the target. 0 is a blank pattern */ memset(&layers, 0, sizeof(layers)); @@ -756,8 +754,7 @@ static ssize_t pdcs_auto_write(struct kobject *kobj, /* We'll use a local copy of buf */ count = min_t(size_t, count, sizeof(in)-1); - strncpy(in, buf, count); - in[count] = '\0'; + strscpy(in, buf, count + 1); /* Current flags are stored in primary boot path entry */ pathentry = &pdcspath_entry_primary; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index fba95486caaf..5641786bd020 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1665,7 +1665,6 @@ int pci_save_state(struct pci_dev *dev) return i; pci_save_ltr_state(dev); - pci_save_aspm_l1ss_state(dev); pci_save_dpc_state(dev); pci_save_aer_state(dev); pci_save_ptm_state(dev); @@ -1772,7 +1771,6 @@ void pci_restore_state(struct pci_dev *dev) * LTR itself (in the PCIe capability). */ pci_restore_ltr_state(dev); - pci_restore_aspm_l1ss_state(dev); pci_restore_pcie_state(dev); pci_restore_pasid_state(dev); @@ -3465,11 +3463,6 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev) if (error) pci_err(dev, "unable to allocate suspend buffer for LTR\n"); - error = pci_add_ext_cap_save_buffer(dev, PCI_EXT_CAP_ID_L1SS, - 2 * sizeof(u32)); - if (error) - pci_err(dev, "unable to allocate suspend buffer for ASPM-L1SS\n"); - pci_allocate_vc_save_buffers(dev); } diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 9ed3b5550043..9049d07d3aae 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -566,14 +566,10 @@ bool pcie_wait_for_link(struct pci_dev *pdev, bool active); void pcie_aspm_init_link_state(struct pci_dev *pdev); void pcie_aspm_exit_link_state(struct pci_dev *pdev); void pcie_aspm_powersave_config_link(struct pci_dev *pdev); -void pci_save_aspm_l1ss_state(struct pci_dev *dev); -void pci_restore_aspm_l1ss_state(struct pci_dev *dev); #else static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_powersave_config_link(struct pci_dev *pdev) { } -static inline void pci_save_aspm_l1ss_state(struct pci_dev *dev) { } -static inline void pci_restore_aspm_l1ss_state(struct pci_dev *dev) { } #endif #ifdef CONFIG_PCIE_ECRC diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 53a1fa306e1e..4b4184563a92 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -470,31 +470,6 @@ static void pci_clear_and_set_dword(struct pci_dev *pdev, int pos, pci_write_config_dword(pdev, pos, val); } -static void aspm_program_l1ss(struct pci_dev *dev, u32 ctl1, u32 ctl2) -{ - u16 l1ss = dev->l1ss; - u32 l1_2_enable; - - /* - * Per PCIe r6.0, sec 5.5.4, T_POWER_ON in PCI_L1SS_CTL2 must be - * programmed prior to setting the L1.2 enable bits in PCI_L1SS_CTL1. - */ - pci_write_config_dword(dev, l1ss + PCI_L1SS_CTL2, ctl2); - - /* - * In addition, Common_Mode_Restore_Time and LTR_L1.2_THRESHOLD in - * PCI_L1SS_CTL1 must be programmed *before* setting the L1.2 - * enable bits, even though they're all in PCI_L1SS_CTL1. - */ - l1_2_enable = ctl1 & PCI_L1SS_CTL1_L1_2_MASK; - ctl1 &= ~PCI_L1SS_CTL1_L1_2_MASK; - - pci_write_config_dword(dev, l1ss + PCI_L1SS_CTL1, ctl1); - if (l1_2_enable) - pci_write_config_dword(dev, l1ss + PCI_L1SS_CTL1, - ctl1 | l1_2_enable); -} - /* Calculate L1.2 PM substate timing parameters */ static void aspm_calc_l1ss_info(struct pcie_link_state *link, u32 parent_l1ss_cap, u32 child_l1ss_cap) @@ -504,6 +479,7 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link, u32 t_common_mode, t_power_on, l1_2_threshold, scale, value; u32 ctl1 = 0, ctl2 = 0; u32 pctl1, pctl2, cctl1, cctl2; + u32 pl1_2_enables, cl1_2_enables; if (!(link->aspm_support & ASPM_STATE_L1_2_MASK)) return; @@ -552,21 +528,39 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link, ctl2 == pctl2 && ctl2 == cctl2) return; - pctl1 &= ~(PCI_L1SS_CTL1_CM_RESTORE_TIME | - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE); - pctl1 |= (ctl1 & (PCI_L1SS_CTL1_CM_RESTORE_TIME | - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE)); - aspm_program_l1ss(parent, pctl1, ctl2); - - cctl1 &= ~(PCI_L1SS_CTL1_CM_RESTORE_TIME | - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE); - cctl1 |= (ctl1 & (PCI_L1SS_CTL1_CM_RESTORE_TIME | - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE)); - aspm_program_l1ss(child, cctl1, ctl2); + /* Disable L1.2 while updating. See PCIe r5.0, sec 5.5.4, 7.8.3.3 */ + pl1_2_enables = pctl1 & PCI_L1SS_CTL1_L1_2_MASK; + cl1_2_enables = cctl1 & PCI_L1SS_CTL1_L1_2_MASK; + + if (pl1_2_enables || cl1_2_enables) { + pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_L1_2_MASK, 0); + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_L1_2_MASK, 0); + } + + /* Program T_POWER_ON times in both ports */ + pci_write_config_dword(parent, parent->l1ss + PCI_L1SS_CTL2, ctl2); + pci_write_config_dword(child, child->l1ss + PCI_L1SS_CTL2, ctl2); + + /* Program Common_Mode_Restore_Time in upstream device */ + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_CM_RESTORE_TIME, ctl1); + + /* Program LTR_L1.2_THRESHOLD time in both ports */ + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_LTR_L12_TH_VALUE | + PCI_L1SS_CTL1_LTR_L12_TH_SCALE, ctl1); + pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_LTR_L12_TH_VALUE | + PCI_L1SS_CTL1_LTR_L12_TH_SCALE, ctl1); + + if (pl1_2_enables || cl1_2_enables) { + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, 0, + pl1_2_enables); + pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, 0, + cl1_2_enables); + } } static void aspm_l1ss_init(struct pcie_link_state *link) @@ -757,43 +751,6 @@ static void pcie_config_aspm_l1ss(struct pcie_link_state *link, u32 state) PCI_L1SS_CTL1_L1SS_MASK, val); } -void pci_save_aspm_l1ss_state(struct pci_dev *dev) -{ - struct pci_cap_saved_state *save_state; - u16 l1ss = dev->l1ss; - u32 *cap; - - if (!l1ss) - return; - - save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_L1SS); - if (!save_state) - return; - - cap = (u32 *)&save_state->cap.data[0]; - pci_read_config_dword(dev, l1ss + PCI_L1SS_CTL2, cap++); - pci_read_config_dword(dev, l1ss + PCI_L1SS_CTL1, cap++); -} - -void pci_restore_aspm_l1ss_state(struct pci_dev *dev) -{ - struct pci_cap_saved_state *save_state; - u32 *cap, ctl1, ctl2; - u16 l1ss = dev->l1ss; - - if (!l1ss) - return; - - save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_L1SS); - if (!save_state) - return; - - cap = (u32 *)&save_state->cap.data[0]; - ctl2 = *cap++; - ctl1 = *cap; - aspm_program_l1ss(dev, ctl1, ctl2); -} - static void pcie_config_aspm_dev(struct pci_dev *pdev, u32 val) { pcie_capability_clear_and_set_word(pdev, PCI_EXP_LNKCTL, diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 9b593f985805..40f70f83daba 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -550,13 +550,7 @@ static void armpmu_disable(struct pmu *pmu) static bool armpmu_filter(struct pmu *pmu, int cpu) { struct arm_pmu *armpmu = to_arm_pmu(pmu); - bool ret; - - ret = cpumask_test_cpu(cpu, &armpmu->supported_cpus); - if (ret && armpmu->filter) - return armpmu->filter(pmu, cpu); - - return ret; + return !cpumask_test_cpu(cpu, &armpmu->supported_cpus); } static ssize_t cpus_show(struct device *dev, diff --git a/drivers/pinctrl/aspeed/pinctrl-aspeed.c b/drivers/pinctrl/aspeed/pinctrl-aspeed.c index 3945612900e6..9c6ee46ac7a0 100644 --- a/drivers/pinctrl/aspeed/pinctrl-aspeed.c +++ b/drivers/pinctrl/aspeed/pinctrl-aspeed.c @@ -93,10 +93,19 @@ static int aspeed_sig_expr_enable(struct aspeed_pinmux_data *ctx, static int aspeed_sig_expr_disable(struct aspeed_pinmux_data *ctx, const struct aspeed_sig_expr *expr) { + int ret; + pr_debug("Disabling signal %s for %s\n", expr->signal, expr->function); - return aspeed_sig_expr_set(ctx, expr, false); + ret = aspeed_sig_expr_eval(ctx, expr, true); + if (ret < 0) + return ret; + + if (ret) + return aspeed_sig_expr_set(ctx, expr, false); + + return 0; } /** @@ -114,7 +123,7 @@ static int aspeed_disable_sig(struct aspeed_pinmux_data *ctx, int ret = 0; if (!exprs) - return true; + return -EINVAL; while (*exprs && !ret) { ret = aspeed_sig_expr_disable(ctx, *exprs); diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c index cc3aaba24188..e49f271de936 100644 --- a/drivers/pinctrl/intel/pinctrl-intel.c +++ b/drivers/pinctrl/intel/pinctrl-intel.c @@ -1709,6 +1709,12 @@ const struct intel_pinctrl_soc_data *intel_pinctrl_get_soc_data(struct platform_ EXPORT_SYMBOL_GPL(intel_pinctrl_get_soc_data); #ifdef CONFIG_PM_SLEEP +static bool __intel_gpio_is_direct_irq(u32 value) +{ + return (value & PADCFG0_GPIROUTIOXAPIC) && (value & PADCFG0_GPIOTXDIS) && + (__intel_gpio_get_gpio_mode(value) == PADCFG0_PMODE_GPIO); +} + static bool intel_pinctrl_should_save(struct intel_pinctrl *pctrl, unsigned int pin) { const struct pin_desc *pd = pin_desc_get(pctrl->pctldev, pin); @@ -1742,8 +1748,7 @@ static bool intel_pinctrl_should_save(struct intel_pinctrl *pctrl, unsigned int * See https://bugzilla.kernel.org/show_bug.cgi?id=214749. */ value = readl(intel_get_padcfg(pctrl, pin, PADCFG0)); - if ((value & PADCFG0_GPIROUTIOXAPIC) && (value & PADCFG0_GPIOTXDIS) && - (__intel_gpio_get_gpio_mode(value) == PADCFG0_PMODE_GPIO)) + if (__intel_gpio_is_direct_irq(value)) return true; return false; @@ -1873,7 +1878,12 @@ int intel_pinctrl_resume_noirq(struct device *dev) for (i = 0; i < pctrl->soc->npins; i++) { const struct pinctrl_pin_desc *desc = &pctrl->soc->pins[i]; - if (!intel_pinctrl_should_save(pctrl, desc->number)) + if (!(intel_pinctrl_should_save(pctrl, desc->number) || + /* + * If the firmware mangled the register contents too much, + * check the saved value for the Direct IRQ mode. + */ + __intel_gpio_is_direct_irq(pads[i].padcfg0))) continue; intel_restore_padcfg(pctrl, desc->number, PADCFG0, pads[i].padcfg0); diff --git a/drivers/pinctrl/mediatek/pinctrl-mt8195.c b/drivers/pinctrl/mediatek/pinctrl-mt8195.c index 89557c7ed2ab..09c4dcef9338 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt8195.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt8195.c @@ -659,7 +659,7 @@ static const struct mtk_pin_field_calc mt8195_pin_drv_range[] = { PIN_FIELD_BASE(10, 10, 4, 0x010, 0x10, 9, 3), PIN_FIELD_BASE(11, 11, 4, 0x000, 0x10, 24, 3), PIN_FIELD_BASE(12, 12, 4, 0x010, 0x10, 12, 3), - PIN_FIELD_BASE(13, 13, 4, 0x010, 0x10, 27, 3), + PIN_FIELD_BASE(13, 13, 4, 0x000, 0x10, 27, 3), PIN_FIELD_BASE(14, 14, 4, 0x010, 0x10, 15, 3), PIN_FIELD_BASE(15, 15, 4, 0x010, 0x10, 0, 3), PIN_FIELD_BASE(16, 16, 4, 0x010, 0x10, 18, 3), @@ -708,7 +708,7 @@ static const struct mtk_pin_field_calc mt8195_pin_drv_range[] = { PIN_FIELD_BASE(78, 78, 3, 0x000, 0x10, 15, 3), PIN_FIELD_BASE(79, 79, 3, 0x000, 0x10, 18, 3), PIN_FIELD_BASE(80, 80, 3, 0x000, 0x10, 21, 3), - PIN_FIELD_BASE(81, 81, 3, 0x000, 0x10, 28, 3), + PIN_FIELD_BASE(81, 81, 3, 0x000, 0x10, 24, 3), PIN_FIELD_BASE(82, 82, 3, 0x000, 0x10, 27, 3), PIN_FIELD_BASE(83, 83, 3, 0x010, 0x10, 0, 3), PIN_FIELD_BASE(84, 84, 3, 0x010, 0x10, 3, 3), diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 9bc6e3922e78..32c3edaf9038 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -365,6 +365,7 @@ static void amd_gpio_dbg_show(struct seq_file *s, struct gpio_chip *gc) } else { debounce_enable = " ∅"; + time = 0; } snprintf(debounce_value, sizeof(debounce_value), "%u", time * unit); seq_printf(s, "debounce %s (🕑 %sus)| ", debounce_enable, debounce_value); diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c index 99c3745da456..190923757cda 100644 --- a/drivers/pinctrl/pinctrl-single.c +++ b/drivers/pinctrl/pinctrl-single.c @@ -372,6 +372,8 @@ static int pcs_set_mux(struct pinctrl_dev *pctldev, unsigned fselector, if (!pcs->fmask) return 0; function = pinmux_generic_get_function(pctldev, fselector); + if (!function) + return -EINVAL; func = function->data; if (!func) return -EINVAL; diff --git a/drivers/pinctrl/qcom/pinctrl-sm8450-lpass-lpi.c b/drivers/pinctrl/qcom/pinctrl-sm8450-lpass-lpi.c index c3c8c34148f1..e22d03ce292e 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8450-lpass-lpi.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8450-lpass-lpi.c @@ -105,7 +105,7 @@ static const struct pinctrl_pin_desc sm8450_lpi_pins[] = { static const char * const swr_tx_clk_groups[] = { "gpio0" }; static const char * const swr_tx_data_groups[] = { "gpio1", "gpio2", "gpio14" }; static const char * const swr_rx_clk_groups[] = { "gpio3" }; -static const char * const swr_rx_data_groups[] = { "gpio4", "gpio5", "gpio15" }; +static const char * const swr_rx_data_groups[] = { "gpio4", "gpio5" }; static const char * const dmic1_clk_groups[] = { "gpio6" }; static const char * const dmic1_data_groups[] = { "gpio7" }; static const char * const dmic2_clk_groups[] = { "gpio8" }; diff --git a/drivers/platform/x86/amd/Kconfig b/drivers/platform/x86/amd/Kconfig index a825af8126c8..2ce8cb2170df 100644 --- a/drivers/platform/x86/amd/Kconfig +++ b/drivers/platform/x86/amd/Kconfig @@ -8,6 +8,7 @@ source "drivers/platform/x86/amd/pmf/Kconfig" config AMD_PMC tristate "AMD SoC PMC driver" depends on ACPI && PCI && RTC_CLASS + select SERIO help The driver provides support for AMD Power Management Controller primarily responsible for S2Idle transactions that are driven from diff --git a/drivers/platform/x86/amd/pmf/auto-mode.c b/drivers/platform/x86/amd/pmf/auto-mode.c index 644af42e07cf..96a8e1832c05 100644 --- a/drivers/platform/x86/amd/pmf/auto-mode.c +++ b/drivers/platform/x86/amd/pmf/auto-mode.c @@ -275,13 +275,8 @@ int amd_pmf_reset_amt(struct amd_pmf_dev *dev) */ if (is_apmf_func_supported(dev, APMF_FUNC_STATIC_SLIDER_GRANULAR)) { - int mode = amd_pmf_get_pprof_modes(dev); - - if (mode < 0) - return mode; - dev_dbg(dev->dev, "resetting AMT thermals\n"); - amd_pmf_update_slider(dev, SLIDER_OP_SET, mode, NULL); + amd_pmf_set_sps_power_limits(dev); } return 0; } @@ -299,7 +294,5 @@ void amd_pmf_deinit_auto_mode(struct amd_pmf_dev *dev) void amd_pmf_init_auto_mode(struct amd_pmf_dev *dev) { amd_pmf_load_defaults_auto_mode(dev); - /* update the thermal limits for Automode */ - amd_pmf_set_automode(dev, config_store.current_mode, NULL); amd_pmf_init_metrics_table(dev); } diff --git a/drivers/platform/x86/amd/pmf/cnqf.c b/drivers/platform/x86/amd/pmf/cnqf.c index 3f9731a2ac28..4beb22a19466 100644 --- a/drivers/platform/x86/amd/pmf/cnqf.c +++ b/drivers/platform/x86/amd/pmf/cnqf.c @@ -103,7 +103,7 @@ int amd_pmf_trans_cnqf(struct amd_pmf_dev *dev, int socket_power, ktime_t time_l src = amd_pmf_cnqf_get_power_source(dev); - if (dev->current_profile == PLATFORM_PROFILE_BALANCED) { + if (is_pprof_balanced(dev)) { amd_pmf_set_cnqf(dev, src, config_store.current_mode, NULL); } else { /* @@ -307,13 +307,9 @@ static ssize_t cnqf_enable_store(struct device *dev, const char *buf, size_t count) { struct amd_pmf_dev *pdev = dev_get_drvdata(dev); - int mode, result, src; + int result, src; bool input; - mode = amd_pmf_get_pprof_modes(pdev); - if (mode < 0) - return mode; - result = kstrtobool(buf, &input); if (result) return result; @@ -321,11 +317,11 @@ static ssize_t cnqf_enable_store(struct device *dev, src = amd_pmf_cnqf_get_power_source(pdev); pdev->cnqf_enabled = input; - if (pdev->cnqf_enabled && pdev->current_profile == PLATFORM_PROFILE_BALANCED) { + if (pdev->cnqf_enabled && is_pprof_balanced(pdev)) { amd_pmf_set_cnqf(pdev, src, config_store.current_mode, NULL); } else { if (is_apmf_func_supported(pdev, APMF_FUNC_STATIC_SLIDER_GRANULAR)) - amd_pmf_update_slider(pdev, SLIDER_OP_SET, mode, NULL); + amd_pmf_set_sps_power_limits(pdev); } dev_dbg(pdev->dev, "Received CnQF %s\n", input ? "on" : "off"); @@ -386,7 +382,7 @@ int amd_pmf_init_cnqf(struct amd_pmf_dev *dev) dev->cnqf_enabled = amd_pmf_check_flags(dev); /* update the thermal for CnQF */ - if (dev->cnqf_enabled && dev->current_profile == PLATFORM_PROFILE_BALANCED) { + if (dev->cnqf_enabled && is_pprof_balanced(dev)) { src = amd_pmf_cnqf_get_power_source(dev); amd_pmf_set_cnqf(dev, src, config_store.current_mode, NULL); } diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index a5f5a4bcff6d..da23639071d7 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -58,6 +58,25 @@ static bool force_load; module_param(force_load, bool, 0444); MODULE_PARM_DESC(force_load, "Force load this driver on supported older platforms (experimental)"); +static int amd_pmf_pwr_src_notify_call(struct notifier_block *nb, unsigned long event, void *data) +{ + struct amd_pmf_dev *pmf = container_of(nb, struct amd_pmf_dev, pwr_src_notifier); + + if (event != PSY_EVENT_PROP_CHANGED) + return NOTIFY_OK; + + if (is_apmf_func_supported(pmf, APMF_FUNC_AUTO_MODE) || + is_apmf_func_supported(pmf, APMF_FUNC_DYN_SLIDER_DC) || + is_apmf_func_supported(pmf, APMF_FUNC_DYN_SLIDER_AC)) { + if ((pmf->amt_enabled || pmf->cnqf_enabled) && is_pprof_balanced(pmf)) + return NOTIFY_DONE; + } + + amd_pmf_set_sps_power_limits(pmf); + + return NOTIFY_OK; +} + static int current_power_limits_show(struct seq_file *seq, void *unused) { struct amd_pmf_dev *dev = seq->private; @@ -366,14 +385,18 @@ static int amd_pmf_probe(struct platform_device *pdev) if (!dev->regbase) return -ENOMEM; + mutex_init(&dev->lock); + mutex_init(&dev->update_mutex); + apmf_acpi_init(dev); platform_set_drvdata(pdev, dev); amd_pmf_init_features(dev); apmf_install_handler(dev); amd_pmf_dbgfs_register(dev); - mutex_init(&dev->lock); - mutex_init(&dev->update_mutex); + dev->pwr_src_notifier.notifier_call = amd_pmf_pwr_src_notify_call; + power_supply_reg_notifier(&dev->pwr_src_notifier); + dev_info(dev->dev, "registered PMF device successfully\n"); return 0; @@ -383,11 +406,12 @@ static int amd_pmf_remove(struct platform_device *pdev) { struct amd_pmf_dev *dev = platform_get_drvdata(pdev); - mutex_destroy(&dev->lock); - mutex_destroy(&dev->update_mutex); + power_supply_unreg_notifier(&dev->pwr_src_notifier); amd_pmf_deinit_features(dev); apmf_acpi_deinit(dev); amd_pmf_dbgfs_unregister(dev); + mutex_destroy(&dev->lock); + mutex_destroy(&dev->update_mutex); kfree(dev->buf); return 0; } diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index 84bbe2c6ea61..06c30cdc0573 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -169,6 +169,7 @@ struct amd_pmf_dev { struct mutex update_mutex; /* protects race between ACPI handler and metrics thread */ bool cnqf_enabled; bool cnqf_supported; + struct notifier_block pwr_src_notifier; }; struct apmf_sps_prop_granular { @@ -391,9 +392,11 @@ int amd_pmf_init_sps(struct amd_pmf_dev *dev); void amd_pmf_deinit_sps(struct amd_pmf_dev *dev); int apmf_get_static_slider_granular(struct amd_pmf_dev *pdev, struct apmf_static_slider_granular_output *output); +bool is_pprof_balanced(struct amd_pmf_dev *pmf); int apmf_update_fan_idx(struct amd_pmf_dev *pdev, bool manual, u32 idx); +int amd_pmf_set_sps_power_limits(struct amd_pmf_dev *pmf); /* Auto Mode Layer */ int apmf_get_auto_mode_def(struct amd_pmf_dev *pdev, struct apmf_auto_mode *data); diff --git a/drivers/platform/x86/amd/pmf/sps.c b/drivers/platform/x86/amd/pmf/sps.c index dba7e36962dc..bed762d47a14 100644 --- a/drivers/platform/x86/amd/pmf/sps.c +++ b/drivers/platform/x86/amd/pmf/sps.c @@ -70,6 +70,24 @@ void amd_pmf_update_slider(struct amd_pmf_dev *dev, bool op, int idx, } } +int amd_pmf_set_sps_power_limits(struct amd_pmf_dev *pmf) +{ + int mode; + + mode = amd_pmf_get_pprof_modes(pmf); + if (mode < 0) + return mode; + + amd_pmf_update_slider(pmf, SLIDER_OP_SET, mode, NULL); + + return 0; +} + +bool is_pprof_balanced(struct amd_pmf_dev *pmf) +{ + return (pmf->current_profile == PLATFORM_PROFILE_BALANCED) ? true : false; +} + static int amd_pmf_profile_get(struct platform_profile_handler *pprof, enum platform_profile_option *profile) { @@ -105,15 +123,10 @@ static int amd_pmf_profile_set(struct platform_profile_handler *pprof, enum platform_profile_option profile) { struct amd_pmf_dev *pmf = container_of(pprof, struct amd_pmf_dev, pprof); - int mode; pmf->current_profile = profile; - mode = amd_pmf_get_pprof_modes(pmf); - if (mode < 0) - return mode; - amd_pmf_update_slider(pmf, SLIDER_OP_SET, mode, NULL); - return 0; + return amd_pmf_set_sps_power_limits(pmf); } int amd_pmf_init_sps(struct amd_pmf_dev *dev) @@ -123,6 +136,9 @@ int amd_pmf_init_sps(struct amd_pmf_dev *dev) dev->current_profile = PLATFORM_PROFILE_BALANCED; amd_pmf_load_defaults_sps(dev); + /* update SPS balanced power mode thermals */ + amd_pmf_set_sps_power_limits(dev); + dev->pprof.profile_get = amd_pmf_profile_get; dev->pprof.profile_set = amd_pmf_profile_set; diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index bb81b8b1f7e9..89c5374e33b3 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -408,14 +408,23 @@ static const struct intel_vsec_platform_info dg1_info = { .quirks = VSEC_QUIRK_NO_DVSEC | VSEC_QUIRK_EARLY_HW, }; +/* MTL info */ +static const struct intel_vsec_platform_info mtl_info = { + .quirks = VSEC_QUIRK_NO_WATCHER | VSEC_QUIRK_NO_CRASHLOG, +}; + #define PCI_DEVICE_ID_INTEL_VSEC_ADL 0x467d #define PCI_DEVICE_ID_INTEL_VSEC_DG1 0x490e +#define PCI_DEVICE_ID_INTEL_VSEC_MTL_M 0x7d0d +#define PCI_DEVICE_ID_INTEL_VSEC_MTL_S 0xad0d #define PCI_DEVICE_ID_INTEL_VSEC_OOBMSM 0x09a7 #define PCI_DEVICE_ID_INTEL_VSEC_RPL 0xa77d #define PCI_DEVICE_ID_INTEL_VSEC_TGL 0x9a0d static const struct pci_device_id intel_vsec_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, VSEC_ADL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_DG1, &dg1_info) }, + { PCI_DEVICE_DATA(INTEL, VSEC_MTL_M, &mtl_info) }, + { PCI_DEVICE_DATA(INTEL, VSEC_MTL_S, &mtl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_OOBMSM, &(struct intel_vsec_platform_info) {}) }, { PCI_DEVICE_DATA(INTEL, VSEC_RPL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_TGL, &tgl_info) }, diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 02860c32625e..32c10457399e 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -5563,7 +5563,7 @@ static int light_sysfs_set(struct led_classdev *led_cdev, static enum led_brightness light_sysfs_get(struct led_classdev *led_cdev) { - return (light_get_status() == 1) ? LED_FULL : LED_OFF; + return (light_get_status() == 1) ? LED_ON : LED_OFF; } static struct tpacpi_led_classdev tpacpi_led_thinklight = { diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index f00995390fdf..13802a3c3591 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -1098,6 +1098,15 @@ const struct dmi_system_id touchscreen_dmi_table[] = { }, }, { + /* Chuwi Vi8 (CWI501) */ + .driver_data = (void *)&chuwi_vi8_data, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Insyde"), + DMI_MATCH(DMI_PRODUCT_NAME, "i86"), + DMI_MATCH(DMI_BIOS_VERSION, "CHUWI.W86JLBNR01"), + }, + }, + { /* Chuwi Vi8 (CWI506) */ .driver_data = (void *)&chuwi_vi8_data, .matches = { diff --git a/drivers/rtc/rtc-efi.c b/drivers/rtc/rtc-efi.c index e991cccdb6e9..1e8bc6cc1e12 100644 --- a/drivers/rtc/rtc-efi.c +++ b/drivers/rtc/rtc-efi.c @@ -188,9 +188,10 @@ static int efi_set_time(struct device *dev, struct rtc_time *tm) static int efi_procfs(struct device *dev, struct seq_file *seq) { - efi_time_t eft, alm; - efi_time_cap_t cap; - efi_bool_t enabled, pending; + efi_time_t eft, alm; + efi_time_cap_t cap; + efi_bool_t enabled, pending; + struct rtc_device *rtc = dev_get_drvdata(dev); memset(&eft, 0, sizeof(eft)); memset(&alm, 0, sizeof(alm)); @@ -213,23 +214,25 @@ static int efi_procfs(struct device *dev, struct seq_file *seq) /* XXX fixme: convert to string? */ seq_printf(seq, "Timezone\t: %u\n", eft.timezone); - seq_printf(seq, - "Alarm Time\t: %u:%u:%u.%09u\n" - "Alarm Date\t: %u-%u-%u\n" - "Alarm Daylight\t: %u\n" - "Enabled\t\t: %s\n" - "Pending\t\t: %s\n", - alm.hour, alm.minute, alm.second, alm.nanosecond, - alm.year, alm.month, alm.day, - alm.daylight, - enabled == 1 ? "yes" : "no", - pending == 1 ? "yes" : "no"); - - if (eft.timezone == EFI_UNSPECIFIED_TIMEZONE) - seq_puts(seq, "Timezone\t: unspecified\n"); - else - /* XXX fixme: convert to string? */ - seq_printf(seq, "Timezone\t: %u\n", alm.timezone); + if (test_bit(RTC_FEATURE_ALARM, rtc->features)) { + seq_printf(seq, + "Alarm Time\t: %u:%u:%u.%09u\n" + "Alarm Date\t: %u-%u-%u\n" + "Alarm Daylight\t: %u\n" + "Enabled\t\t: %s\n" + "Pending\t\t: %s\n", + alm.hour, alm.minute, alm.second, alm.nanosecond, + alm.year, alm.month, alm.day, + alm.daylight, + enabled == 1 ? "yes" : "no", + pending == 1 ? "yes" : "no"); + + if (eft.timezone == EFI_UNSPECIFIED_TIMEZONE) + seq_puts(seq, "Timezone\t: unspecified\n"); + else + /* XXX fixme: convert to string? */ + seq_printf(seq, "Timezone\t: %u\n", alm.timezone); + } /* * now prints the capabilities @@ -269,7 +272,10 @@ static int __init efi_rtc_probe(struct platform_device *dev) rtc->ops = &efi_rtc_ops; clear_bit(RTC_FEATURE_UPDATE_INTERRUPT, rtc->features); - set_bit(RTC_FEATURE_ALARM_WAKEUP_ONLY, rtc->features); + if (efi_rt_services_supported(EFI_RT_SUPPORTED_WAKEUP_SERVICES)) + set_bit(RTC_FEATURE_ALARM_WAKEUP_ONLY, rtc->features); + else + clear_bit(RTC_FEATURE_ALARM, rtc->features); device_init_wakeup(&dev->dev, true); diff --git a/drivers/rtc/rtc-sunplus.c b/drivers/rtc/rtc-sunplus.c index e8e2ab1103fc..4b578e4d44f6 100644 --- a/drivers/rtc/rtc-sunplus.c +++ b/drivers/rtc/rtc-sunplus.c @@ -240,8 +240,8 @@ static int sp_rtc_probe(struct platform_device *plat_dev) if (IS_ERR(sp_rtc->reg_base)) return dev_err_probe(&plat_dev->dev, PTR_ERR(sp_rtc->reg_base), "%s devm_ioremap_resource fail\n", RTC_REG_NAME); - dev_dbg(&plat_dev->dev, "res = 0x%x, reg_base = 0x%lx\n", - sp_rtc->res->start, (unsigned long)sp_rtc->reg_base); + dev_dbg(&plat_dev->dev, "res = %pR, reg_base = %p\n", + sp_rtc->res, sp_rtc->reg_base); sp_rtc->irq = platform_get_irq(plat_dev, 0); if (sp_rtc->irq < 0) diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 1426b9b03612..9feb0323bc44 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -588,8 +588,6 @@ void scsi_device_put(struct scsi_device *sdev) { struct module *mod = sdev->host->hostt->module; - might_sleep(); - put_device(&sdev->sdev_gendev); module_put(mod); } diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 7a6904a3928e..f9b18fdc7b3c 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -1232,8 +1232,7 @@ static int scsi_probe_and_add_lun(struct scsi_target *starget, * that no LUN is present, so don't add sdev in these cases. * Two specific examples are: * 1) NetApp targets: return PQ=1, PDT=0x1f - * 2) IBM/2145 targets: return PQ=1, PDT=0 - * 3) USB UFI: returns PDT=0x1f, with the PQ bits being "reserved" + * 2) USB UFI: returns PDT=0x1f, with the PQ bits being "reserved" * in the UFI 1.0 spec (we cannot rely on reserved bits). * * References: @@ -1247,8 +1246,8 @@ static int scsi_probe_and_add_lun(struct scsi_target *starget, * PDT=00h Direct-access device (floppy) * PDT=1Fh none (no FDD connected to the requested logical unit) */ - if (((result[0] >> 5) == 1 || - (starget->pdt_1f_for_no_lun && (result[0] & 0x1f) == 0x1f)) && + if (((result[0] >> 5) == 1 || starget->pdt_1f_for_no_lun) && + (result[0] & 0x1f) == 0x1f && !scsi_is_wlun(lun)) { SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev, "scsi scan: peripheral device type" diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 981d1bab2120..8ef9a5494340 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -451,6 +451,8 @@ static void scsi_device_dev_release(struct device *dev) struct scsi_vpd *vpd_pgb0 = NULL, *vpd_pgb1 = NULL, *vpd_pgb2 = NULL; unsigned long flags; + might_sleep(); + scsi_dh_release_device(sdev); parent = sdev->sdev_gendev.parent; diff --git a/drivers/spi/spi-dw-core.c b/drivers/spi/spi-dw-core.c index 99edddf9958b..c3bfb6c84cab 100644 --- a/drivers/spi/spi-dw-core.c +++ b/drivers/spi/spi-dw-core.c @@ -366,7 +366,7 @@ static void dw_spi_irq_setup(struct dw_spi *dws) * will be adjusted at the final stage of the IRQ-based SPI transfer * execution so not to lose the leftover of the incoming data. */ - level = min_t(u16, dws->fifo_len / 2, dws->tx_len); + level = min_t(unsigned int, dws->fifo_len / 2, dws->tx_len); dw_writel(dws, DW_SPI_TXFTLR, level); dw_writel(dws, DW_SPI_RXFTLR, level - 1); diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 15f174f4e056..3f33934f5429 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2220,11 +2220,26 @@ void spi_flush_queue(struct spi_controller *ctlr) /*-------------------------------------------------------------------------*/ #if defined(CONFIG_OF) +static void of_spi_parse_dt_cs_delay(struct device_node *nc, + struct spi_delay *delay, const char *prop) +{ + u32 value; + + if (!of_property_read_u32(nc, prop, &value)) { + if (value > U16_MAX) { + delay->value = DIV_ROUND_UP(value, 1000); + delay->unit = SPI_DELAY_UNIT_USECS; + } else { + delay->value = value; + delay->unit = SPI_DELAY_UNIT_NSECS; + } + } +} + static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, struct device_node *nc) { u32 value; - u16 cs_setup; int rc; /* Mode (clock phase/polarity/etc.) */ @@ -2310,10 +2325,8 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, if (!of_property_read_u32(nc, "spi-max-frequency", &value)) spi->max_speed_hz = value; - if (!of_property_read_u16(nc, "spi-cs-setup-delay-ns", &cs_setup)) { - spi->cs_setup.value = cs_setup; - spi->cs_setup.unit = SPI_DELAY_UNIT_NSECS; - } + /* Device CS delays */ + of_spi_parse_dt_cs_delay(nc, &spi->cs_setup, "spi-cs-setup-delay-ns"); return 0; } diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c index 1935ca613447..a1ea093795cf 100644 --- a/drivers/spi/spidev.c +++ b/drivers/spi/spidev.c @@ -90,9 +90,21 @@ MODULE_PARM_DESC(bufsiz, "data bytes in biggest supported SPI message"); /*-------------------------------------------------------------------------*/ static ssize_t +spidev_sync_unlocked(struct spi_device *spi, struct spi_message *message) +{ + ssize_t status; + + status = spi_sync(spi, message); + if (status == 0) + status = message->actual_length; + + return status; +} + +static ssize_t spidev_sync(struct spidev_data *spidev, struct spi_message *message) { - int status; + ssize_t status; struct spi_device *spi; mutex_lock(&spidev->spi_lock); @@ -101,12 +113,10 @@ spidev_sync(struct spidev_data *spidev, struct spi_message *message) if (spi == NULL) status = -ESHUTDOWN; else - status = spi_sync(spi, message); - - if (status == 0) - status = message->actual_length; + status = spidev_sync_unlocked(spi, message); mutex_unlock(&spidev->spi_lock); + return status; } @@ -294,7 +304,7 @@ static int spidev_message(struct spidev_data *spidev, spi_message_add_tail(k_tmp, &msg); } - status = spidev_sync(spidev, &msg); + status = spidev_sync_unlocked(spidev->spi, &msg); if (status < 0) goto done; diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index 27295bda3e0b..b1c6231defad 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -11,6 +11,7 @@ #include <linux/tee_drv.h> #include <linux/uaccess.h> #include <linux/uio.h> +#include <linux/highmem.h> #include "tee_private.h" static void shm_put_kernel_pages(struct page **pages, size_t page_count) @@ -24,38 +25,20 @@ static void shm_put_kernel_pages(struct page **pages, size_t page_count) static int shm_get_kernel_pages(unsigned long start, size_t page_count, struct page **pages) { + struct page *page; size_t n; - int rc; - - if (is_vmalloc_addr((void *)start)) { - struct page *page; - - for (n = 0; n < page_count; n++) { - page = vmalloc_to_page((void *)(start + PAGE_SIZE * n)); - if (!page) - return -ENOMEM; - get_page(page); - pages[n] = page; - } - rc = page_count; - } else { - struct kvec *kiov; - - kiov = kcalloc(page_count, sizeof(*kiov), GFP_KERNEL); - if (!kiov) - return -ENOMEM; - - for (n = 0; n < page_count; n++) { - kiov[n].iov_base = (void *)(start + n * PAGE_SIZE); - kiov[n].iov_len = PAGE_SIZE; - } + if (WARN_ON_ONCE(is_vmalloc_addr((void *)start) || + is_kmap_addr((void *)start))) + return -EINVAL; - rc = get_kernel_pages(kiov, page_count, 0, pages); - kfree(kiov); + page = virt_to_page(start); + for (n = 0; n < page_count; n++) { + pages[n] = page + n; + get_page(pages[n]); } - return rc; + return page_count; } static void release_registered_pages(struct tee_shm *shm) diff --git a/drivers/tty/serial/8250/8250_dma.c b/drivers/tty/serial/8250/8250_dma.c index 37d6af2ec427..7fa66501792d 100644 --- a/drivers/tty/serial/8250/8250_dma.c +++ b/drivers/tty/serial/8250/8250_dma.c @@ -43,15 +43,23 @@ static void __dma_rx_complete(struct uart_8250_port *p) struct uart_8250_dma *dma = p->dma; struct tty_port *tty_port = &p->port.state->port; struct dma_tx_state state; + enum dma_status dma_status; int count; - dma->rx_running = 0; - dmaengine_tx_status(dma->rxchan, dma->rx_cookie, &state); + /* + * New DMA Rx can be started during the completion handler before it + * could acquire port's lock and it might still be ongoing. Don't to + * anything in such case. + */ + dma_status = dmaengine_tx_status(dma->rxchan, dma->rx_cookie, &state); + if (dma_status == DMA_IN_PROGRESS) + return; count = dma->rx_size - state.residue; tty_insert_flip_string(tty_port, dma->rx_buf, count); p->port.icount.rx += count; + dma->rx_running = 0; tty_flip_buffer_push(tty_port); } @@ -62,9 +70,14 @@ static void dma_rx_complete(void *param) struct uart_8250_dma *dma = p->dma; unsigned long flags; - __dma_rx_complete(p); - spin_lock_irqsave(&p->port.lock, flags); + if (dma->rx_running) + __dma_rx_complete(p); + + /* + * Cannot be combined with the previous check because __dma_rx_complete() + * changes dma->rx_running. + */ if (!dma->rx_running && (serial_lsr_in(p) & UART_LSR_DR)) p->dma->rx_dma(p); spin_unlock_irqrestore(&p->port.lock, flags); diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index a1490033aa16..409e91d6829a 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -797,25 +797,11 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) spin_unlock(&port->lock); } - if (stm32_usart_rx_dma_enabled(port)) - return IRQ_WAKE_THREAD; - else - return IRQ_HANDLED; -} - -static irqreturn_t stm32_usart_threaded_interrupt(int irq, void *ptr) -{ - struct uart_port *port = ptr; - struct tty_port *tport = &port->state->port; - struct stm32_port *stm32_port = to_stm32_port(port); - unsigned int size; - unsigned long flags; - /* Receiver timeout irq for DMA RX */ - if (!stm32_port->throttled) { - spin_lock_irqsave(&port->lock, flags); + if (stm32_usart_rx_dma_enabled(port) && !stm32_port->throttled) { + spin_lock(&port->lock); size = stm32_usart_receive_chars(port, false); - uart_unlock_and_check_sysrq_irqrestore(port, flags); + uart_unlock_and_check_sysrq(port); if (size) tty_flip_buffer_push(tport); } @@ -1015,10 +1001,8 @@ static int stm32_usart_startup(struct uart_port *port) u32 val; int ret; - ret = request_threaded_irq(port->irq, stm32_usart_interrupt, - stm32_usart_threaded_interrupt, - IRQF_ONESHOT | IRQF_NO_SUSPEND, - name, port); + ret = request_irq(port->irq, stm32_usart_interrupt, + IRQF_NO_SUSPEND, name, port); if (ret) return ret; @@ -1601,13 +1585,6 @@ static int stm32_usart_of_dma_rx_probe(struct stm32_port *stm32port, struct dma_slave_config config; int ret; - /* - * Using DMA and threaded handler for the console could lead to - * deadlocks. - */ - if (uart_console(port)) - return -ENODEV; - stm32port->rx_buf = dma_alloc_coherent(dev, RX_BUF_L, &stm32port->rx_dma_buf, GFP_KERNEL); diff --git a/drivers/tty/vt/vc_screen.c b/drivers/tty/vt/vc_screen.c index 1850bacdb5b0..f566eb1839dc 100644 --- a/drivers/tty/vt/vc_screen.c +++ b/drivers/tty/vt/vc_screen.c @@ -386,10 +386,6 @@ vcs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) uni_mode = use_unicode(inode); attr = use_attributes(inode); - ret = -ENXIO; - vc = vcs_vc(inode, &viewed); - if (!vc) - goto unlock_out; ret = -EINVAL; if (pos < 0) @@ -407,6 +403,11 @@ vcs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) unsigned int this_round, skip = 0; int size; + ret = -ENXIO; + vc = vcs_vc(inode, &viewed); + if (!vc) + goto unlock_out; + /* Check whether we are above size each round, * as copy_to_user at the end of this loop * could sleep. diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 079e183cf3bf..934b3d997702 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -526,6 +526,9 @@ static const struct usb_device_id usb_quirk_list[] = { /* DJI CineSSD */ { USB_DEVICE(0x2ca3, 0x0031), .driver_info = USB_QUIRK_NO_LPM }, + /* Alcor Link AK9563 SC Reader used in 2022 Lenovo ThinkPads */ + { USB_DEVICE(0x2ce3, 0x9563), .driver_info = USB_QUIRK_NO_LPM }, + /* DELL USB GEN2 */ { USB_DEVICE(0x413c, 0xb062), .driver_info = USB_QUIRK_NO_LPM | USB_QUIRK_RESET_RESUME }, diff --git a/drivers/usb/dwc3/dwc3-qcom.c b/drivers/usb/dwc3/dwc3-qcom.c index b0a0351d2d8b..959fc925ca7c 100644 --- a/drivers/usb/dwc3/dwc3-qcom.c +++ b/drivers/usb/dwc3/dwc3-qcom.c @@ -901,7 +901,7 @@ static int dwc3_qcom_probe(struct platform_device *pdev) qcom->mode = usb_get_dr_mode(&qcom->dwc3->dev); /* enable vbus override for device mode */ - if (qcom->mode == USB_DR_MODE_PERIPHERAL) + if (qcom->mode != USB_DR_MODE_HOST) dwc3_qcom_vbus_override_enable(qcom, true); /* register extcon to override sw_vbus on Vbus change later */ diff --git a/drivers/usb/fotg210/fotg210-udc.c b/drivers/usb/fotg210/fotg210-udc.c index 87cca81bf4ac..eb076746f032 100644 --- a/drivers/usb/fotg210/fotg210-udc.c +++ b/drivers/usb/fotg210/fotg210-udc.c @@ -1014,7 +1014,6 @@ static int fotg210_udc_start(struct usb_gadget *g, int ret; /* hook up the driver */ - driver->driver.bus = NULL; fotg210->driver = driver; if (!IS_ERR_OR_NULL(fotg210->phy)) { diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 523a961b910b..8ad354741380 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -279,8 +279,10 @@ static int __ffs_ep0_queue_wait(struct ffs_data *ffs, char *data, size_t len) struct usb_request *req = ffs->ep0req; int ret; - if (!req) + if (!req) { + spin_unlock_irq(&ffs->ev.waitq.lock); return -EINVAL; + } req->zero = len < le16_to_cpu(ffs->ev.setup.wLength); diff --git a/drivers/usb/gadget/function/f_uac2.c b/drivers/usb/gadget/function/f_uac2.c index 08726e4c68a5..0219cd79493a 100644 --- a/drivers/usb/gadget/function/f_uac2.c +++ b/drivers/usb/gadget/function/f_uac2.c @@ -1142,6 +1142,7 @@ afunc_bind(struct usb_configuration *cfg, struct usb_function *fn) } std_as_out_if0_desc.bInterfaceNumber = ret; std_as_out_if1_desc.bInterfaceNumber = ret; + std_as_out_if1_desc.bNumEndpoints = 1; uac2->as_out_intf = ret; uac2->as_out_alt = 0; diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index 8f12f3f8f6ee..e06022873df1 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -798,6 +798,7 @@ struct eth_dev *gether_setup_name(struct usb_gadget *g, net->max_mtu = GETHER_MAX_MTU_SIZE; dev->gadget = g; + SET_NETDEV_DEV(net, &g->dev); SET_NETDEV_DEVTYPE(net, &gadget_type); status = register_netdev(net); @@ -872,6 +873,8 @@ int gether_register_netdev(struct net_device *net) struct usb_gadget *g; int status; + if (!net->dev.parent) + return -EINVAL; dev = netdev_priv(net); g = dev->gadget; @@ -902,6 +905,7 @@ void gether_set_gadget(struct net_device *net, struct usb_gadget *g) dev = netdev_priv(net); dev->gadget = g; + SET_NETDEV_DEV(net, &g->dev); } EXPORT_SYMBOL_GPL(gether_set_gadget); diff --git a/drivers/usb/gadget/udc/bcm63xx_udc.c b/drivers/usb/gadget/udc/bcm63xx_udc.c index 2cdb07905bde..d04d72f5816e 100644 --- a/drivers/usb/gadget/udc/bcm63xx_udc.c +++ b/drivers/usb/gadget/udc/bcm63xx_udc.c @@ -1830,7 +1830,6 @@ static int bcm63xx_udc_start(struct usb_gadget *gadget, bcm63xx_select_phy_mode(udc, true); udc->driver = driver; - driver->driver.bus = NULL; udc->gadget.dev.of_node = udc->dev->of_node; spin_unlock_irqrestore(&udc->lock, flags); diff --git a/drivers/usb/gadget/udc/fsl_qe_udc.c b/drivers/usb/gadget/udc/fsl_qe_udc.c index bf745358e28e..3b1cc8fa30c8 100644 --- a/drivers/usb/gadget/udc/fsl_qe_udc.c +++ b/drivers/usb/gadget/udc/fsl_qe_udc.c @@ -2285,7 +2285,6 @@ static int fsl_qe_start(struct usb_gadget *gadget, /* lock is needed but whether should use this lock or another */ spin_lock_irqsave(&udc->lock, flags); - driver->driver.bus = NULL; /* hook up the driver */ udc->driver = driver; udc->gadget.speed = driver->max_speed; diff --git a/drivers/usb/gadget/udc/fsl_udc_core.c b/drivers/usb/gadget/udc/fsl_udc_core.c index 50435e804118..a67873a074b7 100644 --- a/drivers/usb/gadget/udc/fsl_udc_core.c +++ b/drivers/usb/gadget/udc/fsl_udc_core.c @@ -1943,7 +1943,6 @@ static int fsl_udc_start(struct usb_gadget *g, /* lock is needed but whether should use this lock or another */ spin_lock_irqsave(&udc_controller->lock, flags); - driver->driver.bus = NULL; /* hook up the driver */ udc_controller->driver = driver; spin_unlock_irqrestore(&udc_controller->lock, flags); diff --git a/drivers/usb/gadget/udc/fusb300_udc.c b/drivers/usb/gadget/udc/fusb300_udc.c index 9af8b415f303..5954800d652c 100644 --- a/drivers/usb/gadget/udc/fusb300_udc.c +++ b/drivers/usb/gadget/udc/fusb300_udc.c @@ -1311,7 +1311,6 @@ static int fusb300_udc_start(struct usb_gadget *g, struct fusb300 *fusb300 = to_fusb300(g); /* hook up the driver */ - driver->driver.bus = NULL; fusb300->driver = driver; return 0; diff --git a/drivers/usb/gadget/udc/goku_udc.c b/drivers/usb/gadget/udc/goku_udc.c index bdc56b24b5c9..5ffb3d5c635b 100644 --- a/drivers/usb/gadget/udc/goku_udc.c +++ b/drivers/usb/gadget/udc/goku_udc.c @@ -1375,7 +1375,6 @@ static int goku_udc_start(struct usb_gadget *g, struct goku_udc *dev = to_goku_udc(g); /* hook up the driver */ - driver->driver.bus = NULL; dev->driver = driver; /* diff --git a/drivers/usb/gadget/udc/gr_udc.c b/drivers/usb/gadget/udc/gr_udc.c index 22096f8505de..85cdc0af3bf9 100644 --- a/drivers/usb/gadget/udc/gr_udc.c +++ b/drivers/usb/gadget/udc/gr_udc.c @@ -1906,7 +1906,6 @@ static int gr_udc_start(struct usb_gadget *gadget, spin_lock(&dev->lock); /* Hook up the driver */ - driver->driver.bus = NULL; dev->driver = driver; /* Get ready for host detection */ diff --git a/drivers/usb/gadget/udc/m66592-udc.c b/drivers/usb/gadget/udc/m66592-udc.c index c7e421b449f3..06e21cee431b 100644 --- a/drivers/usb/gadget/udc/m66592-udc.c +++ b/drivers/usb/gadget/udc/m66592-udc.c @@ -1454,7 +1454,6 @@ static int m66592_udc_start(struct usb_gadget *g, struct m66592 *m66592 = to_m66592(g); /* hook up the driver */ - driver->driver.bus = NULL; m66592->driver = driver; m66592_bset(m66592, M66592_VBSE | M66592_URST, M66592_INTENB0); diff --git a/drivers/usb/gadget/udc/max3420_udc.c b/drivers/usb/gadget/udc/max3420_udc.c index 3074da00c3df..ddf0ed3eb4f2 100644 --- a/drivers/usb/gadget/udc/max3420_udc.c +++ b/drivers/usb/gadget/udc/max3420_udc.c @@ -1108,7 +1108,6 @@ static int max3420_udc_start(struct usb_gadget *gadget, spin_lock_irqsave(&udc->lock, flags); /* hook up the driver */ - driver->driver.bus = NULL; udc->driver = driver; udc->gadget.speed = USB_SPEED_FULL; diff --git a/drivers/usb/gadget/udc/mv_u3d_core.c b/drivers/usb/gadget/udc/mv_u3d_core.c index 598654a3cb41..411b6179782c 100644 --- a/drivers/usb/gadget/udc/mv_u3d_core.c +++ b/drivers/usb/gadget/udc/mv_u3d_core.c @@ -1243,7 +1243,6 @@ static int mv_u3d_start(struct usb_gadget *g, } /* hook up the driver ... */ - driver->driver.bus = NULL; u3d->driver = driver; u3d->ep0_dir = USB_DIR_OUT; diff --git a/drivers/usb/gadget/udc/mv_udc_core.c b/drivers/usb/gadget/udc/mv_udc_core.c index fdb17d86cd65..b397f3a848cf 100644 --- a/drivers/usb/gadget/udc/mv_udc_core.c +++ b/drivers/usb/gadget/udc/mv_udc_core.c @@ -1359,7 +1359,6 @@ static int mv_udc_start(struct usb_gadget *gadget, spin_lock_irqsave(&udc->lock, flags); /* hook up the driver ... */ - driver->driver.bus = NULL; udc->driver = driver; udc->usb_state = USB_STATE_ATTACHED; diff --git a/drivers/usb/gadget/udc/net2272.c b/drivers/usb/gadget/udc/net2272.c index 84605a4d0715..538c1b9a2883 100644 --- a/drivers/usb/gadget/udc/net2272.c +++ b/drivers/usb/gadget/udc/net2272.c @@ -1451,7 +1451,6 @@ static int net2272_start(struct usb_gadget *_gadget, dev->ep[i].irqs = 0; /* hook up the driver ... */ dev->softconnect = 1; - driver->driver.bus = NULL; dev->driver = driver; /* ... then enable host detection and ep0; and we're ready diff --git a/drivers/usb/gadget/udc/net2280.c b/drivers/usb/gadget/udc/net2280.c index d6a68631354a..1b929c519cd7 100644 --- a/drivers/usb/gadget/udc/net2280.c +++ b/drivers/usb/gadget/udc/net2280.c @@ -2423,7 +2423,6 @@ static int net2280_start(struct usb_gadget *_gadget, dev->ep[i].irqs = 0; /* hook up the driver ... */ - driver->driver.bus = NULL; dev->driver = driver; retval = device_create_file(&dev->pdev->dev, &dev_attr_function); diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c index bea346e362b2..f660ebfa1379 100644 --- a/drivers/usb/gadget/udc/omap_udc.c +++ b/drivers/usb/gadget/udc/omap_udc.c @@ -2066,7 +2066,6 @@ static int omap_udc_start(struct usb_gadget *g, udc->softconnect = 1; /* hook up the driver */ - driver->driver.bus = NULL; udc->driver = driver; spin_unlock_irqrestore(&udc->lock, flags); diff --git a/drivers/usb/gadget/udc/pch_udc.c b/drivers/usb/gadget/udc/pch_udc.c index 9bb7a9d7a2fb..4f8617210d85 100644 --- a/drivers/usb/gadget/udc/pch_udc.c +++ b/drivers/usb/gadget/udc/pch_udc.c @@ -2908,7 +2908,6 @@ static int pch_udc_start(struct usb_gadget *g, { struct pch_udc_dev *dev = to_pch_udc(g); - driver->driver.bus = NULL; dev->driver = driver; /* get ready for ep0 traffic */ diff --git a/drivers/usb/gadget/udc/snps_udc_core.c b/drivers/usb/gadget/udc/snps_udc_core.c index 52ea4dcf6a92..2fc5d4d277bc 100644 --- a/drivers/usb/gadget/udc/snps_udc_core.c +++ b/drivers/usb/gadget/udc/snps_udc_core.c @@ -1933,7 +1933,6 @@ static int amd5536_udc_start(struct usb_gadget *g, struct udc *dev = to_amd5536_udc(g); u32 tmp; - driver->driver.bus = NULL; dev->driver = driver; /* Some gadget drivers use both ep0 directions. diff --git a/drivers/usb/typec/altmodes/displayport.c b/drivers/usb/typec/altmodes/displayport.c index 9a6860285fbe..50b24096eb7f 100644 --- a/drivers/usb/typec/altmodes/displayport.c +++ b/drivers/usb/typec/altmodes/displayport.c @@ -535,10 +535,10 @@ int dp_altmode_probe(struct typec_altmode *alt) /* FIXME: Port can only be DFP_U. */ /* Make sure we have compatiple pin configurations */ - if (!(DP_CAP_DFP_D_PIN_ASSIGN(port->vdo) & - DP_CAP_UFP_D_PIN_ASSIGN(alt->vdo)) && - !(DP_CAP_UFP_D_PIN_ASSIGN(port->vdo) & - DP_CAP_DFP_D_PIN_ASSIGN(alt->vdo))) + if (!(DP_CAP_PIN_ASSIGN_DFP_D(port->vdo) & + DP_CAP_PIN_ASSIGN_UFP_D(alt->vdo)) && + !(DP_CAP_PIN_ASSIGN_UFP_D(port->vdo) & + DP_CAP_PIN_ASSIGN_DFP_D(alt->vdo))) return -ENODEV; ret = sysfs_create_group(&alt->dev.kobj, &dp_altmode_group); diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index 1292241d581a..1cf8947c6d66 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -1269,6 +1269,9 @@ err_unregister: con->port = NULL; } + kfree(ucsi->connector); + ucsi->connector = NULL; + err_reset: memset(&ucsi->cap, 0, sizeof(ucsi->cap)); ucsi_reset_ppm(ucsi); @@ -1300,7 +1303,8 @@ static void ucsi_resume_work(struct work_struct *work) int ucsi_resume(struct ucsi *ucsi) { - queue_work(system_long_wq, &ucsi->resume_work); + if (ucsi->connector) + queue_work(system_long_wq, &ucsi->resume_work); return 0; } EXPORT_SYMBOL_GPL(ucsi_resume); @@ -1420,6 +1424,9 @@ void ucsi_unregister(struct ucsi *ucsi) /* Disable notifications */ ucsi->ops->async_write(ucsi, UCSI_CONTROL, &cmd, sizeof(cmd)); + if (!ucsi->connector) + return; + for (i = 0; i < ucsi->cap.num_connectors; i++) { cancel_work_sync(&ucsi->connector[i].work); ucsi_unregister_partner(&ucsi->connector[i]); diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index f9c0044c6442..44b29289aa19 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -849,7 +849,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) ret = ifcvf_init_hw(vf, pdev); if (ret) { IFCVF_ERR(pdev, "Failed to init IFCVF hw\n"); - return ret; + goto err; } for (i = 0; i < vf->nr_vring; i++) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 9af19b0cf3b7..4c538b30fd76 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1511,6 +1511,9 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) nvq = &n->vqs[index]; mutex_lock(&vq->mutex); + if (fd == -1) + vhost_clear_msg(&n->dev); + /* Verify that ring has been setup correctly. */ if (!vhost_vq_access_ok(vq)) { r = -EFAULT; diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index dca6346d75b3..d5ecb8876fc9 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -80,7 +80,7 @@ struct vhost_scsi_cmd { struct scatterlist *tvc_prot_sgl; struct page **tvc_upages; /* Pointer to response header iovec */ - struct iovec tvc_resp_iov; + struct iovec *tvc_resp_iov; /* Pointer to vhost_scsi for our device */ struct vhost_scsi *tvc_vhost; /* Pointer to vhost_virtqueue for the cmd */ @@ -563,7 +563,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) memcpy(v_rsp.sense, cmd->tvc_sense_buf, se_cmd->scsi_sense_length); - iov_iter_init(&iov_iter, ITER_DEST, &cmd->tvc_resp_iov, + iov_iter_init(&iov_iter, ITER_DEST, cmd->tvc_resp_iov, cmd->tvc_in_iovs, sizeof(v_rsp)); ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter); if (likely(ret == sizeof(v_rsp))) { @@ -594,6 +594,7 @@ vhost_scsi_get_cmd(struct vhost_virtqueue *vq, struct vhost_scsi_tpg *tpg, struct vhost_scsi_cmd *cmd; struct vhost_scsi_nexus *tv_nexus; struct scatterlist *sg, *prot_sg; + struct iovec *tvc_resp_iov; struct page **pages; int tag; @@ -613,6 +614,7 @@ vhost_scsi_get_cmd(struct vhost_virtqueue *vq, struct vhost_scsi_tpg *tpg, sg = cmd->tvc_sgl; prot_sg = cmd->tvc_prot_sgl; pages = cmd->tvc_upages; + tvc_resp_iov = cmd->tvc_resp_iov; memset(cmd, 0, sizeof(*cmd)); cmd->tvc_sgl = sg; cmd->tvc_prot_sgl = prot_sg; @@ -625,6 +627,7 @@ vhost_scsi_get_cmd(struct vhost_virtqueue *vq, struct vhost_scsi_tpg *tpg, cmd->tvc_data_direction = data_direction; cmd->tvc_nexus = tv_nexus; cmd->inflight = vhost_scsi_get_inflight(vq); + cmd->tvc_resp_iov = tvc_resp_iov; memcpy(cmd->tvc_cdb, cdb, VHOST_SCSI_MAX_CDB_SIZE); @@ -935,7 +938,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) struct iov_iter in_iter, prot_iter, data_iter; u64 tag; u32 exp_data_len, data_direction; - int ret, prot_bytes, c = 0; + int ret, prot_bytes, i, c = 0; u16 lun; u8 task_attr; bool t10_pi = vhost_has_feature(vq, VIRTIO_SCSI_F_T10_PI); @@ -1092,7 +1095,8 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) } cmd->tvc_vhost = vs; cmd->tvc_vq = vq; - cmd->tvc_resp_iov = vq->iov[vc.out]; + for (i = 0; i < vc.in ; i++) + cmd->tvc_resp_iov[i] = vq->iov[vc.out + i]; cmd->tvc_in_iovs = vc.in; pr_debug("vhost_scsi got command opcode: %#02x, lun: %d\n", @@ -1461,6 +1465,7 @@ static void vhost_scsi_destroy_vq_cmds(struct vhost_virtqueue *vq) kfree(tv_cmd->tvc_sgl); kfree(tv_cmd->tvc_prot_sgl); kfree(tv_cmd->tvc_upages); + kfree(tv_cmd->tvc_resp_iov); } sbitmap_free(&svq->scsi_tags); @@ -1508,6 +1513,14 @@ static int vhost_scsi_setup_vq_cmds(struct vhost_virtqueue *vq, int max_cmds) goto out; } + tv_cmd->tvc_resp_iov = kcalloc(UIO_MAXIOV, + sizeof(struct iovec), + GFP_KERNEL); + if (!tv_cmd->tvc_resp_iov) { + pr_err("Unable to allocate tv_cmd->tvc_resp_iov\n"); + goto out; + } + tv_cmd->tvc_prot_sgl = kcalloc(VHOST_SCSI_PREALLOC_PROT_SGLS, sizeof(struct scatterlist), GFP_KERNEL); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index cbe72bfd2f1f..43c9770b86e5 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -661,7 +661,7 @@ void vhost_dev_stop(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_stop); -static void vhost_clear_msg(struct vhost_dev *dev) +void vhost_clear_msg(struct vhost_dev *dev) { struct vhost_msg_node *node, *n; @@ -679,6 +679,7 @@ static void vhost_clear_msg(struct vhost_dev *dev) spin_unlock(&dev->iotlb_lock); } +EXPORT_SYMBOL_GPL(vhost_clear_msg); void vhost_dev_cleanup(struct vhost_dev *dev) { diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index d9109107af08..790b296271f1 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -181,6 +181,7 @@ long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp); long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); bool vhost_vq_access_ok(struct vhost_virtqueue *vq); bool vhost_log_access_ok(struct vhost_dev *); +void vhost_clear_msg(struct vhost_dev *dev); int vhost_get_vq_desc(struct vhost_virtqueue *, struct iovec iov[], unsigned int iov_count, diff --git a/drivers/video/fbdev/atmel_lcdfb.c b/drivers/video/fbdev/atmel_lcdfb.c index 1fc8de4ecbeb..8187a7c4f910 100644 --- a/drivers/video/fbdev/atmel_lcdfb.c +++ b/drivers/video/fbdev/atmel_lcdfb.c @@ -49,7 +49,6 @@ struct atmel_lcdfb_info { struct clk *lcdc_clk; struct backlight_device *backlight; - u8 bl_power; u8 saved_lcdcon; u32 pseudo_palette[16]; @@ -109,22 +108,7 @@ static u32 contrast_ctr = ATMEL_LCDC_PS_DIV8 static int atmel_bl_update_status(struct backlight_device *bl) { struct atmel_lcdfb_info *sinfo = bl_get_data(bl); - int power = sinfo->bl_power; - int brightness = bl->props.brightness; - - /* REVISIT there may be a meaningful difference between - * fb_blank and power ... there seem to be some cases - * this doesn't handle correctly. - */ - if (bl->props.fb_blank != sinfo->bl_power) - power = bl->props.fb_blank; - else if (bl->props.power != sinfo->bl_power) - power = bl->props.power; - - if (brightness < 0 && power == FB_BLANK_UNBLANK) - brightness = lcdc_readl(sinfo, ATMEL_LCDC_CONTRAST_VAL); - else if (power != FB_BLANK_UNBLANK) - brightness = 0; + int brightness = backlight_get_brightness(bl); lcdc_writel(sinfo, ATMEL_LCDC_CONTRAST_VAL, brightness); if (contrast_ctr & ATMEL_LCDC_POL_POSITIVE) @@ -133,8 +117,6 @@ static int atmel_bl_update_status(struct backlight_device *bl) else lcdc_writel(sinfo, ATMEL_LCDC_CONTRAST_CTR, contrast_ctr); - bl->props.fb_blank = bl->props.power = sinfo->bl_power = power; - return 0; } @@ -155,8 +137,6 @@ static void init_backlight(struct atmel_lcdfb_info *sinfo) struct backlight_properties props; struct backlight_device *bl; - sinfo->bl_power = FB_BLANK_UNBLANK; - if (sinfo->backlight) return; diff --git a/drivers/video/fbdev/aty/aty128fb.c b/drivers/video/fbdev/aty/aty128fb.c index dd31b9d7d337..36a9ac05a340 100644 --- a/drivers/video/fbdev/aty/aty128fb.c +++ b/drivers/video/fbdev/aty/aty128fb.c @@ -1766,12 +1766,10 @@ static int aty128_bl_update_status(struct backlight_device *bd) unsigned int reg = aty_ld_le32(LVDS_GEN_CNTL); int level; - if (bd->props.power != FB_BLANK_UNBLANK || - bd->props.fb_blank != FB_BLANK_UNBLANK || - !par->lcd_on) + if (!par->lcd_on) level = 0; else - level = bd->props.brightness; + level = backlight_get_brightness(bd); reg |= LVDS_BL_MOD_EN | LVDS_BLON; if (level > 0) { diff --git a/drivers/video/fbdev/aty/atyfb_base.c b/drivers/video/fbdev/aty/atyfb_base.c index d59215a4992e..b02e4e645035 100644 --- a/drivers/video/fbdev/aty/atyfb_base.c +++ b/drivers/video/fbdev/aty/atyfb_base.c @@ -2219,13 +2219,7 @@ static int aty_bl_update_status(struct backlight_device *bd) { struct atyfb_par *par = bl_get_data(bd); unsigned int reg = aty_ld_lcd(LCD_MISC_CNTL, par); - int level; - - if (bd->props.power != FB_BLANK_UNBLANK || - bd->props.fb_blank != FB_BLANK_UNBLANK) - level = 0; - else - level = bd->props.brightness; + int level = backlight_get_brightness(bd); reg |= (BLMOD_EN | BIASMOD_EN); if (level > 0) { diff --git a/drivers/video/fbdev/aty/radeon_backlight.c b/drivers/video/fbdev/aty/radeon_backlight.c index d2c1263ad260..427adc838f77 100644 --- a/drivers/video/fbdev/aty/radeon_backlight.c +++ b/drivers/video/fbdev/aty/radeon_backlight.c @@ -57,11 +57,7 @@ static int radeon_bl_update_status(struct backlight_device *bd) * backlight. This provides some greater power saving and the display * is useless without backlight anyway. */ - if (bd->props.power != FB_BLANK_UNBLANK || - bd->props.fb_blank != FB_BLANK_UNBLANK) - level = 0; - else - level = bd->props.brightness; + level = backlight_get_brightness(bd); del_timer_sync(&rinfo->lvds_timer); radeon_engine_idle(); diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index c730253ab85c..583cbcf09446 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -313,7 +313,7 @@ void fb_deferred_io_open(struct fb_info *info, } EXPORT_SYMBOL_GPL(fb_deferred_io_open); -void fb_deferred_io_cleanup(struct fb_info *info) +void fb_deferred_io_release(struct fb_info *info) { struct fb_deferred_io *fbdefio = info->fbdefio; struct page *page; @@ -327,6 +327,14 @@ void fb_deferred_io_cleanup(struct fb_info *info) page = fb_deferred_io_page(info, i); page->mapping = NULL; } +} +EXPORT_SYMBOL_GPL(fb_deferred_io_release); + +void fb_deferred_io_cleanup(struct fb_info *info) +{ + struct fb_deferred_io *fbdefio = info->fbdefio; + + fb_deferred_io_release(info); kvfree(info->pagerefs); mutex_destroy(&fbdefio->lock); diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 14a7d404062c..1b14c21af2b7 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2495,9 +2495,12 @@ static int fbcon_set_font(struct vc_data *vc, struct console_font *font, h > FBCON_SWAP(info->var.rotate, info->var.yres, info->var.xres)) return -EINVAL; + if (font->width > 32 || font->height > 32) + return -EINVAL; + /* Make sure drawing engine can handle the font */ - if (!(info->pixmap.blit_x & (1 << (font->width - 1))) || - !(info->pixmap.blit_y & (1 << (font->height - 1)))) + if (!(info->pixmap.blit_x & BIT(font->width - 1)) || + !(info->pixmap.blit_y & BIT(font->height - 1))) return -EINVAL; /* Make sure driver can handle the font length */ diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 3a6c8458eb8d..ab3545a00abc 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -1454,6 +1454,10 @@ __releases(&info->lock) struct fb_info * const info = file->private_data; lock_fb_info(info); +#if IS_ENABLED(CONFIG_FB_DEFERRED_IO) + if (info->fbdefio) + fb_deferred_io_release(info); +#endif if (info->fbops->fb_release) info->fbops->fb_release(info,1); module_put(info->fbops->owner); diff --git a/drivers/video/fbdev/core/fbmon.c b/drivers/video/fbdev/core/fbmon.c index b0e690f41025..79e5bfbdd34c 100644 --- a/drivers/video/fbdev/core/fbmon.c +++ b/drivers/video/fbdev/core/fbmon.c @@ -1050,7 +1050,7 @@ static u32 fb_get_vblank(u32 hfreq) } /** - * fb_get_hblank_by_freq - get horizontal blank time given hfreq + * fb_get_hblank_by_hfreq - get horizontal blank time given hfreq * @hfreq: horizontal freq * @xres: horizontal resolution in pixels * diff --git a/drivers/video/fbdev/mx3fb.c b/drivers/video/fbdev/mx3fb.c index b945b68984b9..76771e126d0a 100644 --- a/drivers/video/fbdev/mx3fb.c +++ b/drivers/video/fbdev/mx3fb.c @@ -283,12 +283,7 @@ static int mx3fb_bl_get_brightness(struct backlight_device *bl) static int mx3fb_bl_update_status(struct backlight_device *bl) { struct mx3fb_data *fbd = bl_get_data(bl); - int brightness = bl->props.brightness; - - if (bl->props.power != FB_BLANK_UNBLANK) - brightness = 0; - if (bl->props.fb_blank != FB_BLANK_UNBLANK) - brightness = 0; + int brightness = backlight_get_brightness(bl); fbd->backlight_level = (fbd->backlight_level & ~0xFF) | brightness; diff --git a/drivers/video/fbdev/nvidia/nv_backlight.c b/drivers/video/fbdev/nvidia/nv_backlight.c index 2ce53529f636..503a7a683855 100644 --- a/drivers/video/fbdev/nvidia/nv_backlight.c +++ b/drivers/video/fbdev/nvidia/nv_backlight.c @@ -49,17 +49,11 @@ static int nvidia_bl_update_status(struct backlight_device *bd) { struct nvidia_par *par = bl_get_data(bd); u32 tmp_pcrt, tmp_pmc, fpcontrol; - int level; + int level = backlight_get_brightness(bd); if (!par->FlatPanel) return 0; - if (bd->props.power != FB_BLANK_UNBLANK || - bd->props.fb_blank != FB_BLANK_UNBLANK) - level = 0; - else - level = bd->props.brightness; - tmp_pmc = NV_RD32(par->PMC, 0x10F0) & 0x0000FFFF; tmp_pcrt = NV_RD32(par->PCRTC0, 0x081C) & 0xFFFFFFFC; fpcontrol = NV_RD32(par->PRAMDAC, 0x0848) & 0xCFFFFFCC; diff --git a/drivers/video/fbdev/nvidia/nvidia.c b/drivers/video/fbdev/nvidia/nvidia.c index 1960916098d4..e60a276b4855 100644 --- a/drivers/video/fbdev/nvidia/nvidia.c +++ b/drivers/video/fbdev/nvidia/nvidia.c @@ -1197,17 +1197,17 @@ static int nvidia_set_fbinfo(struct fb_info *info) return nvidiafb_check_var(&info->var, info); } -static u32 nvidia_get_chipset(struct fb_info *info) +static u32 nvidia_get_chipset(struct pci_dev *pci_dev, + volatile u32 __iomem *REGS) { - struct nvidia_par *par = info->par; - u32 id = (par->pci_dev->vendor << 16) | par->pci_dev->device; + u32 id = (pci_dev->vendor << 16) | pci_dev->device; printk(KERN_INFO PFX "Device ID: %x \n", id); if ((id & 0xfff0) == 0x00f0 || (id & 0xfff0) == 0x02e0) { /* pci-e */ - id = NV_RD32(par->REGS, 0x1800); + id = NV_RD32(REGS, 0x1800); if ((id & 0x0000ffff) == 0x000010DE) id = 0x10DE0000 | (id >> 16); @@ -1220,12 +1220,11 @@ static u32 nvidia_get_chipset(struct fb_info *info) return id; } -static u32 nvidia_get_arch(struct fb_info *info) +static u32 nvidia_get_arch(u32 Chipset) { - struct nvidia_par *par = info->par; u32 arch = 0; - switch (par->Chipset & 0x0ff0) { + switch (Chipset & 0x0ff0) { case 0x0100: /* GeForce 256 */ case 0x0110: /* GeForce2 MX */ case 0x0150: /* GeForce2 */ @@ -1278,16 +1277,44 @@ static int nvidiafb_probe(struct pci_dev *pd, const struct pci_device_id *ent) struct fb_info *info; unsigned short cmd; int ret; + volatile u32 __iomem *REGS; + int Chipset; + u32 Architecture; NVTRACE_ENTER(); assert(pd != NULL); + if (pci_enable_device(pd)) { + printk(KERN_ERR PFX "cannot enable PCI device\n"); + return -ENODEV; + } + + /* enable IO and mem if not already done */ + pci_read_config_word(pd, PCI_COMMAND, &cmd); + cmd |= (PCI_COMMAND_IO | PCI_COMMAND_MEMORY); + pci_write_config_word(pd, PCI_COMMAND, cmd); + + nvidiafb_fix.mmio_start = pci_resource_start(pd, 0); + nvidiafb_fix.mmio_len = pci_resource_len(pd, 0); + + REGS = ioremap(nvidiafb_fix.mmio_start, nvidiafb_fix.mmio_len); + if (!REGS) { + printk(KERN_ERR PFX "cannot ioremap MMIO base\n"); + return -ENODEV; + } + + Chipset = nvidia_get_chipset(pd, REGS); + Architecture = nvidia_get_arch(Chipset); + if (Architecture == 0) { + printk(KERN_ERR PFX "unknown NV_ARCH\n"); + goto err_out; + } + ret = aperture_remove_conflicting_pci_devices(pd, "nvidiafb"); if (ret) - return ret; + goto err_out; info = framebuffer_alloc(sizeof(struct nvidia_par), &pd->dev); - if (!info) goto err_out; @@ -1298,11 +1325,6 @@ static int nvidiafb_probe(struct pci_dev *pd, const struct pci_device_id *ent) if (info->pixmap.addr == NULL) goto err_out_kfree; - if (pci_enable_device(pd)) { - printk(KERN_ERR PFX "cannot enable PCI device\n"); - goto err_out_enable; - } - if (pci_request_regions(pd, "nvidiafb")) { printk(KERN_ERR PFX "cannot request PCI regions\n"); goto err_out_enable; @@ -1318,34 +1340,17 @@ static int nvidiafb_probe(struct pci_dev *pd, const struct pci_device_id *ent) par->paneltweak = paneltweak; par->reverse_i2c = reverse_i2c; - /* enable IO and mem if not already done */ - pci_read_config_word(pd, PCI_COMMAND, &cmd); - cmd |= (PCI_COMMAND_IO | PCI_COMMAND_MEMORY); - pci_write_config_word(pd, PCI_COMMAND, cmd); - - nvidiafb_fix.mmio_start = pci_resource_start(pd, 0); nvidiafb_fix.smem_start = pci_resource_start(pd, 1); - nvidiafb_fix.mmio_len = pci_resource_len(pd, 0); - - par->REGS = ioremap(nvidiafb_fix.mmio_start, nvidiafb_fix.mmio_len); - if (!par->REGS) { - printk(KERN_ERR PFX "cannot ioremap MMIO base\n"); - goto err_out_free_base0; - } + par->REGS = REGS; - par->Chipset = nvidia_get_chipset(info); - par->Architecture = nvidia_get_arch(info); - - if (par->Architecture == 0) { - printk(KERN_ERR PFX "unknown NV_ARCH\n"); - goto err_out_arch; - } + par->Chipset = Chipset; + par->Architecture = Architecture; sprintf(nvidiafb_fix.id, "NV%x", (pd->device & 0x0ff0) >> 4); if (NVCommonSetup(info)) - goto err_out_arch; + goto err_out_free_base0; par->FbAddress = nvidiafb_fix.smem_start; par->FbMapSize = par->RamAmountKBytes * 1024; @@ -1401,7 +1406,6 @@ static int nvidiafb_probe(struct pci_dev *pd, const struct pci_device_id *ent) goto err_out_iounmap_fb; } - printk(KERN_INFO PFX "PCI nVidia %s framebuffer (%dMB @ 0x%lX)\n", info->fix.id, @@ -1415,15 +1419,14 @@ err_out_iounmap_fb: err_out_free_base1: fb_destroy_modedb(info->monspecs.modedb); nvidia_delete_i2c_busses(par); -err_out_arch: - iounmap(par->REGS); - err_out_free_base0: +err_out_free_base0: pci_release_regions(pd); err_out_enable: kfree(info->pixmap.addr); err_out_kfree: framebuffer_release(info); err_out: + iounmap(REGS); return -ENODEV; } diff --git a/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c b/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c index 4fc4b26a8d30..ba94a0a7bd4f 100644 --- a/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c +++ b/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c @@ -331,13 +331,7 @@ static int dsicm_bl_update_status(struct backlight_device *dev) struct panel_drv_data *ddata = dev_get_drvdata(&dev->dev); struct omap_dss_device *in = ddata->in; int r; - int level; - - if (dev->props.fb_blank == FB_BLANK_UNBLANK && - dev->props.power == FB_BLANK_UNBLANK) - level = dev->props.brightness; - else - level = 0; + int level = backlight_get_brightness(dev); dev_dbg(&ddata->pdev->dev, "update brightness to %d\n", level); diff --git a/drivers/video/fbdev/omap2/omapfb/dss/display-sysfs.c b/drivers/video/fbdev/omap2/omapfb/dss/display-sysfs.c index bc5a44c2a144..ae937854403b 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/display-sysfs.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/display-sysfs.c @@ -10,6 +10,7 @@ #define DSS_SUBSYS_NAME "DISPLAY" #include <linux/kernel.h> +#include <linux/kstrtox.h> #include <linux/module.h> #include <linux/platform_device.h> #include <linux/sysfs.h> @@ -36,7 +37,7 @@ static ssize_t display_enabled_store(struct omap_dss_device *dssdev, int r; bool enable; - r = strtobool(buf, &enable); + r = kstrtobool(buf, &enable); if (r) return r; @@ -73,7 +74,7 @@ static ssize_t display_tear_store(struct omap_dss_device *dssdev, if (!dssdev->driver->enable_te || !dssdev->driver->get_te) return -ENOENT; - r = strtobool(buf, &te); + r = kstrtobool(buf, &te); if (r) return r; @@ -183,7 +184,7 @@ static ssize_t display_mirror_store(struct omap_dss_device *dssdev, if (!dssdev->driver->set_mirror || !dssdev->driver->get_mirror) return -ENOENT; - r = strtobool(buf, &mirror); + r = kstrtobool(buf, &mirror); if (r) return r; diff --git a/drivers/video/fbdev/omap2/omapfb/dss/manager-sysfs.c b/drivers/video/fbdev/omap2/omapfb/dss/manager-sysfs.c index ba21c4a2633d..1b644be5fe2e 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/manager-sysfs.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/manager-sysfs.c @@ -10,6 +10,7 @@ #define DSS_SUBSYS_NAME "MANAGER" #include <linux/kernel.h> +#include <linux/kstrtox.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/platform_device.h> @@ -246,7 +247,7 @@ static ssize_t manager_trans_key_enabled_store(struct omap_overlay_manager *mgr, bool enable; int r; - r = strtobool(buf, &enable); + r = kstrtobool(buf, &enable); if (r) return r; @@ -290,7 +291,7 @@ static ssize_t manager_alpha_blending_enabled_store( if(!dss_has_feature(FEAT_ALPHA_FIXED_ZORDER)) return -ENODEV; - r = strtobool(buf, &enable); + r = kstrtobool(buf, &enable); if (r) return r; @@ -329,7 +330,7 @@ static ssize_t manager_cpr_enable_store(struct omap_overlay_manager *mgr, if (!dss_has_feature(FEAT_CPR)) return -ENODEV; - r = strtobool(buf, &enable); + r = kstrtobool(buf, &enable); if (r) return r; diff --git a/drivers/video/fbdev/omap2/omapfb/dss/overlay-sysfs.c b/drivers/video/fbdev/omap2/omapfb/dss/overlay-sysfs.c index 601c0beb6de9..1da4fb1c77b4 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/overlay-sysfs.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/overlay-sysfs.c @@ -13,6 +13,7 @@ #include <linux/err.h> #include <linux/sysfs.h> #include <linux/kobject.h> +#include <linux/kstrtox.h> #include <linux/platform_device.h> #include <video/omapfb_dss.h> @@ -210,7 +211,7 @@ static ssize_t overlay_enabled_store(struct omap_overlay *ovl, const char *buf, int r; bool enable; - r = strtobool(buf, &enable); + r = kstrtobool(buf, &enable); if (r) return r; diff --git a/drivers/video/fbdev/omap2/omapfb/omapfb-sysfs.c b/drivers/video/fbdev/omap2/omapfb/omapfb-sysfs.c index 06dc41aa0354..831b2c2fbdf9 100644 --- a/drivers/video/fbdev/omap2/omapfb/omapfb-sysfs.c +++ b/drivers/video/fbdev/omap2/omapfb/omapfb-sysfs.c @@ -15,6 +15,7 @@ #include <linux/uaccess.h> #include <linux/platform_device.h> #include <linux/kernel.h> +#include <linux/kstrtox.h> #include <linux/mm.h> #include <linux/omapfb.h> @@ -96,7 +97,7 @@ static ssize_t store_mirror(struct device *dev, int r; struct fb_var_screeninfo new_var; - r = strtobool(buf, &mirror); + r = kstrtobool(buf, &mirror); if (r) return r; diff --git a/drivers/video/fbdev/riva/fbdev.c b/drivers/video/fbdev/riva/fbdev.c index 644278146d3b..41edc6e79460 100644 --- a/drivers/video/fbdev/riva/fbdev.c +++ b/drivers/video/fbdev/riva/fbdev.c @@ -293,13 +293,7 @@ static int riva_bl_update_status(struct backlight_device *bd) { struct riva_par *par = bl_get_data(bd); U032 tmp_pcrt, tmp_pmc; - int level; - - if (bd->props.power != FB_BLANK_UNBLANK || - bd->props.fb_blank != FB_BLANK_UNBLANK) - level = 0; - else - level = bd->props.brightness; + int level = backlight_get_brightness(bd); tmp_pmc = NV_RD32(par->riva.PMC, 0x10F0) & 0x0000FFFF; tmp_pcrt = NV_RD32(par->riva.PCRTC0, 0x081C) & 0xFFFFFFFC; diff --git a/drivers/watchdog/diag288_wdt.c b/drivers/watchdog/diag288_wdt.c index 4cb10877017c..6ca5d9515d85 100644 --- a/drivers/watchdog/diag288_wdt.c +++ b/drivers/watchdog/diag288_wdt.c @@ -86,7 +86,7 @@ static int __diag288(unsigned int func, unsigned int timeout, "1:\n" EX_TABLE(0b, 1b) : "+d" (err) : "d"(__func), "d"(__timeout), - "d"(__action), "d"(__len) : "1", "cc"); + "d"(__action), "d"(__len) : "1", "cc", "memory"); return err; } @@ -268,12 +268,21 @@ static int __init diag288_init(void) char ebc_begin[] = { 194, 197, 199, 201, 213 }; + char *ebc_cmd; watchdog_set_nowayout(&wdt_dev, nowayout_info); if (MACHINE_IS_VM) { - if (__diag288_vm(WDT_FUNC_INIT, 15, - ebc_begin, sizeof(ebc_begin)) != 0) { + ebc_cmd = kmalloc(sizeof(ebc_begin), GFP_KERNEL); + if (!ebc_cmd) { + pr_err("The watchdog cannot be initialized\n"); + return -ENOMEM; + } + memcpy(ebc_cmd, ebc_begin, sizeof(ebc_begin)); + ret = __diag288_vm(WDT_FUNC_INIT, 15, + ebc_cmd, sizeof(ebc_begin)); + kfree(ebc_cmd); + if (ret != 0) { pr_err("The watchdog cannot be initialized\n"); return -EINVAL; } diff --git a/fs/9p/acl.c b/fs/9p/acl.c index c397c51f80d9..eed551d8555f 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -139,7 +139,7 @@ struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu } -struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *v9fs_iop_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { struct v9fs_session_info *v9ses; @@ -151,7 +151,7 @@ struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, return v9fs_get_cached_acl(d_inode(dentry), type); } -int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int v9fs_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int retval; @@ -195,7 +195,7 @@ int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, goto err_out; } - if (!inode_owner_or_capable(&init_user_ns, inode)) { + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) { retval = -EPERM; goto err_out; } @@ -206,7 +206,7 @@ int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr iattr = {}; struct posix_acl *acl_mode = acl; - retval = posix_acl_update_mode(&init_user_ns, inode, + retval = posix_acl_update_mode(&nop_mnt_idmap, inode, &iattr.ia_mode, &acl_mode); if (retval) @@ -225,7 +225,7 @@ int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, * FIXME should we update ctime ? * What is the following setxattr update the mode ? */ - v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr); + v9fs_vfs_setattr_dotl(&nop_mnt_idmap, dentry, &iattr); } break; case ACL_TYPE_DEFAULT: diff --git a/fs/9p/acl.h b/fs/9p/acl.h index 4c60a2bce5de..333cfcc281da 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -10,9 +10,9 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid); struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu); -struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *v9fs_iop_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type); -int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int v9fs_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid); int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid, diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 6acabc2e7dc9..f3f74d197b5d 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -151,7 +151,7 @@ extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); -extern int v9fs_vfs_rename(struct user_namespace *mnt_userns, +extern int v9fs_vfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index bc417da7e9c1..75106b9f293d 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -60,7 +60,7 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); int v9fs_uflags2omode(int uflags, int extended); void v9fs_blank_wstat(struct p9_wstat *wstat); -int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, +int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, int datasync); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index b740017634ef..b6ba22975781 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/sched.h> #include <linux/file.h> #include <linux/stat.h> diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 27a04a226d97..4344e7a7865f 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -260,7 +260,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, { int err = 0; - inode_init_owner(&init_user_ns, inode, NULL, mode); + inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_blocks = 0; inode->i_rdev = rdev; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); @@ -672,7 +672,7 @@ error: /** * v9fs_vfs_create - VFS hook to create a regular file - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dir: The parent directory * @dentry: The name of file to be created * @mode: The UNIX file mode to set @@ -684,7 +684,7 @@ error: */ static int -v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); @@ -704,14 +704,14 @@ v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir, /** * v9fs_vfs_mkdir - VFS mkdir hook to create a directory - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @mode: mode for new directory * */ -static int v9fs_vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int err; @@ -908,7 +908,7 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) /** * v9fs_vfs_rename - VFS hook to rename an inode - * @mnt_userns: The user namespace of the mount + * @idmap: The idmap of the mount * @old_dir: old dir inode * @old_dentry: old dentry * @new_dir: new dir inode @@ -918,7 +918,7 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) */ int -v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +v9fs_vfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -1018,7 +1018,7 @@ error: /** * v9fs_vfs_getattr - retrieve file metadata - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @path: Object to query * @stat: metadata structure to populate * @request_mask: Mask of STATX_xxx flags indicating the caller's interests @@ -1027,7 +1027,7 @@ error: */ static int -v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, +v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; @@ -1038,7 +1038,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -1051,7 +1051,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, return PTR_ERR(st); v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0); - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); p9stat_free(st); kfree(st); @@ -1060,13 +1060,13 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, /** * v9fs_vfs_setattr - set file metadata - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * */ -static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, +static int v9fs_vfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; @@ -1077,7 +1077,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, struct p9_wstat wstat; p9_debug(P9_DEBUG_VFS, "\n"); - retval = setattr_prepare(&init_user_ns, dentry, iattr); + retval = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (retval) return retval; @@ -1135,7 +1135,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, v9fs_invalidate_inode_attr(inode); - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); return 0; } @@ -1300,7 +1300,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, /** * v9fs_vfs_symlink - helper function to create symlinks - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dir: directory inode containing symlink * @dentry: dentry for symlink * @symname: symlink data @@ -1310,7 +1310,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, */ static int -v9fs_vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { p9_debug(P9_DEBUG_VFS, " %lu,%pd,%s\n", @@ -1356,7 +1356,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, /** * v9fs_vfs_mknod - create a special file - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dir: inode destination for new link * @dentry: dentry for file * @mode: mode for creation @@ -1365,7 +1365,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, */ static int -v9fs_vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index f806b3f11649..3bed3eb3a0e2 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -30,7 +30,7 @@ #include "acl.h" static int -v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev); /** @@ -211,7 +211,7 @@ int v9fs_open_to_dotl_flags(int flags) /** * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. - * @mnt_userns: The user namespace of the mount + * @idmap: The user namespace of the mount * @dir: directory inode that is being created * @dentry: dentry that is being deleted * @omode: create permissions @@ -219,10 +219,10 @@ int v9fs_open_to_dotl_flags(int flags) * */ static int -v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_create_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode, bool excl) { - return v9fs_vfs_mknod_dotl(mnt_userns, dir, dentry, omode, 0); + return v9fs_vfs_mknod_dotl(idmap, dir, dentry, omode, 0); } static int @@ -356,14 +356,14 @@ out: /** * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory - * @mnt_userns: The user namespace of the mount + * @idmap: The idmap of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @omode: mode for new directory * */ -static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns, +static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode) { @@ -450,7 +450,7 @@ error: } static int -v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns, +v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -462,7 +462,7 @@ v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -479,7 +479,7 @@ v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns, return PTR_ERR(st); v9fs_stat2inode_dotl(st, d_inode(dentry), 0); - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -529,13 +529,13 @@ static int v9fs_mapped_iattr_valid(int iattr_valid) /** * v9fs_vfs_setattr_dotl - set file metadata - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * */ -int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, +int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; @@ -548,7 +548,7 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, p9_debug(P9_DEBUG_VFS, "\n"); - retval = setattr_prepare(&init_user_ns, dentry, iattr); + retval = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (retval) return retval; @@ -597,7 +597,7 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, truncate_setsize(inode, iattr->ia_size); v9fs_invalidate_inode_attr(inode); - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) { /* We also want to update ACL when we update mode bits */ @@ -687,7 +687,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, } static int -v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int err; @@ -817,7 +817,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, /** * v9fs_vfs_mknod_dotl - create a special file - * @mnt_userns: The user namespace of the mount + * @idmap: The idmap of the mount * @dir: inode destination for new link * @dentry: dentry for file * @omode: mode for creation @@ -825,7 +825,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, * */ static int -v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev) { int err; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index b6984311e00a..50f7f3f6b55e 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -150,7 +150,7 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler, } static int v9fs_xattr_handler_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/Makefile b/fs/Makefile index 4dea17840761..76abc9e055bd 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table.o super.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ - kernel_read_file.o remap_range.o + kernel_read_file.o mnt_idmapping.o remap_range.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o direct-io.o mpage.o diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 06b7c92343ad..223f0283d20f 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -144,7 +144,7 @@ struct adfs_discmap { /* Inode stuff */ struct inode *adfs_iget(struct super_block *sb, struct object_info *obj); int adfs_write_inode(struct inode *inode, struct writeback_control *wbc); -int adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, +int adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); /* map.c */ diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index ee22278b0cfc..c3ac613d0975 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -294,7 +294,7 @@ out: * later. */ int -adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, +adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -302,7 +302,7 @@ adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, unsigned int ia_valid = attr->ia_valid; int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); /* * we can't change the UID or GID of any file - diff --git a/fs/affs/affs.h b/fs/affs/affs.h index bfa89e131ead..60685ec76d98 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -167,17 +167,17 @@ extern const struct export_operations affs_export_ops; extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len); extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int); extern int affs_unlink(struct inode *dir, struct dentry *dentry); -extern int affs_create(struct user_namespace *mnt_userns, struct inode *dir, +extern int affs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool); -extern int affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +extern int affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, struct dentry *dentry); -extern int affs_symlink(struct user_namespace *mnt_userns, +extern int affs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname); -extern int affs_rename2(struct user_namespace *mnt_userns, +extern int affs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); @@ -185,7 +185,7 @@ extern int affs_rename2(struct user_namespace *mnt_userns, /* inode.c */ extern struct inode *affs_new_inode(struct inode *dir); -extern int affs_notify_change(struct user_namespace *mnt_userns, +extern int affs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern void affs_evict_inode(struct inode *inode); extern struct inode *affs_iget(struct super_block *sb, diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 2352a75bd9d6..27f77a52c5c8 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -216,7 +216,7 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) } int -affs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, +affs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -224,7 +224,7 @@ affs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid); - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) goto out; @@ -250,7 +250,7 @@ affs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, affs_truncate(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); if (attr->ia_valid & ATTR_MODE) diff --git a/fs/affs/namei.c b/fs/affs/namei.c index bcab18956b4f..d12ccfd2a83d 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -242,7 +242,7 @@ affs_unlink(struct inode *dir, struct dentry *dentry) } int -affs_create(struct user_namespace *mnt_userns, struct inode *dir, +affs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; @@ -274,7 +274,7 @@ affs_create(struct user_namespace *mnt_userns, struct inode *dir, } int -affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -313,7 +313,7 @@ affs_rmdir(struct inode *dir, struct dentry *dentry) } int -affs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +affs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct super_block *sb = dir->i_sb; @@ -503,7 +503,7 @@ done: return retval; } -int affs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, +int affs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/afs/dir.c b/fs/afs/dir.c index b7c1f8c84b38..82690d1dd49a 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -28,17 +28,17 @@ static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, in loff_t fpos, u64 ino, unsigned dtype); static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); -static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); -static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode); static int afs_rmdir(struct inode *dir, struct dentry *dentry); static int afs_unlink(struct inode *dir, struct dentry *dentry); static int afs_link(struct dentry *from, struct inode *dir, struct dentry *dentry); -static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *content); -static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags); @@ -1332,7 +1332,7 @@ static const struct afs_operation_ops afs_mkdir_operation = { /* * create a directory on an AFS filesystem */ -static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct afs_operation *op; @@ -1630,7 +1630,7 @@ static const struct afs_operation_ops afs_create_operation = { /* * create a regular file on an AFS filesystem */ -static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct afs_operation *op; @@ -1760,7 +1760,7 @@ static const struct afs_operation_ops afs_symlink_operation = { /* * create a symlink in an AFS filesystem */ -static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *content) { struct afs_operation *op; @@ -1897,7 +1897,7 @@ static const struct afs_operation_ops afs_rename_operation = { /* * rename a file in an AFS filesystem and/or move it between directories */ -static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/afs/flock.c b/fs/afs/flock.c index bbcc5afd1576..9c6dea3139f5 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -451,7 +451,7 @@ static int afs_do_setlk_check(struct afs_vnode *vnode, struct key *key, */ static int afs_do_setlk(struct file *file, struct file_lock *fl) { - struct inode *inode = locks_inode(file); + struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); enum afs_flock_mode mode = AFS_FS_S(inode->i_sb)->flock_mode; afs_lock_type_t type; @@ -701,7 +701,7 @@ error: */ static int afs_do_unlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); int ret; _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); @@ -721,7 +721,7 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl) */ static int afs_do_getlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct key *key = afs_file_key(file); int ret, lock_count; @@ -763,7 +763,7 @@ error: */ int afs_lock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); enum afs_flock_operation op; int ret; @@ -798,7 +798,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) */ int afs_flock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); enum afs_flock_operation op; int ret; @@ -843,7 +843,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) */ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file)); _enter(""); @@ -861,7 +861,7 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) */ static void afs_fl_release_private(struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file)); _enter(""); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 6d3a3dbe4928..0167e96e5198 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -737,7 +737,7 @@ error_unlock: /* * read the attributes of an inode */ -int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int afs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -761,7 +761,7 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && stat->nlink > 0) stat->nlink -= 1; @@ -870,7 +870,7 @@ static const struct afs_operation_ops afs_setattr_operation = { /* * set the attributes of an inode */ -int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { const unsigned int supported = diff --git a/fs/afs/internal.h b/fs/afs/internal.h index fd8567b98e2b..ad8523d0d038 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/pagemap.h> #include <linux/rxrpc.h> #include <linux/key.h> @@ -1170,9 +1171,9 @@ extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); extern bool afs_check_validity(struct afs_vnode *); extern int afs_validate(struct afs_vnode *, struct key *); -extern int afs_getattr(struct user_namespace *mnt_userns, const struct path *, +extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, struct kstat *, u32, unsigned int); -extern int afs_setattr(struct user_namespace *mnt_userns, struct dentry *, struct iattr *); +extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *); extern void afs_evict_inode(struct inode *); extern int afs_drop_inode(struct inode *); @@ -1387,7 +1388,7 @@ extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int, extern struct key *afs_request_key(struct afs_cell *); extern struct key *afs_request_key_rcu(struct afs_cell *); extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *); -extern int afs_permission(struct user_namespace *, struct inode *, int); +extern int afs_permission(struct mnt_idmap *, struct inode *, int); extern void __exit afs_clean_up_permit_cache(void); /* diff --git a/fs/afs/security.c b/fs/afs/security.c index 7c6a63a30394..6a7744c9e2a2 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -395,7 +395,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key, * - AFS ACLs are attached to directories only, and a file is controlled by its * parent directory's ACL */ -int afs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int afs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct afs_vnode *vnode = AFS_FS_I(inode); diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 7751b0b3f81d..9048d8ccc715 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -97,7 +97,7 @@ static const struct afs_operation_ops afs_store_acl_operation = { * Set a file's AFS3 ACL. */ static int afs_xattr_set_acl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -228,7 +228,7 @@ static const struct afs_operation_ops yfs_store_opaque_acl2_operation = { * Set a file's YFS ACL. */ static int afs_xattr_set_yfs(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -361,6 +361,9 @@ static int aio_ring_mremap(struct vm_area_struct *vma) spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); + if (!table) + goto out_unlock; + for (i = 0; i < table->nr; i++) { struct kioctx *ctx; @@ -374,6 +377,7 @@ static int aio_ring_mremap(struct vm_area_struct *vma) } } +out_unlock: rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); return res; diff --git a/fs/attr.c b/fs/attr.c index b45f30e516fa..aca9ff7aed33 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -14,6 +14,7 @@ #include <linux/capability.h> #include <linux/fsnotify.h> #include <linux/fcntl.h> +#include <linux/filelock.h> #include <linux/security.h> #include <linux/evm.h> #include <linux/ima.h> @@ -23,7 +24,7 @@ /** * setattr_should_drop_sgid - determine whether the setgid bit needs to be * removed - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: inode to check * * This function determines whether the setgid bit needs to be removed. @@ -33,7 +34,7 @@ * * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise. */ -int setattr_should_drop_sgid(struct user_namespace *mnt_userns, +int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode) { umode_t mode = inode->i_mode; @@ -42,8 +43,7 @@ int setattr_should_drop_sgid(struct user_namespace *mnt_userns, return 0; if (mode & S_IXGRP) return ATTR_KILL_SGID; - if (!in_group_or_capable(mnt_userns, inode, - i_gid_into_vfsgid(mnt_userns, inode))) + if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) return ATTR_KILL_SGID; return 0; } @@ -51,7 +51,7 @@ int setattr_should_drop_sgid(struct user_namespace *mnt_userns, /** * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to * be dropped - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: inode to check * * This function determines whether the set{g,u}id bits need to be removed. @@ -63,7 +63,7 @@ int setattr_should_drop_sgid(struct user_namespace *mnt_userns, * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits * to remove, 0 otherwise. */ -int setattr_should_drop_suidgid(struct user_namespace *mnt_userns, +int setattr_should_drop_suidgid(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; @@ -73,7 +73,7 @@ int setattr_should_drop_suidgid(struct user_namespace *mnt_userns, if (unlikely(mode & S_ISUID)) kill = ATTR_KILL_SUID; - kill |= setattr_should_drop_sgid(mnt_userns, inode); + kill |= setattr_should_drop_sgid(idmap, inode); if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) return kill; @@ -84,24 +84,24 @@ EXPORT_SYMBOL(setattr_should_drop_suidgid); /** * chown_ok - verify permissions to chown inode - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: inode to check permissions on * @ia_vfsuid: uid to chown @inode to * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply pass @nop_mnt_idmap. */ -static bool chown_ok(struct user_namespace *mnt_userns, +static bool chown_ok(struct mnt_idmap *idmap, const struct inode *inode, vfsuid_t ia_vfsuid) { - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid()) && vfsuid_eq(ia_vfsuid, vfsuid)) return true; - if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) + if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN)) return true; if (!vfsuid_valid(vfsuid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) @@ -111,28 +111,28 @@ static bool chown_ok(struct user_namespace *mnt_userns, /** * chgrp_ok - verify permissions to chgrp inode - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: inode to check permissions on * @ia_vfsgid: gid to chown @inode to * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply pass @nop_mnt_idmap. */ -static bool chgrp_ok(struct user_namespace *mnt_userns, +static bool chgrp_ok(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t ia_vfsgid) { - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) { if (vfsgid_eq(ia_vfsgid, vfsgid)) return true; if (vfsgid_in_group_p(ia_vfsgid)) return true; } - if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) + if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN)) return true; if (!vfsgid_valid(vfsgid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) @@ -142,7 +142,7 @@ static bool chgrp_ok(struct user_namespace *mnt_userns, /** * setattr_prepare - check if attribute changes to a dentry are allowed - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: dentry to check * @attr: attributes to change * @@ -152,16 +152,16 @@ static bool chgrp_ok(struct user_namespace *mnt_userns, * SGID bit from mode if user is not allowed to set it. Also file capabilities * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. * * Should be called as the first thing in ->setattr implementations, * possibly after taking additional locks. */ -int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, +int setattr_prepare(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -183,34 +183,34 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && - !chown_ok(mnt_userns, inode, attr->ia_vfsuid)) + !chown_ok(idmap, inode, attr->ia_vfsuid)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && - !chgrp_ok(mnt_userns, inode, attr->ia_vfsgid)) + !chgrp_ok(idmap, inode, attr->ia_vfsgid)) return -EPERM; /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { vfsgid_t vfsgid; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (ia_valid & ATTR_GID) vfsgid = attr->ia_vfsgid; else - vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(idmap, inode); /* Also check the setgid bit! */ - if (!in_group_or_capable(mnt_userns, inode, vfsgid)) + if (!in_group_or_capable(idmap, inode, vfsgid)) attr->ia_mode &= ~S_ISGID; } /* Check for setting the inode time. */ if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; } @@ -219,7 +219,7 @@ kill_priv: if (ia_valid & ATTR_KILL_PRIV) { int error; - error = security_inode_killpriv(mnt_userns, dentry); + error = security_inode_killpriv(idmap, dentry); if (error) return error; } @@ -276,7 +276,7 @@ EXPORT_SYMBOL(inode_newsize_ok); /** * setattr_copy - copy simple metadata updates into the generic inode - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: the inode to be updated * @attr: the new attributes * @@ -289,23 +289,23 @@ EXPORT_SYMBOL(inode_newsize_ok); * Noticeably missing is inode size update, which is more complex * as it requires pagecache updates. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply pass @nop_mnt_idmap. * * The inode is not marked as dirty after this operation. The rationale is * that for "simple" filesystems, the struct inode is the inode storage. * The caller is free to mark the inode dirty afterwards if needed. */ -void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, +void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; - i_uid_update(mnt_userns, attr, inode); - i_gid_update(mnt_userns, attr, inode); + i_uid_update(idmap, attr, inode); + i_gid_update(idmap, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -314,15 +314,15 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - if (!in_group_or_capable(mnt_userns, inode, - i_gid_into_vfsgid(mnt_userns, inode))) + if (!in_group_or_capable(idmap, inode, + i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; inode->i_mode = mode; } } EXPORT_SYMBOL(setattr_copy); -int may_setattr(struct user_namespace *mnt_userns, struct inode *inode, +int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid) { int error; @@ -340,8 +340,8 @@ int may_setattr(struct user_namespace *mnt_userns, struct inode *inode, if (IS_IMMUTABLE(inode)) return -EPERM; - if (!inode_owner_or_capable(mnt_userns, inode)) { - error = inode_permission(mnt_userns, inode, MAY_WRITE); + if (!inode_owner_or_capable(idmap, inode)) { + error = inode_permission(idmap, inode, MAY_WRITE); if (error) return error; } @@ -352,7 +352,7 @@ EXPORT_SYMBOL(may_setattr); /** * notify_change - modify attributes of a filesytem object - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: object affected * @attr: new attributes * @delegated_inode: returns inode, if the inode is delegated @@ -371,13 +371,13 @@ EXPORT_SYMBOL(may_setattr); * the file open for write, as there can be no conflicting delegation in * that case. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply pass @nop_mnt_idmap. */ -int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, +int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; @@ -388,7 +388,7 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, WARN_ON_ONCE(!inode_is_locked(inode)); - error = may_setattr(mnt_userns, inode, ia_valid); + error = may_setattr(idmap, inode, ia_valid); if (error) return error; @@ -453,11 +453,11 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, * namespace of the superblock. */ if (ia_valid & ATTR_UID && - !vfsuid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + !vfsuid_has_fsmapping(idmap, inode->i_sb->s_user_ns, attr->ia_vfsuid)) return -EOVERFLOW; if (ia_valid & ATTR_GID && - !vfsgid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + !vfsgid_has_fsmapping(idmap, inode->i_sb->s_user_ns, attr->ia_vfsgid)) return -EOVERFLOW; @@ -465,13 +465,13 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, * gids unless those uids & gids are being made valid. */ if (!(ia_valid & ATTR_UID) && - !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode))) + !vfsuid_valid(i_uid_into_vfsuid(idmap, inode))) return -EOVERFLOW; if (!(ia_valid & ATTR_GID) && - !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) + !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; - error = security_inode_setattr(mnt_userns, dentry, attr); + error = security_inode_setattr(idmap, dentry, attr); if (error) return error; error = try_break_deleg(inode, delegated_inode); @@ -479,13 +479,13 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, return error; if (inode->i_op->setattr) - error = inode->i_op->setattr(mnt_userns, dentry, attr); + error = inode->i_op->setattr(idmap, dentry, attr); else - error = simple_setattr(mnt_userns, dentry, attr); + error = simple_setattr(idmap, dentry, attr); if (!error) { fsnotify_change(dentry, ia_valid); - ima_inode_post_setattr(mnt_userns, dentry); + ima_inode_post_setattr(idmap, dentry); evm_inode_post_setattr(dentry, ia_valid); } diff --git a/fs/autofs/root.c b/fs/autofs/root.c index ca03c1cae2be..6baf90b08e0e 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -10,12 +10,12 @@ #include "autofs_i.h" -static int autofs_dir_permission(struct user_namespace *, struct inode *, int); -static int autofs_dir_symlink(struct user_namespace *, struct inode *, +static int autofs_dir_permission(struct mnt_idmap *, struct inode *, int); +static int autofs_dir_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); static int autofs_dir_unlink(struct inode *, struct dentry *); static int autofs_dir_rmdir(struct inode *, struct dentry *); -static int autofs_dir_mkdir(struct user_namespace *, struct inode *, +static int autofs_dir_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); static long autofs_root_ioctl(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT @@ -543,7 +543,7 @@ static struct dentry *autofs_lookup(struct inode *dir, return NULL; } -static int autofs_dir_permission(struct user_namespace *mnt_userns, +static int autofs_dir_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (mask & MAY_WRITE) { @@ -560,10 +560,10 @@ static int autofs_dir_permission(struct user_namespace *mnt_userns, return -EACCES; } - return generic_permission(mnt_userns, inode, mask); + return generic_permission(idmap, inode, mask); } -static int autofs_dir_symlink(struct user_namespace *mnt_userns, +static int autofs_dir_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { @@ -720,7 +720,7 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs_dir_mkdir(struct user_namespace *mnt_userns, +static int autofs_dir_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 92737166203f..db649487d58c 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -27,7 +27,7 @@ static const struct file_operations bad_file_ops = .open = bad_file_open, }; -static int bad_inode_create(struct user_namespace *mnt_userns, +static int bad_inode_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -51,14 +51,14 @@ static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) return -EIO; } -static int bad_inode_symlink(struct user_namespace *mnt_userns, +static int bad_inode_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { return -EIO; } -static int bad_inode_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return -EIO; @@ -69,13 +69,13 @@ static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) return -EIO; } -static int bad_inode_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int bad_inode_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { return -EIO; } -static int bad_inode_rename2(struct user_namespace *mnt_userns, +static int bad_inode_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -89,20 +89,20 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, return -EIO; } -static int bad_inode_permission(struct user_namespace *mnt_userns, +static int bad_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return -EIO; } -static int bad_inode_getattr(struct user_namespace *mnt_userns, +static int bad_inode_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { return -EIO; } -static int bad_inode_setattr(struct user_namespace *mnt_userns, +static int bad_inode_setattr(struct mnt_idmap *idmap, struct dentry *direntry, struct iattr *attrs) { return -EIO; @@ -146,14 +146,14 @@ static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, return -EIO; } -static int bad_inode_tmpfile(struct user_namespace *mnt_userns, +static int bad_inode_tmpfile(struct mnt_idmap *idmap, struct inode *inode, struct file *file, umode_t mode) { return -EIO; } -static int bad_inode_set_acl(struct user_namespace *mnt_userns, +static int bad_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 34d4f68f786b..040d5140e426 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -75,7 +75,7 @@ const struct file_operations bfs_dir_operations = { .llseek = generic_file_llseek, }; -static int bfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int bfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { int err; @@ -96,7 +96,7 @@ static int bfs_create(struct user_namespace *mnt_userns, struct inode *dir, } set_bit(ino, info->si_imap); info->si_freei--; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; inode->i_op = &bfs_file_inops; @@ -199,7 +199,7 @@ out_brelse: return error; } -static int bfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 555c962fdad6..90d53209755b 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,8 @@ condflags := \ $(call cc-option, -Wunused-but-set-variable) \ $(call cc-option, -Wunused-const-variable) \ $(call cc-option, -Wpacked-not-aligned) \ - $(call cc-option, -Wstringop-truncation) + $(call cc-option, -Wstringop-truncation) \ + $(call cc-option, -Wmaybe-uninitialized) subdir-ccflags-y += $(condflags) # The following turn off the warnings enabled by -Wextra subdir-ccflags-y += -Wno-missing-field-initializers @@ -31,7 +32,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o + subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \ + lru_cache.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 3da1779e8b79..7427449a04a3 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -110,7 +110,7 @@ out: return ret; } -int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int ret; @@ -118,7 +118,7 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t old_mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - ret = posix_acl_update_mode(mnt_userns, inode, + ret = posix_acl_update_mode(idmap, inode, &inode->i_mode, &acl); if (ret) return ret; diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h index 39bd36e6eeb7..a270e71ec05f 100644 --- a/fs/btrfs/acl.h +++ b/fs/btrfs/acl.h @@ -6,7 +6,7 @@ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); -int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct posix_acl *acl, int type); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 46851511b661..90e40d5ceccd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1252,8 +1252,12 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct struct btrfs_root *root, u64 bytenr, int level, bool *is_shared) { + const struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_backref_shared_cache_entry *entry; + if (!current->journal_info) + lockdep_assert_held(&fs_info->commit_root_sem); + if (!ctx->use_path_cache) return false; @@ -1288,7 +1292,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct * could be a snapshot sharing this extent buffer. */ if (entry->is_shared && - entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) + entry->gen != btrfs_get_last_root_drop_gen(fs_info)) return false; *is_shared = entry->is_shared; @@ -1318,9 +1322,13 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx struct btrfs_root *root, u64 bytenr, int level, bool is_shared) { + const struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_backref_shared_cache_entry *entry; u64 gen; + if (!current->journal_info) + lockdep_assert_held(&fs_info->commit_root_sem); + if (!ctx->use_path_cache) return; @@ -1336,7 +1344,7 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx ASSERT(level >= 0); if (is_shared) - gen = btrfs_get_last_root_drop_gen(root->fs_info); + gen = btrfs_get_last_root_drop_gen(fs_info); else gen = btrfs_root_last_snapshot(&root->root_item); @@ -1864,6 +1872,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, .have_delayed_delete_refs = false, }; int level; + bool leaf_cached; + bool leaf_is_shared; for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) { if (ctx->prev_extents_cache[i].bytenr == bytenr) @@ -1885,6 +1895,23 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, walk_ctx.time_seq = elem.seq; } + ctx->use_path_cache = true; + + /* + * We may have previously determined that the current leaf is shared. + * If it is, then we have a data extent that is shared due to a shared + * subtree (caused by snapshotting) and we don't need to check for data + * backrefs. If the leaf is not shared, then we must do backref walking + * to determine if the data extent is shared through reflinks. + */ + leaf_cached = lookup_backref_shared_cache(ctx, root, + ctx->curr_leaf_bytenr, 0, + &leaf_is_shared); + if (leaf_cached && leaf_is_shared) { + ret = 1; + goto out_trans; + } + walk_ctx.ignore_extent_item_pos = true; walk_ctx.trans = trans; walk_ctx.fs_info = fs_info; @@ -1893,7 +1920,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, /* -1 means we are in the bytenr of the data extent. */ level = -1; ULIST_ITER_INIT(&uiter); - ctx->use_path_cache = true; while (1) { bool is_shared; bool cached; @@ -1964,6 +1990,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, ctx->prev_extents_cache_slot = slot; } +out_trans: if (trans) { btrfs_put_tree_mod_seq(fs_info, &elem); btrfs_end_transaction(trans); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 8affc88b0e0a..d8b90f95b157 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -14,19 +14,31 @@ #include "dev-replace.h" #include "rcu-string.h" #include "zoned.h" +#include "file-item.h" static struct bio_set btrfs_bioset; +static struct bio_set btrfs_clone_bioset; +static struct bio_set btrfs_repair_bioset; +static mempool_t btrfs_failed_bio_pool; + +struct btrfs_failed_bio { + struct btrfs_bio *bbio; + int num_copies; + atomic_t repair_count; +}; /* * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. */ -static inline void btrfs_bio_init(struct btrfs_bio *bbio, - btrfs_bio_end_io_t end_io, void *private) +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, + btrfs_bio_end_io_t end_io, void *private) { memset(bbio, 0, offsetof(struct btrfs_bio, bio)); + bbio->inode = inode; bbio->end_io = end_io; bbio->private = private; + atomic_set(&bbio->pending_ios, 1); } /* @@ -37,32 +49,235 @@ static inline void btrfs_bio_init(struct btrfs_bio *bbio, * a mempool. */ struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_inode *inode, btrfs_bio_end_io_t end_io, void *private) { struct bio *bio; bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio), end_io, private); + btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); return bio; } -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private) +static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, + struct bio *orig, u64 map_length, + bool use_append) { + struct btrfs_bio *orig_bbio = btrfs_bio(orig); struct bio *bio; - struct btrfs_bio *bbio; - ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + if (use_append) { + unsigned int nr_segs; + + bio = bio_split_rw(orig, &fs_info->limits, &nr_segs, + &btrfs_clone_bioset, map_length); + } else { + bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, + &btrfs_clone_bioset); + } + btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio); - bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); - bbio = btrfs_bio(bio); - btrfs_bio_init(bbio, end_io, private); + btrfs_bio(bio)->file_offset = orig_bbio->file_offset; + if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED)) + orig_bbio->file_offset += map_length; - bio_trim(bio, offset >> 9, size >> 9); - bbio->iter = bio->bi_iter; + atomic_inc(&orig_bbio->pending_ios); return bio; } +static void btrfs_orig_write_end_io(struct bio *bio); + +static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, + struct btrfs_bio *orig_bbio) +{ + /* + * For writes we tolerate nr_mirrors - 1 write failures, so we can't + * just blindly propagate a write failure here. Instead increment the + * error count in the original I/O context so that it is guaranteed to + * be larger than the error tolerance. + */ + if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { + struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; + struct btrfs_io_context *orig_bioc = orig_stripe->bioc; + + atomic_add(orig_bioc->max_errors, &orig_bioc->error); + } else { + orig_bbio->bio.bi_status = bbio->bio.bi_status; + } +} + +static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) +{ + if (bbio->bio.bi_pool == &btrfs_clone_bioset) { + struct btrfs_bio *orig_bbio = bbio->private; + + if (bbio->bio.bi_status) + btrfs_bbio_propagate_error(bbio, orig_bbio); + bio_put(&bbio->bio); + bbio = orig_bbio; + } + + if (atomic_dec_and_test(&bbio->pending_ios)) + bbio->end_io(bbio); +} + +static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +{ + if (cur_mirror == fbio->num_copies) + return cur_mirror + 1 - fbio->num_copies; + return cur_mirror + 1; +} + +static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +{ + if (cur_mirror == 1) + return fbio->num_copies; + return cur_mirror - 1; +} + +static void btrfs_repair_done(struct btrfs_failed_bio *fbio) +{ + if (atomic_dec_and_test(&fbio->repair_count)) { + btrfs_orig_bbio_end_io(fbio->bbio); + mempool_free(fbio, &btrfs_failed_bio_pool); + } +} + +static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, + struct btrfs_device *dev) +{ + struct btrfs_failed_bio *fbio = repair_bbio->private; + struct btrfs_inode *inode = repair_bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); + int mirror = repair_bbio->mirror_num; + + if (repair_bbio->bio.bi_status || + !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { + bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); + repair_bbio->bio.bi_iter = repair_bbio->saved_iter; + + mirror = next_repair_mirror(fbio, mirror); + if (mirror == fbio->bbio->mirror_num) { + btrfs_debug(fs_info, "no mirror left"); + fbio->bbio->bio.bi_status = BLK_STS_IOERR; + goto done; + } + + btrfs_submit_bio(&repair_bbio->bio, mirror); + return; + } + + do { + mirror = prev_repair_mirror(fbio, mirror); + btrfs_repair_io_failure(fs_info, btrfs_ino(inode), + repair_bbio->file_offset, fs_info->sectorsize, + repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, + bv->bv_page, bv->bv_offset, mirror); + } while (mirror != fbio->bbio->mirror_num); + +done: + btrfs_repair_done(fbio); + bio_put(&repair_bbio->bio); +} + +/* + * Try to kick off a repair read to the next available mirror for a bad sector. + * + * This primarily tries to recover good data to serve the actual read request, + * but also tries to write the good data back to the bad mirror(s) when a + * read succeeded to restore the redundancy. + */ +static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, + u32 bio_offset, + struct bio_vec *bv, + struct btrfs_failed_bio *fbio) +{ + struct btrfs_inode *inode = failed_bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; + const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); + struct btrfs_bio *repair_bbio; + struct bio *repair_bio; + int num_copies; + int mirror; + + btrfs_debug(fs_info, "repair read error: read error at %llu", + failed_bbio->file_offset + bio_offset); + + num_copies = btrfs_num_copies(fs_info, logical, sectorsize); + if (num_copies == 1) { + btrfs_debug(fs_info, "no copy to repair from"); + failed_bbio->bio.bi_status = BLK_STS_IOERR; + return fbio; + } + + if (!fbio) { + fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); + fbio->bbio = failed_bbio; + fbio->num_copies = num_copies; + atomic_set(&fbio->repair_count, 1); + } + + atomic_inc(&fbio->repair_count); + + repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, + &btrfs_repair_bioset); + repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; + bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); + + repair_bbio = btrfs_bio(repair_bio); + btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio); + repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; + + mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); + btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); + btrfs_submit_bio(repair_bio, mirror); + return fbio; +} + +static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) +{ + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u32 sectorsize = fs_info->sectorsize; + struct bvec_iter *iter = &bbio->saved_iter; + blk_status_t status = bbio->bio.bi_status; + struct btrfs_failed_bio *fbio = NULL; + u32 offset = 0; + + /* + * Hand off repair bios to the repair code as there is no upper level + * submitter for them. + */ + if (bbio->bio.bi_pool == &btrfs_repair_bioset) { + btrfs_end_repair_bio(bbio, dev); + return; + } + + /* Clear the I/O error. A failed repair will reset it. */ + bbio->bio.bi_status = BLK_STS_OK; + + while (iter->bi_size) { + struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); + + bv.bv_len = min(bv.bv_len, sectorsize); + if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) + fbio = repair_one_sector(bbio, offset, &bv, fbio); + + bio_advance_iter_single(&bbio->bio, iter, sectorsize); + offset += sectorsize; + } + + if (bbio->csum != bbio->csum_inline) + kfree(bbio->csum); + + if (fbio) + btrfs_repair_done(fbio); + else + btrfs_orig_bbio_end_io(bbio); +} + static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) { if (!dev || !dev->bdev) @@ -90,24 +305,31 @@ static void btrfs_end_bio_work(struct work_struct *work) { struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); - bbio->end_io(bbio); + /* Metadata reads are checked and repaired by the submitter. */ + if (bbio->bio.bi_opf & REQ_META) + bbio->end_io(bbio); + else + btrfs_check_read_bio(bbio, bbio->bio.bi_private); } static void btrfs_simple_end_io(struct bio *bio) { - struct btrfs_fs_info *fs_info = bio->bi_private; struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_device *dev = bio->bi_private; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; btrfs_bio_counter_dec(fs_info); if (bio->bi_status) - btrfs_log_dev_io_error(bio, bbio->device); + btrfs_log_dev_io_error(bio, dev); if (bio_op(bio) == REQ_OP_READ) { INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - bbio->end_io(bbio); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + btrfs_record_physical_zoned(bbio); + btrfs_orig_bbio_end_io(bbio); } } @@ -118,7 +340,10 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; - bbio->end_io(bbio); + if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META)) + btrfs_check_read_bio(bbio, NULL); + else + btrfs_orig_bbio_end_io(bbio); btrfs_put_bioc(bioc); } @@ -145,7 +370,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - bbio->end_io(bbio); + btrfs_orig_bbio_end_io(bbio); btrfs_put_bioc(bioc); } @@ -181,16 +406,10 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) */ if (bio_op(bio) == REQ_OP_ZONE_APPEND) { u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 zone_start = round_down(physical, dev->fs_info->zone_size); - if (btrfs_dev_is_sequential(dev, physical)) { - u64 zone_start = round_down(physical, - dev->fs_info->zone_size); - - bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; - } else { - bio->bi_opf &= ~REQ_OP_ZONE_APPEND; - bio->bi_opf |= REQ_OP_WRITE; - } + ASSERT(btrfs_dev_is_sequential(dev, physical)); + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; } btrfs_debug_in_rcu(dev->fs_info, "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", @@ -224,41 +443,21 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) +static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) { - u64 logical = bio->bi_iter.bi_sector << 9; - u64 length = bio->bi_iter.bi_size; - u64 map_length = length; - struct btrfs_io_context *bioc = NULL; - struct btrfs_io_stripe smap; - int ret; - - btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); - if (ret) { - btrfs_bio_counter_dec(fs_info); - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); - return; - } - - if (map_length < length) { - btrfs_crit(fs_info, - "mapping failed logical %llu bio len %llu len %llu", - logical, length, map_length); - BUG(); - } + /* Do not leak our private flag into the block layer. */ + bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; if (!bioc) { - /* Single mirror read/write fast path */ + /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; - btrfs_bio(bio)->device = smap.dev; - bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - bio->bi_private = fs_info; + bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; + bio->bi_private = smap->dev; bio->bi_end_io = btrfs_simple_end_io; - btrfs_submit_dev_bio(smap.dev, bio); + btrfs_submit_dev_bio(smap->dev, bio); } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - /* Parity RAID write or read recovery */ + /* Parity RAID write or read recovery. */ bio->bi_private = bioc; bio->bi_end_io = btrfs_raid56_end_io; if (bio_op(bio) == REQ_OP_READ) @@ -266,16 +465,233 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror else raid56_parity_write(bio, bioc); } else { - /* Write to multiple mirrors */ + /* Write to multiple mirrors. */ int total_devs = bioc->num_stripes; - int dev_nr; bioc->orig_bio = bio; - for (dev_nr = 0; dev_nr < total_devs; dev_nr++) + for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) btrfs_submit_mirrored_bio(bioc, dev_nr); } } +static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) +{ + if (bbio->bio.bi_opf & REQ_META) + return btree_csum_one_bio(bbio); + return btrfs_csum_one_bio(bbio); +} + +/* + * Async submit bios are used to offload expensive checksumming onto the worker + * threads. + */ +struct async_submit_bio { + struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe smap; + int mirror_num; + struct btrfs_work work; +}; + +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the btree. + */ +static void run_one_async_start(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + blk_status_t ret; + + ret = btrfs_bio_csum(async->bbio); + if (ret) + async->bbio->bio.bi_status = ret; +} + +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the tree. + */ +static void run_one_async_done(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + struct bio *bio = &async->bbio->bio; + + /* If an error occurred we just want to clean up the bio and move on. */ + if (bio->bi_status) { + btrfs_orig_bbio_end_io(async->bbio); + return; + } + + /* + * All of the bios that pass through here are from async helpers. + * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. + * This changes nothing when cgroups aren't in use. + */ + bio->bi_opf |= REQ_CGROUP_PUNT; + __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); +} + +static void run_one_async_free(struct btrfs_work *work) +{ + kfree(container_of(work, struct async_submit_bio, work)); +} + +static bool should_async_write(struct btrfs_bio *bbio) +{ + /* + * If the I/O is not issued by fsync and friends, (->sync_writers != 0), + * then try to defer the submission to a workqueue to parallelize the + * checksum calculation. + */ + if (atomic_read(&bbio->inode->sync_writers)) + return false; + + /* + * Submit metadata writes synchronously if the checksum implementation + * is fast, or we are on a zoned device that wants I/O to be submitted + * in order. + */ + if (bbio->bio.bi_opf & REQ_META) { + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + + if (btrfs_is_zoned(fs_info)) + return false; + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) + return false; + } + + return true; +} + +/* + * Submit bio to an async queue. + * + * Return true if the work has been succesfuly submitted, else false. + */ +static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, + struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) +{ + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return false; + + async->bbio = bbio; + async->bioc = bioc; + async->smap = *smap; + async->mirror_num = mirror_num; + + btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, + run_one_async_free); + if (op_is_sync(bbio->bio.bi_opf)) + btrfs_queue_work(fs_info->hipri_workers, &async->work); + else + btrfs_queue_work(fs_info->workers, &async->work); + return true; +} + +static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) +{ + struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_bio *orig_bbio = bbio; + u64 logical = bio->bi_iter.bi_sector << 9; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; + bool use_append = btrfs_use_zone_append(bbio); + struct btrfs_io_context *bioc = NULL; + struct btrfs_io_stripe smap; + blk_status_t ret; + int error; + + btrfs_bio_counter_inc_blocked(fs_info); + error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); + if (error) { + ret = errno_to_blk_status(error); + goto fail; + } + + map_length = min(map_length, length); + if (use_append) + map_length = min(map_length, fs_info->max_zone_append_size); + + if (map_length < length) { + bio = btrfs_split_bio(fs_info, bio, map_length, use_append); + bbio = btrfs_bio(bio); + } + + /* + * Save the iter for the end_io handler and preload the checksums for + * data reads. + */ + if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) { + bbio->saved_iter = bio->bi_iter; + ret = btrfs_lookup_bio_sums(bbio); + if (ret) + goto fail_put_bio; + } + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + if (use_append) { + bio->bi_opf &= ~REQ_OP_WRITE; + bio->bi_opf |= REQ_OP_ZONE_APPEND; + ret = btrfs_extract_ordered_extent(btrfs_bio(bio)); + if (ret) + goto fail_put_bio; + } + + /* + * Csum items for reloc roots have already been cloned at this + * point, so they are handled as part of the no-checksum case. + */ + if (!(inode->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !btrfs_is_data_reloc_root(inode->root)) { + if (should_async_write(bbio) && + btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) + goto done; + + ret = btrfs_bio_csum(bbio); + if (ret) + goto fail_put_bio; + } + } + + __btrfs_submit_bio(bio, bioc, &smap, mirror_num); +done: + return map_length == length; + +fail_put_bio: + if (map_length < length) + bio_put(bio); +fail: + btrfs_bio_counter_dec(fs_info); + btrfs_bio_end_io(orig_bbio, ret); + /* Do not submit another chunk */ + return true; +} + +void btrfs_submit_bio(struct bio *bio, int mirror_num) +{ + while (!btrfs_submit_chunk(bio, mirror_num)) + ; +} + /* * Submit a repair write. * @@ -283,7 +699,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror * RAID setup. Here we only want to write the one bad copy, so we do the * mapping ourselves and submit the bio directly. * - * The I/O is issued sychronously to block the repair read completion from + * The I/O is issued synchronously to block the repair read completion from * freeing the bio. */ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, @@ -381,10 +797,31 @@ int __init btrfs_bioset_init(void) offsetof(struct btrfs_bio, bio), BIOSET_NEED_BVECS)) return -ENOMEM; + if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), 0)) + goto out_free_bioset; + if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), + BIOSET_NEED_BVECS)) + goto out_free_clone_bioset; + if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, + sizeof(struct btrfs_failed_bio))) + goto out_free_repair_bioset; return 0; + +out_free_repair_bioset: + bioset_exit(&btrfs_repair_bioset); +out_free_clone_bioset: + bioset_exit(&btrfs_clone_bioset); +out_free_bioset: + bioset_exit(&btrfs_bioset); + return -ENOMEM; } void __cold btrfs_bioset_exit(void) { + mempool_exit(&btrfs_failed_bio_pool); + bioset_exit(&btrfs_repair_bioset); + bioset_exit(&btrfs_clone_bioset); bioset_exit(&btrfs_bioset); } diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index b12f84b3b341..873ff85817f0 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -26,32 +26,23 @@ struct btrfs_fs_info; typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); /* - * Additional info to pass along bio. - * - * Mostly for btrfs specific features like csum and mirror_num. + * Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and + * passed to btrfs_submit_bio for mapping to the physical devices. */ struct btrfs_bio { - unsigned int mirror_num:7; - - /* - * Extra indicator for metadata bios. - * For some btrfs bios they use pages without a mapping, thus - * we can not rely on page->mapping->host to determine if - * it's a metadata bio. - */ - unsigned int is_metadata:1; - struct bvec_iter iter; - - /* for direct I/O */ + /* Inode and offset into it that this I/O operates on. */ + struct btrfs_inode *inode; u64 file_offset; - /* @device is for stripe IO submission. */ - struct btrfs_device *device; union { - /* For data checksum verification. */ + /* + * Data checksumming and original I/O information for internal + * use in the btrfs_submit_bio machinery. + */ struct { u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + struct bvec_iter saved_iter; }; /* For metadata parentness verification. */ @@ -62,7 +53,9 @@ struct btrfs_bio { btrfs_bio_end_io_t end_io; void *private; - /* For read end I/O handling */ + /* For internal use in read end I/O handling */ + unsigned int mirror_num; + atomic_t pending_ios; struct work_struct end_io_work; /* @@ -80,11 +73,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio) int __init btrfs_bioset_init(void); void __cold btrfs_bioset_exit(void); +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, + btrfs_bio_end_io_t end_io, void *private); struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_inode *inode, btrfs_bio_end_io_t end_io, void *private); -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private); - static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { @@ -92,34 +85,10 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) bbio->end_io(bbio); } -static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) -{ - if (bbio->is_metadata) - return; - if (bbio->csum != bbio->csum_inline) { - kfree(bbio->csum); - bbio->csum = NULL; - } -} +/* Bio only refers to one ordered extent. */ +#define REQ_BTRFS_ONE_ORDERED REQ_DRV -/* - * Iterate through a btrfs_bio (@bbio) on a per-sector basis. - * - * bvl - struct bio_vec - * bbio - struct btrfs_bio - * iters - struct bvec_iter - * bio_offset - unsigned int - */ -#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ - for ((iter) = (bbio)->iter, (bio_offset) = 0; \ - (iter).bi_size && \ - (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ - (bio_offset) += fs_info->sectorsize, \ - bio_advance_iter_single(&(bbio)->bio, &(iter), \ - (fs_info)->sectorsize)) - -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num); +void btrfs_submit_bio(struct bio *bio, int mirror_num); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 708d843daa72..5b10401d803b 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/sizes.h> #include <linux/list_sort.h> #include "misc.h" #include "ctree.h" @@ -539,6 +540,153 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end return total_added; } +/* + * Get an arbitrary extent item index / max_index through the block group + * + * @block_group the block group to sample from + * @index: the integral step through the block group to grab from + * @max_index: the granularity of the sampling + * @key: return value parameter for the item we find + * + * Pre-conditions on indices: + * 0 <= index <= max_index + * 0 < max_index + * + * Returns: 0 on success, 1 if the search didn't yield a useful item, negative + * error code on error. + */ +static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, + struct btrfs_block_group *block_group, + int index, int max_index, + struct btrfs_key *key) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *extent_root; + int ret = 0; + u64 search_offset; + u64 search_end = block_group->start + block_group->length; + struct btrfs_path *path; + + ASSERT(index >= 0); + ASSERT(index <= max_index); + ASSERT(max_index > 0); + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, + BTRFS_SUPER_INFO_OFFSET)); + + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = READA_FORWARD; + + search_offset = index * div_u64(block_group->length, max_index); + key->objectid = block_group->start + search_offset; + key->type = BTRFS_EXTENT_ITEM_KEY; + key->offset = 0; + + while (1) { + ret = btrfs_search_forward(extent_root, key, path, 0); + if (ret != 0) + goto out; + /* Success; sampled an extent item in the block group */ + if (key->type == BTRFS_EXTENT_ITEM_KEY && + key->objectid >= block_group->start && + key->objectid + key->offset <= search_end) + goto out; + + /* We can't possibly find a valid extent item anymore */ + if (key->objectid >= search_end) { + ret = 1; + break; + } + if (key->type < BTRFS_EXTENT_ITEM_KEY) + key->type = BTRFS_EXTENT_ITEM_KEY; + else + key->objectid++; + btrfs_release_path(path); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&caching_ctl->mutex); + cond_resched(); + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + } +out: + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); + btrfs_free_path(path); + return ret; +} + +/* + * Best effort attempt to compute a block group's size class while caching it. + * + * @block_group: the block group we are caching + * + * We cannot infer the size class while adding free space extents, because that + * logic doesn't care about contiguous file extents (it doesn't differentiate + * between a 100M extent and 100 contiguous 1M extents). So we need to read the + * file extent items. Reading all of them is quite wasteful, because usually + * only a handful are enough to give a good answer. Therefore, we just grab 5 of + * them at even steps through the block group and pick the smallest size class + * we see. Since size class is best effort, and not guaranteed in general, + * inaccuracy is acceptable. + * + * To be more explicit about why this algorithm makes sense: + * + * If we are caching in a block group from disk, then there are three major cases + * to consider: + * 1. the block group is well behaved and all extents in it are the same size + * class. + * 2. the block group is mostly one size class with rare exceptions for last + * ditch allocations + * 3. the block group was populated before size classes and can have a totally + * arbitrary mix of size classes. + * + * In case 1, looking at any extent in the block group will yield the correct + * result. For the mixed cases, taking the minimum size class seems like a good + * approximation, since gaps from frees will be usable to the size class. For + * 2., a small handful of file extents is likely to yield the right answer. For + * 3, we can either read every file extent, or admit that this is best effort + * anyway and try to stay fast. + * + * Returns: 0 on success, negative error code on error. + */ +static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, + struct btrfs_block_group *block_group) +{ + struct btrfs_key key; + int i; + u64 min_size = block_group->length; + enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; + int ret; + + if (!btrfs_block_group_should_use_size_class(block_group)) + return 0; + + for (i = 0; i < 5; ++i) { + ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); + if (ret < 0) + goto out; + if (ret > 0) + continue; + min_size = min_t(u64, min_size, key.offset); + size_class = btrfs_calc_block_group_size_class(min_size); + } + if (size_class != BTRFS_BG_SZ_NONE) { + spin_lock(&block_group->lock); + block_group->size_class = size_class; + spin_unlock(&block_group->lock); + } + +out: + return ret; +} + static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group = caching_ctl->block_group; @@ -683,6 +831,7 @@ static noinline void caching_thread(struct btrfs_work *work) mutex_lock(&caching_ctl->mutex); down_read(&fs_info->commit_root_sem); + load_block_group_size_class(caching_ctl, block_group); if (btrfs_test_opt(fs_info, SPACE_CACHE)) { ret = load_free_space_cache(block_group); if (ret == 1) { @@ -1816,7 +1965,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * * @fs_info: the filesystem * @chunk_start: logical address of block group - * @bdev: physical device to resolve, can be NULL to indicate any device * @physical: physical address to map to logical addresses * @logical: return array of logical addresses which map to @physical * @naddrs: length of @logical @@ -1827,8 +1975,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * block copies. */ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - struct block_device *bdev, u64 physical, u64 **logical, - int *naddrs, int *stripe_len) + u64 physical, u64 **logical, int *naddrs, int *stripe_len) { struct extent_map *em; struct map_lookup *map; @@ -1868,9 +2015,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, data_stripe_length)) continue; - if (bdev && map->stripes[i].dev->bdev != bdev) - continue; - stripe_nr = physical - map->stripes[i].physical; stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); @@ -1927,7 +2071,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(fs_info, cache->start, NULL, + ret = btrfs_rmap_block(fs_info, cache->start, bytenr, &logical, &nr, &stripe_len); if (ret) return ret; @@ -3330,7 +3474,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->delalloc_root_lock); while (total) { - bool reclaim; + bool reclaim = false; cache = btrfs_lookup_block_group(info, bytenr); if (!cache) { @@ -3379,6 +3523,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, cache->space_info->disk_used -= num_bytes * factor; reclaim = should_reclaim_block_group(cache, num_bytes); + spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -3433,32 +3578,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * reservation and return -EAGAIN, otherwise this function always succeeds. */ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc) + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class) { struct btrfs_space_info *space_info = cache->space_info; + enum btrfs_block_group_size_class size_class; int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); if (cache->ro) { ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - trace_btrfs_space_reservation(cache->fs_info, "space_info", - space_info->flags, num_bytes, 1); - btrfs_space_info_update_bytes_may_use(cache->fs_info, - space_info, -ram_bytes); - if (delalloc) - cache->delalloc_bytes += num_bytes; + goto out; + } - /* - * Compression can use less space than we reserved, so wake - * tickets if that happens - */ - if (num_bytes < ram_bytes) - btrfs_try_granting_tickets(cache->fs_info, space_info); + if (btrfs_block_group_should_use_size_class(cache)) { + size_class = btrfs_calc_block_group_size_class(num_bytes); + ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); + if (ret) + goto out; } + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, "space_info", + space_info->flags, num_bytes, 1); + btrfs_space_info_update_bytes_may_use(cache->fs_info, + space_info, -ram_bytes); + if (delalloc) + cache->delalloc_bytes += num_bytes; + + /* + * Compression can use less space than we reserved, so wake tickets if + * that happens. + */ + if (num_bytes < ram_bytes) + btrfs_try_granting_tickets(cache->fs_info, space_info); +out: spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; @@ -4218,3 +4373,73 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount bg->swap_extents -= amount; spin_unlock(&bg->lock); } + +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size) +{ + if (size <= SZ_128K) + return BTRFS_BG_SZ_SMALL; + if (size <= SZ_8M) + return BTRFS_BG_SZ_MEDIUM; + return BTRFS_BG_SZ_LARGE; +} + +/* + * Handle a block group allocating an extent in a size class + * + * @bg: The block group we allocated in. + * @size_class: The size class of the allocation. + * @force_wrong_size_class: Whether we are desperate enough to allow + * mismatched size classes. + * + * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the + * case of a race that leads to the wrong size class without + * force_wrong_size_class set. + * + * find_free_extent will skip block groups with a mismatched size class until + * it really needs to avoid ENOSPC. In that case it will set + * force_wrong_size_class. However, if a block group is newly allocated and + * doesn't yet have a size class, then it is possible for two allocations of + * different sizes to race and both try to use it. The loser is caught here and + * has to retry. + */ +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class) +{ + ASSERT(size_class != BTRFS_BG_SZ_NONE); + + /* The new allocation is in the right size class, do nothing */ + if (bg->size_class == size_class) + return 0; + /* + * The new allocation is in a mismatched size class. + * This means one of two things: + * + * 1. Two tasks in find_free_extent for different size_classes raced + * and hit the same empty block_group. Make the loser try again. + * 2. A call to find_free_extent got desperate enough to set + * 'force_wrong_slab'. Don't change the size_class, but allow the + * allocation. + */ + if (bg->size_class != BTRFS_BG_SZ_NONE) { + if (force_wrong_size_class) + return 0; + return -EAGAIN; + } + /* + * The happy new block group case: the new allocation is the first + * one in the block_group so we set size_class. + */ + bg->size_class = size_class; + + return 0; +} + +bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) +{ + if (btrfs_is_zoned(bg->fs_info)) + return false; + if (!btrfs_is_block_group_data_only(bg)) + return false; + return true; +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index a02ea76fd6cf..6e4a0b429ac3 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -12,6 +12,17 @@ enum btrfs_disk_cache_state { BTRFS_DC_SETUP, }; +enum btrfs_block_group_size_class { + /* Unset */ + BTRFS_BG_SZ_NONE, + /* 0 < size <= 128K */ + BTRFS_BG_SZ_SMALL, + /* 128K < size <= 8M */ + BTRFS_BG_SZ_MEDIUM, + /* 8M < size < BG_LENGTH */ + BTRFS_BG_SZ_LARGE, +}; + /* * This describes the state of the block_group for async discard. This is due * to the two pass nature of it where extent discarding is prioritized over @@ -233,6 +244,7 @@ struct btrfs_block_group { struct list_head active_bg_list; struct work_struct zone_finish_work; struct extent_buffer *last_eb; + enum btrfs_block_group_size_class size_class; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -302,7 +314,8 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc); int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc); + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class); void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, int delalloc); int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, @@ -315,8 +328,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - struct block_device *bdev, u64 physical, u64 **logical, - int *naddrs, int *stripe_len); + u64 physical, u64 **logical, int *naddrs, int *stripe_len); static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) { @@ -346,4 +358,10 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg); void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount); +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class); +bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg); + #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 195c09e20609..9dc21622806e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -93,12 +93,6 @@ struct btrfs_inode { /* the io_tree does range state (DIRTY, LOCKED etc) */ struct extent_io_tree io_tree; - /* special utility tree used to record which mirrors have already been - * tried when checksums fail for a given block - */ - struct rb_root io_failure_tree; - spinlock_t io_failure_lock; - /* * Keep track of where the inode has extent items mapped in order to * make sure the i_size adjustments are accurate @@ -411,21 +405,11 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); -void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type); -void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); -blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); -blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, - struct bio *bio, - u64 dio_file_offset); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); -int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff); -unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, - u64 start, u64 end); +blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio); +bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, + u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool nowait, bool strict); @@ -469,7 +453,7 @@ int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args); void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); -struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, +struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap, struct inode *dir); void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5122ca79f7ea..f42f31f22d13 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -141,12 +141,15 @@ static int compression_decompress(int type, struct list_head *ws, static int btrfs_decompress_bio(struct compressed_bio *cb); -static void finish_compressed_bio_read(struct compressed_bio *cb) +static void end_compressed_bio_read(struct btrfs_bio *bbio) { + struct compressed_bio *cb = bbio->private; unsigned int index; struct page *page; - if (cb->status == BLK_STS_OK) + if (bbio->bio.bi_status) + cb->status = bbio->bio.bi_status; + else cb->status = errno_to_blk_status(btrfs_decompress_bio(cb)); /* Release the compressed pages */ @@ -162,54 +165,6 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) /* Finally free the cb struct */ kfree(cb->compressed_pages); kfree(cb); -} - -/* - * Verify the checksums and kick off repair if needed on the uncompressed data - * before decompressing it into the original bio and freeing the uncompressed - * pages. - */ -static void end_compressed_bio_read(struct btrfs_bio *bbio) -{ - struct compressed_bio *cb = bbio->private; - struct inode *inode = cb->inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_inode *bi = BTRFS_I(inode); - bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) && - !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); - blk_status_t status = bbio->bio.bi_status; - struct bvec_iter iter; - struct bio_vec bv; - u32 offset; - - btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { - u64 start = bbio->file_offset + offset; - - if (!status && - (!csum || !btrfs_check_data_csum(bi, bbio, offset, - bv.bv_page, bv.bv_offset))) { - btrfs_clean_io_failure(bi, start, bv.bv_page, - bv.bv_offset); - } else { - int ret; - - refcount_inc(&cb->pending_ios); - ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, - bv.bv_page, bv.bv_offset, - true); - if (ret) { - refcount_dec(&cb->pending_ios); - status = errno_to_blk_status(ret); - } - } - } - - if (status) - cb->status = status; - - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_read(cb); - btrfs_bio_free_csum(bbio); bio_put(&bbio->bio); } @@ -303,68 +258,12 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) static void end_compressed_bio_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = bbio->private; - - if (bbio->bio.bi_status) - cb->status = bbio->bio.bi_status; - - if (refcount_dec_and_test(&cb->pending_ios)) { - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - - btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio); - queue_work(fs_info->compressed_write_workers, &cb->write_end_work); - } - bio_put(&bbio->bio); -} - -/* - * Allocate a compressed_bio, which will be used to read/write on-disk - * (aka, compressed) * data. - * - * @cb: The compressed_bio structure, which records all the needed - * information to bind the compressed data to the uncompressed - * page cache. - * @disk_byten: The logical bytenr where the compressed data will be read - * from or written to. - * @endio_func: The endio function to call after the IO for compressed data - * is finished. - * @next_stripe_start: Return value of logical bytenr of where next stripe starts. - * Let the caller know to only fill the bio up to the stripe - * boundary. - */ - - -static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, - blk_opf_t opf, - btrfs_bio_end_io_t endio_func, - u64 *next_stripe_start) -{ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - struct btrfs_io_geometry geom; - struct extent_map *em; - struct bio *bio; - int ret; - bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb); - bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + cb->status = bbio->bio.bi_status; + queue_work(fs_info->compressed_write_workers, &cb->write_end_work); - em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize); - if (IS_ERR(em)) { - bio_put(bio); - return ERR_CAST(em); - } - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev); - - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom); - free_extent_map(em); - if (ret < 0) { - bio_put(bio); - return ERR_PTR(ret); - } - *next_stripe_start = disk_bytenr + geom.len; - refcount_inc(&cb->pending_ios); - return bio; + bio_put(&bbio->bio); } /* @@ -389,18 +288,13 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct bio *bio = NULL; struct compressed_bio *cb; u64 cur_disk_bytenr = disk_start; - u64 next_stripe_start; blk_status_t ret = BLK_STS_OK; - int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; - const bool use_append = btrfs_use_zone_append(inode, disk_start); - const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; - refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = &inode->vfs_inode; cb->start = start; @@ -411,8 +305,16 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; - if (blkcg_css) + if (blkcg_css) { kthread_associate_blkcg(blkcg_css); + write_flags |= REQ_CGROUP_PUNT; + } + + write_flags |= REQ_BTRFS_ONE_ORDERED; + bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags, + BTRFS_I(cb->inode), end_compressed_bio_write, cb); + bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT; + btrfs_bio(bio)->file_offset = start; while (cur_disk_bytenr < disk_start + compressed_len) { u64 offset = cur_disk_bytenr - disk_start; @@ -420,77 +322,30 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int real_size; unsigned int added; struct page *page = compressed_pages[index]; - bool submit = false; - - /* Allocate new bio if submitted or not yet allocated */ - if (!bio) { - bio = alloc_compressed_bio(cb, cur_disk_bytenr, - bio_op | write_flags, end_compressed_bio_write, - &next_stripe_start); - if (IS_ERR(bio)) { - ret = errno_to_blk_status(PTR_ERR(bio)); - break; - } - if (blkcg_css) - bio->bi_opf |= REQ_CGROUP_PUNT; - } - /* - * We should never reach next_stripe_start start as we will - * submit comp_bio when reach the boundary immediately. - */ - ASSERT(cur_disk_bytenr != next_stripe_start); /* * We have various limits on the real read size: - * - stripe boundary * - page boundary * - compressed length boundary */ - real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr); - real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); real_size = min_t(u64, real_size, compressed_len - offset); ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); - if (use_append) - added = bio_add_zone_append_page(bio, page, real_size, - offset_in_page(offset)); - else - added = bio_add_page(bio, page, real_size, - offset_in_page(offset)); - /* Reached zoned boundary */ - if (added == 0) - submit = true; - + added = bio_add_page(bio, page, real_size, offset_in_page(offset)); + /* + * Maximum compressed extent is smaller than bio size limit, + * thus bio_add_page() should always success. + */ + ASSERT(added == real_size); cur_disk_bytenr += added; - /* Reached stripe boundary */ - if (cur_disk_bytenr == next_stripe_start) - submit = true; - - /* Finished the range */ - if (cur_disk_bytenr == disk_start + compressed_len) - submit = true; - - if (submit) { - if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, start, true); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - break; - } - } - - ASSERT(bio->bi_iter.bi_size); - btrfs_submit_bio(fs_info, bio, 0); - bio = NULL; - } - cond_resched(); } + /* Finished the range. */ + ASSERT(bio->bi_iter.bi_size); + btrfs_submit_bio(bio, 0); if (blkcg_css) kthread_associate_blkcg(NULL); - - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_write(cb); return ret; } @@ -667,10 +522,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct extent_map_tree *em_tree; struct compressed_bio *cb; unsigned int compressed_len; - struct bio *comp_bio = NULL; + struct bio *comp_bio; const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 cur_disk_byte = disk_bytenr; - u64 next_stripe_start; u64 file_offset; u64 em_len; u64 em_start; @@ -703,7 +557,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, goto out; } - refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = inode; @@ -737,37 +590,23 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; + comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode), + end_compressed_bio_read, cb); + comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT); + while (cur_disk_byte < disk_bytenr + compressed_len) { u64 offset = cur_disk_byte - disk_bytenr; unsigned int index = offset >> PAGE_SHIFT; unsigned int real_size; unsigned int added; struct page *page = cb->compressed_pages[index]; - bool submit = false; - - /* Allocate new bio if submitted or not yet allocated */ - if (!comp_bio) { - comp_bio = alloc_compressed_bio(cb, cur_disk_byte, - REQ_OP_READ, end_compressed_bio_read, - &next_stripe_start); - if (IS_ERR(comp_bio)) { - cb->status = errno_to_blk_status(PTR_ERR(comp_bio)); - break; - } - } - /* - * We should never reach next_stripe_start start as we will - * submit comp_bio when reach the boundary immediately. - */ - ASSERT(cur_disk_byte != next_stripe_start); + /* * We have various limit on the real read size: - * - stripe boundary * - page boundary * - compressed length boundary */ - real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte); - real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); real_size = min_t(u64, real_size, compressed_len - offset); ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); @@ -778,45 +617,20 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, */ ASSERT(added == real_size); cur_disk_byte += added; - - /* Reached stripe boundary, need to submit */ - if (cur_disk_byte == next_stripe_start) - submit = true; - - /* Has finished the range, need to submit */ - if (cur_disk_byte == disk_bytenr + compressed_len) - submit = true; - - if (submit) { - /* Save the original iter for read repair */ - if (bio_op(comp_bio) == REQ_OP_READ) - btrfs_bio(comp_bio)->iter = comp_bio->bi_iter; - - /* - * Save the initial offset of this chunk, as there - * is no direct correlation between compressed pages and - * the original file offset. The field is only used for - * priting error messages. - */ - btrfs_bio(comp_bio)->file_offset = file_offset; - - ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL); - if (ret) { - btrfs_bio_end_io(btrfs_bio(comp_bio), ret); - break; - } - - ASSERT(comp_bio->bi_iter.bi_size); - btrfs_submit_bio(fs_info, comp_bio, mirror_num); - comp_bio = NULL; - } } if (memstall) psi_memstall_leave(&pflags); - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_read(cb); + /* + * Stash the initial offset of this chunk, as there is no direct + * correlation between compressed pages and the original file offset. + * The field is only used for printing error messages anyway. + */ + btrfs_bio(comp_bio)->file_offset = file_offset; + + ASSERT(comp_bio->bi_iter.bi_size); + btrfs_submit_bio(comp_bio, mirror_num); return; fail: @@ -1609,7 +1423,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, index_end = end >> PAGE_SHIFT; /* Don't miss unaligned end */ - if (!IS_ALIGNED(end, PAGE_SIZE)) + if (!PAGE_ALIGNED(end)) index_end++; curr_sample_pos = 0; @@ -1642,7 +1456,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, * * For now is's a naive and optimistic 'return true', we'll extend the logic to * quickly (compared to direct compression) detect data characteristics - * (compressible/uncompressible) to avoid wasting CPU time on uncompressible + * (compressible/incompressible) to avoid wasting CPU time on incompressible * data. * * The following types of analysis can be performed: diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 6209d40a1e08..a5e3377db9ad 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -31,9 +31,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* Number of outstanding bios */ - refcount_t pending_ios; - /* Number of compressed pages in the array */ unsigned int nr_pages; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4754c9101a4c..a5b6bb54545f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -484,7 +484,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (ret) return ret; } - btrfs_clean_tree_block(buf); + btrfs_clear_buffer_dirty(trans, buf); *last_ref = 1; } return 0; @@ -853,8 +853,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, /* * Search for a key in the given extent_buffer. * - * The lower boundary for the search is specified by the slot number @low. Use a - * value of 0 to search over the whole extent buffer. + * The lower boundary for the search is specified by the slot number @first_slot. + * Use a value of 0 to search over the whole extent buffer. * * The slot in the extent buffer is returned via @slot. If the key exists in the * extent buffer, then @slot will point to the slot where the key is, otherwise @@ -863,18 +863,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, * Slot may point to the total number of items (i.e. one position beyond the last * key) if the key is bigger than the last key in the extent buffer. */ -static noinline int generic_bin_search(struct extent_buffer *eb, int low, - const struct btrfs_key *key, int *slot) +int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot) { unsigned long p; int item_size; - int high = btrfs_header_nritems(eb); + /* + * Use unsigned types for the low and high slots, so that we get a more + * efficient division in the search loop below. + */ + u32 low = first_slot; + u32 high = btrfs_header_nritems(eb); int ret; const int key_size = sizeof(struct btrfs_disk_key); - if (low > high) { + if (unlikely(low > high)) { btrfs_err(eb->fs_info, - "%s: low (%d) > high (%d) eb %llu owner %llu level %d", + "%s: low (%u) > high (%u) eb %llu owner %llu level %d", __func__, low, high, eb->start, btrfs_header_owner(eb), btrfs_header_level(eb)); return -EINVAL; @@ -925,16 +930,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb, int low, return 1; } -/* - * Simple binary search on an extent buffer. Works for both leaves and nodes, and - * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). - */ -int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, - int *slot) -{ - return generic_bin_search(eb, 0, key, slot); -} - static void root_add_used(struct btrfs_root *root, u32 size) { spin_lock(&root->accounting_lock); @@ -1054,7 +1049,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, path->locks[level] = 0; path->nodes[level] = NULL; - btrfs_clean_tree_block(mid); + btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); @@ -1115,7 +1110,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - btrfs_clean_tree_block(right); + btrfs_clear_buffer_dirty(trans, right); btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); @@ -1161,7 +1156,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - btrfs_clean_tree_block(mid); + btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); @@ -1869,7 +1864,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb, return 0; } - return generic_bin_search(eb, search_low_slot, key, slot); + return btrfs_generic_bin_search(eb, search_low_slot, key, slot); } static int search_leaf(struct btrfs_trans_handle *trans, @@ -3041,7 +3036,8 @@ noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) * min slot controls the lowest index we're willing to push to the * right. We'll push up to and including min_slot, but no lower */ -static noinline int __push_leaf_right(struct btrfs_path *path, +static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int data_size, int empty, struct extent_buffer *right, int free_space, u32 left_nritems, @@ -3139,7 +3135,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, if (left_nritems) btrfs_mark_buffer_dirty(left); else - btrfs_clean_tree_block(left); + btrfs_clear_buffer_dirty(trans, left); btrfs_mark_buffer_dirty(right); @@ -3151,7 +3147,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; if (btrfs_header_nritems(path->nodes[0]) == 0) - btrfs_clean_tree_block(path->nodes[0]); + btrfs_clear_buffer_dirty(trans, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3243,8 +3239,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 0; } - return __push_leaf_right(path, min_data_size, empty, - right, free_space, left_nritems, min_slot); + return __push_leaf_right(trans, path, min_data_size, empty, right, + free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); free_extent_buffer(right); @@ -3259,7 +3255,8 @@ out_unlock: * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the * items */ -static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, +static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int data_size, int empty, struct extent_buffer *left, int free_space, u32 right_nritems, u32 max_slot) @@ -3363,7 +3360,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, if (right_nritems) btrfs_mark_buffer_dirty(right); else - btrfs_clean_tree_block(right); + btrfs_clear_buffer_dirty(trans, right); btrfs_item_key(right, &disk_key, 0); fixup_low_keys(path, &disk_key, 1); @@ -3449,9 +3446,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root ret = -EUCLEAN; goto out; } - return __push_leaf_left(path, min_data_size, - empty, left, free_space, right_nritems, - max_slot); + return __push_leaf_left(trans, path, min_data_size, empty, left, + free_space, right_nritems, max_slot); out: btrfs_tree_unlock(left); free_extent_buffer(left); @@ -4400,7 +4396,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { - btrfs_clean_tree_block(leaf); + btrfs_clear_buffer_dirty(trans, leaf); btrfs_del_leaf(trans, root, path, leaf); } } else { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6965703a81b6..97897107fab5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -507,6 +507,21 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); /* ctree.c */ int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); + +int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot); + +/* + * Simple binary search on an extent buffer. Works for both leaves and nodes, and + * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). + */ +static inline int btrfs_bin_search(struct extent_buffer *eb, + const struct btrfs_key *key, + int *slot) +{ + return btrfs_generic_bin_search(eb, 0, key, slot); +} + int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index d81b764a7644..8065341d831a 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -765,7 +765,7 @@ again: break; unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); lock_page(page); /* @@ -999,7 +999,7 @@ next: } #define CLUSTER_SIZE (SZ_256K) -static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); +static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); /* * Defrag one contiguous target range. diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 573ebab886e2..886ffb232eac 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -437,8 +437,7 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, return 0; } -static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, +static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { @@ -452,8 +451,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, atomic_dec(&delayed_refs->num_entries); } -static bool merge_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, +static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) @@ -482,10 +480,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, mod = -next->ref_mod; } - drop_delayed_ref(trans, delayed_refs, head, next); + drop_delayed_ref(delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { - drop_delayed_ref(trans, delayed_refs, head, ref); + drop_delayed_ref(delayed_refs, head, ref); done = true; } else { /* @@ -499,11 +497,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, return done; } -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, +void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_node *ref; struct rb_node *node; u64 seq = 0; @@ -524,7 +521,7 @@ again: ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; - if (merge_ref(trans, delayed_refs, head, ref, seq)) + if (merge_ref(delayed_refs, head, ref, seq)) goto again; } } @@ -601,8 +598,7 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, * Return 0 for insert. * Return >0 for merge. */ -static int insert_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *root, +static int insert_delayed_ref(struct btrfs_delayed_ref_root *root, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *ref) { @@ -641,7 +637,7 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans, /* remove existing tail if its ref_mod is zero */ if (exist->ref_mod == 0) - drop_delayed_ref(trans, root, href, exist); + drop_delayed_ref(root, href, exist); spin_unlock(&href->lock); return ret; inserted: @@ -978,7 +974,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); + ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -1070,7 +1066,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); + ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d6304b690ec4..2eb34abf700f 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -357,7 +357,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, +void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index ff2e524d9937..317aeff6c1da 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -78,6 +78,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { + lockdep_assert_held(&discard_ctl->lock); if (!btrfs_run_discard_work(discard_ctl)) return; @@ -89,6 +90,8 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, BTRFS_DISCARD_DELAY); block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; } + if (list_empty(&block_group->discard_list)) + btrfs_get_block_group(block_group); list_move_tail(&block_group->discard_list, get_discard_list(discard_ctl, block_group)); @@ -108,8 +111,12 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { + bool queued; + spin_lock(&discard_ctl->lock); + queued = !list_empty(&block_group->discard_list); + if (!btrfs_run_discard_work(discard_ctl)) { spin_unlock(&discard_ctl->lock); return; @@ -121,6 +128,8 @@ static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, block_group->discard_eligible_time = (ktime_get_ns() + BTRFS_DISCARD_UNUSED_DELAY); block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; + if (!queued) + btrfs_get_block_group(block_group); list_add_tail(&block_group->discard_list, &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); @@ -131,6 +140,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { bool running = false; + bool queued = false; spin_lock(&discard_ctl->lock); @@ -140,7 +150,16 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, } block_group->discard_eligible_time = 0; + queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); + /* + * If the block group is currently running in the discard workfn, we + * don't want to deref it, since it's still being used by the workfn. + * The workfn will notice this case and deref the block group when it is + * finished. + */ + if (queued && !running) + btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); @@ -214,10 +233,12 @@ again: if (block_group && now >= block_group->discard_eligible_time) { if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && block_group->used != 0) { - if (btrfs_is_block_group_data_only(block_group)) + if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); - else + } else { list_del_init(&block_group->discard_list); + btrfs_put_block_group(block_group); + } goto again; } if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { @@ -511,6 +532,15 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; + /* + * If the block group was removed from the discard list while it was + * running in this workfn, then we didn't deref it, since this function + * still owned that reference. But we set the discard_ctl->block_group + * back to NULL, so we can use that condition to know that now we need + * to deref the block_group. + */ + if (discard_ctl->block_group == NULL) + btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); @@ -651,8 +681,12 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, bg_list) { list_del_init(&block_group->bg_list); - btrfs_put_block_group(block_group); btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); + /* + * This put is for the get done by btrfs_mark_bg_unused. + * Queueing discard incremented it for discard's reference. + */ + btrfs_put_block_group(block_group); } spin_unlock(&fs_info->unused_bgs_lock); } @@ -683,6 +717,7 @@ static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) if (block_group->used == 0) btrfs_mark_bg_unused(block_group); spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); } } spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3aa04224315e..b53f0e30ce2b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -79,23 +79,6 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) } /* - * async submit bios are used to offload expensive checksumming - * onto the worker threads. They checksum file and metadata bios - * just before they are sent down the IO stack. - */ -struct async_submit_bio { - struct btrfs_inode *inode; - struct bio *bio; - enum btrfs_wq_submit_cmd submit_cmd; - int mirror_num; - - /* Optional parameter for used by direct io */ - u64 dio_file_offset; - struct btrfs_work work; - blk_status_t status; -}; - -/* * Compute the csum of a btree block and store the result to provided buffer. */ static void csum_tree_block(struct extent_buffer *buf, u8 *result) @@ -455,6 +438,22 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec return csum_one_extent_buffer(eb); } +blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) +{ + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct bvec_iter iter; + struct bio_vec bv; + int ret = 0; + + bio_for_each_segment(bv, &bbio->bio, iter) { + ret = csum_dirty_buffer(fs_info, &bv); + if (ret) + break; + } + + return errno_to_blk_status(ret); +} + static int check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; @@ -700,172 +699,6 @@ err: return ret; } -static void run_one_async_start(struct btrfs_work *work) -{ - struct async_submit_bio *async; - blk_status_t ret; - - async = container_of(work, struct async_submit_bio, work); - switch (async->submit_cmd) { - case WQ_SUBMIT_METADATA: - ret = btree_submit_bio_start(async->bio); - break; - case WQ_SUBMIT_DATA: - ret = btrfs_submit_bio_start(async->inode, async->bio); - break; - case WQ_SUBMIT_DATA_DIO: - ret = btrfs_submit_bio_start_direct_io(async->inode, - async->bio, async->dio_file_offset); - break; - } - if (ret) - async->status = ret; -} - -/* - * In order to insert checksums into the metadata in large chunks, we wait - * until bio submission time. All the pages in the bio are checksummed and - * sums are attached onto the ordered extent record. - * - * At IO completion time the csums attached on the ordered extent record are - * inserted into the tree. - */ -static void run_one_async_done(struct btrfs_work *work) -{ - struct async_submit_bio *async = - container_of(work, struct async_submit_bio, work); - struct btrfs_inode *inode = async->inode; - struct btrfs_bio *bbio = btrfs_bio(async->bio); - - /* If an error occurred we just want to clean up the bio and move on */ - if (async->status) { - btrfs_bio_end_io(bbio, async->status); - return; - } - - /* - * All of the bios that pass through here are from async helpers. - * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. - * This changes nothing when cgroups aren't in use. - */ - async->bio->bi_opf |= REQ_CGROUP_PUNT; - btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num); -} - -static void run_one_async_free(struct btrfs_work *work) -{ - struct async_submit_bio *async; - - async = container_of(work, struct async_submit_bio, work); - kfree(async); -} - -/* - * Submit bio to an async queue. - * - * Retrun: - * - true if the work has been succesfuly submitted - * - false in case of error - */ -bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct async_submit_bio *async; - - async = kmalloc(sizeof(*async), GFP_NOFS); - if (!async) - return false; - - async->inode = inode; - async->bio = bio; - async->mirror_num = mirror_num; - async->submit_cmd = cmd; - - btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, - run_one_async_free); - - async->dio_file_offset = dio_file_offset; - - async->status = 0; - - if (op_is_sync(bio->bi_opf)) - btrfs_queue_work(fs_info->hipri_workers, &async->work); - else - btrfs_queue_work(fs_info->workers, &async->work); - return true; -} - -static blk_status_t btree_csum_one_bio(struct bio *bio) -{ - struct bio_vec *bvec; - struct btrfs_root *root; - int ret = 0; - struct bvec_iter_all iter_all; - - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - root = BTRFS_I(bvec->bv_page->mapping->host)->root; - ret = csum_dirty_buffer(root->fs_info, bvec); - if (ret) - break; - } - - return errno_to_blk_status(ret); -} - -blk_status_t btree_submit_bio_start(struct bio *bio) -{ - /* - * when we're called for a write, we're already in the async - * submission context. Just jump into btrfs_submit_bio. - */ - return btree_csum_one_bio(bio); -} - -static bool should_async_write(struct btrfs_fs_info *fs_info, - struct btrfs_inode *bi) -{ - if (btrfs_is_zoned(fs_info)) - return false; - if (atomic_read(&bi->sync_writers)) - return false; - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) - return false; - return true; -} - -void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_bio *bbio = btrfs_bio(bio); - blk_status_t ret; - - bio->bi_opf |= REQ_META; - bbio->is_metadata = 1; - - if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - btrfs_submit_bio(fs_info, bio, mirror_num); - return; - } - - /* - * Kthread helpers are used to submit writes so that checksumming can - * happen in parallel across all CPUs. - */ - if (should_async_write(fs_info, inode) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) - return; - - ret = btree_csum_one_bio(bio); - if (ret) { - btrfs_bio_end_io(bbio, ret); - return; - } - - btrfs_submit_bio(fs_info, bio, mirror_num); -} - #ifdef CONFIG_MIGRATION static int btree_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -1035,22 +868,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, } -void btrfs_clean_tree_block(struct extent_buffer *buf) -{ - struct btrfs_fs_info *fs_info = buf->fs_info; - if (btrfs_header_generation(buf) == - fs_info->running_transaction->transid) { - btrfs_assert_tree_write_locked(buf); - - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, - -buf->len, - fs_info->dirty_metadata_batch); - clear_extent_buffer_dirty(buf); - } - } -} - static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { @@ -1910,6 +1727,9 @@ static int cleaner_kthread(void *arg) goto sleep; } + if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags)) + btrfs_sysfs_feature_update(fs_info); + btrfs_run_delayed_iputs(fs_info); again = btrfs_clean_one_deleted_snapshot(fs_info); @@ -5159,11 +4979,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, start += fs_info->nodesize; if (!eb) continue; + + btrfs_tree_lock(eb); wait_on_extent_buffer_writeback(eb); + btrfs_clear_buffer_dirty(NULL, eb); + btrfs_tree_unlock(eb); - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, - &eb->bflags)) - clear_extent_buffer_dirty(eb); free_extent_buffer_stale(eb); } } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index f2f295eb6103..4d5772330110 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -39,7 +39,8 @@ struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, int level); -void btrfs_clean_tree_block(struct extent_buffer *buf); +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, @@ -86,7 +87,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); -void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -114,15 +114,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_extent_buffer(struct extent_buffer *buf, struct btrfs_tree_parent_check *check); -enum btrfs_wq_submit_cmd { - WQ_SUBMIT_METADATA, - WQ_SUBMIT_DATA, - WQ_SUBMIT_DATA_DIO, -}; - -bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); -blk_status_t btree_submit_bio_start(struct bio *bio); +blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 3c7766dfaa69..29a225836e28 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -972,8 +972,8 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, { struct extent_state *state; struct extent_state *prealloc = NULL; - struct rb_node **p; - struct rb_node *parent; + struct rb_node **p = NULL; + struct rb_node *parent = NULL; int err = 0; u64 last_start; u64 last_end; @@ -1218,8 +1218,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, { struct extent_state *state; struct extent_state *prealloc = NULL; - struct rb_node **p; - struct rb_node *parent; + struct rb_node **p = NULL; + struct rb_node *parent = NULL; int err = 0; u64 last_start; u64 last_end; @@ -1625,7 +1625,7 @@ search: } /* - * Searche a range in the state tree for a given mask. If 'filled' == 1, this + * Search a range in the state tree for a given mask. If 'filled' == 1, this * returns 1 only if every extent in the tree has the bits set. Otherwise, 1 * is returned if any bit in the range is found set. */ diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index e3eeec380844..21766e49ec02 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -6,7 +6,6 @@ #include "misc.h" struct extent_changeset; -struct io_failure_record; /* Bits for the extent state */ enum { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 72ba13b027a9..824c657f59e8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -16,7 +16,8 @@ #include <linux/percpu_counter.h> #include <linux/lockdep.h> #include <linux/crc32c.h> -#include "misc.h" +#include "ctree.h" +#include "extent-tree.h" #include "tree-log.h" #include "disk-io.h" #include "print-tree.h" @@ -31,14 +32,12 @@ #include "space-info.h" #include "block-rsv.h" #include "delalloc-space.h" -#include "block-group.h" #include "discard.h" #include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" #include "fs.h" #include "accessors.h" -#include "extent-tree.h" #include "root-tree.h" #include "file-item.h" #include "orphan.h" @@ -1966,7 +1965,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, cond_resched(); spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); } return 0; @@ -2013,7 +2012,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * insert_inline_extent_backref()). */ spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &actual_count); @@ -3385,7 +3384,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) enum btrfs_loop_type { LOOP_CACHING_NOWAIT, LOOP_CACHING_WAIT, + LOOP_UNSET_SIZE_CLASS, LOOP_ALLOC_CHUNK, + LOOP_WRONG_SIZE_CLASS, LOOP_NO_EMPTY_SIZE, }; @@ -3453,81 +3454,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, btrfs_put_block_group(cache); } -enum btrfs_extent_allocation_policy { - BTRFS_EXTENT_ALLOC_CLUSTERED, - BTRFS_EXTENT_ALLOC_ZONED, -}; - -/* - * Structure used internally for find_free_extent() function. Wraps needed - * parameters. - */ -struct find_free_extent_ctl { - /* Basic allocation info */ - u64 ram_bytes; - u64 num_bytes; - u64 min_alloc_size; - u64 empty_size; - u64 flags; - int delalloc; - - /* Where to start the search inside the bg */ - u64 search_start; - - /* For clustered allocation */ - u64 empty_cluster; - struct btrfs_free_cluster *last_ptr; - bool use_cluster; - - bool have_caching_bg; - bool orig_have_caching_bg; - - /* Allocation is called for tree-log */ - bool for_treelog; - - /* Allocation is called for data relocation */ - bool for_data_reloc; - - /* RAID index, converted from flags */ - int index; - - /* - * Current loop number, check find_free_extent_update_loop() for details - */ - int loop; - - /* - * Whether we're refilling a cluster, if true we need to re-search - * current block group but don't try to refill the cluster again. - */ - bool retry_clustered; - - /* - * Whether we're updating free space cache, if true we need to re-search - * current block group but don't try updating free space cache again. - */ - bool retry_unclustered; - - /* If current block group is cached */ - int cached; - - /* Max contiguous hole found */ - u64 max_extent_size; - - /* Total free space from free space cache, not always contiguous */ - u64 total_free_space; - - /* Found result */ - u64 found_offset; - - /* Hint where to start looking for an empty space */ - u64 hint_byte; - - /* Allocation policy */ - enum btrfs_extent_allocation_policy policy; -}; - - /* * Helper function for find_free_extent(). * @@ -3559,8 +3485,7 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, if (offset) { /* We have a block, we're done */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(cluster_bg, - ffe_ctl->search_start, ffe_ctl->num_bytes); + trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); *cluster_bg_ret = cluster_bg; ffe_ctl->found_offset = offset; return 0; @@ -3610,10 +3535,8 @@ refill_cluster: if (offset) { /* We found one, proceed */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(bg, - ffe_ctl->search_start, - ffe_ctl->num_bytes); ffe_ctl->found_offset = offset; + trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); return 0; } } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && @@ -4028,24 +3951,6 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, } } -static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) -{ - switch (ffe_ctl->policy) { - case BTRFS_EXTENT_ALLOC_CLUSTERED: - /* - * If we can't allocate a new chunk we've already looped through - * at least once, move on to the NO_EMPTY_SIZE case. - */ - ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; - return 0; - case BTRFS_EXTENT_ALLOC_ZONED: - /* Give up here */ - return -ENOSPC; - default: - BUG(); - } -} - /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. @@ -4079,31 +3984,28 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_UNSET_SIZE_CLASS, allow unset size class * LOOP_ALLOC_CHUNK, force a chunk allocation and try again * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try * again */ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { ffe_ctl->index = 0; - if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { - /* - * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any uncached bgs and we've already done a - * full search through. - */ - if (ffe_ctl->orig_have_caching_bg || !full_search) - ffe_ctl->loop = LOOP_CACHING_WAIT; - else - ffe_ctl->loop = LOOP_ALLOC_CHUNK; - } else { + /* + * We want to skip the LOOP_CACHING_WAIT step if we don't have + * any uncached bgs and we've already done a full search + * through. + */ + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && + (!ffe_ctl->orig_have_caching_bg && full_search)) ffe_ctl->loop++; - } + ffe_ctl->loop++; if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; int exist = 0; - /*Check if allocation policy allows to create a new chunk */ + /* Check if allocation policy allows to create a new chunk */ ret = can_allocate_chunk(fs_info, ffe_ctl); if (ret) return ret; @@ -4123,8 +4025,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ - if (ret == -ENOSPC) - ret = chunk_allocation_failed(ffe_ctl); + if (ret == -ENOSPC) { + ret = 0; + ffe_ctl->loop++; + } else if (ret < 0) btrfs_abort_transaction(trans, ret); else @@ -4154,6 +4058,21 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } +static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group *bg) +{ + if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) + return true; + if (!btrfs_block_group_should_use_size_class(bg)) + return true; + if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) + return true; + if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && + bg->size_class == BTRFS_BG_SZ_NONE) + return true; + return ffe_ctl->size_class == bg->size_class; +} + static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4288,6 +4207,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl->total_free_space = 0; ffe_ctl->found_offset = 0; ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); if (btrfs_is_zoned(fs_info)) ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; @@ -4296,8 +4216,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size, - ffe_ctl->flags); + trace_find_free_extent(root, ffe_ctl); space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); if (!space_info) { @@ -4340,6 +4259,7 @@ static noinline int find_free_extent(struct btrfs_root *root, block_group->flags); btrfs_lock_block_group(block_group, ffe_ctl->delalloc); + ffe_ctl->hinted = true; goto have_block_group; } } else if (block_group) { @@ -4347,6 +4267,7 @@ static noinline int find_free_extent(struct btrfs_root *root, } } search: + trace_find_free_extent_search_loop(root, ffe_ctl); ffe_ctl->have_caching_bg = false; if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || ffe_ctl->index == 0) @@ -4356,6 +4277,7 @@ search: &space_info->block_groups[ffe_ctl->index], list) { struct btrfs_block_group *bg_ret; + ffe_ctl->hinted = false; /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) { if (ffe_ctl->for_treelog) @@ -4397,6 +4319,7 @@ search: } have_block_group: + trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; @@ -4421,6 +4344,9 @@ have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) goto loop; + if (!find_free_extent_check_size_class(ffe_ctl, block_group)) + goto loop; + bg_ret = NULL; ret = do_allocation(block_group, ffe_ctl, &bg_ret); if (ret == 0) { @@ -4455,7 +4381,8 @@ have_block_group: ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, ffe_ctl->num_bytes, - ffe_ctl->delalloc); + ffe_ctl->delalloc, + ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); if (ret == -EAGAIN) { btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, @@ -4468,8 +4395,7 @@ have_block_group: ins->objectid = ffe_ctl->search_start; ins->offset = ffe_ctl->num_bytes; - trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start, - ffe_ctl->num_bytes); + trace_btrfs_reserve_extent(block_group, ffe_ctl); btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: @@ -4912,7 +4838,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); __btrfs_tree_lock(buf, nest); - btrfs_clean_tree_block(buf); + btrfs_clear_buffer_dirty(trans, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); @@ -5542,13 +5468,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, } } } - /* make block locked assertion in btrfs_clean_tree_block happy */ - if (!path->locks[level] && - btrfs_header_generation(eb) == trans->transid) { + /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */ + if (!path->locks[level]) { btrfs_tree_lock(eb); path->locks[level] = BTRFS_WRITE_LOCK; } - btrfs_clean_tree_block(eb); + btrfs_clear_buffer_dirty(trans, eb); } if (eb == root->node) { diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index ae5425253603..0c958fc1b3b8 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -3,6 +3,87 @@ #ifndef BTRFS_EXTENT_TREE_H #define BTRFS_EXTENT_TREE_H +#include "misc.h" +#include "block-group.h" + +struct btrfs_free_cluster; + +enum btrfs_extent_allocation_policy { + BTRFS_EXTENT_ALLOC_CLUSTERED, + BTRFS_EXTENT_ALLOC_ZONED, +}; + +struct find_free_extent_ctl { + /* Basic allocation info */ + u64 ram_bytes; + u64 num_bytes; + u64 min_alloc_size; + u64 empty_size; + u64 flags; + int delalloc; + + /* Where to start the search inside the bg */ + u64 search_start; + + /* For clustered allocation */ + u64 empty_cluster; + struct btrfs_free_cluster *last_ptr; + bool use_cluster; + + bool have_caching_bg; + bool orig_have_caching_bg; + + /* Allocation is called for tree-log */ + bool for_treelog; + + /* Allocation is called for data relocation */ + bool for_data_reloc; + + /* RAID index, converted from flags */ + int index; + + /* + * Current loop number, check find_free_extent_update_loop() for details + */ + int loop; + + /* + * Whether we're refilling a cluster, if true we need to re-search + * current block group but don't try to refill the cluster again. + */ + bool retry_clustered; + + /* + * Whether we're updating free space cache, if true we need to re-search + * current block group but don't try updating free space cache again. + */ + bool retry_unclustered; + + /* If current block group is cached */ + int cached; + + /* Max contiguous hole found */ + u64 max_extent_size; + + /* Total free space from free space cache, not always contiguous */ + u64 total_free_space; + + /* Found result */ + u64 found_offset; + + /* Hint where to start looking for an empty space */ + u64 hint_byte; + + /* Allocation policy */ + enum btrfs_extent_allocation_policy policy; + + /* Whether or not the allocator is currently following a hint */ + bool hinted; + + /* Size class of block groups to prefer in early loops */ + enum btrfs_block_group_size_class size_class; +}; + enum btrfs_inline_ref_type { BTRFS_REF_TYPE_INVALID, BTRFS_REF_TYPE_BLOCK, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9bd32daa9b9a..c25fa74d7615 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -36,6 +36,7 @@ #include "file.h" #include "dev-replace.h" #include "super.h" +#include "transaction.h" static struct kmem_cache *extent_buffer_cache; @@ -99,7 +100,6 @@ struct btrfs_bio_ctrl { struct bio *bio; int mirror_num; enum btrfs_compression_type compress_type; - u32 len_to_stripe_boundary; u32 len_to_oe_boundary; btrfs_bio_end_io_t end_io_func; @@ -126,7 +126,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct bio *bio; struct bio_vec *bv; - struct btrfs_inode *inode; + struct inode *inode; int mirror_num; if (!bio_ctrl->bio) @@ -134,15 +134,13 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio = bio_ctrl->bio; bv = bio_first_bvec_all(bio); - inode = BTRFS_I(bv->bv_page->mapping->host); + inode = bv->bv_page->mapping->host; mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); - btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; - - if (!is_data_inode(&inode->vfs_inode)) { + if (!is_data_inode(inode)) { if (btrfs_op(bio) != BTRFS_MAP_WRITE) { /* * For metadata read, we should have the parent_check, @@ -153,14 +151,15 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio_ctrl->parent_check, sizeof(struct btrfs_tree_parent_check)); } - btrfs_submit_metadata_bio(inode, bio, mirror_num); - } else if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - btrfs_submit_data_write_bio(inode, bio, mirror_num); - } else { - btrfs_submit_data_read_bio(inode, bio, mirror_num, - bio_ctrl->compress_type); + bio->bi_opf |= REQ_META; } + if (btrfs_op(bio) == BTRFS_MAP_READ && + bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) + btrfs_submit_compressed_read(inode, bio, mirror_num); + else + btrfs_submit_bio(bio, mirror_num); + /* The bio is owned by the end_io handler now */ bio_ctrl->bio = NULL; } @@ -515,266 +514,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, start, end, page_ops, NULL); } -static int insert_failrec(struct btrfs_inode *inode, - struct io_failure_record *failrec) -{ - struct rb_node *exist; - - spin_lock(&inode->io_failure_lock); - exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr, - &failrec->rb_node); - spin_unlock(&inode->io_failure_lock); - - return (exist == NULL) ? 0 : -EEXIST; -} - -static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start) -{ - struct rb_node *node; - struct io_failure_record *failrec = ERR_PTR(-ENOENT); - - spin_lock(&inode->io_failure_lock); - node = rb_simple_search(&inode->io_failure_tree, start); - if (node) - failrec = rb_entry(node, struct io_failure_record, rb_node); - spin_unlock(&inode->io_failure_lock); - return failrec; -} - -static void free_io_failure(struct btrfs_inode *inode, - struct io_failure_record *rec) -{ - spin_lock(&inode->io_failure_lock); - rb_erase(&rec->rb_node, &inode->io_failure_tree); - spin_unlock(&inode->io_failure_lock); - - kfree(rec); -} - -static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) -{ - if (cur_mirror == failrec->num_copies) - return cur_mirror + 1 - failrec->num_copies; - return cur_mirror + 1; -} - -static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) -{ - if (cur_mirror == 1) - return failrec->num_copies; - return cur_mirror - 1; -} - -/* - * each time an IO finishes, we do a fast check in the IO failure tree - * to see if we need to process or clean up an io_failure_record - */ -int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, - struct page *page, unsigned int pg_offset) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_io_tree *io_tree = &inode->io_tree; - u64 ino = btrfs_ino(inode); - u64 locked_start, locked_end; - struct io_failure_record *failrec; - int mirror; - int ret; - - failrec = get_failrec(inode, start); - if (IS_ERR(failrec)) - return 0; - - BUG_ON(!failrec->this_mirror); - - if (sb_rdonly(fs_info->sb)) - goto out; - - ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start, - &locked_end, EXTENT_LOCKED, NULL); - if (ret || locked_start > failrec->bytenr || - locked_end < failrec->bytenr + failrec->len - 1) - goto out; - - mirror = failrec->this_mirror; - do { - mirror = prev_mirror(failrec, mirror); - btrfs_repair_io_failure(fs_info, ino, start, failrec->len, - failrec->logical, page, pg_offset, mirror); - } while (mirror != failrec->failed_mirror); - -out: - free_io_failure(inode, failrec); - return 0; -} - -/* - * Can be called when - * - hold extent lock - * - under ordered extent - * - the inode is freeing - */ -void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) -{ - struct io_failure_record *failrec; - struct rb_node *node, *next; - - if (RB_EMPTY_ROOT(&inode->io_failure_tree)) - return; - - spin_lock(&inode->io_failure_lock); - node = rb_simple_search_first(&inode->io_failure_tree, start); - while (node) { - failrec = rb_entry(node, struct io_failure_record, rb_node); - if (failrec->bytenr > end) - break; - - next = rb_next(node); - rb_erase(&failrec->rb_node, &inode->io_failure_tree); - kfree(failrec); - - node = next; - } - spin_unlock(&inode->io_failure_lock); -} - -static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, - struct btrfs_bio *bbio, - unsigned int bio_offset) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - u64 start = bbio->file_offset + bio_offset; - struct io_failure_record *failrec; - const u32 sectorsize = fs_info->sectorsize; - int ret; - - failrec = get_failrec(BTRFS_I(inode), start); - if (!IS_ERR(failrec)) { - btrfs_debug(fs_info, - "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", - failrec->logical, failrec->bytenr, failrec->len); - /* - * when data can be on disk more than twice, add to failrec here - * (e.g. with a list for failed_mirror) to make - * clean_io_failure() clean all those errors at once. - */ - ASSERT(failrec->this_mirror == bbio->mirror_num); - ASSERT(failrec->len == fs_info->sectorsize); - return failrec; - } - - failrec = kzalloc(sizeof(*failrec), GFP_NOFS); - if (!failrec) - return ERR_PTR(-ENOMEM); - - RB_CLEAR_NODE(&failrec->rb_node); - failrec->bytenr = start; - failrec->len = sectorsize; - failrec->failed_mirror = bbio->mirror_num; - failrec->this_mirror = bbio->mirror_num; - failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset; - - btrfs_debug(fs_info, - "new io failure record logical %llu start %llu", - failrec->logical, start); - - failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize); - if (failrec->num_copies == 1) { - /* - * We only have a single copy of the data, so don't bother with - * all the retry and error correction code that follows. No - * matter what the error is, it is very likely to persist. - */ - btrfs_debug(fs_info, - "cannot repair logical %llu num_copies %d", - failrec->logical, failrec->num_copies); - kfree(failrec); - return ERR_PTR(-EIO); - } - - /* Set the bits in the private failure tree */ - ret = insert_failrec(BTRFS_I(inode), failrec); - if (ret) { - kfree(failrec); - return ERR_PTR(ret); - } - - return failrec; -} - -int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, - u32 bio_offset, struct page *page, unsigned int pgoff, - bool submit_buffered) -{ - u64 start = failed_bbio->file_offset + bio_offset; - struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct bio *failed_bio = &failed_bbio->bio; - const int icsum = bio_offset >> fs_info->sectorsize_bits; - struct bio *repair_bio; - struct btrfs_bio *repair_bbio; - - btrfs_debug(fs_info, - "repair read error: read error at %llu", start); - - BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - - failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset); - if (IS_ERR(failrec)) - return PTR_ERR(failrec); - - /* - * There are two premises: - * a) deliver good data to the caller - * b) correct the bad sectors on disk - * - * Since we're only doing repair for one sector, we only need to get - * a good copy of the failed sector and if we succeed, we have setup - * everything for btrfs_repair_io_failure to do the rest for us. - */ - failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); - if (failrec->this_mirror == failrec->failed_mirror) { - btrfs_debug(fs_info, - "failed to repair num_copies %d this_mirror %d failed_mirror %d", - failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); - free_io_failure(inode, failrec); - return -EIO; - } - - repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io, - failed_bbio->private); - repair_bbio = btrfs_bio(repair_bio); - repair_bbio->file_offset = start; - repair_bio->bi_iter.bi_sector = failrec->logical >> 9; - - if (failed_bbio->csum) { - const u32 csum_size = fs_info->csum_size; - - repair_bbio->csum = repair_bbio->csum_inline; - memcpy(repair_bbio->csum, - failed_bbio->csum + csum_size * icsum, csum_size); - } - - bio_add_page(repair_bio, page, failrec->len, pgoff); - repair_bbio->iter = repair_bio->bi_iter; - - btrfs_debug(fs_info, - "repair read error: submitting new read to mirror %d", - failrec->this_mirror); - - /* - * At this point we have a bio, so any errors from bio submission will - * be handled by the endio on the repair_bio, so we can't return an - * error here. - */ - if (submit_buffered) - btrfs_submit_data_read_bio(inode, repair_bio, - failrec->this_mirror, 0); - else - btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); - - return BLK_STS_OK; -} - static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); @@ -803,79 +542,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_subpage_end_reader(fs_info, page, start, len); } -static void end_sector_io(struct page *page, u64 offset, bool uptodate) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - const u32 sectorsize = inode->root->fs_info->sectorsize; - - end_page_read(page, uptodate, offset, sectorsize); - unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL); -} - -static void submit_data_read_repair(struct inode *inode, - struct btrfs_bio *failed_bbio, - u32 bio_offset, const struct bio_vec *bvec, - unsigned int error_bitmap) -{ - const unsigned int pgoff = bvec->bv_offset; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct page *page = bvec->bv_page; - const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset; - const u64 end = start + bvec->bv_len - 1; - const u32 sectorsize = fs_info->sectorsize; - const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; - int i; - - BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE); - - /* This repair is only for data */ - ASSERT(is_data_inode(inode)); - - /* We're here because we had some read errors or csum mismatch */ - ASSERT(error_bitmap); - - /* - * We only get called on buffered IO, thus page must be mapped and bio - * must not be cloned. - */ - ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED)); - - /* Iterate through all the sectors in the range */ - for (i = 0; i < nr_bits; i++) { - const unsigned int offset = i * sectorsize; - bool uptodate = false; - int ret; - - if (!(error_bitmap & (1U << i))) { - /* - * This sector has no error, just end the page read - * and unlock the range. - */ - uptodate = true; - goto next; - } - - ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio, - bio_offset + offset, page, pgoff + offset, - true); - if (!ret) { - /* - * We have submitted the read repair, the page release - * will be handled by the endio function of the - * submitted repair bio. - * Thus we don't need to do any thing here. - */ - continue; - } - /* - * Continue on failed repair, otherwise the remaining sectors - * will not be properly unlocked. - */ -next: - end_sector_io(page, start + offset, uptodate); - } -} - /* lots and lots of room for performance fixes in the end_bio funcs */ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) @@ -919,7 +585,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) u64 start; u64 end; struct bvec_iter_all iter_all; - bool first_bvec = true; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { @@ -941,11 +606,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) start = page_offset(page) + bvec->bv_offset; end = start + bvec->bv_len - 1; - if (first_bvec) { - btrfs_record_physical_zoned(inode, start, bio); - first_bvec = false; - } - end_extent_writepage(page, error, start, end); btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); @@ -1093,8 +753,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; - unsigned int error_bitmap = (unsigned int)-1; - bool repair = false; u64 start; u64 end; u32 len; @@ -1126,25 +784,14 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) len = bvec->bv_len; mirror = bbio->mirror_num; - if (likely(uptodate)) { - if (is_data_inode(inode)) { - error_bitmap = btrfs_verify_data_csum(bbio, - bio_offset, page, start, end); - if (error_bitmap) - uptodate = false; - } else { - if (btrfs_validate_metadata_buffer(bbio, - page, start, end, mirror)) - uptodate = false; - } - } + if (uptodate && !is_data_inode(inode) && + btrfs_validate_metadata_buffer(bbio, page, start, end, mirror)) + uptodate = false; if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; - btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0); - /* * Zero out the remaining part if this range straddles * i_size. @@ -1161,19 +808,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) zero_user_segment(page, zero_start, offset_in_page(end) + 1); } - } else if (is_data_inode(inode)) { - /* - * Only try to repair bios that actually made it to a - * device. If the bio failed to be submitted mirror - * is 0 and we need to fail it without retrying. - * - * This also includes the high level bios for compressed - * extents - these never make it to a device and repair - * is already handled on the lower compressed bio. - */ - if (mirror > 0) - repair = true; - } else { + } else if (!is_data_inode(inode)) { struct extent_buffer *eb; eb = find_extent_buffer_readpage(fs_info, page, start); @@ -1182,19 +817,10 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) atomic_dec(&eb->io_pages); } - if (repair) { - /* - * submit_data_read_repair() will handle all the good - * and bad sectors, we just continue to the next bvec. - */ - submit_data_read_repair(inode, bbio, bio_offset, bvec, - error_bitmap); - } else { - /* Update page status and unlock */ - end_page_read(page, uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, PageUptodate(page)); - } + /* Update page status and unlock. */ + end_page_read(page, uptodate, start, len); + endio_readpage_release_extent(&processed, BTRFS_I(inode), + start, end, PageUptodate(page)); ASSERT(bio_offset + len > bio_offset); bio_offset += len; @@ -1202,7 +828,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); - btrfs_bio_free_csum(bbio); bio_put(bio); } @@ -1270,11 +895,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, u32 real_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; bool contig = false; - int ret; ASSERT(bio); /* The limit should be calculated when bio_ctrl->bio is allocated */ - ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); + ASSERT(bio_ctrl->len_to_oe_boundary); if (bio_ctrl->compress_type != compress_type) return 0; @@ -1310,9 +934,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, if (!contig) return 0; - real_size = min(bio_ctrl->len_to_oe_boundary, - bio_ctrl->len_to_stripe_boundary) - bio_size; - real_size = min(real_size, size); + real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size); /* * If real_size is 0, never call bio_add_*_page(), as even size is 0, @@ -1321,82 +943,45 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, if (real_size == 0) return 0; - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - ret = bio_add_zone_append_page(bio, page, real_size, pg_offset); - else - ret = bio_add_page(bio, page, real_size, pg_offset); - - return ret; + return bio_add_page(bio, page, real_size, pg_offset); } -static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, - struct btrfs_inode *inode, u64 file_offset) +static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, + struct btrfs_inode *inode, u64 file_offset) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_io_geometry geom; struct btrfs_ordered_extent *ordered; - struct extent_map *em; - u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT); - int ret; /* - * Pages for compressed extent are never submitted to disk directly, - * thus it has no real boundary, just set them to U32_MAX. - * - * The split happens for real compressed bio, which happens in - * btrfs_submit_compressed_read/write(). + * Limit the extent to the ordered boundary for Zone Append. + * Compressed bios aren't submitted directly, so it doesn't apply to + * them. */ - if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { - bio_ctrl->len_to_oe_boundary = U32_MAX; - bio_ctrl->len_to_stripe_boundary = U32_MAX; - return 0; - } - em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); - if (IS_ERR(em)) - return PTR_ERR(em); - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio), - logical, &geom); - free_extent_map(em); - if (ret < 0) { - return ret; - } - if (geom.len > U32_MAX) - bio_ctrl->len_to_stripe_boundary = U32_MAX; - else - bio_ctrl->len_to_stripe_boundary = (u32)geom.len; - - if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { - bio_ctrl->len_to_oe_boundary = U32_MAX; - return 0; - } - - /* Ordered extent not yet created, so we're good */ - ordered = btrfs_lookup_ordered_extent(inode, file_offset); - if (!ordered) { - bio_ctrl->len_to_oe_boundary = U32_MAX; - return 0; + if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && + btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) { + ordered = btrfs_lookup_ordered_extent(inode, file_offset); + if (ordered) { + bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, + ordered->file_offset + + ordered->disk_num_bytes - file_offset); + btrfs_put_ordered_extent(ordered); + return; + } } - bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, - ordered->disk_bytenr + ordered->disk_num_bytes - logical); - btrfs_put_ordered_extent(ordered); - return 0; + bio_ctrl->len_to_oe_boundary = U32_MAX; } -static int alloc_new_bio(struct btrfs_inode *inode, - struct btrfs_bio_ctrl *bio_ctrl, - struct writeback_control *wbc, - blk_opf_t opf, - u64 disk_bytenr, u32 offset, u64 file_offset, - enum btrfs_compression_type compress_type) +static void alloc_new_bio(struct btrfs_inode *inode, + struct btrfs_bio_ctrl *bio_ctrl, + struct writeback_control *wbc, blk_opf_t opf, + u64 disk_bytenr, u32 offset, u64 file_offset, + enum btrfs_compression_type compress_type) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; - int ret; - ASSERT(bio_ctrl->end_io_func); - - bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL); + bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func, + NULL); /* * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. @@ -1405,48 +990,21 @@ static int alloc_new_bio(struct btrfs_inode *inode, bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; else bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; + btrfs_bio(bio)->file_offset = file_offset; bio_ctrl->bio = bio; bio_ctrl->compress_type = compress_type; - ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); - if (ret < 0) - goto error; + calc_bio_boundaries(bio_ctrl, inode, file_offset); if (wbc) { /* - * For Zone append we need the correct block_device that we are - * going to write to set in the bio to be able to respect the - * hardware limitation. Look it up here: + * Pick the last added device to support cgroup writeback. For + * multi-device file systems this means blk-cgroup policies have + * to always be set on the last added/replaced device. + * This is a bit odd but has been like that for a long time. */ - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *dev; - - dev = btrfs_zoned_get_device(fs_info, disk_bytenr, - fs_info->sectorsize); - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - goto error; - } - - bio_set_dev(bio, dev->bdev); - } else { - /* - * Otherwise pick the last added device to support - * cgroup writeback. For multi-device file systems this - * means blk-cgroup policies have to always be set on the - * last added/replaced device. This is a bit odd but has - * been like that for a long time. - */ - bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); - } + bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); wbc_init_bio(wbc, bio); - } else { - ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); } - return 0; -error: - bio_ctrl->bio = NULL; - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); - return ret; } /* @@ -1472,7 +1030,6 @@ static int submit_extent_page(blk_opf_t opf, enum btrfs_compression_type compress_type, bool force_bio_submit) { - int ret = 0; struct btrfs_inode *inode = BTRFS_I(page->mapping->host); unsigned int cur = pg_offset; @@ -1492,12 +1049,9 @@ static int submit_extent_page(blk_opf_t opf, /* Allocate new bio if needed */ if (!bio_ctrl->bio) { - ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, - disk_bytenr, offset, - page_offset(page) + cur, - compress_type); - if (ret < 0) - return ret; + alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr, + offset, page_offset(page) + cur, + compress_type); } /* * We must go through btrfs_bio_add_page() to ensure each @@ -2054,10 +1608,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * find_next_dirty_byte() are all exclusive */ iosize = min(min(em_end, end + 1), dirty_range_end) - cur; - - if (btrfs_use_zone_append(inode, em->block_start)) - op = REQ_OP_ZONE_APPEND; - free_extent_map(em); em = NULL; @@ -2361,13 +1911,6 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) mapping_set_error(page->mapping, -EIO); /* - * If we error out, we should add back the dirty_metadata_bytes - * to make it consistent. - */ - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, - eb->len, fs_info->dirty_metadata_batch); - - /* * If writeback for a btree extent that doesn't belong to a log tree * failed, increment the counter transaction->eb_write_errors. * We do this because while the transaction is running and before it's @@ -3826,6 +3369,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, lockend = round_up(start + len, inode->root->fs_info->sectorsize); prev_extent_end = lockstart; + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); @@ -4019,6 +3563,7 @@ check_eof_delalloc: out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); out: free_extent_state(delalloc_cached_state); btrfs_free_backref_share_ctx(backref_ctx); @@ -4722,12 +4267,25 @@ static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); } -void clear_extent_buffer_dirty(const struct extent_buffer *eb) +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; int i; int num_pages; struct page *page; + btrfs_assert_tree_write_locked(eb); + + if (trans && btrfs_header_generation(eb) != trans->transid) + return; + + if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) + return; + + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, + fs_info->dirty_metadata_batch); + if (eb->fs_info->nodesize < PAGE_SIZE) return clear_subpage_extent_buffer_dirty(eb); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a2c82448b2e0..4341ad978fb8 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -11,6 +11,8 @@ #include "ulist.h" #include "misc.h" +struct btrfs_trans_handle; + enum { EXTENT_BUFFER_UPTODATE, EXTENT_BUFFER_DIRTY, @@ -60,11 +62,9 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) -struct btrfs_bio; struct btrfs_root; struct btrfs_inode; struct btrfs_fs_info; -struct io_failure_record; struct extent_io_tree; struct btrfs_tree_parent_check; @@ -262,7 +262,6 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); -void clear_extent_buffer_dirty(const struct extent_buffer *eb); bool set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); @@ -274,40 +273,13 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, u32 bits_to_clear, unsigned long page_ops); int extent_invalidate_folio(struct extent_io_tree *tree, struct folio *folio, size_t offset); +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); -/* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data. This - * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the sector is set up to date - * and things continue. If a good mirror can't be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { - /* Use rb_simple_node for search/insert */ - struct { - struct rb_node rb_node; - u64 bytenr; - }; - struct page *page; - u64 len; - u64 logical; - int this_mirror; - int failed_mirror; - int num_copies; -}; - -int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, - u32 bio_offset, struct page *page, unsigned int pgoff, - bool submit_buffered); -void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); -int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, - struct page *page, unsigned int pg_offset); - #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 5de73466b2ca..41c77a100853 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -380,32 +380,25 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, /* * Lookup the checksum for the read bio in csum tree. * - * @inode: inode that the bio is for. - * @bio: bio to look up. - * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return - * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If - * NULL, the checksum buffer is allocated and returned in - * btrfs_bio(bio)->csum instead. - * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst) +blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_bio *bbio = NULL; + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct bio *bio = &bbio->bio; struct btrfs_path *path; const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 orig_len = bio->bi_iter.bi_size; u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 cur_disk_bytenr; - u8 *csum; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; int count = 0; blk_status_t ret = BLK_STS_OK; - if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || + if ((inode->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) return BLK_STS_OK; @@ -426,21 +419,14 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst if (!path) return BLK_STS_RESOURCE; - if (!dst) { - bbio = btrfs_bio(bio); - - if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); - if (!bbio->csum) { - btrfs_free_path(path); - return BLK_STS_RESOURCE; - } - } else { - bbio->csum = bbio->csum_inline; + if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { + bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); + if (!bbio->csum) { + btrfs_free_path(path); + return BLK_STS_RESOURCE; } - csum = bbio->csum; } else { - csum = dst; + bbio->csum = bbio->csum_inline; } /* @@ -456,7 +442,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst * read from the commit root and sidestep a nasty deadlock * between reading the free space cache and updating the csum tree. */ - if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (btrfs_is_free_space_inode(inode)) { path->search_commit_root = 1; path->skip_locking = 1; } @@ -479,14 +465,15 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX); sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >> fs_info->sectorsize_bits; - csum_dst = csum + sector_offset * csum_size; + csum_dst = bbio->csum + sector_offset * csum_size; count = search_csum_tree(fs_info, path, cur_disk_bytenr, search_len, csum_dst); if (count < 0) { ret = errno_to_blk_status(count); - if (bbio) - btrfs_bio_free_csum(bbio); + if (bbio->csum != bbio->csum_inline) + kfree(bbio->csum); + bbio->csum = NULL; break; } @@ -504,12 +491,13 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst memset(csum_dst, 0, csum_size); count = 1; - if (BTRFS_I(inode)->root->root_key.objectid == + if (inode->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset; int ret; - ret = search_file_offset_in_bio(bio, inode, + ret = search_file_offset_in_bio(bio, + &inode->vfs_inode, cur_disk_bytenr, &file_offset); if (ret) set_extent_bits(io_tree, file_offset, @@ -784,23 +772,16 @@ fail: /* * Calculate checksums of the data contained inside a bio. - * - * @inode: Owner of the data inside the bio - * @bio: Contains the data to be checksummed - * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the - * file offsets are determined from the page offsets in the bio. - * Otherwise, this is the starting file offset of the bio vecs in - * @bio, which must be contiguous. - * @one_ordered: If true, @bio only refers to one ordered extent. */ -blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 offset, bool one_ordered) +blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) { + struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + struct bio *bio = &bbio->bio; + u64 offset = bbio->file_offset; struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; - const bool use_page_offsets = (offset == (u64)-1); char *data; struct bvec_iter iter; struct bio_vec bvec; @@ -828,9 +809,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, shash->tfm = fs_info->csum_shash; bio_for_each_segment(bvec, bio, iter) { - if (use_page_offsets) - offset = page_offset(bvec.bv_page) + bvec.bv_offset; - if (!ordered) { ordered = btrfs_lookup_ordered_extent(inode, offset); /* @@ -852,7 +830,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - 1); for (i = 0; i < blockcount; i++) { - if (!one_ordered && + if (!(bio->bi_opf & REQ_BTRFS_ONE_ORDERED) && !in_range(offset, ordered->file_offset, ordered->num_bytes)) { unsigned long bytes_left; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 031225668434..cd7f2ae515c0 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -38,7 +38,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); +blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio); int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, u64 num_bytes); @@ -49,8 +49,10 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 offset, bool one_ordered); +blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait); int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index af046d22300e..5cc5a1faaef5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1017,7 +1017,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, unlock_page(pages[i]); put_page(pages[i]); } - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); return -EAGAIN; } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index c667e878ef1a..4d155a48ec59 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1283,7 +1283,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) list_del(&free_space_root->dirty_list); btrfs_tree_lock(free_space_root->node); - btrfs_clean_tree_block(free_space_root->node); + btrfs_clear_buffer_dirty(trans, free_space_root->node); btrfs_tree_unlock(free_space_root->node); btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), free_space_root->node, 0, 1); diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 5553e1f8afe8..31c1648bc0b4 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -24,6 +24,7 @@ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } @@ -46,6 +47,7 @@ void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } @@ -68,6 +70,7 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } @@ -90,5 +93,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 37b86acfcbcf..4c477eae6891 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -3,6 +3,7 @@ #ifndef BTRFS_FS_H #define BTRFS_FS_H +#include <linux/blkdev.h> #include <linux/fs.h> #include <linux/btrfs_tree.h> #include <linux/sizes.h> @@ -125,6 +126,12 @@ enum { */ BTRFS_FS_NO_OVERCOMMIT, + /* + * Indicate if we have some features changed, this is mostly for + * cleaner thread to update the sysfs interface. + */ + BTRFS_FS_FEATURE_CHANGED, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -742,8 +749,10 @@ struct btrfs_fs_info { */ u64 zone_size; - /* Max size to emit ZONE_APPEND write command */ + /* Constraints for ZONE_APPEND commands: */ + struct queue_limits limits; u64 max_zone_append_size; + struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; u64 treelog_bg; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98a800b8bd43..6c18dc9a1831 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -84,27 +84,12 @@ struct btrfs_dio_data { }; struct btrfs_dio_private { - struct btrfs_inode *inode; - - /* - * Since DIO can use anonymous page, we cannot use page_offset() to - * grab the file offset, thus need a dedicated member for file offset. - */ + /* Range of I/O */ u64 file_offset; - /* Used for bio::bi_size */ u32 bytes; - /* - * References to this structure. There is one reference per in-flight - * bio plus one while we're still setting up. - */ - refcount_t refs; - - /* Array of checksums */ - u8 *csums; - /* This must be last */ - struct bio bio; + struct btrfs_bio bbio; }; static struct bio_set btrfs_dio_bioset; @@ -228,7 +213,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start, page_end; + u64 page_start = 0, page_end = 0; struct page *page; if (locked_page) { @@ -2536,19 +2521,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } /* - * in order to insert checksums into the metadata in large chunks, - * we wait until bio submission time. All the pages in the bio are - * checksummed and sums are attached onto the ordered extent record. - * - * At IO completion time the cums attached on the ordered extent record - * are inserted into the btree - */ -blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio) -{ - return btrfs_csum_one_bio(inode, bio, (u64)-1, false); -} - -/* * Split an extent_map at [start, start + len] * * This function is intended to be used only for extract_ordered_extent(). @@ -2663,19 +2635,19 @@ out: return ret; } -static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, - struct bio *bio, loff_t file_offset) +blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio) { + u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + u64 len = bbio->bio.bi_iter.bi_size; + struct btrfs_inode *inode = bbio->inode; struct btrfs_ordered_extent *ordered; - u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 file_len; - u64 len = bio->bi_iter.bi_size; u64 end = start + len; u64 ordered_end; u64 pre, post; int ret = 0; - ordered = btrfs_lookup_ordered_extent(inode, file_offset); + ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset); if (WARN_ON_ONCE(!ordered)) return BLK_STS_IOERR; @@ -2715,7 +2687,7 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, ret = btrfs_split_ordered_extent(ordered, pre, post); if (ret) goto out; - ret = split_zoned_em(inode, file_offset, file_len, pre, post); + ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post); out: btrfs_put_ordered_extent(ordered); @@ -2723,75 +2695,6 @@ out: return errno_to_blk_status(ret); } -void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - ret = extract_ordered_extent(inode, bio, - page_offset(bio_first_bvec_all(bio)->bv_page)); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } - - /* - * If we need to checksum, and the I/O is not issued by fsync and - * friends, that is ->sync_writers != 0, defer the submission to a - * workqueue to parallelize it. - * - * Csum items for reloc roots have already been cloned at this point, - * so they are handled as part of the no-checksum case. - */ - if (!(inode->flags & BTRFS_INODE_NODATASUM) && - !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && - !btrfs_is_data_reloc_root(inode->root)) { - if (!atomic_read(&inode->sync_writers) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) - return; - - ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } - btrfs_submit_bio(fs_info, bio, mirror_num); -} - -void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (compress_type != BTRFS_COMPRESS_NONE) { - /* - * btrfs_submit_compressed_read will handle completing the bio - * if there were any errors, so just return here. - */ - btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num); - return; - } - - /* Save the original iter for read repair */ - btrfs_bio(bio)->iter = bio->bi_iter; - - /* - * Lookup bio sums does extra checks around whether we need to csum or - * not, which is why we ignore skip_sum here. - */ - ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - - btrfs_submit_bio(fs_info, bio, mirror_num); -} - /* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. @@ -2969,7 +2872,7 @@ again: unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -3259,15 +3162,13 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - /* A valid bdev implies a write on a sequential zone */ - if (ordered_extent->bdev) { + /* A valid ->physical implies a write on a sequential zone. */ + if (ordered_extent->physical != (u64)-1) { btrfs_rewrite_logical_zoned(ordered_extent); btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); } - btrfs_free_io_failure_record(inode, start, end); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; logical_len = ordered_extent->truncated_len; @@ -3474,109 +3375,55 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of } /* - * check_data_csum - verify checksum of one sector of uncompressed data - * @inode: inode - * @bbio: btrfs_bio which contains the csum + * Verify the checksum of a single data sector. + * + * @bbio: btrfs_io_bio which contains the csum + * @dev: device the sector is on * @bio_offset: offset to the beginning of the bio (in bytes) - * @page: page where is the data to be verified - * @pgoff: offset inside the page + * @bv: bio_vec to check * - * The length of such check is always one sector size. + * Check if the checksum on a data block is valid. When a checksum mismatch is + * detected, report the error and fill the corrupted range with zero. * - * When csum mismatch is detected, we will also report the error and fill the - * corrupted range with zero. (Thus it needs the extra parameters) + * Return %true if the sector is ok or had no checksum to start with, else %false. */ -int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff) +bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, + u32 bio_offset, struct bio_vec *bv) { + struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 len = fs_info->sectorsize; + u64 file_offset = bbio->file_offset + bio_offset; + u64 end = file_offset + bv->bv_len - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; - ASSERT(pgoff + len <= PAGE_SIZE); + ASSERT(bv->bv_len == fs_info->sectorsize); - csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); + if (!bbio->csum) + return true; - if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected)) + if (btrfs_is_data_reloc_root(inode->root) && + test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, + 1, NULL)) { + /* Skip the range without csum for data reloc inode */ + clear_extent_bits(&inode->io_tree, file_offset, end, + EXTENT_NODATASUM); + return true; + } + + csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); + if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, + csum_expected)) goto zeroit; - return 0; + return true; zeroit: - btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset, - csum, csum_expected, bbio->mirror_num); - if (bbio->device) - btrfs_dev_stat_inc_and_print(bbio->device, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - memzero_page(page, pgoff, len); - return -EIO; -} - -/* - * When reads are done, we need to check csums to verify the data is correct. - * if there's a match, we allow the bio to finish. If not, the code in - * extent_io.c will try to find good copies for us. - * - * @bio_offset: offset to the beginning of the bio (in bytes) - * @start: file offset of the range start - * @end: file offset of the range end (inclusive) - * - * Return a bitmap where bit set means a csum mismatch, and bit not set means - * csum match. - */ -unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, - u64 start, u64 end) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_io_tree *io_tree = &inode->io_tree; - const u32 sectorsize = root->fs_info->sectorsize; - u32 pg_off; - unsigned int result = 0; - - /* - * This only happens for NODATASUM or compressed read. - * Normally this should be covered by above check for compressed read - * or the next check for NODATASUM. Just do a quicker exit here. - */ - if (bbio->csum == NULL) - return 0; - - if (inode->flags & BTRFS_INODE_NODATASUM) - return 0; - - if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) - return 0; - - ASSERT(page_offset(page) <= start && - end <= page_offset(page) + PAGE_SIZE - 1); - for (pg_off = offset_in_page(start); - pg_off < offset_in_page(end); - pg_off += sectorsize, bio_offset += sectorsize) { - u64 file_offset = pg_off + page_offset(page); - int ret; - - if (btrfs_is_data_reloc_root(root) && - test_range_bit(io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM, 1, NULL)) { - /* Skip the range without csum for data reloc inode */ - clear_extent_bits(io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM); - continue; - } - ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); - if (ret < 0) { - const int nr_bit = (pg_off - offset_in_page(start)) >> - root->fs_info->sectorsize_bits; - - result |= (1U << nr_bit); - } - } - return result; + btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected, + bbio->mirror_num); + if (dev) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); + memzero_bvec(bv); + return false; } /* @@ -4987,7 +4834,7 @@ again: unlock_extent(io_tree, block_start, block_end, &cached_state); unlock_page(page); put_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -5281,7 +5128,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) return ret; } -static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -5291,7 +5138,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr if (btrfs_root_readonly(root)) return -EROFS; - err = setattr_prepare(mnt_userns, dentry, attr); + err = setattr_prepare(idmap, dentry, attr); if (err) return err; @@ -5302,12 +5149,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr } if (attr->ia_valid) { - setattr_copy(mnt_userns, inode, attr); + setattr_copy(idmap, inode, attr); inode_inc_iversion(inode); err = btrfs_dirty_inode(BTRFS_I(inode)); if (!err && attr->ia_valid & ATTR_MODE) - err = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); + err = posix_acl_chmod(idmap, dentry, inode->i_mode); } return err; @@ -5466,8 +5313,6 @@ void btrfs_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; - btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); - if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) goto no_delete; @@ -6724,7 +6569,7 @@ out_inode: return err; } -static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -6732,13 +6577,13 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); return btrfs_create_common(dir, dentry, inode); } -static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; @@ -6746,7 +6591,7 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; @@ -6837,7 +6682,7 @@ fail: return err; } -static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -6845,7 +6690,7 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; - inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode); + inode_init_owner(idmap, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; return btrfs_create_common(dir, dentry, inode); @@ -7392,7 +7237,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); else ret = nowait ? -EAGAIN : -ENOTBLK; btrfs_put_ordered_extent(ordered); @@ -7833,10 +7678,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->offset = start; iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; - - if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) - iomap->flags |= IOMAP_F_ZONE_APPEND; - free_extent_map(em); return 0; @@ -7888,267 +7729,47 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, return ret; } -static void btrfs_dio_private_put(struct btrfs_dio_private *dip) -{ - /* - * This implies a barrier so that stores to dio_bio->bi_status before - * this and loads of dio_bio->bi_status after this are fully ordered. - */ - if (!refcount_dec_and_test(&dip->refs)) - return; - - if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { - btrfs_mark_ordered_io_finished(dip->inode, NULL, - dip->file_offset, dip->bytes, - !dip->bio.bi_status); - } else { - unlock_extent(&dip->inode->io_tree, - dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); - } - - kfree(dip->csums); - bio_endio(&dip->bio); -} - -void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ - struct btrfs_dio_private *dip = btrfs_bio(bio)->private; - - BUG_ON(bio_op(bio) == REQ_OP_WRITE); - - refcount_inc(&dip->refs); - btrfs_submit_bio(inode->root->fs_info, bio, mirror_num); -} - -static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, - struct btrfs_bio *bbio, - const bool uptodate) -{ - struct inode *inode = &dip->inode->vfs_inode; - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); - blk_status_t err = BLK_STS_OK; - struct bvec_iter iter; - struct bio_vec bv; - u32 offset; - - btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { - u64 start = bbio->file_offset + offset; - - if (uptodate && - (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset, - bv.bv_page, bv.bv_offset))) { - btrfs_clean_io_failure(BTRFS_I(inode), start, - bv.bv_page, bv.bv_offset); - } else { - int ret; - - ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, - bv.bv_page, bv.bv_offset, false); - if (ret) - err = errno_to_blk_status(ret); - } - } - - return err; -} - -blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, - struct bio *bio, - u64 dio_file_offset) +static void btrfs_dio_end_io(struct btrfs_bio *bbio) { - return btrfs_csum_one_bio(inode, bio, dio_file_offset, false); -} - -static void btrfs_end_dio_bio(struct btrfs_bio *bbio) -{ - struct btrfs_dio_private *dip = bbio->private; + struct btrfs_dio_private *dip = + container_of(bbio, struct btrfs_dio_private, bbio); + struct btrfs_inode *inode = bbio->inode; struct bio *bio = &bbio->bio; - blk_status_t err = bio->bi_status; - - if (err) - btrfs_warn(dip->inode->root->fs_info, - "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio_op(bio), - bio->bi_opf, bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, err); - - if (bio_op(bio) == REQ_OP_READ) - err = btrfs_check_read_dio_bio(dip, bbio, !err); - - if (err) - dip->bio.bi_status = err; - - btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio); - - bio_put(bio); - btrfs_dio_private_put(dip); -} -static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode, - u64 file_offset, int async_submit) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_dio_private *dip = btrfs_bio(bio)->private; - blk_status_t ret; - - /* Save the original iter for read repair */ - if (btrfs_op(bio) == BTRFS_MAP_READ) - btrfs_bio(bio)->iter = bio->bi_iter; - - if (inode->flags & BTRFS_INODE_NODATASUM) - goto map; + if (bio->bi_status) { + btrfs_warn(inode->root->fs_info, + "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", + btrfs_ino(inode), bio->bi_opf, + dip->file_offset, dip->bytes, bio->bi_status); + } - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - /* Check btrfs_submit_data_write_bio() for async submit rules */ - if (async_submit && !atomic_read(&inode->sync_writers) && - btrfs_wq_submit_bio(inode, bio, 0, file_offset, - WQ_SUBMIT_DATA_DIO)) - return; + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset, + dip->bytes, !bio->bi_status); + else + unlock_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); - /* - * If we aren't doing async submit, calculate the csum of the - * bio now. - */ - ret = btrfs_csum_one_bio(inode, bio, file_offset, false); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } else { - btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, - file_offset - dip->file_offset); - } -map: - btrfs_submit_bio(fs_info, bio, 0); + bbio->bio.bi_private = bbio->private; + iomap_dio_bio_end_io(bio); } -static void btrfs_submit_direct(const struct iomap_iter *iter, - struct bio *dio_bio, loff_t file_offset) +static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, + loff_t file_offset) { + struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_dio_private *dip = - container_of(dio_bio, struct btrfs_dio_private, bio); - struct inode *inode = iter->inode; - const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const bool raid56 = (btrfs_data_alloc_profile(fs_info) & - BTRFS_BLOCK_GROUP_RAID56_MASK); - struct bio *bio; - u64 start_sector; - int async_submit = 0; - u64 submit_len; - u64 clone_offset = 0; - u64 clone_len; - u64 logical; - int ret; - blk_status_t status; - struct btrfs_io_geometry geom; + container_of(bbio, struct btrfs_dio_private, bbio); struct btrfs_dio_data *dio_data = iter->private; - struct extent_map *em = NULL; - - dip->inode = BTRFS_I(inode); - dip->file_offset = file_offset; - dip->bytes = dio_bio->bi_iter.bi_size; - refcount_set(&dip->refs, 1); - dip->csums = NULL; - - if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - unsigned int nr_sectors = - (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits); - - /* - * Load the csums up front to reduce csum tree searches and - * contention when submitting bios. - */ - status = BLK_STS_RESOURCE; - dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); - if (!dip->csums) - goto out_err; - - status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); - if (status != BLK_STS_OK) - goto out_err; - } - - start_sector = dio_bio->bi_iter.bi_sector; - submit_len = dio_bio->bi_iter.bi_size; - - do { - logical = start_sector << 9; - em = btrfs_get_chunk_map(fs_info, logical, submit_len); - if (IS_ERR(em)) { - status = errno_to_blk_status(PTR_ERR(em)); - em = NULL; - goto out_err_em; - } - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), - logical, &geom); - if (ret) { - status = errno_to_blk_status(ret); - goto out_err_em; - } - clone_len = min(submit_len, geom.len); - ASSERT(clone_len <= UINT_MAX); + btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private); + bbio->file_offset = file_offset; - /* - * This will never fail as it's passing GPF_NOFS and - * the allocation is backed by btrfs_bioset. - */ - bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len, - btrfs_end_dio_bio, dip); - btrfs_bio(bio)->file_offset = file_offset; - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - status = extract_ordered_extent(BTRFS_I(inode), bio, - file_offset); - if (status) { - bio_put(bio); - goto out_err; - } - } - - ASSERT(submit_len >= clone_len); - submit_len -= clone_len; - - /* - * Increase the count before we submit the bio so we know - * the end IO handler won't happen before we increase the - * count. Otherwise, the dip might get freed before we're - * done setting it up. - * - * We transfer the initial reference to the last bio, so we - * don't need to increment the reference count for the last one. - */ - if (submit_len > 0) { - refcount_inc(&dip->refs); - /* - * If we are submitting more than one bio, submit them - * all asynchronously. The exception is RAID 5 or 6, as - * asynchronous checksums make it difficult to collect - * full stripe writes. - */ - if (!raid56) - async_submit = 1; - } - - btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit); - - dio_data->submitted += clone_len; - clone_offset += clone_len; - start_sector += clone_len >> 9; - file_offset += clone_len; - - free_extent_map(em); - } while (submit_len > 0); - return; + dip->file_offset = file_offset; + dip->bytes = bio->bi_iter.bi_size; -out_err_em: - free_extent_map(em); -out_err: - dio_bio->bi_status = status; - btrfs_dio_private_put(dip); + dio_data->submitted += bio->bi_iter.bi_size; + btrfs_submit_bio(bio, 0); } static const struct iomap_ops btrfs_dio_iomap_ops = { @@ -8157,7 +7778,7 @@ static const struct iomap_ops btrfs_dio_iomap_ops = { }; static const struct iomap_dio_ops btrfs_dio_ops = { - .submit_io = btrfs_submit_direct, + .submit_io = btrfs_dio_submit_io, .bio_set = &btrfs_dio_bioset, }; @@ -8552,7 +8173,7 @@ again: unlock_extent(io_tree, page_start, page_end, &cached_state); unlock_page(page); up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -8802,7 +8423,7 @@ out: return ret; } -struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, +struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap, struct inode *dir) { struct inode *inode; @@ -8813,7 +8434,7 @@ struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, * Subvolumes don't inherit the sgid bit or the parent's gid if * the parent's sgid bit is set. This is probably a bug. */ - inode_init_owner(mnt_userns, inode, NULL, + inode_init_owner(idmap, inode, NULL, S_IFDIR | (~current_umask() & S_IRWXUGO)); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; @@ -8850,7 +8471,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_log_commit = 0; spin_lock_init(&ei->lock); - spin_lock_init(&ei->io_failure_lock); ei->outstanding_extents = 0; if (sb->s_magic != BTRFS_TEST_MAGIC) btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, @@ -8870,7 +8490,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->io_tree.inode = ei; extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); - ei->io_failure_tree = RB_ROOT; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); @@ -8994,7 +8613,7 @@ int __init btrfs_init_cachep(void) goto fail; if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_dio_private, bio), + offsetof(struct btrfs_dio_private, bbio.bio), BIOSET_NEED_BVECS)) goto fail; @@ -9004,7 +8623,7 @@ fail: return -ENOMEM; } -static int btrfs_getattr(struct user_namespace *mnt_userns, +static int btrfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -9034,7 +8653,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); @@ -9289,14 +8908,14 @@ out_notrans: return ret; } -static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns, +static struct inode *new_whiteout_inode(struct mnt_idmap *idmap, struct inode *dir) { struct inode *inode; inode = new_inode(dir->i_sb); if (inode) { - inode_init_owner(mnt_userns, inode, dir, + inode_init_owner(idmap, inode, dir, S_IFCHR | WHITEOUT_MODE); inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); @@ -9304,7 +8923,7 @@ static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns, return inode; } -static int btrfs_rename(struct user_namespace *mnt_userns, +static int btrfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -9376,7 +8995,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, filemap_flush(old_inode->i_mapping); if (flags & RENAME_WHITEOUT) { - whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir); + whiteout_args.inode = new_whiteout_inode(idmap, old_dir); if (!whiteout_args.inode) { ret = -ENOMEM; goto out_fscrypt_names; @@ -9545,7 +9164,7 @@ out_fscrypt_names: return ret; } -static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, +static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -9558,7 +9177,7 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); else - ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, + ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info); @@ -9758,7 +9377,7 @@ out: return ret; } -static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -9786,7 +9405,7 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; - inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO); + inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO); inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_aops; @@ -10075,7 +9694,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } -static int btrfs_permission(struct user_namespace *mnt_userns, +static int btrfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -10088,10 +9707,10 @@ static int btrfs_permission(struct user_namespace *mnt_userns, if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) return -EACCES; } - return generic_permission(mnt_userns, inode, mask); + return generic_permission(idmap, inode, mask); } -static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -10109,7 +9728,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; @@ -10289,65 +9908,13 @@ struct btrfs_encoded_read_private { wait_queue_head_t wait; atomic_t pending; blk_status_t status; - bool skip_csum; }; -static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, - struct bio *bio, int mirror_num) -{ - struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (!priv->skip_csum) { - ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); - if (ret) - return ret; - } - - atomic_inc(&priv->pending); - btrfs_submit_bio(fs_info, bio, mirror_num); - return BLK_STS_OK; -} - -static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) -{ - const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); - struct btrfs_encoded_read_private *priv = bbio->private; - struct btrfs_inode *inode = priv->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 sectorsize = fs_info->sectorsize; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - u32 bio_offset = 0; - - if (priv->skip_csum || !uptodate) - return bbio->bio.bi_status; - - bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { - unsigned int i, nr_sectors, pgoff; - - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); - pgoff = bvec->bv_offset; - for (i = 0; i < nr_sectors; i++) { - ASSERT(pgoff < PAGE_SIZE); - if (btrfs_check_data_csum(inode, bbio, bio_offset, - bvec->bv_page, pgoff)) - return BLK_STS_IOERR; - bio_offset += sectorsize; - pgoff += sectorsize; - } - } - return BLK_STS_OK; -} - static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) { struct btrfs_encoded_read_private *priv = bbio->private; - blk_status_t status; - status = btrfs_encoded_read_verify_csum(bbio); - if (status) { + if (bbio->bio.bi_status) { /* * The memory barrier implied by the atomic_dec_return() here * pairs with the memory barrier implied by the @@ -10356,11 +9923,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) * write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ - WRITE_ONCE(priv->status, status); + WRITE_ONCE(priv->status, bbio->bio.bi_status); } if (!atomic_dec_return(&priv->pending)) wake_up(&priv->wait); - btrfs_bio_free_csum(bbio); bio_put(&bbio->bio); } @@ -10368,47 +9934,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 disk_io_size, struct page **pages) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private priv = { .inode = inode, .file_offset = file_offset, .pending = ATOMIC_INIT(1), - .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM), }; unsigned long i = 0; u64 cur = 0; - int ret; init_waitqueue_head(&priv.wait); - /* - * Submit bios for the extent, splitting due to bio or stripe limits as - * necessary. - */ + /* Submit bios for the extent, splitting due to bio limits as necessary. */ while (cur < disk_io_size) { - struct extent_map *em; - struct btrfs_io_geometry geom; struct bio *bio = NULL; - u64 remaining; + u64 remaining = disk_io_size - cur; - em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur, - disk_io_size - cur); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - } else { - ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ, - disk_bytenr + cur, &geom); - free_extent_map(em); - } - if (ret) { - WRITE_ONCE(priv.status, errno_to_blk_status(ret)); - break; - } - remaining = min(geom.len, disk_io_size - cur); while (bio || remaining) { size_t bytes = min_t(u64, remaining, PAGE_SIZE); if (!bio) { bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, + inode, btrfs_encoded_read_endio, &priv); bio->bi_iter.bi_sector = @@ -10417,14 +9962,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (!bytes || bio_add_page(bio, pages[i], bytes, 0) < bytes) { - blk_status_t status; - - status = submit_encoded_read_bio(inode, bio, 0); - if (status) { - WRITE_ONCE(priv.status, status); - bio_put(bio); - goto out; - } + atomic_inc(&priv.pending); + btrfs_submit_bio(bio, 0); bio = NULL; continue; } @@ -10435,7 +9974,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, } } -out: if (atomic_dec_return(&priv.pending)) io_wait_event(priv.wait, !atomic_read(&priv.pending)); /* See btrfs_encoded_read_endio() for ordering. */ @@ -10995,9 +10533,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, return 0; max_pages = sis->max - bsi->nr_pages; - first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; - next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, - PAGE_SIZE) >> PAGE_SHIFT; + first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; + next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; if (first_ppage >= next_ppage) return 0; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7e348bd2ccde..84626c8ad5bf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -243,7 +243,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int btrfs_fileattr_set(struct user_namespace *mnt_userns, +int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -578,7 +578,7 @@ static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit return num_items; } -static noinline int create_subvol(struct user_namespace *mnt_userns, +static noinline int create_subvol(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct btrfs_qgroup_inherit *inherit) { @@ -623,7 +623,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, if (ret < 0) goto out_root_item; - new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir); + new_inode_args.inode = btrfs_new_subvol_inode(idmap, dir); if (!new_inode_args.inode) { ret = -ENOMEM; goto out_anon_dev; @@ -707,7 +707,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, * exists). */ btrfs_tree_lock(leaf); - btrfs_clean_tree_block(leaf); + btrfs_clear_buffer_dirty(trans, leaf); btrfs_tree_unlock(leaf); btrfs_free_tree_block(trans, objectid, leaf, 0, 1); free_extent_buffer(leaf); @@ -898,7 +898,7 @@ free_pending: * nfs_async_unlink(). */ -static int btrfs_may_delete(struct user_namespace *mnt_userns, +static int btrfs_may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, int isdir) { int error; @@ -909,12 +909,12 @@ static int btrfs_may_delete(struct user_namespace *mnt_userns, BUG_ON(d_inode(victim->d_parent) != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(mnt_userns, dir, d_inode(victim)) || + if (check_sticky(idmap, dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) return -EPERM; @@ -933,16 +933,16 @@ static int btrfs_may_delete(struct user_namespace *mnt_userns, } /* copy of may_create in fs/namei.c() */ -static inline int btrfs_may_create(struct user_namespace *mnt_userns, +static inline int btrfs_may_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child) { if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns)) + if (!fsuidgid_has_mapping(dir->i_sb, idmap)) return -EOVERFLOW; - return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } /* @@ -951,7 +951,7 @@ static inline int btrfs_may_create(struct user_namespace *mnt_userns, * inside this filesystem so it's quite a bit simpler. */ static noinline int btrfs_mksubvol(const struct path *parent, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const char *name, int namelen, struct btrfs_root *snap_src, bool readonly, @@ -967,12 +967,12 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (error == -EINTR) return error; - dentry = lookup_one(mnt_userns, name, parent->dentry, namelen); + dentry = lookup_one(idmap, name, parent->dentry, namelen); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_unlock; - error = btrfs_may_create(mnt_userns, dir, dentry); + error = btrfs_may_create(idmap, dir, dentry); if (error) goto out_dput; @@ -993,7 +993,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (snap_src) error = create_snapshot(snap_src, dir, dentry, readonly, inherit); else - error = create_subvol(mnt_userns, dir, dentry, inherit); + error = create_subvol(idmap, dir, dentry, inherit); if (!error) fsnotify_mkdir(dir, dentry); @@ -1007,7 +1007,7 @@ out_unlock: } static noinline int btrfs_mksnapshot(const struct path *parent, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const char *name, int namelen, struct btrfs_root *root, bool readonly, @@ -1037,7 +1037,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent, btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); - ret = btrfs_mksubvol(parent, mnt_userns, name, namelen, + ret = btrfs_mksubvol(parent, idmap, name, namelen, root, readonly, inherit); out: if (snapshot_force_cow) @@ -1240,7 +1240,7 @@ out_drop: } static noinline int __btrfs_ioctl_snap_create(struct file *file, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const char *name, unsigned long fd, int subvol, bool readonly, struct btrfs_qgroup_inherit *inherit) @@ -1268,7 +1268,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, } if (subvol) { - ret = btrfs_mksubvol(&file->f_path, mnt_userns, name, + ret = btrfs_mksubvol(&file->f_path, idmap, name, namelen, NULL, readonly, inherit); } else { struct fd src = fdget(fd); @@ -1283,14 +1283,14 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); ret = -EXDEV; - } else if (!inode_owner_or_capable(mnt_userns, src_inode)) { + } else if (!inode_owner_or_capable(idmap, src_inode)) { /* * Subvolume creation is not restricted, but snapshots * are limited to own subvolumes only */ ret = -EPERM; } else { - ret = btrfs_mksnapshot(&file->f_path, mnt_userns, + ret = btrfs_mksnapshot(&file->f_path, idmap, name, namelen, BTRFS_I(src_inode)->root, readonly, inherit); @@ -1317,7 +1317,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file), + ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), vol_args->name, vol_args->fd, subvol, false, NULL); @@ -1377,7 +1377,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, } } - ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file), + ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), vol_args->name, vol_args->fd, subvol, readonly, inherit); if (ret) @@ -1422,7 +1422,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, u64 flags; int ret = 0; - if (!inode_owner_or_capable(file_mnt_user_ns(file), inode)) + if (!inode_owner_or_capable(file_mnt_idmap(file), inode)) return -EPERM; ret = mnt_want_write_file(file); @@ -1870,7 +1870,7 @@ out: return ret; } -static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns, +static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct inode *inode, struct btrfs_ioctl_ino_lookup_user_args *args) { @@ -1962,7 +1962,7 @@ static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns, ret = PTR_ERR(temp_inode); goto out_put; } - ret = inode_permission(mnt_userns, temp_inode, + ret = inode_permission(idmap, temp_inode, MAY_READ | MAY_EXEC); iput(temp_inode); if (ret) { @@ -2101,7 +2101,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) return -EACCES; } - ret = btrfs_search_path_in_tree_user(file_mnt_user_ns(file), inode, args); + ret = btrfs_search_path_in_tree_user(file_mnt_idmap(file), inode, args); if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; @@ -2335,7 +2335,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_root *dest = NULL; struct btrfs_ioctl_vol_args *vol_args = NULL; struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; - struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct mnt_idmap *idmap = file_mnt_idmap(file); char *subvol_name, *subvol_name_ptr = NULL; int subvol_namelen; int err = 0; @@ -2428,7 +2428,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * anywhere in the filesystem the user wouldn't be able * to delete without an idmapped mount. */ - if (old_dir != dir && mnt_userns != &init_user_ns) { + if (old_dir != dir && idmap != &nop_mnt_idmap) { err = -EOPNOTSUPP; goto free_parent; } @@ -2471,7 +2471,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (err == -EINTR) goto free_subvol_name; - dentry = lookup_one(mnt_userns, subvol_name, parent, subvol_namelen); + dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_unlock_dir; @@ -2513,13 +2513,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (root == dest) goto out_dput; - err = inode_permission(mnt_userns, inode, MAY_WRITE | MAY_EXEC); + err = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); if (err) goto out_dput; } /* check if subvolume may be deleted by a user */ - err = btrfs_may_delete(mnt_userns, dir, dentry, 1); + err = btrfs_may_delete(idmap, dir, dentry, 1); if (err) goto out_dput; @@ -2582,7 +2582,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) * running and allows defrag on files open in read-only mode. */ if (!capable(CAP_SYS_ADMIN) && - inode_permission(&init_user_ns, inode, MAY_WRITE)) { + inode_permission(&nop_mnt_idmap, inode, MAY_WRITE)) { ret = -EPERM; goto out; } @@ -3907,7 +3907,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, } static long _btrfs_ioctl_set_received_subvol(struct file *file, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct btrfs_ioctl_received_subvol_args *sa) { struct inode *inode = file_inode(file); @@ -3919,7 +3919,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, int ret = 0; int received_uuid_changed; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; ret = mnt_want_write_file(file); @@ -4024,7 +4024,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file, args64->rtime.nsec = args32->rtime.nsec; args64->flags = args32->flags; - ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), args64); + ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), args64); if (ret) goto out; @@ -4058,7 +4058,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, if (IS_ERR(sa)) return PTR_ERR(sa); - ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), sa); + ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), sa); if (ret) goto out; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 8a855d5ac2fa..d51b9a2f2f6e 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -6,7 +6,7 @@ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int btrfs_fileattr_set(struct user_namespace *mnt_userns, +int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c new file mode 100644 index 000000000000..0fe0ae54ac67 --- /dev/null +++ b/fs/btrfs/lru_cache.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/mm.h> +#include "lru_cache.h" +#include "messages.h" + +/* + * Initialize a cache object. + * + * @cache: The cache. + * @max_size: Maximum size (number of entries) for the cache. + * Use 0 for unlimited size, it's the user's responsability to + * trim the cache in that case. + */ +void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size) +{ + INIT_LIST_HEAD(&cache->lru_list); + mt_init(&cache->entries); + cache->size = 0; + cache->max_size = max_size; +} + +static struct btrfs_lru_cache_entry *match_entry(struct list_head *head, u64 key, + u64 gen) +{ + struct btrfs_lru_cache_entry *entry; + + list_for_each_entry(entry, head, list) { + if (entry->key == key && entry->gen == gen) + return entry; + } + + return NULL; +} + +/* + * Lookup for an entry in the cache. + * + * @cache: The cache. + * @key: The key of the entry we are looking for. + * @gen: Generation associated to the key. + * + * Returns the entry associated with the key or NULL if none found. + */ +struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, + u64 key, u64 gen) +{ + struct list_head *head; + struct btrfs_lru_cache_entry *entry; + + head = mtree_load(&cache->entries, key); + if (!head) + return NULL; + + entry = match_entry(head, key, gen); + if (entry) + list_move_tail(&entry->lru_list, &cache->lru_list); + + return entry; +} + +/* + * Remove an entry from the cache. + * + * @cache: The cache to remove from. + * @entry: The entry to remove from the cache. + * + * Note: this also frees the memory used by the entry. + */ +void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *entry) +{ + struct list_head *prev = entry->list.prev; + + ASSERT(cache->size > 0); + ASSERT(!mtree_empty(&cache->entries)); + + list_del(&entry->list); + list_del(&entry->lru_list); + + if (list_empty(prev)) { + struct list_head *head; + + /* + * If previous element in the list entry->list is now empty, it + * means it's a head entry not pointing to any cached entries, + * so remove it from the maple tree and free it. + */ + head = mtree_erase(&cache->entries, entry->key); + ASSERT(head == prev); + kfree(head); + } + + kfree(entry); + cache->size--; +} + +/* + * Store an entry in the cache. + * + * @cache: The cache. + * @entry: The entry to store. + * + * Returns 0 on success and < 0 on error. + */ +int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *new_entry, + gfp_t gfp) +{ + const u64 key = new_entry->key; + struct list_head *head; + int ret; + + head = kmalloc(sizeof(*head), gfp); + if (!head) + return -ENOMEM; + + ret = mtree_insert(&cache->entries, key, head, gfp); + if (ret == 0) { + INIT_LIST_HEAD(head); + list_add_tail(&new_entry->list, head); + } else if (ret == -EEXIST) { + kfree(head); + head = mtree_load(&cache->entries, key); + ASSERT(head != NULL); + if (match_entry(head, key, new_entry->gen) != NULL) + return -EEXIST; + list_add_tail(&new_entry->list, head); + } else if (ret < 0) { + kfree(head); + return ret; + } + + if (cache->max_size > 0 && cache->size == cache->max_size) { + struct btrfs_lru_cache_entry *lru_entry; + + lru_entry = list_first_entry(&cache->lru_list, + struct btrfs_lru_cache_entry, + lru_list); + btrfs_lru_cache_remove(cache, lru_entry); + } + + list_add_tail(&new_entry->lru_list, &cache->lru_list); + cache->size++; + + return 0; +} + +/* + * Empty a cache. + * + * @cache: The cache to empty. + * + * Removes all entries from the cache. + */ +void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache) +{ + struct btrfs_lru_cache_entry *entry; + struct btrfs_lru_cache_entry *tmp; + + list_for_each_entry_safe(entry, tmp, &cache->lru_list, lru_list) + btrfs_lru_cache_remove(cache, entry); + + ASSERT(cache->size == 0); + ASSERT(mtree_empty(&cache->entries)); +} diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h new file mode 100644 index 000000000000..de3e18bce24a --- /dev/null +++ b/fs/btrfs/lru_cache.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_LRU_CACHE_H +#define BTRFS_LRU_CACHE_H + +#include <linux/maple_tree.h> +#include <linux/list.h> + +/* + * A cache entry. This is meant to be embedded in a structure of a user of + * this module. Similar to how struct list_head and struct rb_node are used. + * + * Note: it should be embedded as the first element in a struct (offset 0), and + * this module assumes it was allocated with kmalloc(), so it calls kfree() when + * it needs to free an entry. + */ +struct btrfs_lru_cache_entry { + struct list_head lru_list; + u64 key; + /* + * Optional generation associated to a key. Use 0 if not needed/used. + * Entries with the same key and different generations are stored in a + * linked list, so use this only for cases where there's a small number + * of different generations. + */ + u64 gen; + /* + * The maple tree uses unsigned long type for the keys, which is 32 bits + * on 32 bits systems, and 64 bits on 64 bits systems. So if we want to + * use something like inode numbers as keys, which are always a u64, we + * have to deal with this in a special way - we store the key in the + * entry itself, as a u64, and the values inserted into the maple tree + * are linked lists of entries - so in case we are on a 64 bits system, + * that list always has a single entry, while on 32 bits systems it + * may have more than one, with each entry having the same value for + * their lower 32 bits of the u64 key. + */ + struct list_head list; +}; + +struct btrfs_lru_cache { + struct list_head lru_list; + struct maple_tree entries; + /* Number of entries stored in the cache. */ + unsigned int size; + /* Maximum number of entries the cache can have. */ + unsigned int max_size; +}; + +#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \ + list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list) + +static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache) +{ + return cache->size; +} + +static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache) +{ + return cache->size >= cache->max_size; +} + +static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( + struct btrfs_lru_cache *cache) +{ + return list_first_entry_or_null(&cache->lru_list, + struct btrfs_lru_cache_entry, lru_list); +} + +void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size); +struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, + u64 key, u64 gen); +int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *new_entry, + gfp_t gfp); +void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *entry); +void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache); + +#endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d5e78cbc8fbc..71f6d8302d50 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -280,7 +280,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } /* Check if we have reached page boundary */ - if (IS_ALIGNED(cur_in, PAGE_SIZE)) { + if (PAGE_ALIGNED(cur_in)) { put_page(page_in); page_in = NULL; } diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 625bbbbb2608..fde5aaa6e7c9 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -293,36 +293,6 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) #endif /* - * We only mark the transaction aborted and then set the file system read-only. - * This will prevent new transactions from starting or trying to join this - * one. - * - * This means that error recovery at the call site is limited to freeing - * any local memory allocations and passing the error code up without - * further cleanup. The transaction should complete as it normally would - * in the call path but will return -EIO. - * - * We'll complete the cleanup in btrfs_end_transaction and - * btrfs_commit_transaction. - */ -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - WRITE_ONCE(trans->aborted, errno); - WRITE_ONCE(trans->transaction->aborted, errno); - if (first_hit && errno == -ENOSPC) - btrfs_dump_space_info_for_trans_abort(fs_info); - /* Wake up anybody who may be waiting on this transaction */ - wake_up(&fs_info->transaction_wait); - wake_up(&fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); -} - -/* * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an * alert, and either panics or BUGs, depending on mount options. */ diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 190af1f698d9..8c516ee58ff9 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -6,7 +6,6 @@ #include <linux/types.h> struct btrfs_fs_info; -struct btrfs_trans_handle; static inline __printf(2, 3) __cold void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) @@ -178,39 +177,6 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function const char * __attribute_const__ btrfs_decode_error(int errno); -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit); - -bool __cold abort_should_print_stack(int errno); - -/* - * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact stack trace is reported for some errors. - */ -#define btrfs_abort_transaction(trans, errno) \ -do { \ - bool first = false; \ - /* Report first abort since mount */ \ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((trans)->fs_info->fs_state))) { \ - first = true; \ - if (WARN(abort_should_print_stack(errno), \ - KERN_ERR \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno))) { \ - /* Stack trace printed. */ \ - } else { \ - btrfs_err((trans)->fs_info, \ - "Transaction aborted (error %d)", \ - (errno)); \ - } \ - } \ - __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno), first); \ -} while (0) - #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 57d8c72737e1..6c24b69e2d0a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -616,7 +616,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; ordered = container_of(work, struct btrfs_ordered_extent, flush_work); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); complete(&ordered->completion); } @@ -716,13 +716,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, } /* - * Used to start IO or wait for a given ordered extent to finish. + * Start IO and wait for a given ordered extent to finish. * - * If wait is one, this effectively waits on page writeback for all the pages - * in the extent, and it waits on the io completion code to insert - * metadata into the btree corresponding to the extent + * Wait on page writeback for all the pages in the extent and the IO completion + * code to insert metadata into the btree corresponding to the extent. */ -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; @@ -744,12 +743,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) */ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); - if (wait) { - if (!freespace_inode) - btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); - wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, - &entry->flags)); - } + + if (!freespace_inode) + btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); + wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); } /* @@ -800,7 +797,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); end = ordered->file_offset; /* * If the ordered extent had an error save the error but don't @@ -1061,7 +1058,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, break; } unlock_extent(&inode->io_tree, start, end, cachedp); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 89f82b78f590..eb40cb39f842 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -157,7 +157,6 @@ struct btrfs_ordered_extent { * command in a workqueue context */ u64 physical; - struct block_device *bdev; }; static inline void @@ -187,7 +186,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index af97413abcf4..52a7d2fa2284 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1304,7 +1304,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) list_del("a_root->dirty_list); btrfs_tree_lock(quota_root->node); - btrfs_clean_tree_block(quota_root->node); + btrfs_clear_buffer_dirty(trans, quota_root->node); btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, btrfs_root_id(quota_root), quota_root->node, 0, 1); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 6a2cf754912d..642828c1b299 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -998,7 +998,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) } /* - * Return the total numer of errors found in the vertical stripe of @sector_nr. + * Return the total number of errors found in the vertical stripe of @sector_nr. * * @faila and @failb will also be updated to the first and second stripe * number of the errors. @@ -1183,7 +1183,15 @@ not_found: trace_info->stripe_nr = -1; } -/* Generate PQ for one veritical stripe. */ +static inline void bio_list_put(struct bio_list *bio_list) +{ + struct bio *bio; + + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); +} + +/* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { void **pointers = rbio->finish_pointers; @@ -1228,7 +1236,6 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { - struct bio *bio; /* The total sector number inside the full stripe. */ int total_sector_nr; int sectornr; @@ -1317,8 +1324,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, return 0; error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); + bio_list_put(bio_list); return -EIO; } @@ -1357,7 +1363,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) } /* - * For subpage case, we can no longer set page Uptodate directly for + * For subpage case, we can no longer set page Up-to-date directly for * stripe_pages[], thus we need to locate the sector. */ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, @@ -1425,13 +1431,20 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi int total_sector_nr = get_bio_sector_nr(rbio, bio); u32 bio_size = 0; struct bio_vec *bvec; - struct bvec_iter_all iter_all; + int i; - bio_for_each_segment_all(bvec, bio, iter_all) + bio_for_each_bvec_all(bvec, bio, i) bio_size += bvec->bv_len; - bitmap_set(rbio->error_bitmap, total_sector_nr, - bio_size >> rbio->bioc->fs_info->sectorsize_bits); + /* + * Since we can have multiple bios touching the error_bitmap, we cannot + * call bitmap_set() without protection. + * + * Instead use set_bit() for each bit, as set_bit() itself is atomic. + */ + for (i = total_sector_nr; i < total_sector_nr + + (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) + set_bit(i, rbio->error_bitmap); } /* Verify the data sectors at read time. */ @@ -1490,7 +1503,7 @@ static void raid_wait_read_end_io(struct bio *bio) wake_up(&rbio->io_wait); } -static void submit_read_bios(struct btrfs_raid_bio *rbio, +static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { struct bio *bio; @@ -1507,41 +1520,8 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio, } submit_bio(bio); } -} - -static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) -{ - struct bio *bio; - int total_sector_nr; - int ret = 0; - ASSERT(bio_list_size(bio_list) == 0); - - /* - * Build a list of bios to read all sectors (including data and P/Q). - * - * This behaviro is to compensate the later csum verification and - * recovery. - */ - for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; - total_sector_nr++) { - struct sector_ptr *sector; - int stripe = total_sector_nr / rbio->stripe_nsectors; - int sectornr = total_sector_nr % rbio->stripe_nsectors; - - sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, bio_list, sector, - stripe, sectornr, REQ_OP_READ); - if (ret) - goto cleanup; - } - return 0; - -cleanup: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - return ret; + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); } static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) @@ -1660,12 +1640,12 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; - int ret = 0; rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - ret = PTR_ERR(rbio); - goto fail; + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + bio_endio(bio); + return; } rbio->operation = BTRFS_RBIO_WRITE; rbio_add_bio(rbio, bio); @@ -1674,31 +1654,24 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) * Don't plug on full rbios, just get them out the door * as quickly as we can */ - if (rbio_is_full(rbio)) - goto queue_rbio; - - cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); - if (cb) { - plug = container_of(cb, struct btrfs_plug_cb, cb); - if (!plug->info) { - plug->info = fs_info; - INIT_LIST_HEAD(&plug->rbio_list); + if (!rbio_is_full(rbio)) { + cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); + if (cb) { + plug = container_of(cb, struct btrfs_plug_cb, cb); + if (!plug->info) { + plug->info = fs_info; + INIT_LIST_HEAD(&plug->rbio_list); + } + list_add_tail(&rbio->plug_list, &plug->rbio_list); + return; } - list_add_tail(&rbio->plug_list, &plug->rbio_list); - return; } -queue_rbio: + /* * Either we don't have any existing plug, or we're doing a full stripe, - * can queue the rmw work now. + * queue the rmw work now. */ start_async_work(rbio, rmw_rbio_work); - - return; - -fail: - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); } static int verify_one_sector(struct btrfs_raid_bio *rbio, @@ -1765,7 +1738,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, &failb); /* - * No errors in the veritical stripe, skip it. Can happen for recovery + * No errors in the vertical stripe, skip it. Can happen for recovery * which only part of a stripe failed csum check. */ if (!found_errors) @@ -1886,7 +1859,7 @@ pstripe: sector->uptodate = 1; } if (failb >= 0) { - ret = verify_one_sector(rbio, faila, sector_nr); + ret = verify_one_sector(rbio, failb, sector_nr); if (ret < 0) goto cleanup; @@ -1941,14 +1914,25 @@ out: return ret; } -static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) +static void recover_rbio(struct btrfs_raid_bio *rbio) { - struct bio *bio; + struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; - ASSERT(bio_list_size(bio_list) == 0); + /* + * Either we're doing recover for a read failure or degraded write, + * caller should have set error bitmap correctly. + */ + ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); + + /* For recovery, we need to read all sectors including P/Q. */ + ret = alloc_rbio_pages(rbio); + if (ret < 0) + goto out; + + index_rbio_pages(rbio); + /* * Read everything that hasn't failed. However this time we will * not trust any cached sector. @@ -1979,78 +1963,32 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, } sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); - if (ret < 0) - goto error; + if (ret < 0) { + bio_list_put(&bio_list); + goto out; + } } - return 0; -error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - - return -EIO; -} - -static int recover_rbio(struct btrfs_raid_bio *rbio) -{ - struct bio_list bio_list; - struct bio *bio; - int ret; - - /* - * Either we're doing recover for a read failure or degraded write, - * caller should have set error bitmap correctly. - */ - ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); - bio_list_init(&bio_list); - - /* For recovery, we need to read all sectors including P/Q. */ - ret = alloc_rbio_pages(rbio); - if (ret < 0) - goto out; - - index_rbio_pages(rbio); - - ret = recover_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto out; - - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + submit_read_wait_bio_list(rbio, &bio_list); ret = recover_sectors(rbio); - out: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void recover_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; - int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = lock_stripe_add(rbio); - if (ret == 0) { - ret = recover_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } + if (!lock_stripe_add(rbio)) + recover_rbio(rbio); } static void recover_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = recover_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + recover_rbio(container_of(work, struct btrfs_raid_bio, work)); } static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) @@ -2196,11 +2134,9 @@ no_csum: static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) { - struct bio_list bio_list; - struct bio *bio; - int ret; - - bio_list_init(&bio_list); + struct bio_list bio_list = BIO_EMPTY_LIST; + int total_sector_nr; + int ret = 0; /* * Fill the data csums we need for data verification. We need to fill @@ -2209,24 +2145,32 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) */ fill_data_csums(rbio); - ret = rmw_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto out; + /* + * Build a list of bios to read all sectors (including data and P/Q). + * + * This behavior is to compensate the later csum verification and recovery. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + sector = rbio_stripe_sector(rbio, stripe, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, REQ_OP_READ); + if (ret) { + bio_list_put(&bio_list); + return ret; + } + } /* * We may or may not have any corrupted sectors (including missing dev * and csum mismatch), just let recover_sectors() to handle them all. */ - ret = recover_sectors(rbio); - return ret; -out: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; + submit_read_wait_bio_list(rbio, &bio_list); + return recover_sectors(rbio); } static void raid_wait_write_end_io(struct bio *bio) @@ -2282,7 +2226,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) return false; } -static int rmw_rbio(struct btrfs_raid_bio *rbio) +static void rmw_rbio(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; int sectornr; @@ -2294,30 +2238,28 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) */ ret = alloc_rbio_parity_pages(rbio); if (ret < 0) - return ret; + goto out; /* * Either full stripe write, or we have every data sector already * cached, can go to write path immediately. */ - if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio)) - goto write; - - /* - * Now we're doing sub-stripe write, also need all data stripes to do - * the full RMW. - */ - ret = alloc_rbio_data_pages(rbio); - if (ret < 0) - return ret; + if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { + /* + * Now we're doing sub-stripe write, also need all data stripes + * to do the full RMW. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + goto out; - index_rbio_pages(rbio); + index_rbio_pages(rbio); - ret = rmw_read_wait_recover(rbio); - if (ret < 0) - return ret; + ret = rmw_read_wait_recover(rbio); + if (ret < 0) + goto out; + } -write: /* * At this stage we're not allowed to add any new bios to the * bio list any more, anyone else that wants to change this stripe @@ -2348,7 +2290,7 @@ write: bio_list_init(&bio_list); ret = rmw_assemble_write_bios(rbio, &bio_list); if (ret < 0) - return ret; + goto out; /* We should have at least one bio assembled. */ ASSERT(bio_list_size(&bio_list)); @@ -2365,32 +2307,22 @@ write: break; } } - return ret; +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void rmw_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; - int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = lock_stripe_add(rbio); - if (ret == 0) { - ret = rmw_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } + if (lock_stripe_add(rbio) == 0) + rmw_rbio(rbio); } static void rmw_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = rmw_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); } /* @@ -2498,7 +2430,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) struct sector_ptr p_sector = { 0 }; struct sector_ptr q_sector = { 0 }; struct bio_list bio_list; - struct bio *bio; int is_replace = 0; int ret; @@ -2629,8 +2560,7 @@ submit_write: return 0; cleanup: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); + bio_list_put(&bio_list); return ret; } @@ -2725,15 +2655,12 @@ out: return ret; } -static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) +static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) { - struct bio *bio; + struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; - ASSERT(bio_list_size(bio_list) == 0); - /* Build a list of bios to read all the missing parts. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { @@ -2762,45 +2689,38 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, if (sector->uptodate) continue; - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); - if (ret) - goto error; + if (ret) { + bio_list_put(&bio_list); + return ret; + } } + + submit_read_wait_bio_list(rbio, &bio_list); return 0; -error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - return ret; } -static int scrub_rbio(struct btrfs_raid_bio *rbio) +static void scrub_rbio(struct btrfs_raid_bio *rbio) { bool need_check = false; - struct bio_list bio_list; int sector_nr; int ret; - struct bio *bio; - - bio_list_init(&bio_list); ret = alloc_rbio_essential_pages(rbio); if (ret) - goto cleanup; + goto out; bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); - ret = scrub_assemble_read_bios(rbio, &bio_list); + ret = scrub_assemble_read_bios(rbio); if (ret < 0) - goto cleanup; - - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + goto out; /* We may have some failures, recover the failed sectors first. */ ret = recover_scrub_rbio(rbio); if (ret < 0) - goto cleanup; + goto out; /* * We have every sector properly prepared. Can finish the scrub @@ -2817,23 +2737,13 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) break; } } - return ret; - -cleanup: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void scrub_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - ret = scrub_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); } void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 7c73a443939e..df0e0abdeb1f 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -65,7 +65,7 @@ struct btrfs_raid_bio { /* Number of data stripes (no p/q) */ u8 nr_data; - /* Numer of all stripes (including P/Q) */ + /* Number of all stripes (including P/Q) */ u8 real_stripes; /* How many pages there are for each stripe */ @@ -132,7 +132,7 @@ struct btrfs_raid_bio { /* * Checksum buffer if the rbio is for data. The buffer should cover - * all data sectors (exlcuding P/Q sectors). + * all data sectors (excluding P/Q sectors). */ u8 *csum_buf; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 31ec4a7658ce..ef13a9d4e370 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2825,7 +2825,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * * Here we have to manually invalidate the range (i_size, PAGE_END + 1). */ - if (!IS_ALIGNED(i_size, PAGE_SIZE)) { + if (!PAGE_ALIGNED(i_size)) { struct address_space *mapping = inode->vfs_inode.i_mapping; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 52b346795f66..69c93ae333f6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -229,7 +229,7 @@ struct full_stripe_lock { }; #ifndef CONFIG_64BIT -/* This structure is for archtectures whose (void *) is smaller than u64 */ +/* This structure is for architectures whose (void *) is smaller than u64 */ struct scrub_page_private { u64 logical; }; @@ -2053,20 +2053,33 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * a) don't have an extent buffer and * b) the page is already kmapped */ - if (sblock->logical != btrfs_stack_header_bytenr(h)) + if (sblock->logical != btrfs_stack_header_bytenr(h)) { sblock->header_error = 1; - - if (sector->generation != btrfs_stack_header_generation(h)) { - sblock->header_error = 1; - sblock->generation_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad bytenr, has %llu want %llu", + sblock->logical, sblock->mirror_num, + btrfs_stack_header_bytenr(h), + sblock->logical); + goto out; } - if (!scrub_check_fsid(h->fsid, sector)) + if (!scrub_check_fsid(h->fsid, sector)) { sblock->header_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad fsid, has %pU want %pU", + sblock->logical, sblock->mirror_num, + h->fsid, sblock->dev->fs_devices->fsid); + goto out; + } - if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, - BTRFS_UUID_SIZE)) + if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { sblock->header_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", + sblock->logical, sblock->mirror_num, + h->chunk_tree_uuid, fs_info->chunk_tree_uuid); + goto out; + } shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); @@ -2079,9 +2092,27 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) } crypto_shash_final(shash, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) + if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) { sblock->checksum_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, + sblock->logical, sblock->mirror_num, + CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), + CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); + goto out; + } + + if (sector->generation != btrfs_stack_header_generation(h)) { + sblock->header_error = 1; + sblock->generation_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad generation, has %llu want %llu", + sblock->logical, sblock->mirror_num, + btrfs_stack_header_generation(h), + sector->generation); + } +out: return sblock->header_error || sblock->checksum_error; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e65e6b6600a7..e5c963bb873d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -32,6 +32,7 @@ #include "file-item.h" #include "ioctl.h" #include "verity.h" +#include "lru_cache.h" /* * Maximum number of references an extent can have in order for us to attempt to @@ -80,23 +81,23 @@ struct clone_root { bool found_ref; }; -#define SEND_CTX_MAX_NAME_CACHE_SIZE 128 -#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) +#define SEND_MAX_NAME_CACHE_SIZE 256 /* - * Limit the root_ids array of struct backref_cache_entry to 12 elements. - * This makes the size of a cache entry to be exactly 128 bytes on x86_64. + * Limit the root_ids array of struct backref_cache_entry to 17 elements. + * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which + * can be satisfied from the kmalloc-192 slab, without wasting any space. * The most common case is to have a single root for cloning, which corresponds - * to the send root. Having the user specify more than 11 clone roots is not + * to the send root. Having the user specify more than 16 clone roots is not * common, and in such rare cases we simply don't use caching if the number of - * cloning roots that lead down to a leaf is more than 12. + * cloning roots that lead down to a leaf is more than 17. */ -#define SEND_MAX_BACKREF_CACHE_ROOTS 12 +#define SEND_MAX_BACKREF_CACHE_ROOTS 17 /* * Max number of entries in the cache. - * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding - * maple tree's internal nodes, is 16K. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding + * maple tree's internal nodes, is 24K. */ #define SEND_MAX_BACKREF_CACHE_SIZE 128 @@ -107,15 +108,31 @@ struct clone_root { * x86_64). */ struct backref_cache_entry { - /* List to link to the cache's lru list. */ - struct list_head list; - /* The key for this entry in the cache. */ - u64 key; + struct btrfs_lru_cache_entry entry; u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; /* Number of valid elements in the root_ids array. */ int num_roots; }; +/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ +static_assert(offsetof(struct backref_cache_entry, entry) == 0); + +/* + * Max number of entries in the cache that stores directories that were already + * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses + * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but + * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). + */ +#define SEND_MAX_DIR_CREATED_CACHE_SIZE 64 + +/* + * Max number of entries in the cache that stores directories that were already + * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses + * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but + * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). + */ +#define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64 + struct send_ctx { struct file *send_filp; loff_t send_off; @@ -174,9 +191,7 @@ struct send_ctx { struct list_head new_refs; struct list_head deleted_refs; - struct radix_tree_root name_cache; - struct list_head name_cache_list; - int name_cache_size; + struct btrfs_lru_cache name_cache; /* * The inode we are currently processing. It's not NULL only when we @@ -285,13 +300,11 @@ struct send_ctx { struct rb_root rbtree_new_refs; struct rb_root rbtree_deleted_refs; - struct { - u64 last_reloc_trans; - struct list_head lru_list; - struct maple_tree entries; - /* Number of entries stored in the cache. */ - int size; - } backref_cache; + struct btrfs_lru_cache backref_cache; + u64 backref_cache_last_reloc_trans; + + struct btrfs_lru_cache dir_created_cache; + struct btrfs_lru_cache dir_utimes_cache; }; struct pending_dir_move { @@ -321,21 +334,15 @@ struct orphan_dir_info { u64 ino; u64 gen; u64 last_dir_index_offset; + u64 dir_high_seq_ino; }; struct name_cache_entry { - struct list_head list; /* - * radix_tree has only 32bit entries but we need to handle 64bit inums. - * We use the lower 32bit of the 64bit inum to store it in the tree. If - * more then one inum would fall into the same entry, we use radix_list - * to store the additional entries. radix_list is also used to store - * entries where two entries have the same inum but different - * generations. + * The key in the entry is an inode number, and the generation matches + * the inode's generation. */ - struct list_head radix_list; - u64 ino; - u64 gen; + struct btrfs_lru_cache_entry entry; u64 parent_ino; u64 parent_gen; int ret; @@ -344,6 +351,9 @@ struct name_cache_entry { char name[]; }; +/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ +static_assert(offsetof(struct name_cache_entry, entry) == 0); + #define ADVANCE 1 #define ADVANCE_ONLY_NEXT -1 @@ -956,14 +966,12 @@ out: static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) { int ret; - struct btrfs_inode_info info; + struct btrfs_inode_info info = { 0 }; - if (!gen) - return -EPERM; + ASSERT(gen); ret = get_inode_info(root, ino, &info); - if (!ret) - *gen = info.gen; + *gen = info.gen; return ret; } @@ -1388,19 +1396,6 @@ static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, return 0; } -static void empty_backref_cache(struct send_ctx *sctx) -{ - struct backref_cache_entry *entry; - struct backref_cache_entry *tmp; - - list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list) - kfree(entry); - - INIT_LIST_HEAD(&sctx->backref_cache.lru_list); - mtree_destroy(&sctx->backref_cache.entries); - sctx->backref_cache.size = 0; -} - static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, const u64 **root_ids_ret, int *root_count_ret) { @@ -1408,9 +1403,10 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; + struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; - if (sctx->backref_cache.size == 0) + if (btrfs_lru_cache_size(&sctx->backref_cache) == 0) return false; /* @@ -1424,18 +1420,18 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) { - empty_backref_cache(sctx); + if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) { + btrfs_lru_cache_clear(&sctx->backref_cache); return false; } - entry = mtree_load(&sctx->backref_cache.entries, key); - if (!entry) + raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0); + if (!raw_entry) return false; + entry = container_of(raw_entry, struct backref_cache_entry, entry); *root_ids_ret = entry->root_ids; *root_count_ret = entry->num_roots; - list_move_tail(&entry->list, &sctx->backref_cache.lru_list); return true; } @@ -1461,7 +1457,8 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, if (!new_entry) return; - new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.gen = 0; new_entry->num_roots = 0; ULIST_ITER_INIT(&uiter); while ((node = ulist_next(root_ids, &uiter)) != NULL) { @@ -1489,23 +1486,12 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, * none of the roots is part of the list of roots from which we are * allowed to clone. Cache the new entry as it's still useful to avoid * backref walking to determine which roots have a path to the leaf. + * + * Also use GFP_NOFS because we're called while holding a transaction + * handle or while holding fs_info->commit_root_sem. */ - - if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) { - struct backref_cache_entry *lru_entry; - struct backref_cache_entry *mt_entry; - - lru_entry = list_first_entry(&sctx->backref_cache.lru_list, - struct backref_cache_entry, list); - mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key); - ASSERT(mt_entry == lru_entry); - list_del(&mt_entry->list); - kfree(mt_entry); - sctx->backref_cache.size--; - } - - ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key, - new_entry, GFP_NOFS); + ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry, + GFP_NOFS); ASSERT(ret == 0 || ret == -ENOMEM); if (ret) { /* Caching is optional, no worries. */ @@ -1513,17 +1499,13 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, return; } - list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list); - /* * We are called from iterate_extent_inodes() while either holding a * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (sctx->backref_cache.size == 0) - sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans; - - sctx->backref_cache.size++; + if (btrfs_lru_cache_size(&sctx->backref_cache) == 1) + sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; } static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, @@ -1886,7 +1868,8 @@ enum inode_state { inode_state_did_delete, }; -static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) +static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, + u64 *send_gen, u64 *parent_gen) { int ret; int left_ret; @@ -1900,6 +1883,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) goto out; left_ret = (info.nlink == 0) ? -ENOENT : ret; left_gen = info.gen; + if (send_gen) + *send_gen = ((left_ret == -ENOENT) ? 0 : info.gen); if (!sctx->parent_root) { right_ret = -ENOENT; @@ -1909,6 +1894,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) goto out; right_ret = (info.nlink == 0) ? -ENOENT : ret; right_gen = info.gen; + if (parent_gen) + *parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen); } if (!left_ret && !right_ret) { @@ -1953,14 +1940,15 @@ out: return ret; } -static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) +static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, + u64 *send_gen, u64 *parent_gen) { int ret; if (ino == BTRFS_FIRST_FREE_OBJECTID) return 1; - ret = get_cur_inode_state(sctx, ino, gen); + ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); if (ret < 0) goto out; @@ -2121,43 +2109,36 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, const char *name, int name_len, u64 *who_ino, u64 *who_gen, u64 *who_mode) { - int ret = 0; - u64 gen; + int ret; + u64 parent_root_dir_gen; u64 other_inode = 0; struct btrfs_inode_info info; if (!sctx->parent_root) - goto out; + return 0; - ret = is_inode_existent(sctx, dir, dir_gen); + ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen); if (ret <= 0) - goto out; + return 0; /* * If we have a parent root we need to verify that the parent dir was * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. + * + * @parent_root_dir_gen was set to 0 if the inode does not exist in the + * parent root. */ - if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_gen(sctx->parent_root, dir, &gen); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } - if (gen != dir_gen) - goto out; - } + if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID && + parent_root_dir_gen != dir_gen) + return 0; ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, &other_inode); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } + if (ret == -ENOENT) + return 0; + else if (ret < 0) + return ret; /* * Check if the overwritten ref was already processed. If yes, the ref @@ -2168,18 +2149,15 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, is_waiting_for_move(sctx, other_inode)) { ret = get_inode_info(sctx->parent_root, other_inode, &info); if (ret < 0) - goto out; + return ret; - ret = 1; *who_ino = other_inode; *who_gen = info.gen; *who_mode = info.mode; - } else { - ret = 0; + return 1; } -out: - return ret; + return 0; } /* @@ -2194,47 +2172,43 @@ static int did_overwrite_ref(struct send_ctx *sctx, u64 ino, u64 ino_gen, const char *name, int name_len) { - int ret = 0; - u64 gen; + int ret; u64 ow_inode; + u64 ow_gen = 0; + u64 send_root_dir_gen; if (!sctx->parent_root) - goto out; + return 0; - ret = is_inode_existent(sctx, dir, dir_gen); + ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL); if (ret <= 0) - goto out; + return ret; - if (dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_gen(sctx->send_root, dir, &gen); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } - if (gen != dir_gen) - goto out; - } + /* + * @send_root_dir_gen was set to 0 if the inode does not exist in the + * send root. + */ + if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen) + return 0; /* check if the ref was overwritten by another ref */ ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, &ow_inode); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { + if (ret == -ENOENT) { /* was never and will never be overwritten */ - ret = 0; - goto out; + return 0; + } else if (ret < 0) { + return ret; } - ret = get_inode_gen(sctx->send_root, ow_inode, &gen); - if (ret < 0) - goto out; + if (ow_inode == ino) { + ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); + if (ret < 0) + return ret; - if (ow_inode == ino && gen == ino_gen) { - ret = 0; - goto out; + /* It's the same inode, so no overwrite happened. */ + if (ow_gen == ino_gen) + return 0; } /* @@ -2243,15 +2217,20 @@ static int did_overwrite_ref(struct send_ctx *sctx, * inode 'ino' to be orphanized, therefore check if ow_inode matches * the current inode being processed. */ - if ((ow_inode < sctx->send_progress) || - (ino != sctx->cur_ino && ow_inode == sctx->cur_ino && - gen == sctx->cur_inode_gen)) - ret = 1; - else - ret = 0; + if (ow_inode < sctx->send_progress) + return 1; -out: - return ret; + if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) { + if (ow_gen == 0) { + ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); + if (ret < 0) + return ret; + } + if (ow_gen == sctx->cur_inode_gen) + return 1; + } + + return 0; } /* @@ -2285,113 +2264,16 @@ out: return ret; } -/* - * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, - * so we need to do some special handling in case we have clashes. This function - * takes care of this with the help of name_cache_entry::radix_list. - * In case of error, nce is kfreed. - */ -static int name_cache_insert(struct send_ctx *sctx, - struct name_cache_entry *nce) +static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx, + u64 ino, u64 gen) { - int ret = 0; - struct list_head *nce_head; - - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); - if (!nce_head) { - nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); - if (!nce_head) { - kfree(nce); - return -ENOMEM; - } - INIT_LIST_HEAD(nce_head); - - ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); - if (ret < 0) { - kfree(nce_head); - kfree(nce); - return ret; - } - } - list_add_tail(&nce->radix_list, nce_head); - list_add_tail(&nce->list, &sctx->name_cache_list); - sctx->name_cache_size++; - - return ret; -} + struct btrfs_lru_cache_entry *entry; -static void name_cache_delete(struct send_ctx *sctx, - struct name_cache_entry *nce) -{ - struct list_head *nce_head; - - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); - if (!nce_head) { - btrfs_err(sctx->send_root->fs_info, - "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", - nce->ino, sctx->name_cache_size); - } - - list_del(&nce->radix_list); - list_del(&nce->list); - sctx->name_cache_size--; - - /* - * We may not get to the final release of nce_head if the lookup fails - */ - if (nce_head && list_empty(nce_head)) { - radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); - kfree(nce_head); - } -} - -static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, - u64 ino, u64 gen) -{ - struct list_head *nce_head; - struct name_cache_entry *cur; - - nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); - if (!nce_head) + entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen); + if (!entry) return NULL; - list_for_each_entry(cur, nce_head, radix_list) { - if (cur->ino == ino && cur->gen == gen) - return cur; - } - return NULL; -} - -/* - * Remove some entries from the beginning of name_cache_list. - */ -static void name_cache_clean_unused(struct send_ctx *sctx) -{ - struct name_cache_entry *nce; - - if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE) - return; - - while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) { - nce = list_entry(sctx->name_cache_list.next, - struct name_cache_entry, list); - name_cache_delete(sctx, nce); - kfree(nce); - } -} - -static void name_cache_free(struct send_ctx *sctx) -{ - struct name_cache_entry *nce; - - while (!list_empty(&sctx->name_cache_list)) { - nce = list_entry(sctx->name_cache_list.next, - struct name_cache_entry, list); - name_cache_delete(sctx, nce); - kfree(nce); - } + return container_of(entry, struct name_cache_entry, entry); } /* @@ -2410,7 +2292,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, { int ret; int nce_ret; - struct name_cache_entry *nce = NULL; + struct name_cache_entry *nce; /* * First check if we already did a call to this function with the same @@ -2420,17 +2302,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, nce = name_cache_search(sctx, ino, gen); if (nce) { if (ino < sctx->send_progress && nce->need_later_update) { - name_cache_delete(sctx, nce); - kfree(nce); + btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry); nce = NULL; } else { - /* - * Removes the entry from the list and adds it back to - * the end. This marks the entry as recently used so - * that name_cache_clean_unused does not remove it. - */ - list_move_tail(&nce->list, &sctx->name_cache_list); - *parent_ino = nce->parent_ino; *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); @@ -2446,7 +2320,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, * This should only happen for the parent dir that we determine in * record_new_ref_if_needed(). */ - ret = is_inode_existent(sctx, ino, gen); + ret = is_inode_existent(sctx, ino, gen, NULL, NULL); if (ret < 0) goto out; @@ -2497,8 +2371,8 @@ out_cache: goto out; } - nce->ino = ino; - nce->gen = gen; + nce->entry.key = ino; + nce->entry.gen = gen; nce->parent_ino = *parent_ino; nce->parent_gen = *parent_gen; nce->name_len = fs_path_len(dest); @@ -2510,10 +2384,11 @@ out_cache: else nce->need_later_update = 1; - nce_ret = name_cache_insert(sctx, nce); - if (nce_ret < 0) + nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); + if (nce_ret < 0) { + kfree(nce); ret = nce_ret; - name_cache_clean_unused(sctx); + } out: return ret; @@ -2884,6 +2759,63 @@ out: } /* + * If the cache is full, we can't remove entries from it and do a call to + * send_utimes() for each respective inode, because we might be finishing + * processing an inode that is a directory and it just got renamed, and existing + * entries in the cache may refer to inodes that have the directory in their + * full path - in which case we would generate outdated paths (pre-rename) + * for the inodes that the cache entries point to. Instead of prunning the + * cache when inserting, do it after we finish processing each inode at + * finish_inode_if_needed(). + */ +static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) +{ + struct btrfs_lru_cache_entry *entry; + int ret; + + entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen); + if (entry != NULL) + return 0; + + /* Caching is optional, don't fail if we can't allocate memory. */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return send_utimes(sctx, dir, gen); + + entry->key = dir; + entry->gen = gen; + + ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL); + ASSERT(ret != -EEXIST); + if (ret) { + kfree(entry); + return send_utimes(sctx, dir, gen); + } + + return 0; +} + +static int trim_dir_utimes_cache(struct send_ctx *sctx) +{ + while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) > + SEND_MAX_DIR_UTIMES_CACHE_SIZE) { + struct btrfs_lru_cache_entry *lru; + int ret; + + lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache); + ASSERT(lru != NULL); + + ret = send_utimes(sctx, lru->key, lru->gen); + if (ret) + return ret; + + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru); + } + + return 0; +} + +/* * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have * a valid path yet because we did not process the refs yet. So, the inode * is created as orphan. @@ -2971,6 +2903,23 @@ out: return ret; } +static void cache_dir_created(struct send_ctx *sctx, u64 dir) +{ + struct btrfs_lru_cache_entry *entry; + int ret; + + /* Caching is optional, ignore any failures. */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return; + + entry->key = dir; + entry->gen = 0; + ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL); + if (ret < 0) + kfree(entry); +} + /* * We need some special handling for inodes that get processed before the parent * directory got created. See process_recorded_refs for details. @@ -2986,6 +2935,9 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) struct btrfs_key di_key; struct btrfs_dir_item *di; + if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0)) + return 1; + path = alloc_path_for_send(); if (!path) return -ENOMEM; @@ -3009,6 +2961,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) if (di_key.type != BTRFS_ROOT_ITEM_KEY && di_key.objectid < sctx->send_progress) { ret = 1; + cache_dir_created(sctx, dir); break; } } @@ -3038,7 +2991,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx) return 0; } - return send_create_inode(sctx, sctx->cur_ino); + ret = send_create_inode(sctx, sctx->cur_ino); + + if (ret == 0 && S_ISDIR(sctx->cur_inode_mode)) + cache_dir_created(sctx, sctx->cur_ino); + + return ret; } struct recorded_ref { @@ -3166,6 +3124,7 @@ static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx, odi->ino = dir_ino; odi->gen = dir_gen; odi->last_dir_index_offset = 0; + odi->dir_high_seq_ino = 0; rb_link_node(&odi->node, parent, p); rb_insert_color(&odi->node, &sctx->orphan_dirs); @@ -3215,8 +3174,7 @@ static void free_orphan_dir_info(struct send_ctx *sctx, * We check this by iterating all dir items and checking if the inode behind * the dir item was already processed. */ -static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, - u64 send_progress) +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen) { int ret = 0; int iter_ret = 0; @@ -3227,6 +3185,8 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, struct btrfs_key loc; struct btrfs_dir_item *di; struct orphan_dir_info *odi = NULL; + u64 dir_high_seq_ino = 0; + u64 last_dir_index_offset = 0; /* * Don't try to rmdir the top/root subvolume dir. @@ -3234,17 +3194,62 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (dir == BTRFS_FIRST_FREE_OBJECTID) return 0; + odi = get_orphan_dir_info(sctx, dir, dir_gen); + if (odi && sctx->cur_ino < odi->dir_high_seq_ino) + return 0; + path = alloc_path_for_send(); if (!path) return -ENOMEM; + if (!odi) { + /* + * Find the inode number associated with the last dir index + * entry. This is very likely the inode with the highest number + * of all inodes that have an entry in the directory. We can + * then use it to avoid future calls to can_rmdir(), when + * processing inodes with a lower number, from having to search + * the parent root b+tree for dir index keys. + */ + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + /* Can't happen, the root is never empty. */ + ASSERT(path->slots[0] > 0); + if (WARN_ON(path->slots[0] == 0)) { + ret = -EUCLEAN; + goto out; + } + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) { + /* No index keys, dir can be removed. */ + ret = 1; + goto out; + } + + di = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dir_high_seq_ino = loc.objectid; + if (sctx->cur_ino < dir_high_seq_ino) { + ret = 0; + goto out; + } + + btrfs_release_path(path); + } + key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; - key.offset = 0; - - odi = get_orphan_dir_info(sctx, dir, dir_gen); - if (odi) - key.offset = odi->last_dir_index_offset; + key.offset = (odi ? odi->last_dir_index_offset : 0); btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct waiting_dir_move *dm; @@ -3257,29 +3262,18 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid); + last_dir_index_offset = found_key.offset; + dm = get_waiting_dir_move(sctx, loc.objectid); if (dm) { - odi = add_orphan_dir_info(sctx, dir, dir_gen); - if (IS_ERR(odi)) { - ret = PTR_ERR(odi); - goto out; - } - odi->gen = dir_gen; - odi->last_dir_index_offset = found_key.offset; dm->rmdir_ino = dir; dm->rmdir_gen = dir_gen; ret = 0; goto out; } - if (loc.objectid > send_progress) { - odi = add_orphan_dir_info(sctx, dir, dir_gen); - if (IS_ERR(odi)) { - ret = PTR_ERR(odi); - goto out; - } - odi->gen = dir_gen; - odi->last_dir_index_offset = found_key.offset; + if (loc.objectid > sctx->cur_ino) { ret = 0; goto out; } @@ -3294,7 +3288,22 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, out: btrfs_free_path(path); - return ret; + + if (ret) + return ret; + + if (!odi) { + odi = add_orphan_dir_info(sctx, dir, dir_gen); + if (IS_ERR(odi)) + return PTR_ERR(odi); + + odi->gen = dir_gen; + } + + odi->last_dir_index_offset = last_dir_index_offset; + odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino); + + return 0; } static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) @@ -3579,7 +3588,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } gen = odi->gen; - ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino); + ret = can_rmdir(sctx, rmdir_ino, gen); if (ret < 0) goto out; if (!ret) @@ -3599,7 +3608,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } finish: - ret = send_utimes(sctx, pm->ino, pm->gen); + ret = cache_dir_utimes(sctx, pm->ino, pm->gen); if (ret < 0) goto out; @@ -3619,7 +3628,7 @@ finish: if (ret < 0) goto out; - ret = send_utimes(sctx, cur->dir, cur->dir_gen); + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } @@ -4242,7 +4251,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * "testdir_2". */ list_for_each_entry(cur, &sctx->new_refs, list) { - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_will_create) @@ -4288,12 +4297,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * the source path when performing its rename * operation. */ - if (is_waiting_for_move(sctx, ow_inode)) { - wdm = get_waiting_dir_move(sctx, - ow_inode); - ASSERT(wdm); + wdm = get_waiting_dir_move(sctx, ow_inode); + if (wdm) wdm->orphanized = true; - } /* * Make sure we clear our orphanized inode's @@ -4306,10 +4312,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * and get instead the orphan name. */ nce = name_cache_search(sctx, ow_inode, ow_gen); - if (nce) { - name_cache_delete(sctx, nce); - kfree(nce); - } + if (nce) + btrfs_lru_cache_remove(&sctx->name_cache, + &nce->entry); /* * ow_inode might currently be an ancestor of @@ -4358,7 +4363,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * parent directory out of order. But we need to check if this * did already happen before due to other refs in the same dir. */ - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_will_create) { @@ -4388,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = send_create_inode(sctx, cur->dir); if (ret < 0) goto out; + cache_dir_created(sctx, cur->dir); } } @@ -4470,8 +4476,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * later, we do this check again and rmdir it then if possible. * See the use of check_dirs for more details. */ - ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, - sctx->cur_ino); + ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen); if (ret < 0) goto out; if (ret) { @@ -4564,20 +4569,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (cur->dir > sctx->cur_ino) continue; - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_did_create || ret == inode_state_no_change) { - /* TODO delayed utimes */ - ret = send_utimes(sctx, cur->dir, cur->dir_gen); + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } else if (ret == inode_state_did_delete && cur->dir != last_dir_ino_rm) { - ret = can_rmdir(sctx, cur->dir, cur->dir_gen, - sctx->cur_ino); + ret = can_rmdir(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; if (ret) { @@ -5635,7 +5638,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * boundary in the send buffer. This means that there may be a gap * between the beginning of the command and the file data. */ - data_offset = ALIGN(sctx->send_size, PAGE_SIZE); + data_offset = PAGE_ALIGN(sctx->send_size); if (data_offset > sctx->send_max_size || sctx->send_max_size - data_offset < disk_num_bytes) { ret = -EOVERFLOW; @@ -5759,7 +5762,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, sent += size; } - if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { + if (sctx->clean_page_cache && PAGE_ALIGNED(end)) { /* * Always operate only on ranges that are a multiple of the page * size. This is not only to prevent zeroing parts of a page in @@ -6754,12 +6757,26 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) * it's moved/renamed, therefore we don't need to do it here. */ sctx->send_progress = sctx->cur_ino + 1; - ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + + /* + * If the current inode is a non-empty directory, delay issuing + * the utimes command for it, as it's very likely we have inodes + * with an higher number inside it. We want to issue the utimes + * command only after adding all dentries to it. + */ + if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0) + ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + else + ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) goto out; } out: + if (!ret) + ret = trim_dir_utimes_cache(sctx); + return ret; } @@ -8044,6 +8061,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) int clone_sources_to_rollback = 0; size_t alloc_size; int sort_clone_roots = 0; + struct btrfs_lru_cache_entry *entry; + struct btrfs_lru_cache_entry *tmp; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -8073,10 +8092,10 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) /* * Check that we don't overflow at later allocations, we request * clone_sources_count + 1 items, and compare to unsigned long inside - * access_ok. + * access_ok. Also set an upper limit for allocation size so this can't + * easily exhaust memory. Max number of clone sources is about 200K. */ - if (arg->clone_sources_count > - ULONG_MAX / sizeof(struct clone_root) - 1) { + if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) { ret = -EINVAL; goto out; } @@ -8094,11 +8113,22 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); - INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); - INIT_LIST_HEAD(&sctx->name_cache_list); - INIT_LIST_HEAD(&sctx->backref_cache.lru_list); - mt_init(&sctx->backref_cache.entries); + btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE); + btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE); + btrfs_lru_cache_init(&sctx->dir_created_cache, + SEND_MAX_DIR_CREATED_CACHE_SIZE); + /* + * This cache is periodically trimmed to a fixed size elsewhere, see + * cache_dir_utimes() and trim_dir_utimes_cache(). + */ + btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0); + + sctx->pending_dir_moves = RB_ROOT; + sctx->waiting_dir_moves = RB_ROOT; + sctx->orphan_dirs = RB_ROOT; + sctx->rbtree_new_refs = RB_ROOT; + sctx->rbtree_deleted_refs = RB_ROOT; sctx->flags = arg->flags; @@ -8165,12 +8195,6 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->pending_dir_moves = RB_ROOT; - sctx->waiting_dir_moves = RB_ROOT; - sctx->orphan_dirs = RB_ROOT; - sctx->rbtree_new_refs = RB_ROOT; - sctx->rbtree_deleted_refs = RB_ROOT; - sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), arg->clone_sources_count + 1, GFP_KERNEL); @@ -8279,6 +8303,13 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) if (ret < 0) goto out; + btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) { + ret = send_utimes(sctx, entry->key, entry->gen); + if (ret < 0) + goto out; + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry); + } + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { ret = begin_cmd(sctx, BTRFS_SEND_C_END); if (ret < 0) @@ -8358,11 +8389,12 @@ out: kvfree(sctx->send_buf); kvfree(sctx->verity_descriptor); - name_cache_free(sctx); - close_current_inode(sctx); - empty_backref_cache(sctx); + btrfs_lru_cache_clear(&sctx->name_cache); + btrfs_lru_cache_clear(&sctx->backref_cache); + btrfs_lru_cache_clear(&sctx->dir_created_cache); + btrfs_lru_cache_clear(&sctx->dir_utimes_cache); kfree(sctx); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 433ce221dc5c..581845bc206a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -58,6 +58,7 @@ #include "scrub.h" #include "verity.h" #include "super.h" +#include "extent-tree.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -2049,7 +2050,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } /* - * Metadata in mixed block goup profiles are accounted in data + * Metadata in mixed block group profiles are accounted in data */ if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 45615ce36498..8c5efa5813b3 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -702,7 +702,7 @@ static void release_raid_kobj(struct kobject *kobj) kfree(to_raid_kobj(kobj)); } -static struct kobj_type btrfs_raid_ktype = { +static const struct kobj_type btrfs_raid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = release_raid_kobj, .default_groups = raid_groups, @@ -900,7 +900,7 @@ static void space_info_release(struct kobject *kobj) kfree(sinfo); } -static struct kobj_type space_info_ktype = { +static const struct kobj_type space_info_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = space_info_release, .default_groups = space_info_groups, @@ -1259,7 +1259,7 @@ static void btrfs_release_fsid_kobj(struct kobject *kobj) complete(&fs_devs->kobj_unregister); } -static struct kobj_type btrfs_ktype = { +static const struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = btrfs_release_fsid_kobj, }; @@ -1789,7 +1789,7 @@ static void btrfs_release_devid_kobj(struct kobject *kobj) complete(&device->kobj_unregister); } -static struct kobj_type devid_ktype = { +static const struct kobj_type devid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = devid_groups, .release = btrfs_release_devid_kobj, @@ -2103,7 +2103,7 @@ static void qgroups_release(struct kobject *kobj) kfree(kobj); } -static struct kobj_type qgroups_ktype = { +static const struct kobj_type qgroups_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = qgroups_groups, .release = qgroups_release, @@ -2173,7 +2173,7 @@ static void qgroup_release(struct kobject *kobj) memset(&qgroup->kobj, 0, sizeof(*kobj)); } -static struct kobj_type qgroup_ktype = { +static const struct kobj_type qgroup_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = qgroup_release, .default_groups = qgroup_groups, @@ -2272,36 +2272,23 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, * Change per-fs features in /sys/fs/btrfs/UUID/features to match current * values in superblock. Call after any changes to incompat/compat_ro flags */ -void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, - u64 bit, enum btrfs_feature_set set) +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devs; struct kobject *fsid_kobj; - u64 __maybe_unused features; - int __maybe_unused ret; + int ret; if (!fs_info) return; - /* - * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not - * safe when called from some contexts (eg. balance) - */ - features = get_features(fs_info, set); - ASSERT(bit & supported_feature_masks[set]); - - fs_devs = fs_info->fs_devices; - fsid_kobj = &fs_devs->fsid_kobj; - + fsid_kobj = &fs_info->fs_devices->fsid_kobj; if (!fsid_kobj->state_initialized) return; - /* - * FIXME: this is too heavy to update just one value, ideally we'd like - * to use sysfs_update_group but some refactoring is needed first. - */ - sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); - ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); + ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group); + if (ret < 0) + btrfs_warn(fs_info, + "failed to update /sys/fs/btrfs/%pU/features: %d", + fs_info->fs_devices->fsid, ret); } int __init btrfs_init_sysfs(void) diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index bacef43f7267..86c7eef12873 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -19,8 +19,7 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); -void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, - u64 bit, enum btrfs_feature_set set); +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info); void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); int __init btrfs_init_sysfs(void); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 181469fc0bb3..ca09cf9afce8 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -64,7 +64,7 @@ struct inode *btrfs_new_test_inode(void) BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - inode_init_owner(&init_user_ns, inode, NULL, S_IFREG); + inode_init_owner(&nop_mnt_idmap, inode, NULL, S_IFREG); return inode; } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index c5b3a631bf4f..f2f2e11dac4c 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -509,7 +509,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, goto out_free; } - ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1), + ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), &logical, &out_ndaddrs, &out_stripe_len); if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { test_err("didn't rmap anything but expected %d", diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b8c52e89688c..18329ebcb1cb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2464,6 +2464,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&fs_info->transaction_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); + /* If we have features changed, wake up the cleaner to update sysfs. */ + if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) && + fs_info->cleaner_kthread) + wake_up_process(fs_info->cleaner_kthread); + ret = btrfs_write_and_wait_transaction(trans); if (ret) { btrfs_handle_fs_error(fs_info, ret, @@ -2604,6 +2609,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) return (ret < 0) ? 0 : 1; } +/* + * We only mark the transaction aborted and then set the file system read-only. + * This will prevent new transactions from starting or trying to join this + * one. + * + * This means that error recovery at the call site is limited to freeing + * any local memory allocations and passing the error code up without + * further cleanup. The transaction should complete as it normally would + * in the call path but will return -EIO. + * + * We'll complete the cleanup in btrfs_end_transaction and + * btrfs_commit_transaction. + */ +void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + WRITE_ONCE(trans->aborted, errno); + WRITE_ONCE(trans->transaction->aborted, errno); + if (first_hit && errno == -ENOSPC) + btrfs_dump_space_info_for_trans_abort(fs_info); + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); +} + int __init btrfs_transaction_init(void) { btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 97f6c39f59c8..fa728ab80826 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -202,6 +202,34 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) delayed_refs->qgroup_to_skip = 0; } +bool __cold abort_should_print_stack(int errno); + +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact stack trace is reported for some errors. + */ +#define btrfs_abort_transaction(trans, errno) \ +do { \ + bool first = false; \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((trans)->fs_info->fs_state))) { \ + first = true; \ + if (WARN(abort_should_print_stack(errno), \ + KERN_ERR \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno))) { \ + /* Stack trace printed. */ \ + } else { \ + btrfs_debug((trans)->fs_info, \ + "Transaction aborted (error %d)", \ + (errno)); \ + } \ + } \ + __btrfs_abort_transaction((trans), __func__, \ + __LINE__, (errno), first); \ +} while (0) + int btrfs_end_transaction(struct btrfs_trans_handle *trans); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, unsigned int num_items); @@ -236,6 +264,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); +void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit); int __init btrfs_transaction_init(void); void __cold btrfs_transaction_exit(void); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d43261545264..200cea6e49e5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -279,12 +279,6 @@ void btrfs_end_log_trans(struct btrfs_root *root) } } -static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) -{ - filemap_fdatawait_range(buf->pages[0]->mapping, - buf->start, buf->start + buf->len - 1); -} - /* * the walk control struct is used to pass state down the chain when * processing the log tree. The stage field tells us which part @@ -2623,11 +2617,12 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return ret; } + btrfs_tree_lock(next); + btrfs_clear_buffer_dirty(trans, next); + wait_on_extent_buffer_writeback(next); + btrfs_tree_unlock(next); + if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); ret = btrfs_pin_reserved_extent(trans, bytenr, blocksize); if (ret) { @@ -2637,8 +2632,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_redirty_list_add( trans->transaction, next); } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); unaccount_log_buffer(fs_info, bytenr); } } @@ -2693,11 +2686,12 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[*level]; + btrfs_tree_lock(next); + btrfs_clear_buffer_dirty(trans, next); + wait_on_extent_buffer_writeback(next); + btrfs_tree_unlock(next); + if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); ret = btrfs_pin_reserved_extent(trans, path->nodes[*level]->start, path->nodes[*level]->len); @@ -2706,9 +2700,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_redirty_list_add(trans->transaction, next); } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); - unaccount_log_buffer(fs_info, path->nodes[*level]->start); } @@ -2776,19 +2767,18 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[orig_level]; + btrfs_tree_lock(next); + btrfs_clear_buffer_dirty(trans, next); + wait_on_extent_buffer_writeback(next); + btrfs_tree_unlock(next); + if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); ret = btrfs_pin_reserved_extent(trans, next->start, next->len); if (ret) goto out; btrfs_redirty_list_add(trans->transaction, next); } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); unaccount_log_buffer(fs_info, next->start); } } @@ -3576,17 +3566,19 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, } static int flush_dir_items_batch(struct btrfs_trans_handle *trans, - struct btrfs_root *log, + struct btrfs_inode *inode, struct extent_buffer *src, struct btrfs_path *dst_path, int start_slot, int count) { + struct btrfs_root *log = inode->root->log_root; char *ins_data = NULL; struct btrfs_item_batch batch; struct extent_buffer *dst; unsigned long src_offset; unsigned long dst_offset; + u64 last_index; struct btrfs_key key; u32 item_size; int ret; @@ -3644,6 +3636,18 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); btrfs_release_path(dst_path); + + last_index = batch.keys[count - 1].offset; + ASSERT(last_index > inode->last_dir_index_offset); + + /* + * If for some unexpected reason the last item's index is not greater + * than the last index we logged, warn and force a transaction commit. + */ + if (WARN_ON(last_index <= inode->last_dir_index_offset)) + ret = BTRFS_LOG_FORCE_COMMIT; + else + inode->last_dir_index_offset = last_index; out: kfree(ins_data); @@ -3693,7 +3697,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, } di = btrfs_item_ptr(src, i, struct btrfs_dir_item); - ctx->last_dir_item_offset = key.offset; /* * Skip ranges of items that consist only of dir item keys created @@ -3756,7 +3759,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, if (batch_size > 0) { int ret; - ret = flush_dir_items_batch(trans, log, src, dst_path, + ret = flush_dir_items_batch(trans, inode, src, dst_path, batch_start, batch_size); if (ret < 0) return ret; @@ -3780,7 +3783,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_root *root = inode->root; struct btrfs_root *log = root->log_root; - int err = 0; int ret; u64 last_old_dentry_offset = min_offset - 1; u64 last_offset = (u64)-1; @@ -3821,8 +3823,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, path->slots[0]); if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; - } else if (ret < 0) { - err = ret; + } else if (ret > 0) { + ret = 0; } goto done; @@ -3845,7 +3847,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; } else if (ret < 0) { - err = ret; goto done; } @@ -3867,12 +3868,15 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, */ search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (ret > 0) + if (ret > 0) { ret = btrfs_next_item(root, path); + if (ret > 0) { + /* There are no more keys in the inode's root. */ + ret = 0; + goto done; + } + } if (ret < 0) - err = ret; - /* If ret is 1, there are no more keys in the inode's root. */ - if (ret != 0) goto done; /* @@ -3883,8 +3887,8 @@ search: ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, &last_old_dentry_offset); if (ret != 0) { - if (ret < 0) - err = ret; + if (ret > 0) + ret = 0; goto done; } path->slots[0] = btrfs_header_nritems(path->nodes[0]); @@ -3895,10 +3899,10 @@ search: */ ret = btrfs_next_leaf(root, path); if (ret) { - if (ret == 1) + if (ret == 1) { last_offset = (u64)-1; - else - err = ret; + ret = 0; + } goto done; } btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); @@ -3929,7 +3933,7 @@ done: btrfs_release_path(path); btrfs_release_path(dst_path); - if (err == 0) { + if (ret == 0) { *last_offset_ret = last_offset; /* * In case the leaf was changed in the current transaction but @@ -3940,15 +3944,13 @@ done: * a range, last_old_dentry_offset is == to last_offset. */ ASSERT(last_old_dentry_offset <= last_offset); - if (last_old_dentry_offset < last_offset) { + if (last_old_dentry_offset < last_offset) ret = insert_dir_log_key(trans, log, path, ino, last_old_dentry_offset + 1, last_offset); - if (ret) - err = ret; - } } - return err; + + return ret; } /* @@ -4044,7 +4046,6 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, min_key = BTRFS_DIR_START_INDEX; max_key = 0; - ctx->last_dir_item_offset = inode->last_dir_index_offset; while (1) { ret = log_dir_items(trans, inode, path, dst_path, @@ -4056,8 +4057,6 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, min_key = max_key + 1; } - inode->last_dir_index_offset = ctx->last_dir_item_offset; - return 0; } @@ -5593,10 +5592,8 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction * commits. */ - if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) { - btrfs_set_log_full_commit(trans); + if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) return BTRFS_LOG_FORCE_COMMIT; - } inode = btrfs_iget(root->fs_info->sb, ino, root); /* @@ -6455,7 +6452,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * result in losing the file after a log replay. */ if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { - btrfs_set_log_full_commit(trans); ret = BTRFS_LOG_FORCE_COMMIT; goto out_unlock; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 85b43075ac58..bdeb5216718f 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -13,8 +13,13 @@ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 -/* We can't use the tree log for whatever reason, force a transaction commit */ -#define BTRFS_LOG_FORCE_COMMIT (1) +/* + * We can't use the tree log for whatever reason, force a transaction commit. + * We use a negative value because there are functions through the logging code + * that need to return an error (< 0 value), false (0) or true (1). Any negative + * value will do, as it will cause the log to be marked for a full sync. + */ +#define BTRFS_LOG_FORCE_COMMIT (-(MAX_ERRNO + 1)) struct btrfs_log_ctx { int log_ret; @@ -24,8 +29,6 @@ struct btrfs_log_ctx { bool logging_new_delayed_dentries; /* Indicate if the inode being logged was logged before. */ bool logged_before; - /* Tracks the last logged dir item/index key offset. */ - u64 last_dir_item_offset; struct inode *inode; struct list_head list; /* Only used for fast fsyncs. */ diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index bf9eb693a6a7..c5ff16f9e9fa 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -783,30 +783,25 @@ again: /* * fsverity op that writes a Merkle tree block into the btree. * - * @inode: inode to write a Merkle tree block for - * @buf: Merkle tree data block to write - * @index: index of the block in the Merkle tree - * @log_blocksize: log base 2 of the Merkle tree block size - * - * Note that the block size could be different from the page size, so it is not - * safe to assume that index is a page index. + * @inode: inode to write a Merkle tree block for + * @buf: Merkle tree block to write + * @pos: the position of the block in the Merkle tree (in bytes) + * @size: the Merkle tree block size (in bytes) * * Returns 0 on success or negative error code on failure */ static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf, - u64 index, int log_blocksize) + u64 pos, unsigned int size) { - u64 off = index << log_blocksize; - u64 len = 1ULL << log_blocksize; loff_t merkle_pos = merkle_file_pos(inode); if (merkle_pos < 0) return merkle_pos; - if (merkle_pos > inode->i_sb->s_maxbytes - off - len) + if (merkle_pos > inode->i_sb->s_maxbytes - pos - size) return -EFBIG; return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, - off, buf, len); + pos, buf, size); } const struct fsverity_operations btrfs_verityops = { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bcfef75b97da..7823168c08a6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -403,6 +403,7 @@ void btrfs_free_device(struct btrfs_device *device) static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; + WARN_ON(fs_devices->opened); while (!list_empty(&fs_devices->devices)) { device = list_entry(fs_devices->devices.next, @@ -727,7 +728,7 @@ static struct btrfs_fs_devices *find_fsid_reverted_metadata( /* * Handle the case where the scanned device is part of an fs whose last * metadata UUID change reverted it to the original FSID. At the same - * time * fs_devices was first created by another constitutent device + * time fs_devices was first created by another constituent device * which didn't fully observe the operation. This results in an * btrfs_fs_devices created with metadata/fsid different AND * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the @@ -1181,9 +1182,22 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&uuid_mutex); close_fs_devices(fs_devices); - if (!fs_devices->opened) + if (!fs_devices->opened) { list_splice_init(&fs_devices->seed_list, &list); + /* + * If the struct btrfs_fs_devices is not assembled with any + * other device, it can be re-initialized during the next mount + * without the needing device-scan step. Therefore, it can be + * fully freed. + */ + if (fs_devices->num_devices == 1) { + list_del(&fs_devices->fs_list); + free_fs_devices(fs_devices); + } + } + + list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); list_del(&fs_devices->seed_list); @@ -1600,7 +1614,7 @@ again: if (ret < 0) goto out; - while (1) { + while (search_start < search_end) { l = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(l)) { @@ -1623,6 +1637,9 @@ again: if (key.type != BTRFS_DEV_EXTENT_KEY) goto next; + if (key.offset > search_end) + break; + if (key.offset > search_start) { hole_size = key.offset - search_start; dev_extent_hole_check(device, &search_start, &hole_size, @@ -1683,6 +1700,7 @@ next: else ret = 0; + ASSERT(max_hole_start + max_hole_size <= search_end); out: btrfs_free_path(path); *start = max_hole_start; @@ -6266,91 +6284,42 @@ static bool need_full_stripe(enum btrfs_map_op op) return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); } -/* - * Calculate the geometry of a particular (address, len) tuple. This - * information is used to calculate how big a particular bio can get before it - * straddles a stripe. - * - * @fs_info: the filesystem - * @em: mapping containing the logical extent - * @op: type of operation - write or read - * @logical: address that we want to figure out the geometry of - * @io_geom: pointer used to return values - * - * Returns < 0 in case a chunk for the given logical address cannot be found, - * usually shouldn't happen unless @logical is corrupted, 0 otherwise. - */ -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, - enum btrfs_map_op op, u64 logical, - struct btrfs_io_geometry *io_geom) +static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, + u64 offset, u64 *stripe_nr, u64 *stripe_offset, + u64 *full_stripe_start) { - struct map_lookup *map; - u64 len; - u64 offset; - u64 stripe_offset; - u64 stripe_nr; - u32 stripe_len; - u64 raid56_full_stripe_start = (u64)-1; - int data_stripes; + u32 stripe_len = map->stripe_len; ASSERT(op != BTRFS_MAP_DISCARD); - map = em->map_lookup; - /* Offset of this logical address in the chunk */ - offset = logical - em->start; - /* Len of a stripe in a chunk */ - stripe_len = map->stripe_len; /* - * Stripe_nr is where this block falls in - * stripe_offset is the offset of this block in its stripe. + * Stripe_nr is the stripe where this block falls. stripe_offset is + * the offset of this block in its stripe. */ - stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset); - ASSERT(stripe_offset < U32_MAX); + *stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset); + ASSERT(*stripe_offset < U32_MAX); - data_stripes = nr_data_stripes(map); + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); - /* Only stripe based profiles needs to check against stripe length. */ - if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { - u64 max_len = stripe_len - stripe_offset; + *full_stripe_start = + div64_u64(offset, full_stripe_len) * full_stripe_len; /* - * In case of raid56, we need to know the stripe aligned start + * For writes to RAID56, allow to write a full stripe set, but + * no straddling of stripe sets. */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = stripe_len * data_stripes; - raid56_full_stripe_start = offset; - - /* - * Allow a write of a full stripe, but make sure we - * don't allow straddling of stripes - */ - raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, - full_stripe_len); - raid56_full_stripe_start *= full_stripe_len; - - /* - * For writes to RAID[56], allow a full stripeset across - * all disks. For other RAID types and for RAID[56] - * reads, just allow a single stripe (on a single disk). - */ - if (op == BTRFS_MAP_WRITE) { - max_len = stripe_len * data_stripes - - (offset - raid56_full_stripe_start); - } - } - len = min_t(u64, em->len - offset, max_len); - } else { - len = em->len - offset; + if (op == BTRFS_MAP_WRITE) + return full_stripe_len - (offset - *full_stripe_start); } - io_geom->len = len; - io_geom->offset = offset; - io_geom->stripe_len = stripe_len; - io_geom->stripe_nr = stripe_nr; - io_geom->stripe_offset = stripe_offset; - io_geom->raid56_stripe_offset = raid56_full_stripe_start; - - return 0; + /* + * For other RAID types and for RAID56 reads, allow a single stripe (on + * a single disk). + */ + if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) + return stripe_len - *stripe_offset; + return U64_MAX; } static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, @@ -6369,6 +6338,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, { struct extent_map *em; struct map_lookup *map; + u64 map_offset; u64 stripe_offset; u64 stripe_nr; u64 stripe_len; @@ -6387,7 +6357,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int patch_the_first_stripe_for_dev_replace = 0; u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; - struct btrfs_io_geometry geom; + u64 max_len; ASSERT(bioc_ret); ASSERT(op != BTRFS_MAP_DISCARD); @@ -6395,18 +6365,14 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, em = btrfs_get_chunk_map(fs_info, logical, *length); ASSERT(!IS_ERR(em)); - ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); - if (ret < 0) - return ret; - map = em->map_lookup; - - *length = geom.len; - stripe_len = geom.stripe_len; - stripe_nr = geom.stripe_nr; - stripe_offset = geom.stripe_offset; - raid56_full_stripe_start = geom.raid56_stripe_offset; data_stripes = nr_data_stripes(map); + stripe_len = map->stripe_len; + + map_offset = logical - em->start; + max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, + &stripe_offset, &raid56_full_stripe_start); + *length = min_t(u64, em->len - map_offset, max_len); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6b7a05f6cf82..7e51f2238f72 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -53,21 +53,6 @@ enum btrfs_raid_types { BTRFS_NR_RAID_TYPES }; -struct btrfs_io_geometry { - /* remaining bytes before crossing a stripe */ - u64 len; - /* offset of logical address in chunk */ - u64 offset; - /* length of single IO stripe */ - u32 stripe_len; - /* offset of address in stripe */ - u32 stripe_offset; - /* number of stripe where address falls */ - u64 stripe_nr; - /* offset of raid56 stripe into the chunk */ - u64 raid56_stripe_offset; -}; - /* * Use sequence counter to get consistent device stat data on * 32-bit processors. @@ -545,9 +530,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes); -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, - enum btrfs_map_op op, u64 logical, - struct btrfs_io_geometry *io_geom); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 0ed4b119a7ca..0ebeaf4e81f9 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -370,7 +370,7 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler, } static int btrfs_xattr_handler_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -383,7 +383,7 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 01a13de11832..da7bb9187b68 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -63,7 +63,7 @@ struct list_head *zlib_alloc_workspace(unsigned int level) workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); - workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); + workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL); workspace->level = level; workspace->buf = NULL; /* diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1f503e8e42d4..f95b2c94d619 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -17,6 +17,7 @@ #include "space-info.h" #include "fs.h" #include "accessors.h" +#include "bio.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -160,7 +161,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, */ static inline u32 sb_zone_number(int shift, int mirror) { - u64 zone; + u64 zone = U64_MAX; ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); switch (mirror) { @@ -220,7 +221,6 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { struct btrfs_zoned_device_info *zinfo = device->zone_info; - u32 zno; int ret; if (!*nr_zones) @@ -235,6 +235,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, /* Check cache */ if (zinfo->zone_cache) { unsigned int i; + u32 zno; ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); zno = pos >> zinfo->zone_size_shift; @@ -274,9 +275,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return -EIO; /* Populate cache */ - if (zinfo->zone_cache) + if (zinfo->zone_cache) { + u32 zno = pos >> zinfo->zone_size_shift; + memcpy(zinfo->zone_cache + zno, zones, sizeof(*zinfo->zone_cache) * *nr_zones); + } return 0; } @@ -417,25 +421,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); - /* - * We limit max_zone_append_size also by max_segments * - * PAGE_SIZE. Technically, we can have multiple pages per segment. But, - * since btrfs adds the pages one by one to a bio, and btrfs cannot - * increase the metadata reservation even if it increases the number of - * extents, it is safe to stick with the limit. - * - * With the zoned emulation, we can have non-zoned device on the zoned - * mode. In this case, we don't have a valid max zone append size. So, - * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size. - */ - if (bdev_is_zoned(bdev)) { - zone_info->max_zone_append_size = min_t(u64, - (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, - (u64)bdev_max_segments(bdev) << PAGE_SHIFT); - } else { - zone_info->max_zone_append_size = - (u64)bdev_max_segments(bdev) << PAGE_SHIFT; - } if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; @@ -715,9 +700,9 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) { + struct queue_limits *lim = &fs_info->limits; struct btrfs_device *device; u64 zone_size = 0; - u64 max_zone_append_size = 0; int ret; /* @@ -727,6 +712,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) if (!btrfs_fs_incompat(fs_info, ZONED)) return btrfs_check_for_zoned_device(fs_info); + blk_set_stacking_limits(lim); + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { struct btrfs_zoned_device_info *zone_info = device->zone_info; @@ -741,10 +728,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) zone_info->zone_size, zone_size); return -EINVAL; } - if (!max_zone_append_size || - (zone_info->max_zone_append_size && - zone_info->max_zone_append_size < max_zone_append_size)) - max_zone_append_size = zone_info->max_zone_append_size; + + /* + * With the zoned emulation, we can have non-zoned device on the + * zoned mode. In this case, we don't have a valid max zone + * append size. + */ + if (bdev_is_zoned(device->bdev)) { + blk_stack_limits(lim, + &bdev_get_queue(device->bdev)->limits, + 0); + } } /* @@ -765,8 +759,18 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) } fs_info->zone_size = zone_size; - fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, - fs_info->sectorsize); + /* + * Also limit max_zone_append_size by max_segments * PAGE_SIZE. + * Technically, we can have multiple pages per segment. But, since + * we add the pages one by one to a bio, and cannot increase the + * metadata reservation even if it increases the number of extents, it + * is safe to stick with the limit. + */ + fs_info->max_zone_append_size = ALIGN_DOWN( + min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT, + (u64)lim->max_sectors << SECTOR_SHIFT, + (u64)lim->max_segments << PAGE_SHIFT), + fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; if (fs_info->max_zone_append_size < fs_info->max_extent_size) fs_info->max_extent_size = fs_info->max_zone_append_size; @@ -1623,8 +1627,10 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans) spin_unlock(&trans->releasing_ebs_lock); } -bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) +bool btrfs_use_zone_append(struct btrfs_bio *bbio) { + u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); + struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_group *cache; bool ret = false; @@ -1635,6 +1641,9 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) if (!is_data_inode(&inode->vfs_inode)) return false; + if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) + return false; + /* * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the * extent layout the relocation code has. @@ -1657,22 +1666,16 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) return ret; } -void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, - struct bio *bio) +void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { + const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; struct btrfs_ordered_extent *ordered; - const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - if (bio_op(bio) != REQ_OP_ZONE_APPEND) - return; - - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); + ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); if (WARN_ON(!ordered)) return; ordered->physical = physical; - ordered->bdev = bio->bi_bdev; - btrfs_put_ordered_extent(ordered); } @@ -1684,43 +1687,46 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) struct extent_map *em; struct btrfs_ordered_sum *sum; u64 orig_logical = ordered->disk_bytenr; - u64 *logical = NULL; - int nr, stripe_len; + struct map_lookup *map; + u64 physical = ordered->physical; + u64 chunk_start_phys; + u64 logical; - /* Zoned devices should not have partitions. So, we can assume it is 0 */ - ASSERT(!bdev_is_partition(ordered->bdev)); - if (WARN_ON(!ordered->bdev)) + em = btrfs_get_chunk_map(fs_info, orig_logical, 1); + if (IS_ERR(em)) return; + map = em->map_lookup; + chunk_start_phys = map->stripes[0].physical; - if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev, - ordered->physical, &logical, &nr, - &stripe_len))) - goto out; - - WARN_ON(nr != 1); + if (WARN_ON_ONCE(map->num_stripes > 1) || + WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) || + WARN_ON_ONCE(physical < chunk_start_phys) || + WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) { + free_extent_map(em); + return; + } + logical = em->start + (physical - map->stripes[0].physical); + free_extent_map(em); - if (orig_logical == *logical) - goto out; + if (orig_logical == logical) + return; - ordered->disk_bytenr = *logical; + ordered->disk_bytenr = logical; em_tree = &inode->extent_tree; write_lock(&em_tree->lock); em = search_extent_mapping(em_tree, ordered->file_offset, ordered->num_bytes); - em->block_start = *logical; + em->block_start = logical; free_extent_map(em); write_unlock(&em_tree->lock); list_for_each_entry(sum, &ordered->list, list) { - if (*logical < orig_logical) - sum->bytenr -= orig_logical - *logical; + if (logical < orig_logical) + sum->bytenr -= orig_logical - logical; else - sum->bytenr += *logical - orig_logical; + sum->bytenr += logical - orig_logical; } - -out: - kfree(logical); } bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, @@ -1845,26 +1851,6 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); } -struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) -{ - struct btrfs_device *device; - struct extent_map *em; - struct map_lookup *map; - - em = btrfs_get_chunk_map(fs_info, logical, length); - if (IS_ERR(em)) - return ERR_CAST(em); - - map = em->map_lookup; - /* We only support single profile for now */ - device = map->stripes[0].dev; - - free_extent_map(em); - - return device; -} - /* * Activate block group and underlying device zones * diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index f43990985d80..c0570d35fea2 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -20,7 +20,6 @@ struct btrfs_zoned_device_info { */ u64 zone_size; u8 zone_size_shift; - u64 max_zone_append_size; u32 nr_zones; unsigned int max_active_zones; atomic_t active_zones_left; @@ -56,9 +55,8 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); void btrfs_free_redirty_list(struct btrfs_transaction *trans); -bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start); -void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, - struct bio *bio); +bool btrfs_use_zone_append(struct btrfs_bio *bbio); +void btrfs_record_physical_zoned(struct btrfs_bio *bbio); void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, @@ -68,8 +66,6 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); -struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, - u64 logical, u64 length); bool btrfs_zone_activate(struct btrfs_block_group *block_group); int btrfs_zone_finish(struct btrfs_block_group *block_group); bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); @@ -185,13 +181,12 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { } static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } -static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) +static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio) { return false; } -static inline void btrfs_record_physical_zoned(struct inode *inode, - u64 file_offset, struct bio *bio) +static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { } @@ -224,13 +219,6 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, return -EOPNOTSUPP; } -static inline struct btrfs_device *btrfs_zoned_get_device( - struct btrfs_fs_info *fs_info, - u64 logical, u64 length) -{ - return ERR_PTR(-EOPNOTSUPP); -} - static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group) { return true; diff --git a/fs/buffer.c b/fs/buffer.c index d9c6d1fbb6dd..623e77d6ef77 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -48,6 +48,7 @@ #include <linux/sched/mm.h> #include <trace/events/block.h> #include <linux/fscrypt.h> +#include <linux/fsverity.h> #include "internal.h" @@ -295,20 +296,53 @@ still_busy: return; } -struct decrypt_bh_ctx { +struct postprocess_bh_ctx { struct work_struct work; struct buffer_head *bh; }; +static void verify_bh(struct work_struct *work) +{ + struct postprocess_bh_ctx *ctx = + container_of(work, struct postprocess_bh_ctx, work); + struct buffer_head *bh = ctx->bh; + bool valid; + + valid = fsverity_verify_blocks(page_folio(bh->b_page), bh->b_size, + bh_offset(bh)); + end_buffer_async_read(bh, valid); + kfree(ctx); +} + +static bool need_fsverity(struct buffer_head *bh) +{ + struct page *page = bh->b_page; + struct inode *inode = page->mapping->host; + + return fsverity_active(inode) && + /* needed by ext4 */ + page->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + static void decrypt_bh(struct work_struct *work) { - struct decrypt_bh_ctx *ctx = - container_of(work, struct decrypt_bh_ctx, work); + struct postprocess_bh_ctx *ctx = + container_of(work, struct postprocess_bh_ctx, work); struct buffer_head *bh = ctx->bh; int err; - err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size, - bh_offset(bh)); + err = fscrypt_decrypt_pagecache_blocks(page_folio(bh->b_page), + bh->b_size, bh_offset(bh)); + if (err == 0 && need_fsverity(bh)) { + /* + * We use different work queues for decryption and for verity + * because verity may require reading metadata pages that need + * decryption, and we shouldn't recurse to the same workqueue. + */ + INIT_WORK(&ctx->work, verify_bh); + fsverity_enqueue_verify_work(&ctx->work); + return; + } end_buffer_async_read(bh, err == 0); kfree(ctx); } @@ -319,15 +353,24 @@ static void decrypt_bh(struct work_struct *work) */ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) { - /* Decrypt if needed */ - if (uptodate && - fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) { - struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); + struct inode *inode = bh->b_page->mapping->host; + bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode); + bool verify = need_fsverity(bh); + + /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */ + if (uptodate && (decrypt || verify)) { + struct postprocess_bh_ctx *ctx = + kmalloc(sizeof(*ctx), GFP_ATOMIC); if (ctx) { - INIT_WORK(&ctx->work, decrypt_bh); ctx->bh = bh; - fscrypt_enqueue_decrypt_work(&ctx->work); + if (decrypt) { + INIT_WORK(&ctx->work, decrypt_bh); + fscrypt_enqueue_decrypt_work(&ctx->work); + } else { + INIT_WORK(&ctx->work, verify_bh); + fsverity_enqueue_verify_work(&ctx->work); + } return; } uptodate = 0; @@ -2245,6 +2288,11 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) int nr, i; int fully_mapped = 1; bool page_error = false; + loff_t limit = i_size_read(inode); + + /* This is needed for ext4. */ + if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) + limit = inode->i_sb->s_maxbytes; VM_BUG_ON_FOLIO(folio_test_large(folio), folio); @@ -2253,7 +2301,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) bbits = block_size_bits(blocksize); iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits); - lblock = (i_size_read(inode)+blocksize-1) >> bbits; + lblock = (limit+blocksize-1) >> bbits; bh = head; nr = 0; i = 0; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index a69073a1d3f0..40052bdb3365 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -138,7 +138,7 @@ static int cachefiles_adjust_size(struct cachefiles_object *object) newattrs.ia_size = oi_size & PAGE_MASK; ret = cachefiles_inject_remove_error(); if (ret == 0) - ret = notify_change(&init_user_ns, file->f_path.dentry, + ret = notify_change(&nop_mnt_idmap, file->f_path.dentry, &newattrs, NULL); if (ret < 0) goto truncate_failed; @@ -148,7 +148,7 @@ static int cachefiles_adjust_size(struct cachefiles_object *object) newattrs.ia_size = ni_size; ret = cachefiles_inject_write_error(); if (ret == 0) - ret = notify_change(&init_user_ns, file->f_path.dentry, + ret = notify_change(&nop_mnt_idmap, file->f_path.dentry, &newattrs, NULL); truncate_failed: diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 03ca8f2f657a..82219a8f6084 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -130,7 +130,7 @@ retry: goto mkdir_error; ret = cachefiles_inject_write_error(); if (ret == 0) - ret = vfs_mkdir(&init_user_ns, d_inode(dir), subdir, 0700); + ret = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); if (ret < 0) { trace_cachefiles_vfs_error(NULL, d_inode(dir), ret, cachefiles_trace_mkdir_error); @@ -245,7 +245,7 @@ static int cachefiles_unlink(struct cachefiles_cache *cache, ret = cachefiles_inject_remove_error(); if (ret == 0) { - ret = vfs_unlink(&init_user_ns, d_backing_inode(dir), dentry, NULL); + ret = vfs_unlink(&nop_mnt_idmap, d_backing_inode(dir), dentry, NULL); if (ret == -EIO) cachefiles_io_error(cache, "Unlink failed"); } @@ -382,10 +382,10 @@ try_again: cachefiles_io_error(cache, "Rename security error %d", ret); } else { struct renamedata rd = { - .old_mnt_userns = &init_user_ns, + .old_mnt_idmap = &nop_mnt_idmap, .old_dir = d_inode(dir), .old_dentry = rep, - .new_mnt_userns = &init_user_ns, + .new_mnt_idmap = &nop_mnt_idmap, .new_dir = d_inode(cache->graveyard), .new_dentry = grave, }; @@ -451,7 +451,7 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) ret = cachefiles_inject_write_error(); if (ret == 0) { - file = vfs_tmpfile_open(&init_user_ns, &parentpath, S_IFREG, + file = vfs_tmpfile_open(&nop_mnt_idmap, &parentpath, S_IFREG, O_RDWR | O_LARGEFILE | O_DIRECT, cache->cache_cred); ret = PTR_ERR_OR_ZERO(file); @@ -714,7 +714,7 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, ret = cachefiles_inject_read_error(); if (ret == 0) - ret = vfs_link(object->file->f_path.dentry, &init_user_ns, + ret = vfs_link(object->file->f_path.dentry, &nop_mnt_idmap, d_inode(fan), dentry, NULL); if (ret < 0) { trace_cachefiles_vfs_error(object, d_inode(fan), ret, diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 00b087c14995..bcb6173943ee 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -65,7 +65,7 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object) ret = cachefiles_inject_write_error(); if (ret == 0) - ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, + ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, sizeof(struct cachefiles_xattr) + len, 0); if (ret < 0) { trace_cachefiles_vfs_error(object, file_inode(file), ret, @@ -108,7 +108,7 @@ int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file xlen = cachefiles_inject_read_error(); if (xlen == 0) - xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, buf, tlen); + xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, tlen); if (xlen != tlen) { if (xlen < 0) trace_cachefiles_vfs_error(object, file_inode(file), xlen, @@ -150,7 +150,7 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, ret = cachefiles_inject_remove_error(); if (ret == 0) - ret = vfs_removexattr(&init_user_ns, dentry, cachefiles_xattr_cache); + ret = vfs_removexattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache); if (ret < 0) { trace_cachefiles_vfs_error(object, d_inode(dentry), ret, cachefiles_trace_remxattr_error); @@ -207,7 +207,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) ret = cachefiles_inject_write_error(); if (ret == 0) - ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, + ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, len, 0); if (ret < 0) { trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret, @@ -249,7 +249,7 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) xlen = cachefiles_inject_read_error(); if (xlen == 0) - xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, buf, len); + xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, len); if (xlen != len) { if (xlen < 0) { trace_cachefiles_vfs_error(NULL, d_inode(dentry), xlen, diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index c7e8dd5b58d4..6945a938d396 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -85,7 +85,7 @@ retry: return acl; } -int ceph_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int ret = 0, size = 0; @@ -105,7 +105,7 @@ int ceph_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - ret = posix_acl_update_mode(&init_user_ns, inode, + ret = posix_acl_update_mode(&nop_mnt_idmap, inode, &new_mode, &acl); if (ret) goto out; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8c74871e37c9..cac4083e387a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -305,7 +305,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_request *req; + struct ceph_osd_request *req = NULL; struct ceph_vino vino = ceph_vino(inode); struct iov_iter iter; struct page **pages; @@ -313,6 +313,11 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) int err = 0; u64 len = subreq->len; + if (ceph_inode_is_shutdown(inode)) { + err = -EIO; + goto out; + } + if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) return; @@ -563,6 +568,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p idx %lu\n", page, page->index); + if (ceph_inode_is_shutdown(inode)) + return -EIO; + /* verify this is a writeable snap context */ snapc = page_snap_context(page); if (!snapc) { @@ -1643,7 +1651,7 @@ int ceph_uninline_data(struct file *file) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req = NULL; - struct ceph_cap_flush *prealloc_cf; + struct ceph_cap_flush *prealloc_cf = NULL; struct folio *folio = NULL; u64 inline_version = CEPH_INLINE_NONE; struct page *pages[1]; @@ -1657,6 +1665,11 @@ int ceph_uninline_data(struct file *file) dout("uninline_data %p %llx.%llx inline_version %llu\n", inode, ceph_vinop(inode), inline_version); + if (ceph_inode_is_shutdown(inode)) { + err = -EIO; + goto out; + } + if (inline_version == CEPH_INLINE_NONE) return 0; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f75ad432f375..7cc20772eac9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -9,6 +9,7 @@ #include <linux/wait.h> #include <linux/writeback.h> #include <linux/iversion.h> +#include <linux/filelock.h> #include "super.h" #include "mds_client.h" @@ -4078,6 +4079,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, void *p, *end; struct cap_extra_info extra_info = {}; bool queue_trunc; + bool close_sessions = false; dout("handle_caps from mds%d\n", session->s_mds); @@ -4215,9 +4217,13 @@ void ceph_handle_caps(struct ceph_mds_session *session, realm = NULL; if (snaptrace_len) { down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, snaptrace, - snaptrace + snaptrace_len, - false, &realm); + if (ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, + false, &realm)) { + up_write(&mdsc->snap_rwsem); + close_sessions = true; + goto done; + } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); @@ -4277,6 +4283,11 @@ done_unlocked: iput(inode); out: ceph_put_string(extra_info.pool_ns); + + /* Defer closing the sessions after s_mutex lock being released */ + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); + return; flush_cap_releases: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 6c7026cc8988..0ced8b570e42 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -845,7 +845,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) return PTR_ERR(result); } -static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); @@ -905,13 +905,13 @@ out: return err; } -static int ceph_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ceph_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return ceph_mknod(mnt_userns, dir, dentry, mode, 0); + return ceph_mknod(idmap, dir, dentry, mode, 0); } -static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *dest) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); @@ -970,7 +970,7 @@ out: return err; } -static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); @@ -1269,7 +1269,7 @@ out: return err; } -static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 764598e1efd9..b5cff85925a1 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -2011,6 +2011,9 @@ static int ceph_zero_partial_object(struct inode *inode, loff_t zero = 0; int op; + if (ceph_inode_is_shutdown(inode)) + return -EIO; + if (!length) { op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; length = &zero; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 23d05ec87fcc..8e5f41d45283 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2227,7 +2227,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) /* * setattr */ -int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -2240,7 +2240,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (ceph_inode_is_shutdown(inode)) return -ESTALE; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err != 0) return err; @@ -2255,7 +2255,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, err = __ceph_setattr(inode, attr); if (err >= 0 && (attr->ia_valid & ATTR_MODE)) - err = posix_acl_chmod(&init_user_ns, dentry, attr->ia_mode); + err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode); return err; } @@ -2397,7 +2397,7 @@ out: * Check inode permissions. We verify we have a valid value for * the AUTH cap, then call the generic handler. */ -int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int err; @@ -2408,7 +2408,7 @@ int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode, err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false); if (!err) - err = generic_permission(&init_user_ns, inode, mask); + err = generic_permission(&nop_mnt_idmap, inode, mask); return err; } @@ -2417,10 +2417,10 @@ static int statx_to_caps(u32 want, umode_t mode) { int mask = 0; - if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME)) + if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_AUTH_SHARED; - if (want & (STATX_NLINK|STATX_CTIME)) { + if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) { /* * The link count for directories depends on inode->i_subdirs, * and that is only updated when Fs caps are held. @@ -2431,11 +2431,10 @@ static int statx_to_caps(u32 want, umode_t mode) mask |= CEPH_CAP_LINK_SHARED; } - if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE| - STATX_BLOCKS)) + if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_FILE_SHARED; - if (want & (STATX_CTIME)) + if (want & (STATX_CTIME|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_XATTR_SHARED; return mask; @@ -2445,7 +2444,7 @@ static int statx_to_caps(u32 want, umode_t mode) * Get all the attributes. If we have sufficient caps for the requested attrs, * then we can avoid talking to the MDS at all. */ -int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); @@ -2466,7 +2465,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, return err; } - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->ino = ceph_present_inode(inode); /* @@ -2478,6 +2477,11 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, valid_mask |= STATX_BTIME; } + if (request_mask & STATX_CHANGE_COOKIE) { + stat->change_cookie = inode_peek_iversion_raw(inode); + valid_mask |= STATX_CHANGE_COOKIE; + } + if (ceph_snap(inode) == CEPH_NOSNAP) stat->dev = sb->s_dev; else @@ -2519,6 +2523,8 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->nlink = 1 + 1 + ci->i_subdirs; } + stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; stat->result_mask = request_mask & valid_mask; return err; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 9c8dc8a55e7e..cb51c7e9c8e2 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -7,6 +7,7 @@ #include "super.h" #include "mds_client.h" +#include <linux/filelock.h> #include <linux/ceph/pagelist.h> static u64 lock_secret; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 26a0a8b9975e..27a245d959c0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -806,6 +806,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *s; + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) + return ERR_PTR(-EIO); + if (mds >= mdsc->mdsmap->possible_max_rank) return ERR_PTR(-EINVAL); @@ -1478,6 +1481,9 @@ static int __open_session(struct ceph_mds_client *mdsc, int mstate; int mds = session->s_mds; + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) + return -EIO; + /* wait for mds to go active? */ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); dout("open_session to mds%d (%s)\n", mds, @@ -2860,6 +2866,11 @@ static void __do_request(struct ceph_mds_client *mdsc, return; } + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) { + dout("do_request metadata corrupted\n"); + err = -EIO; + goto finish; + } if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { dout("do_request timed out\n"); @@ -3245,6 +3256,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) u64 tid; int err, result; int mds = session->s_mds; + bool close_sessions = false; if (msg->front.iov_len < sizeof(*head)) { pr_err("mdsc_handle_reply got corrupt (short) reply\n"); @@ -3351,10 +3363,17 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) realm = NULL; if (rinfo->snapblob_len) { down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, rinfo->snapblob, + err = ceph_update_snap_trace(mdsc, rinfo->snapblob, rinfo->snapblob + rinfo->snapblob_len, le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, &realm); + if (err) { + up_write(&mdsc->snap_rwsem); + close_sessions = true; + if (err == -EIO) + ceph_msg_dump(msg); + goto out_err; + } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); @@ -3412,6 +3431,10 @@ out_err: req->r_end_latency, err); out: ceph_mdsc_put_request(req); + + /* Defer closing the sessions after s_mutex lock being released */ + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); return; } @@ -3662,6 +3685,12 @@ static void handle_session(struct ceph_mds_session *session, break; case CEPH_SESSION_FLUSHMSG: + /* flush cap releases */ + spin_lock(&session->s_cap_lock); + if (session->s_num_cap_releases) + ceph_flush_cap_releases(mdsc, session); + spin_unlock(&session->s_cap_lock); + send_flushmsg_ack(mdsc, session, seq); break; @@ -5011,7 +5040,7 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) } /* - * called after sb is ro. + * called after sb is ro or when metadata corrupted. */ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { @@ -5301,7 +5330,8 @@ static void mds_peer_reset(struct ceph_connection *con) struct ceph_mds_client *mdsc = s->s_mdsc; pr_warn("mds%d closed our session\n", s->s_mds); - send_mds_reconnect(mdsc, s); + if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO) + send_mds_reconnect(mdsc, s); } static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index e4151852184e..87007203f130 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> #include <linux/sort.h> #include <linux/slab.h> #include <linux/iversion.h> @@ -766,8 +767,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm; struct ceph_snap_realm *first_realm = NULL; struct ceph_snap_realm *realm_to_rebuild = NULL; + struct ceph_client *client = mdsc->fsc->client; int rebuild_snapcs; int err = -ENOMEM; + int ret; LIST_HEAD(dirty_realms); lockdep_assert_held_write(&mdsc->snap_rwsem); @@ -884,6 +887,27 @@ fail: if (first_realm) ceph_put_snap_realm(mdsc, first_realm); pr_err("%s error %d\n", __func__, err); + + /* + * When receiving a corrupted snap trace we don't know what + * exactly has happened in MDS side. And we shouldn't continue + * writing to OSD, which may corrupt the snapshot contents. + * + * Just try to blocklist this kclient and then this kclient + * must be remounted to continue after the corrupted metadata + * fixed in the MDS side. + */ + WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO); + ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); + if (ret) + pr_err("%s failed to blocklist %s: %d\n", __func__, + ceph_pr_addr(&client->msgr.inst.addr), ret); + + WARN(1, "%s: %s%sdo remount to continue%s", + __func__, ret ? "" : ceph_pr_addr(&client->msgr.inst.addr), + ret ? "" : " was blocklisted, ", + err == -EIO ? " after corrupted snaptrace is fixed" : ""); + return err; } @@ -984,6 +1008,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, __le64 *split_inos = NULL, *split_realms = NULL; int i; int locked_rwsem = 0; + bool close_sessions = false; /* decode */ if (msg->front.iov_len < sizeof(*h)) @@ -1092,8 +1117,12 @@ skip_inode: * update using the provided snap trace. if we are deleting a * snap, we can avoid queueing cap_snaps. */ - ceph_update_snap_trace(mdsc, p, e, - op == CEPH_SNAP_OP_DESTROY, NULL); + if (ceph_update_snap_trace(mdsc, p, e, + op == CEPH_SNAP_OP_DESTROY, + NULL)) { + close_sessions = true; + goto bad; + } if (op == CEPH_SNAP_OP_SPLIT) /* we took a reference when we created the realm, above */ @@ -1112,6 +1141,9 @@ bad: out: if (locked_rwsem) up_write(&mdsc->snap_rwsem); + + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); return; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0ed3be75bb9a..6ecca2c6d137 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -100,6 +100,17 @@ struct ceph_mount_options { char *mon_addr; }; +/* mount state */ +enum { + CEPH_MOUNT_MOUNTING, + CEPH_MOUNT_MOUNTED, + CEPH_MOUNT_UNMOUNTING, + CEPH_MOUNT_UNMOUNTED, + CEPH_MOUNT_SHUTDOWN, + CEPH_MOUNT_RECOVER, + CEPH_MOUNT_FENCE_IO, +}; + #define CEPH_ASYNC_CREATE_CONFLICT_BITS 8 struct ceph_fs_client { @@ -1039,12 +1050,12 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) { return __ceph_do_getattr(inode, NULL, mask, force); } -extern int ceph_permission(struct user_namespace *mnt_userns, +extern int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); extern int __ceph_setattr(struct inode *inode, struct iattr *attr); -extern int ceph_setattr(struct user_namespace *mnt_userns, +extern int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -extern int ceph_getattr(struct user_namespace *mnt_userns, +extern int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); void ceph_inode_shutdown(struct inode *inode); @@ -1117,7 +1128,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx); #ifdef CONFIG_CEPH_FS_POSIX_ACL struct posix_acl *ceph_get_acl(struct inode *, int, bool); -int ceph_set_acl(struct user_namespace *mnt_userns, +int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int ceph_pre_init_acls(struct inode *dir, umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f31350cda960..f65b07cc33a2 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1285,7 +1285,7 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler, } static int ceph_set_xattr_handler(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index bbf58c2439da..9a2d390bd06f 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1674,7 +1674,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, return rc; } -struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *cifs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { #if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX) @@ -1738,7 +1738,7 @@ out: #endif } -int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int cifs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { #if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 10e00c624922..cb7c5460a80b 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/mount.h> #include <linux/slab.h> #include <linux/init.h> @@ -345,7 +346,7 @@ static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len) return -EOPNOTSUPP; } -static int cifs_permission(struct user_namespace *mnt_userns, +static int cifs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct cifs_sb_info *cifs_sb; @@ -361,7 +362,7 @@ static int cifs_permission(struct user_namespace *mnt_userns, on the client (above and beyond ACL on servers) for servers which do not support setting and viewing mode bits, so allowing client to check permissions is useful */ - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } static struct kmem_cache *cifs_inode_cachep; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 63a0ac2b9355..b58cd737b21e 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -49,7 +49,7 @@ extern void cifs_sb_deactive(struct super_block *sb); /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); -extern int cifs_create(struct user_namespace *, struct inode *, +extern int cifs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool excl); extern int cifs_atomic_open(struct inode *, struct dentry *, struct file *, unsigned, umode_t); @@ -57,12 +57,12 @@ extern struct dentry *cifs_lookup(struct inode *, struct dentry *, unsigned int); extern int cifs_unlink(struct inode *dir, struct dentry *dentry); extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); -extern int cifs_mknod(struct user_namespace *, struct inode *, struct dentry *, +extern int cifs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); -extern int cifs_mkdir(struct user_namespace *, struct inode *, struct dentry *, +extern int cifs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); extern int cifs_rmdir(struct inode *, struct dentry *); -extern int cifs_rename2(struct user_namespace *, struct inode *, +extern int cifs_rename2(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); extern int cifs_revalidate_file_attr(struct file *filp); @@ -72,9 +72,9 @@ extern int cifs_revalidate_dentry(struct dentry *); extern int cifs_invalidate_mapping(struct inode *inode); extern int cifs_revalidate_mapping(struct inode *inode); extern int cifs_zap_mapping(struct inode *inode); -extern int cifs_getattr(struct user_namespace *, const struct path *, +extern int cifs_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -extern int cifs_setattr(struct user_namespace *, struct dentry *, +extern int cifs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern int cifs_fiemap(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); @@ -124,7 +124,7 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path); /* Functions related to symlinks */ extern const char *cifs_get_link(struct dentry *, struct inode *, struct delayed_call *); -extern int cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, +extern int cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, const char *symname); #ifdef CONFIG_CIFS_XATTR diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index cfdd5bf701a1..cd8171a1c9a0 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -26,6 +26,7 @@ #include <uapi/linux/cifs/cifs_mount.h> #include "../smbfs_common/smb2pdu.h" #include "smb2pdu.h" +#include <linux/filelock.h> #define SMB_PATH_MAX 260 #define CIFS_PORT 445 diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 1207b39686fb..b8a47704a6ef 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -225,9 +225,9 @@ extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, const char *, u32 *, u32); extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *, const struct cifs_fid *, u32 *, u32); -extern struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns, +extern struct posix_acl *cifs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type); -extern int cifs_set_acl(struct user_namespace *mnt_userns, +extern int cifs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, const char *, int); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 23f10e0d6e7e..60dd4e37030a 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -15,6 +15,7 @@ /* want to reuse a stale file handle and only the caller knows the file info */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/kernel.h> #include <linux/vfs.h> #include <linux/slab.h> diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index ad4208bf1e32..2b6076324ffc 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -529,7 +529,7 @@ out_free_xid: return rc; } -int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, +int cifs_create(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, umode_t mode, bool excl) { int rc; @@ -579,7 +579,7 @@ out_free_xid: return rc; } -int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode, +int cifs_mknod(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, umode_t mode, dev_t device_number) { int rc = -EPERM; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 22dfc1f8b4f1..2870e3b6ffe8 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -9,6 +9,7 @@ * */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/backing-dev.h> #include <linux/stat.h> #include <linux/fcntl.h> @@ -3889,7 +3890,7 @@ uncached_fill_pages(struct TCP_Server_Info *server, rdata->got_bytes += result; } - return rdata->got_bytes > 0 && result != -ECONNABORTED ? + return result != -ECONNABORTED && rdata->got_bytes > 0 ? rdata->got_bytes : result; } @@ -4665,7 +4666,7 @@ readpages_fill_pages(struct TCP_Server_Info *server, rdata->got_bytes += result; } - return rdata->got_bytes > 0 && result != -ECONNABORTED ? + return result != -ECONNABORTED && rdata->got_bytes > 0 ? rdata->got_bytes : result; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index f145a59af89b..11cdc7cfe0ba 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1910,7 +1910,7 @@ posix_mkdir_get_info: } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ -int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, +int cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, umode_t mode) { int rc = 0; @@ -2138,7 +2138,7 @@ do_rename_exit: } int -cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, +cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, struct dentry *source_dentry, struct inode *target_dir, struct dentry *target_dentry, unsigned int flags) { @@ -2496,7 +2496,7 @@ int cifs_revalidate_dentry(struct dentry *dentry) return cifs_revalidate_mapping(inode); } -int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int cifs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; @@ -2537,7 +2537,7 @@ int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path, return rc; } - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->blksize = cifs_sb->ctx->bsize; stat->ino = CIFS_I(inode)->uniqueid; @@ -2752,7 +2752,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; - rc = setattr_prepare(&init_user_ns, direntry, attrs); + rc = setattr_prepare(&nop_mnt_idmap, direntry, attrs); if (rc < 0) goto out; @@ -2859,7 +2859,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); } - setattr_copy(&init_user_ns, inode, attrs); + setattr_copy(&nop_mnt_idmap, inode, attrs); mark_inode_dirty(inode); /* force revalidate when any of these times are set since some @@ -2903,7 +2903,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; - rc = setattr_prepare(&init_user_ns, direntry, attrs); + rc = setattr_prepare(&nop_mnt_idmap, direntry, attrs); if (rc < 0) goto cifs_setattr_exit; @@ -3058,7 +3058,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); } - setattr_copy(&init_user_ns, inode, attrs); + setattr_copy(&nop_mnt_idmap, inode, attrs); mark_inode_dirty(inode); cifs_setattr_exit: @@ -3068,7 +3068,7 @@ cifs_setattr_exit: } int -cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry, +cifs_setattr(struct mnt_idmap *idmap, struct dentry *direntry, struct iattr *attrs) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index a5a097a69983..4510dea77be3 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -569,7 +569,7 @@ cifs_hl_exit: } int -cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, +cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, const char *symname) { int rc = -EOPNOTSUPP; diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index ba6cc50af390..9f1dd04b555a 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -7,6 +7,7 @@ * */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/stat.h> #include <linux/slab.h> #include <linux/pagemap.h> diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 5f2fb2fd2e37..50e762fa1a14 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -89,7 +89,7 @@ static int cifs_creation_time_set(unsigned int xid, struct cifs_tcon *pTcon, } static int cifs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index 9be281bbcc06..dd6277d87afb 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -46,12 +46,12 @@ extern const struct file_operations coda_ioctl_operations; /* operations shared over more than one file */ int coda_open(struct inode *i, struct file *f); int coda_release(struct inode *i, struct file *f); -int coda_permission(struct user_namespace *mnt_userns, struct inode *inode, +int coda_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int coda_revalidate_inode(struct inode *); -int coda_getattr(struct user_namespace *, const struct path *, struct kstat *, +int coda_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -int coda_setattr(struct user_namespace *, struct dentry *, struct iattr *); +int coda_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); /* this file: helpers */ char *coda_f2s(struct CodaFid *f); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 328d7a684b63..8450b1bd354b 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -73,7 +73,7 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsig } -int coda_permission(struct user_namespace *mnt_userns, struct inode *inode, +int coda_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int error; @@ -133,7 +133,7 @@ static inline void coda_dir_drop_nlink(struct inode *dir) } /* creation routines: create, mknod, mkdir, link, symlink */ -static int coda_create(struct user_namespace *mnt_userns, struct inode *dir, +static int coda_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *de, umode_t mode, bool excl) { int error; @@ -166,7 +166,7 @@ err_out: return error; } -static int coda_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *de, umode_t mode) { struct inode *inode; @@ -228,7 +228,7 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode, } -static int coda_symlink(struct user_namespace *mnt_userns, +static int coda_symlink(struct mnt_idmap *idmap, struct inode *dir_inode, struct dentry *de, const char *symname) { @@ -295,7 +295,7 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) } /* rename */ -static int coda_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int coda_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 2185328b65c7..d661e6cf17ac 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -251,16 +251,16 @@ static void coda_evict_inode(struct inode *inode) coda_cache_clear_inode(inode); } -int coda_getattr(struct user_namespace *mnt_userns, const struct path *path, +int coda_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { int err = coda_revalidate_inode(d_inode(path->dentry)); if (!err) - generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); return err; } -int coda_setattr(struct user_namespace *mnt_userns, struct dentry *de, +int coda_setattr(struct mnt_idmap *idmap, struct dentry *de, struct iattr *iattr) { struct inode *inode = d_inode(de); diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index cb9fd59a688c..36e35c15561a 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -24,7 +24,7 @@ #include "coda_linux.h" /* pioctl ops */ -static int coda_ioctl_permission(struct user_namespace *mnt_userns, +static int coda_ioctl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); static long coda_pioctl(struct file *filp, unsigned int cmd, unsigned long user_data); @@ -41,7 +41,7 @@ const struct file_operations coda_ioctl_operations = { }; /* the coda pioctl inode ops */ -static int coda_ioctl_permission(struct user_namespace *mnt_userns, +static int coda_ioctl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return (mask & MAY_EXEC) ? -EACCES : 0; diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index c0395363eab9..e710a1782382 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -77,7 +77,7 @@ extern void configfs_hash_and_remove(struct dentry * dir, const char * name); extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); -extern int configfs_setattr(struct user_namespace *mnt_userns, +extern int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); extern struct dentry *configfs_pin_fs(void); @@ -91,7 +91,7 @@ extern const struct inode_operations configfs_root_inode_operations; extern const struct inode_operations configfs_symlink_inode_operations; extern const struct dentry_operations configfs_dentry_ops; -extern int configfs_symlink(struct user_namespace *mnt_userns, +extern int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname); extern int configfs_unlink(struct inode *dir, struct dentry *dentry); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index ec6519e1ca3b..4afcbbe63e68 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1251,7 +1251,7 @@ out_root_unlock: } EXPORT_SYMBOL(configfs_depend_item_unlocked); -static int configfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int ret = 0; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index b601610e9907..1c15edbe70ff 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -32,7 +32,7 @@ static const struct inode_operations configfs_inode_operations ={ .setattr = configfs_setattr, }; -int configfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode * inode = d_inode(dentry); @@ -60,7 +60,7 @@ int configfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } /* attributes were changed atleast once in past */ - error = simple_setattr(mnt_userns, dentry, iattr); + error = simple_setattr(idmap, dentry, iattr); if (error) return error; diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 0623c3edcfb9..69133ec1fac2 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -137,7 +137,7 @@ static int get_target(const char *symname, struct path *path, } -int configfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int ret; @@ -196,7 +196,7 @@ int configfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (dentry->d_inode || d_unhashed(dentry)) ret = -EEXIST; else - ret = inode_permission(&init_user_ns, dir, + ret = inode_permission(&nop_mnt_idmap, dir, MAY_WRITE | MAY_EXEC); if (!ret) ret = type->ct_item_ops->allow_link(parent_item, target_item); diff --git a/fs/coredump.c b/fs/coredump.c index de78bde2991b..68619329ec65 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -644,7 +644,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) goto close_fail; } } else { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *inode; int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW | O_LARGEFILE | O_EXCL; @@ -722,8 +722,8 @@ void do_coredump(const kernel_siginfo_t *siginfo) * a process dumps core while its cwd is e.g. on a vfat * filesystem. */ - mnt_userns = file_mnt_user_ns(cprm.file); - if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), + idmap = file_mnt_idmap(cprm.file); + if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) { pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n", cn.corename); @@ -736,7 +736,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) } if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) goto close_fail; - if (do_truncate(mnt_userns, cprm.file->f_path.dentry, + if (do_truncate(idmap, cprm.file->f_path.dentry, 0, 0, cprm.file)) goto close_fail; } @@ -838,6 +838,30 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) } } +int dump_emit(struct coredump_params *cprm, const void *addr, int nr) +{ + if (cprm->to_skip) { + if (!__dump_skip(cprm, cprm->to_skip)) + return 0; + cprm->to_skip = 0; + } + return __dump_emit(cprm, addr, nr); +} +EXPORT_SYMBOL(dump_emit); + +void dump_skip_to(struct coredump_params *cprm, unsigned long pos) +{ + cprm->to_skip = pos - cprm->pos; +} +EXPORT_SYMBOL(dump_skip_to); + +void dump_skip(struct coredump_params *cprm, size_t nr) +{ + cprm->to_skip += nr; +} +EXPORT_SYMBOL(dump_skip); + +#ifdef CONFIG_ELF_CORE static int dump_emit_page(struct coredump_params *cprm, struct page *page) { struct bio_vec bvec = { @@ -871,30 +895,6 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) return 1; } -int dump_emit(struct coredump_params *cprm, const void *addr, int nr) -{ - if (cprm->to_skip) { - if (!__dump_skip(cprm, cprm->to_skip)) - return 0; - cprm->to_skip = 0; - } - return __dump_emit(cprm, addr, nr); -} -EXPORT_SYMBOL(dump_emit); - -void dump_skip_to(struct coredump_params *cprm, unsigned long pos) -{ - cprm->to_skip = pos - cprm->pos; -} -EXPORT_SYMBOL(dump_skip_to); - -void dump_skip(struct coredump_params *cprm, size_t nr) -{ - cprm->to_skip += nr; -} -EXPORT_SYMBOL(dump_skip); - -#ifdef CONFIG_ELF_CORE int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len) { diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 1b4403136d05..d57d0a020f71 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -30,13 +30,11 @@ */ bool fscrypt_decrypt_bio(struct bio *bio) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; - int err = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len, - bv->bv_offset); + bio_for_each_folio_all(fi, bio) { + int err = fscrypt_decrypt_pagecache_blocks(fi.folio, fi.length, + fi.offset); if (err) { bio->bi_status = errno_to_blk_status(err); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index e78be66bbf01..bf642479269a 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -237,41 +237,43 @@ EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); /** * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a - * pagecache page - * @page: The locked pagecache page containing the block(s) to decrypt + * pagecache folio + * @folio: The locked pagecache folio containing the block(s) to decrypt * @len: Total size of the block(s) to decrypt. Must be a nonzero * multiple of the filesystem's block size. - * @offs: Byte offset within @page of the first block to decrypt. Must be + * @offs: Byte offset within @folio of the first block to decrypt. Must be * a multiple of the filesystem's block size. * - * The specified block(s) are decrypted in-place within the pagecache page, - * which must still be locked and not uptodate. Normally, blocksize == - * PAGE_SIZE and the whole page is decrypted at once. + * The specified block(s) are decrypted in-place within the pagecache folio, + * which must still be locked and not uptodate. * * This is for use by the filesystem's ->readahead() method. * * Return: 0 on success; -errno on failure */ -int fscrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len, - unsigned int offs) +int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, + size_t offs) { - const struct inode *inode = page->mapping->host; + const struct inode *inode = folio->mapping->host; const unsigned int blockbits = inode->i_blkbits; const unsigned int blocksize = 1 << blockbits; - u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) + + u64 lblk_num = ((u64)folio->index << (PAGE_SHIFT - blockbits)) + (offs >> blockbits); - unsigned int i; + size_t i; int err; - if (WARN_ON_ONCE(!PageLocked(page))) + if (WARN_ON_ONCE(!folio_test_locked(folio))) return -EINVAL; if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize))) return -EINVAL; for (i = offs; i < offs + len; i += blocksize, lblk_num++) { + struct page *page = folio_page(folio, i >> PAGE_SHIFT); + err = fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, - page, blocksize, i, GFP_NOFS); + page, blocksize, i & ~PAGE_MASK, + GFP_NOFS); if (err) return err; } diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 316a778cec0f..0fec2dfc36eb 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -573,6 +573,9 @@ fscrypt_find_master_key(struct super_block *sb, int fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); +int fscrypt_add_test_dummy_key(struct super_block *sb, + struct fscrypt_key_specifier *key_spec); + int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); @@ -651,6 +654,7 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, const union fscrypt_policy *policy2); int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, struct fscrypt_key_specifier *key_spec); +const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb); bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode); int fscrypt_policy_from_context(union fscrypt_policy *policy_u, diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 78dd2ff306bd..78086f8dbda5 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -211,10 +211,6 @@ static int allocate_filesystem_keyring(struct super_block *sb) * are still available at this time; this is important because after user file * accesses have been allowed, this function may need to evict keys from the * keyslots of an inline crypto engine, which requires the block device(s). - * - * This is also called when the super_block is being freed. This is needed to - * avoid a memory leak if mounting fails after the "test_dummy_encryption" - * option was processed, as in that case the unmount-time call isn't made. */ void fscrypt_destroy_keyring(struct super_block *sb) { @@ -778,34 +774,26 @@ out: /** * fscrypt_add_test_dummy_key() - add the test dummy encryption key * @sb: the filesystem instance to add the key to - * @dummy_policy: the encryption policy for test_dummy_encryption + * @key_spec: the key specifier of the test dummy encryption key * - * If needed, add the key for the test_dummy_encryption mount option to the - * filesystem. To prevent misuse of this mount option, a per-boot random key is - * used instead of a hardcoded one. This makes it so that any encrypted files - * created using this option won't be accessible after a reboot. + * Add the key for the test_dummy_encryption mount option to the filesystem. To + * prevent misuse of this mount option, a per-boot random key is used instead of + * a hardcoded one. This makes it so that any encrypted files created using + * this option won't be accessible after a reboot. * * Return: 0 on success, -errno on failure */ int fscrypt_add_test_dummy_key(struct super_block *sb, - const struct fscrypt_dummy_policy *dummy_policy) + struct fscrypt_key_specifier *key_spec) { - const union fscrypt_policy *policy = dummy_policy->policy; - struct fscrypt_key_specifier key_spec; struct fscrypt_master_key_secret secret; int err; - if (!policy) - return 0; - err = fscrypt_policy_to_key_spec(policy, &key_spec); - if (err) - return err; fscrypt_get_test_dummy_secret(&secret); - err = add_master_key(sb, &secret, &key_spec); + err = add_master_key(sb, &secret, key_spec); wipe_master_key_secret(&secret); return err; } -EXPORT_SYMBOL_GPL(fscrypt_add_test_dummy_key); /* * Verify that the current user has added a master key with the given identifier diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 94757ccd3056..aa94fba9d17e 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -438,6 +438,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, bool need_dirhash_key, struct fscrypt_master_key **mk_ret) { + struct super_block *sb = ci->ci_inode->i_sb; struct fscrypt_key_specifier mk_spec; struct fscrypt_master_key *mk; int err; @@ -450,8 +451,26 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, if (err) return err; - mk = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec); - if (!mk) { + mk = fscrypt_find_master_key(sb, &mk_spec); + if (unlikely(!mk)) { + const union fscrypt_policy *dummy_policy = + fscrypt_get_dummy_policy(sb); + + /* + * Add the test_dummy_encryption key on-demand. In principle, + * it should be added at mount time. Do it here instead so that + * the individual filesystems don't need to worry about adding + * this key at mount time and cleaning up on mount failure. + */ + if (dummy_policy && + fscrypt_policies_equal(dummy_policy, &ci->ci_policy)) { + err = fscrypt_add_test_dummy_key(sb, &mk_spec); + if (err) + return err; + mk = fscrypt_find_master_key(sb, &mk_spec); + } + } + if (unlikely(!mk)) { if (ci->ci_policy.version != FSCRYPT_POLICY_V1) return -ENOKEY; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 893661b52376..3b5fcb6402ea 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -53,8 +53,7 @@ int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, } } -static const union fscrypt_policy * -fscrypt_get_dummy_policy(struct super_block *sb) +const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb) { if (!sb->s_cop->get_dummy_policy) return NULL; @@ -506,7 +505,7 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) return -EFAULT; policy.version = version; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -1271,8 +1271,9 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) if (ret < 0) goto out_unlock; - ret = copy_mc_to_kernel(daddr, saddr, length); - if (ret) + if (copy_mc_to_kernel(daddr, saddr, length) == 0) + ret = length; + else ret = -EIO; out_unlock: diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 2e8e112b1993..bf397f6a6a33 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -42,7 +42,7 @@ static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS; * so that we can use the file mode as part of a heuristic to determine whether * to lock down individual files. */ -static int debugfs_setattr(struct user_namespace *mnt_userns, +static int debugfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *ia) { int ret; @@ -52,7 +52,7 @@ static int debugfs_setattr(struct user_namespace *mnt_userns, if (ret) return ret; } - return simple_setattr(&init_user_ns, dentry, ia); + return simple_setattr(&nop_mnt_idmap, dentry, ia); } static const struct inode_operations debugfs_file_inode_operations = { @@ -837,7 +837,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, take_dentry_name_snapshot(&old_name, old_dentry); - error = simple_rename(&init_user_ns, d_inode(old_dir), old_dentry, + error = simple_rename(&nop_mnt_idmap, d_inode(old_dir), old_dentry, d_inode(new_dir), dentry, 0); if (error) { release_dentry_name_snapshot(&old_name); diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index 1105ce3c80cb..b3b86dbdc187 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -4,7 +4,6 @@ menuconfig DLM depends on INET depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) select IP_SCTP - select SRCU help A general purpose distributed lock manager for kernel or userspace applications. diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index d0b4e2181a5f..9f344d76afa3 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -381,23 +381,23 @@ static int threads_start(void) { int error; - error = dlm_scand_start(); + /* Thread for sending/receiving messages for all lockspace's */ + error = dlm_midcomms_start(); if (error) { - log_print("cannot start dlm_scand thread %d", error); + log_print("cannot start dlm midcomms %d", error); goto fail; } - /* Thread for sending/receiving messages for all lockspace's */ - error = dlm_midcomms_start(); + error = dlm_scand_start(); if (error) { - log_print("cannot start dlm midcomms %d", error); - goto scand_fail; + log_print("cannot start dlm_scand thread %d", error); + goto midcomms_fail; } return 0; - scand_fail: - dlm_scand_stop(); + midcomms_fail: + dlm_midcomms_stop(); fail: return error; } @@ -572,7 +572,7 @@ static int new_lockspace(const char *name, const char *cluster, spin_lock_init(&ls->ls_rcom_spin); get_random_bytes(&ls->ls_rcom_seq, sizeof(uint64_t)); ls->ls_recover_status = 0; - ls->ls_recover_seq = 0; + ls->ls_recover_seq = get_random_u64(); ls->ls_recover_args = NULL; init_rwsem(&ls->ls_in_recovery); init_rwsem(&ls->ls_recv_active); @@ -820,6 +820,9 @@ static int release_lockspace(struct dlm_ls *ls, int force) return rv; } + if (ls_count == 1) + dlm_midcomms_version_wait(); + dlm_device_deregister(ls); if (force < 3 && dlm_user_daemon_available()) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 4450721ec83c..61cd6c2628fa 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -61,6 +61,7 @@ #include "memory.h" #include "config.h" +#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(5000) #define NEEDED_RMEM (4*1024*1024) struct connection { @@ -99,6 +100,7 @@ struct connection { struct connection *othercon; struct work_struct rwork; /* receive worker */ struct work_struct swork; /* send worker */ + wait_queue_head_t shutdown_wait; unsigned char rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE]; int rx_leftover; int mark; @@ -282,6 +284,7 @@ static void dlm_con_init(struct connection *con, int nodeid) INIT_WORK(&con->swork, process_send_sockets); INIT_WORK(&con->rwork, process_recv_sockets); spin_lock_init(&con->addrs_lock); + init_waitqueue_head(&con->shutdown_wait); } /* @@ -790,6 +793,43 @@ static void close_connection(struct connection *con, bool and_other) up_write(&con->sock_lock); } +static void shutdown_connection(struct connection *con, bool and_other) +{ + int ret; + + if (con->othercon && and_other) + shutdown_connection(con->othercon, false); + + flush_workqueue(io_workqueue); + down_read(&con->sock_lock); + /* nothing to shutdown */ + if (!con->sock) { + up_read(&con->sock_lock); + return; + } + + ret = kernel_sock_shutdown(con->sock, SHUT_WR); + up_read(&con->sock_lock); + if (ret) { + log_print("Connection %p failed to shutdown: %d will force close", + con, ret); + goto force_close; + } else { + ret = wait_event_timeout(con->shutdown_wait, !con->sock, + DLM_SHUTDOWN_WAIT_TIMEOUT); + if (ret == 0) { + log_print("Connection %p shutdown timed out, will force close", + con); + goto force_close; + } + } + + return; + +force_close: + close_connection(con, false); +} + static struct processqueue_entry *new_processqueue_entry(int nodeid, int buflen) { @@ -1488,6 +1528,7 @@ static void process_recv_sockets(struct work_struct *work) break; case DLM_IO_EOF: close_connection(con, false); + wake_up(&con->shutdown_wait); /* CF_RECV_PENDING cleared */ break; case DLM_IO_RESCHED: @@ -1695,6 +1736,9 @@ static int work_start(void) void dlm_lowcomms_shutdown(void) { + struct connection *con; + int i, idx; + /* stop lowcomms_listen_data_ready calls */ lock_sock(listen_con.sock->sk); listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready; @@ -1703,29 +1747,20 @@ void dlm_lowcomms_shutdown(void) cancel_work_sync(&listen_con.rwork); dlm_close_sock(&listen_con.sock); - flush_workqueue(process_workqueue); -} - -void dlm_lowcomms_shutdown_node(int nodeid, bool force) -{ - struct connection *con; - int idx; - idx = srcu_read_lock(&connections_srcu); - con = nodeid2con(nodeid, 0); - if (WARN_ON_ONCE(!con)) { - srcu_read_unlock(&connections_srcu, idx); - return; - } + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(con, &connection_hash[i], list) { + shutdown_connection(con, true); + stop_connection_io(con); + flush_workqueue(process_workqueue); + close_connection(con, true); - flush_work(&con->swork); - stop_connection_io(con); - WARN_ON_ONCE(!force && !list_empty(&con->writequeue)); - close_connection(con, true); - clean_one_writequeue(con); - if (con->othercon) - clean_one_writequeue(con->othercon); - allow_connection_io(con); + clean_one_writequeue(con); + if (con->othercon) + clean_one_writequeue(con->othercon); + allow_connection_io(con); + } + } srcu_read_unlock(&connections_srcu, idx); } diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index eb7a08641fcf..cdbaa452fc05 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -51,7 +51,7 @@ int __init dlm_memory_init(void) cb_cache = kmem_cache_create("dlm_cb", sizeof(struct dlm_callback), __alignof__(struct dlm_callback), 0, NULL); - if (!rsb_cache) + if (!cb_cache) goto cb; return 0; diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index fc015a6abe17..c02c43e4980a 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -146,8 +146,8 @@ /* init value for sequence numbers for testing purpose only e.g. overflows */ #define DLM_SEQ_INIT 0 -/* 3 minutes wait to sync ending of dlm */ -#define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(3 * 60 * 1000) +/* 5 seconds wait to sync ending of dlm */ +#define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(5000) #define DLM_VERSION_NOT_SET 0 struct midcomms_node { @@ -375,7 +375,7 @@ static int dlm_send_ack(int nodeid, uint32_t seq) struct dlm_msg *msg; char *ppc; - msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_NOFS, &ppc, + msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_ATOMIC, &ppc, NULL, NULL); if (!msg) return -ENOMEM; @@ -402,10 +402,11 @@ static int dlm_send_fin(struct midcomms_node *node, struct dlm_mhandle *mh; char *ppc; - mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_NOFS, &ppc); + mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_ATOMIC, &ppc); if (!mh) return -ENOMEM; + set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); mh->ack_rcv = ack_rcv; m_header = (struct dlm_header *)ppc; @@ -417,7 +418,6 @@ static int dlm_send_fin(struct midcomms_node *node, pr_debug("sending fin msg to node %d\n", node->nodeid); dlm_midcomms_commit_mhandle(mh, NULL, 0); - set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); return 0; } @@ -467,7 +467,7 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) break; default: spin_unlock(&node->state_lock); - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); WARN_ON_ONCE(1); return; @@ -498,18 +498,14 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, switch (p->header.h_cmd) { case DLM_FIN: - /* send ack before fin */ - dlm_send_ack(node->nodeid, node->seq_next); - spin_lock(&node->state_lock); pr_debug("receive fin msg from node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); switch (node->state) { case DLM_ESTABLISHED: - node->state = DLM_CLOSE_WAIT; - pr_debug("switch node %d to state %s\n", - node->nodeid, dlm_state_str(node->state)); + dlm_send_ack(node->nodeid, node->seq_next); + /* passive shutdown DLM_LAST_ACK case 1 * additional we check if the node is used by * cluster manager events at all. @@ -518,34 +514,38 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, node->state = DLM_LAST_ACK; pr_debug("switch node %d to state %s case 1\n", node->nodeid, dlm_state_str(node->state)); - spin_unlock(&node->state_lock); - goto send_fin; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); + } else { + node->state = DLM_CLOSE_WAIT; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); } break; case DLM_FIN_WAIT1: + dlm_send_ack(node->nodeid, node->seq_next); node->state = DLM_CLOSING; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); break; case DLM_FIN_WAIT2: + dlm_send_ack(node->nodeid, node->seq_next); midcomms_node_reset(node); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); - wake_up(&node->shutdown_wait); break; case DLM_LAST_ACK: /* probably remove_member caught it, do nothing */ break; default: spin_unlock(&node->state_lock); - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); WARN_ON_ONCE(1); return; } spin_unlock(&node->state_lock); - - set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); break; default: WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); @@ -564,12 +564,6 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, log_print_ratelimited("ignore dlm msg because seq mismatch, seq: %u, expected: %u, nodeid: %d", seq, node->seq_next, node->nodeid); } - - return; - -send_fin: - set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); - dlm_send_fin(node, dlm_pas_fin_ack_rcv); } static struct midcomms_node * @@ -612,16 +606,8 @@ dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p, case DLM_ESTABLISHED: break; default: - /* some invalid state passive shutdown - * was failed, we try to reset and - * hope it will go on. - */ - log_print("reset node %d because shutdown stuck", - node->nodeid); - - midcomms_node_reset(node); - node->state = DLM_ESTABLISHED; - break; + spin_unlock(&node->state_lock); + return NULL; } spin_unlock(&node->state_lock); } @@ -671,6 +657,7 @@ static int dlm_midcomms_version_check_3_2(struct midcomms_node *node) switch (node->version) { case DLM_VERSION_NOT_SET: node->version = DLM_VERSION_3_2; + wake_up(&node->shutdown_wait); log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2, node->nodeid); break; @@ -840,6 +827,7 @@ static int dlm_midcomms_version_check_3_1(struct midcomms_node *node) switch (node->version) { case DLM_VERSION_NOT_SET: node->version = DLM_VERSION_3_1; + wake_up(&node->shutdown_wait); log_print("version 0x%08x for node %d detected", DLM_VERSION_3_1, node->nodeid); break; @@ -1214,8 +1202,15 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, dlm_free_mhandle(mh); break; case DLM_VERSION_3_2: + /* held rcu read lock here, because we sending the + * dlm message out, when we do that we could receive + * an ack back which releases the mhandle and we + * get a use after free. + */ + rcu_read_lock(); dlm_midcomms_commit_msg_3_2(mh, name, namelen); srcu_read_unlock(&nodes_srcu, mh->idx); + rcu_read_unlock(); break; default: srcu_read_unlock(&nodes_srcu, mh->idx); @@ -1266,7 +1261,6 @@ static void dlm_act_fin_ack_rcv(struct midcomms_node *node) midcomms_node_reset(node); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); - wake_up(&node->shutdown_wait); break; case DLM_CLOSED: /* not valid but somehow we got what we want */ @@ -1274,7 +1268,7 @@ static void dlm_act_fin_ack_rcv(struct midcomms_node *node) break; default: spin_unlock(&node->state_lock); - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); WARN_ON_ONCE(1); return; @@ -1362,11 +1356,11 @@ void dlm_midcomms_remove_member(int nodeid) case DLM_CLOSE_WAIT: /* passive shutdown DLM_LAST_ACK case 2 */ node->state = DLM_LAST_ACK; - spin_unlock(&node->state_lock); - pr_debug("switch node %d to state %s case 2\n", node->nodeid, dlm_state_str(node->state)); - goto send_fin; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); + break; case DLM_LAST_ACK: /* probably receive fin caught it, do nothing */ break; @@ -1374,7 +1368,7 @@ void dlm_midcomms_remove_member(int nodeid) /* already gone, do nothing */ break; default: - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); break; } @@ -1382,12 +1376,6 @@ void dlm_midcomms_remove_member(int nodeid) spin_unlock(&node->state_lock); srcu_read_unlock(&nodes_srcu, idx); - return; - -send_fin: - set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); - dlm_send_fin(node, dlm_pas_fin_ack_rcv); - srcu_read_unlock(&nodes_srcu, idx); } static void midcomms_node_release(struct rcu_head *rcu) @@ -1395,9 +1383,31 @@ static void midcomms_node_release(struct rcu_head *rcu) struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); + dlm_send_queue_flush(node); kfree(node); } +void dlm_midcomms_version_wait(void) +{ + struct midcomms_node *node; + int i, idx, ret; + + idx = srcu_read_lock(&nodes_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { + ret = wait_event_timeout(node->shutdown_wait, + node->version != DLM_VERSION_NOT_SET || + node->state == DLM_CLOSED || + test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), + DLM_SHUTDOWN_TIMEOUT); + if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) + pr_debug("version wait timed out for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + } + } + srcu_read_unlock(&nodes_srcu, idx); +} + static void midcomms_shutdown(struct midcomms_node *node) { int ret; @@ -1418,11 +1428,11 @@ static void midcomms_shutdown(struct midcomms_node *node) node->state = DLM_FIN_WAIT1; pr_debug("switch node %d to state %s case 2\n", node->nodeid, dlm_state_str(node->state)); + dlm_send_fin(node, dlm_act_fin_ack_rcv); break; case DLM_CLOSED: /* we have what we want */ - spin_unlock(&node->state_lock); - return; + break; default: /* busy to enter DLM_FIN_WAIT1, wait until passive * done in shutdown_wait to enter DLM_CLOSED. @@ -1431,29 +1441,20 @@ static void midcomms_shutdown(struct midcomms_node *node) } spin_unlock(&node->state_lock); - if (node->state == DLM_FIN_WAIT1) { - dlm_send_fin(node, dlm_act_fin_ack_rcv); - - if (DLM_DEBUG_FENCE_TERMINATION) - msleep(5000); - } + if (DLM_DEBUG_FENCE_TERMINATION) + msleep(5000); /* wait for other side dlm + fin */ ret = wait_event_timeout(node->shutdown_wait, node->state == DLM_CLOSED || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), DLM_SHUTDOWN_TIMEOUT); - if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) { + if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) pr_debug("active shutdown timed out for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); - midcomms_node_reset(node); - dlm_lowcomms_shutdown_node(node->nodeid, true); - return; - } - - pr_debug("active shutdown done for node %d with state %s\n", - node->nodeid, dlm_state_str(node->state)); - dlm_lowcomms_shutdown_node(node->nodeid, false); + else + pr_debug("active shutdown done for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); } void dlm_midcomms_shutdown(void) @@ -1461,8 +1462,6 @@ void dlm_midcomms_shutdown(void) struct midcomms_node *node; int i, idx; - dlm_lowcomms_shutdown(); - mutex_lock(&close_lock); idx = srcu_read_lock(&nodes_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { @@ -1480,6 +1479,8 @@ void dlm_midcomms_shutdown(void) } srcu_read_unlock(&nodes_srcu, idx); mutex_unlock(&close_lock); + + dlm_lowcomms_shutdown(); } int dlm_midcomms_close(int nodeid) diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index bea1cee4279c..9f8c9605013d 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -20,6 +20,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, gfp_t allocation, char **ppc); void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name, int namelen); +void dlm_midcomms_version_wait(void); int dlm_midcomms_close(int nodeid); int dlm_midcomms_start(void); void dlm_midcomms_stop(void); diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 737f185aad8d..ed4357e62f35 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -4,6 +4,7 @@ */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/miscdevice.h> #include <linux/poll.h> #include <linux/dlm.h> diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index e3f5d7f3c8a0..bd3f3c755b24 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1105,7 +1105,7 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, } inode_lock(lower_inode); - rc = __vfs_setxattr(&init_user_ns, lower_dentry, lower_inode, + rc = __vfs_setxattr(&nop_mnt_idmap, lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME, page_virt, size, 0); if (!rc && ecryptfs_inode) fsstack_copy_attr_all(ecryptfs_inode, lower_inode); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index f3cd00fac9c3..144ace9e0dd9 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -139,7 +139,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, if (d_unhashed(lower_dentry)) rc = -EINVAL; else - rc = vfs_unlink(&init_user_ns, lower_dir, lower_dentry, + rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, NULL); } if (rc) { @@ -180,7 +180,7 @@ ecryptfs_do_create(struct inode *directory_inode, rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir); if (!rc) - rc = vfs_create(&init_user_ns, lower_dir, + rc = vfs_create(&nop_mnt_idmap, lower_dir, lower_dentry, mode, true); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " @@ -191,7 +191,7 @@ ecryptfs_do_create(struct inode *directory_inode, inode = __ecryptfs_get_inode(d_inode(lower_dentry), directory_inode->i_sb); if (IS_ERR(inode)) { - vfs_unlink(&init_user_ns, lower_dir, lower_dentry, NULL); + vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, NULL); goto out_lock; } fsstack_copy_attr_times(directory_inode, lower_dir); @@ -253,7 +253,7 @@ out: * Returns zero on success; non-zero on error condition */ static int -ecryptfs_create(struct user_namespace *mnt_userns, +ecryptfs_create(struct mnt_idmap *idmap, struct inode *directory_inode, struct dentry *ecryptfs_dentry, umode_t mode, bool excl) { @@ -434,7 +434,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); rc = lock_parent(new_dentry, &lower_new_dentry, &lower_dir); if (!rc) - rc = vfs_link(lower_old_dentry, &init_user_ns, lower_dir, + rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir, lower_new_dentry, NULL); if (rc || d_really_is_negative(lower_new_dentry)) goto out_lock; @@ -456,7 +456,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) return ecryptfs_do_unlink(dir, dentry, d_inode(dentry)); } -static int ecryptfs_symlink(struct user_namespace *mnt_userns, +static int ecryptfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { @@ -478,7 +478,7 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns, strlen(symname)); if (rc) goto out_lock; - rc = vfs_symlink(&init_user_ns, lower_dir, lower_dentry, + rc = vfs_symlink(&nop_mnt_idmap, lower_dir, lower_dentry, encoded_symname); kfree(encoded_symname); if (rc || d_really_is_negative(lower_dentry)) @@ -495,7 +495,7 @@ out_lock: return rc; } -static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int rc; @@ -504,7 +504,7 @@ static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, rc = lock_parent(dentry, &lower_dentry, &lower_dir); if (!rc) - rc = vfs_mkdir(&init_user_ns, lower_dir, + rc = vfs_mkdir(&nop_mnt_idmap, lower_dir, lower_dentry, mode); if (rc || d_really_is_negative(lower_dentry)) goto out; @@ -533,7 +533,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) if (d_unhashed(lower_dentry)) rc = -EINVAL; else - rc = vfs_rmdir(&init_user_ns, lower_dir, lower_dentry); + rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry); } if (!rc) { clear_nlink(d_inode(dentry)); @@ -548,7 +548,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) } static int -ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { int rc; @@ -557,7 +557,7 @@ ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, rc = lock_parent(dentry, &lower_dentry, &lower_dir); if (!rc) - rc = vfs_mknod(&init_user_ns, lower_dir, + rc = vfs_mknod(&nop_mnt_idmap, lower_dir, lower_dentry, mode, dev); if (rc || d_really_is_negative(lower_dentry)) goto out; @@ -574,7 +574,7 @@ out: } static int -ecryptfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -616,10 +616,10 @@ ecryptfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, goto out_lock; } - rd.old_mnt_userns = &init_user_ns; + rd.old_mnt_idmap = &nop_mnt_idmap; rd.old_dir = d_inode(lower_old_dir_dentry); rd.old_dentry = lower_old_dentry; - rd.new_mnt_userns = &init_user_ns; + rd.new_mnt_idmap = &nop_mnt_idmap; rd.new_dir = d_inode(lower_new_dir_dentry); rd.new_dentry = lower_new_dentry; rc = vfs_rename(&rd); @@ -856,7 +856,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); inode_lock(d_inode(lower_dentry)); - rc = notify_change(&init_user_ns, lower_dentry, + rc = notify_change(&nop_mnt_idmap, lower_dentry, &lower_ia, NULL); inode_unlock(d_inode(lower_dentry)); } @@ -864,16 +864,16 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) } static int -ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +ecryptfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { - return inode_permission(&init_user_ns, + return inode_permission(&nop_mnt_idmap, ecryptfs_inode_to_lower(inode), mask); } /** * ecryptfs_setattr - * @mnt_userns: user namespace of the target mount + * @idmap: idmap of the target mount * @dentry: dentry handle to the inode to modify * @ia: Structure with flags of what to change and values * @@ -884,7 +884,7 @@ ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode, * All other metadata changes will be passed right to the lower filesystem, * and we will just update our inode to look like the lower. */ -static int ecryptfs_setattr(struct user_namespace *mnt_userns, +static int ecryptfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *ia) { int rc = 0; @@ -939,7 +939,7 @@ static int ecryptfs_setattr(struct user_namespace *mnt_userns, } mutex_unlock(&crypt_stat->cs_mutex); - rc = setattr_prepare(&init_user_ns, dentry, ia); + rc = setattr_prepare(&nop_mnt_idmap, dentry, ia); if (rc) goto out; if (ia->ia_valid & ATTR_SIZE) { @@ -965,14 +965,14 @@ static int ecryptfs_setattr(struct user_namespace *mnt_userns, lower_ia.ia_valid &= ~ATTR_MODE; inode_lock(d_inode(lower_dentry)); - rc = notify_change(&init_user_ns, lower_dentry, &lower_ia, NULL); + rc = notify_change(&nop_mnt_idmap, lower_dentry, &lower_ia, NULL); inode_unlock(d_inode(lower_dentry)); out: fsstack_copy_attr_all(inode, lower_inode); return rc; } -static int ecryptfs_getattr_link(struct user_namespace *mnt_userns, +static int ecryptfs_getattr_link(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -982,7 +982,7 @@ static int ecryptfs_getattr_link(struct user_namespace *mnt_userns, mount_crypt_stat = &ecryptfs_superblock_to_private( dentry->d_sb)->mount_crypt_stat; - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { char *target; size_t targetsiz; @@ -998,7 +998,7 @@ static int ecryptfs_getattr_link(struct user_namespace *mnt_userns, return rc; } -static int ecryptfs_getattr(struct user_namespace *mnt_userns, +static int ecryptfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -1011,7 +1011,7 @@ static int ecryptfs_getattr(struct user_namespace *mnt_userns, if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); stat->blocks = lower_stat.blocks; } return rc; @@ -1033,7 +1033,7 @@ ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, goto out; } inode_lock(lower_inode); - rc = __vfs_setxattr_locked(&init_user_ns, lower_dentry, name, value, size, flags, NULL); + rc = __vfs_setxattr_locked(&nop_mnt_idmap, lower_dentry, name, value, size, flags, NULL); inode_unlock(lower_inode); if (!rc && inode) fsstack_copy_attr_all(inode, lower_inode); @@ -1099,7 +1099,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, struct inode *inode, goto out; } inode_lock(lower_inode); - rc = __vfs_removexattr(&init_user_ns, lower_dentry, name); + rc = __vfs_removexattr(&nop_mnt_idmap, lower_dentry, name); inode_unlock(lower_inode); out: return rc; @@ -1110,26 +1110,26 @@ static int ecryptfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return vfs_fileattr_get(ecryptfs_dentry_to_lower(dentry), fa); } -static int ecryptfs_fileattr_set(struct user_namespace *mnt_userns, +static int ecryptfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); int rc; - rc = vfs_fileattr_set(&init_user_ns, lower_dentry, fa); + rc = vfs_fileattr_set(&nop_mnt_idmap, lower_dentry, fa); fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry)); return rc; } -static struct posix_acl *ecryptfs_get_acl(struct user_namespace *mnt_userns, +static struct posix_acl *ecryptfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { - return vfs_get_acl(mnt_userns, ecryptfs_dentry_to_lower(dentry), + return vfs_get_acl(idmap, ecryptfs_dentry_to_lower(dentry), posix_acl_xattr_name(type)); } -static int ecryptfs_set_acl(struct user_namespace *mnt_userns, +static int ecryptfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { @@ -1137,7 +1137,7 @@ static int ecryptfs_set_acl(struct user_namespace *mnt_userns, struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); struct inode *lower_inode = d_inode(lower_dentry); - rc = vfs_set_acl(&init_user_ns, lower_dentry, + rc = vfs_set_acl(&nop_mnt_idmap, lower_dentry, posix_acl_xattr_name(type), acl); if (!rc) fsstack_copy_attr_all(d_inode(dentry), lower_inode); @@ -1190,7 +1190,7 @@ static int ecryptfs_xattr_get(const struct xattr_handler *handler, } static int ecryptfs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 19af229eb7ca..373c3e5747e6 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -428,7 +428,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) if (size < 0) size = 8; put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); - rc = __vfs_setxattr(&init_user_ns, lower_dentry, lower_inode, + rc = __vfs_setxattr(&nop_mnt_idmap, lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME, xattr_virt, size, 0); inode_unlock(lower_inode); if (rc) diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 617f3ad2485e..b973a2c03dde 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -70,7 +70,7 @@ bool efivarfs_valid_name(const char *str, int len) return uuid_is_valid(s); } -static int efivarfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode = NULL; @@ -163,7 +163,7 @@ efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } static int -efivarfs_fileattr_set(struct user_namespace *mnt_userns, +efivarfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { unsigned int i_flags = 0; diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 85490370e0ca..704fb59577e0 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -108,3 +108,21 @@ config EROFS_FS_ONDEMAND read support. If unsure, say N. + +config EROFS_FS_PCPU_KTHREAD + bool "EROFS per-cpu decompression kthread workers" + depends on EROFS_FS_ZIP + help + Saying Y here enables per-CPU kthread workers pool to carry out + async decompression for low latencies on some architectures. + + If unsure, say N. + +config EROFS_FS_PCPU_KTHREAD_HIPRI + bool "EROFS high priority per-CPU kthread workers" + depends on EROFS_FS_ZIP && EROFS_FS_PCPU_KTHREAD + help + This permits EROFS to configure per-CPU kthread workers to run + at higher priority. + + If unsure, say N. diff --git a/fs/erofs/data.c b/fs/erofs/data.c index f57f921683d7..032e12dccb84 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -74,8 +74,7 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, } static int erofs_map_blocks_flatmode(struct inode *inode, - struct erofs_map_blocks *map, - int flags) + struct erofs_map_blocks *map) { erofs_blk_t nblocks, lastblk; u64 offset = map->m_la; @@ -91,11 +90,8 @@ static int erofs_map_blocks_flatmode(struct inode *inode, map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la; map->m_plen = blknr_to_addr(lastblk) - offset; } else if (tailendpacking) { - /* 2 - inode inline B: inode, [xattrs], inline last blk... */ - struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); - - map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize + - vi->xattr_isize + erofs_blkoff(map->m_la); + map->m_pa = erofs_iloc(inode) + vi->inode_isize + + vi->xattr_isize + erofs_blkoff(offset); map->m_plen = inode->i_size - offset; /* inline data should be located in the same meta block */ @@ -117,8 +113,7 @@ static int erofs_map_blocks_flatmode(struct inode *inode, return 0; } -int erofs_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, int flags) +int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) { struct super_block *sb = inode->i_sb; struct erofs_inode *vi = EROFS_I(inode); @@ -130,7 +125,7 @@ int erofs_map_blocks(struct inode *inode, void *kaddr; int err = 0; - trace_erofs_map_blocks_enter(inode, map, flags); + trace_erofs_map_blocks_enter(inode, map, 0); map->m_deviceid = 0; if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ @@ -140,7 +135,7 @@ int erofs_map_blocks(struct inode *inode, } if (vi->datalayout != EROFS_INODE_CHUNK_BASED) { - err = erofs_map_blocks_flatmode(inode, map, flags); + err = erofs_map_blocks_flatmode(inode, map); goto out; } @@ -150,7 +145,7 @@ int erofs_map_blocks(struct inode *inode, unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */ chunknr = map->m_la >> vi->chunkbits; - pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + + pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); @@ -192,7 +187,7 @@ out_unlock: out: if (!err) map->m_llen = map->m_plen; - trace_erofs_map_blocks_exit(inode, map, flags, 0); + trace_erofs_map_blocks_exit(inode, map, 0, err); return err; } @@ -255,7 +250,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_la = offset; map.m_llen = length; - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + ret = erofs_map_blocks(inode, &map); if (ret < 0) return ret; diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index ecf28f66b97d..6970b09b8307 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -6,21 +6,6 @@ */ #include "internal.h" -static void debug_one_dentry(unsigned char d_type, const char *de_name, - unsigned int de_namelen) -{ -#ifdef CONFIG_EROFS_FS_DEBUG - /* since the on-disk name could not have the trailing '\0' */ - unsigned char dbg_namebuf[EROFS_NAME_LEN + 1]; - - memcpy(dbg_namebuf, de_name, de_namelen); - dbg_namebuf[de_namelen] = '\0'; - - erofs_dbg("found dirent %s de_len %u d_type %d", dbg_namebuf, - de_namelen, d_type); -#endif -} - static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, void *dentry_blk, struct erofs_dirent *de, unsigned int nameoff, unsigned int maxsize) @@ -52,10 +37,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, return -EFSCORRUPTED; } - debug_one_dentry(d_type, de_name, de_namelen); if (!dir_emit(ctx, de_name, de_namelen, le64_to_cpu(de->nid), d_type)) - /* stopped by some reason */ return 1; ++de; ctx->pos += sizeof(struct erofs_dirent); diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 014e20962376..96a87c023128 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -9,6 +9,7 @@ static DEFINE_MUTEX(erofs_domain_list_lock); static DEFINE_MUTEX(erofs_domain_cookies_lock); static LIST_HEAD(erofs_domain_list); +static LIST_HEAD(erofs_domain_cookies_list); static struct vfsmount *erofs_pseudo_mnt; struct erofs_fscache_request { @@ -164,18 +165,8 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) { int ret; - struct super_block *sb = folio_mapping(folio)->host->i_sb; + struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private; struct erofs_fscache_request *req; - struct erofs_map_dev mdev = { - .m_deviceid = 0, - .m_pa = folio_pos(folio), - }; - - ret = erofs_map_dev(sb, &mdev); - if (ret) { - folio_unlock(folio); - return ret; - } req = erofs_fscache_req_alloc(folio_mapping(folio), folio_pos(folio), folio_size(folio)); @@ -184,8 +175,8 @@ static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) return PTR_ERR(req); } - ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - req, mdev.m_pa, folio_size(folio)); + ret = erofs_fscache_read_folios_async(ctx->cookie, req, + folio_pos(folio), folio_size(folio)); if (ret) req->error = ret; @@ -207,7 +198,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) int ret; map.m_la = pos; - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + ret = erofs_map_blocks(inode, &map); if (ret) return ret; @@ -328,8 +319,6 @@ const struct address_space_operations erofs_fscache_access_aops = { static void erofs_fscache_domain_put(struct erofs_domain *domain) { - if (!domain) - return; mutex_lock(&erofs_domain_list_lock); if (refcount_dec_and_test(&domain->ref)) { list_del(&domain->list); @@ -337,8 +326,8 @@ static void erofs_fscache_domain_put(struct erofs_domain *domain) kern_unmount(erofs_pseudo_mnt); erofs_pseudo_mnt = NULL; } - mutex_unlock(&erofs_domain_list_lock); fscache_relinquish_volume(domain->volume, NULL, false); + mutex_unlock(&erofs_domain_list_lock); kfree(domain->domain_id); kfree(domain); return; @@ -431,19 +420,21 @@ static int erofs_fscache_register_domain(struct super_block *sb) return err; } -static -struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, - char *name, - unsigned int flags) +static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, + char *name, unsigned int flags) { struct fscache_volume *volume = EROFS_SB(sb)->volume; struct erofs_fscache *ctx; struct fscache_cookie *cookie; + struct super_block *isb; + struct inode *inode; int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&ctx->node); + refcount_set(&ctx->ref, 1); cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE, name, strlen(name), NULL, 0, 0); @@ -452,32 +443,32 @@ struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, ret = -EINVAL; goto err; } - fscache_use_cookie(cookie, false); - ctx->cookie = cookie; - - if (flags & EROFS_REG_COOKIE_NEED_INODE) { - struct inode *const inode = new_inode(sb); - - if (!inode) { - erofs_err(sb, "failed to get anon inode for %s", name); - ret = -ENOMEM; - goto err_cookie; - } - set_nlink(inode, 1); - inode->i_size = OFFSET_MAX; - inode->i_mapping->a_ops = &erofs_fscache_meta_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - - ctx->inode = inode; + /* + * Allocate anonymous inode in global pseudo mount for shareable blobs, + * so that they are accessible among erofs fs instances. + */ + isb = flags & EROFS_REG_COOKIE_SHARE ? erofs_pseudo_mnt->mnt_sb : sb; + inode = new_inode(isb); + if (!inode) { + erofs_err(sb, "failed to get anon inode for %s", name); + ret = -ENOMEM; + goto err_cookie; } + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &erofs_fscache_meta_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + inode->i_private = ctx; + + ctx->cookie = cookie; + ctx->inode = inode; return ctx; err_cookie: - fscache_unuse_cookie(ctx->cookie, NULL, NULL); - fscache_relinquish_cookie(ctx->cookie, false); + fscache_unuse_cookie(cookie, NULL, NULL); + fscache_relinquish_cookie(cookie, false); err: kfree(ctx); return ERR_PTR(ret); @@ -492,13 +483,9 @@ static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx) kfree(ctx); } -static -struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, - char *name, - unsigned int flags) +static struct erofs_fscache *erofs_domain_init_cookie(struct super_block *sb, + char *name, unsigned int flags) { - int err; - struct inode *inode; struct erofs_fscache *ctx; struct erofs_domain *domain = EROFS_SB(sb)->domain; @@ -508,55 +495,38 @@ struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, ctx->name = kstrdup(name, GFP_KERNEL); if (!ctx->name) { - err = -ENOMEM; - goto out; - } - - inode = new_inode(erofs_pseudo_mnt->mnt_sb); - if (!inode) { - err = -ENOMEM; - goto out; + erofs_fscache_relinquish_cookie(ctx); + return ERR_PTR(-ENOMEM); } - ctx->domain = domain; - ctx->anon_inode = inode; - inode->i_private = ctx; refcount_inc(&domain->ref); + ctx->domain = domain; + list_add(&ctx->node, &erofs_domain_cookies_list); return ctx; -out: - erofs_fscache_relinquish_cookie(ctx); - return ERR_PTR(err); } -static -struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, - char *name, - unsigned int flags) +static struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, + char *name, unsigned int flags) { - struct inode *inode; struct erofs_fscache *ctx; struct erofs_domain *domain = EROFS_SB(sb)->domain; - struct super_block *psb = erofs_pseudo_mnt->mnt_sb; + flags |= EROFS_REG_COOKIE_SHARE; mutex_lock(&erofs_domain_cookies_lock); - spin_lock(&psb->s_inode_list_lock); - list_for_each_entry(inode, &psb->s_inodes, i_sb_list) { - ctx = inode->i_private; - if (!ctx || ctx->domain != domain || strcmp(ctx->name, name)) + list_for_each_entry(ctx, &erofs_domain_cookies_list, node) { + if (ctx->domain != domain || strcmp(ctx->name, name)) continue; if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) { - igrab(inode); + refcount_inc(&ctx->ref); } else { erofs_err(sb, "%s already exists in domain %s", name, domain->domain_id); ctx = ERR_PTR(-EEXIST); } - spin_unlock(&psb->s_inode_list_lock); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } - spin_unlock(&psb->s_inode_list_lock); - ctx = erofs_fscache_domain_init_cookie(sb, name, flags); + ctx = erofs_domain_init_cookie(sb, name, flags); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } @@ -572,23 +542,22 @@ struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) { - bool drop; - struct erofs_domain *domain; + struct erofs_domain *domain = NULL; if (!ctx) return; - domain = ctx->domain; - if (domain) { - mutex_lock(&erofs_domain_cookies_lock); - drop = atomic_read(&ctx->anon_inode->i_count) == 1; - iput(ctx->anon_inode); - mutex_unlock(&erofs_domain_cookies_lock); - if (!drop) - return; - } + if (!ctx->domain) + return erofs_fscache_relinquish_cookie(ctx); - erofs_fscache_relinquish_cookie(ctx); - erofs_fscache_domain_put(domain); + mutex_lock(&erofs_domain_cookies_lock); + if (refcount_dec_and_test(&ctx->ref)) { + domain = ctx->domain; + list_del(&ctx->node); + erofs_fscache_relinquish_cookie(ctx); + } + mutex_unlock(&erofs_domain_cookies_lock); + if (domain) + erofs_fscache_domain_put(domain); } int erofs_fscache_register_fs(struct super_block *sb) @@ -596,7 +565,7 @@ int erofs_fscache_register_fs(struct super_block *sb) int ret; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; - unsigned int flags; + unsigned int flags = 0; if (sbi->domain_id) ret = erofs_fscache_register_domain(sb); @@ -615,7 +584,6 @@ int erofs_fscache_register_fs(struct super_block *sb) * * Acquired domain/volume will be relinquished in kill_sb() on error. */ - flags = EROFS_REG_COOKIE_NEED_INODE; if (sbi->domain_id) flags |= EROFS_REG_COOKIE_NEED_NOEXIST; fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index d3b8736fa124..4be7dda3cd24 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -14,7 +14,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, struct super_block *sb = inode->i_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_inode *vi = EROFS_I(inode); - const erofs_off_t inode_loc = iloc(sbi, vi->nid); + const erofs_off_t inode_loc = erofs_iloc(inode); erofs_blk_t blkaddr, nblks = 0; void *kaddr; @@ -308,52 +308,54 @@ out_unlock: } /* - * erofs nid is 64bits, but i_ino is 'unsigned long', therefore - * we should do more for 32-bit platform to find the right inode. + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. */ -static int erofs_ilookup_test_actor(struct inode *inode, void *opaque) +static ino_t erofs_squash_ino(erofs_nid_t nid) { - const erofs_nid_t nid = *(erofs_nid_t *)opaque; + ino_t ino = (ino_t)nid; + + if (sizeof(ino_t) < sizeof(erofs_nid_t)) + ino ^= nid >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8; + return ino; +} - return EROFS_I(inode)->nid == nid; +static int erofs_iget5_eq(struct inode *inode, void *opaque) +{ + return EROFS_I(inode)->nid == *(erofs_nid_t *)opaque; } -static int erofs_iget_set_actor(struct inode *inode, void *opaque) +static int erofs_iget5_set(struct inode *inode, void *opaque) { const erofs_nid_t nid = *(erofs_nid_t *)opaque; - inode->i_ino = erofs_inode_hash(nid); + inode->i_ino = erofs_squash_ino(nid); + EROFS_I(inode)->nid = nid; return 0; } struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid) { - const unsigned long hashval = erofs_inode_hash(nid); struct inode *inode; - inode = iget5_locked(sb, hashval, erofs_ilookup_test_actor, - erofs_iget_set_actor, &nid); + inode = iget5_locked(sb, erofs_squash_ino(nid), erofs_iget5_eq, + erofs_iget5_set, &nid); if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - int err; - struct erofs_inode *vi = EROFS_I(inode); - - vi->nid = nid; + int err = erofs_fill_inode(inode); - err = erofs_fill_inode(inode); - if (!err) { - unlock_new_inode(inode); - } else { + if (err) { iget_failed(inode); - inode = ERR_PTR(err); + return ERR_PTR(err); } + unlock_new_inode(inode); } return inode; } -int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -366,7 +368,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); return 0; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index bb8501c0ff5b..3f3561d37d1b 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -12,7 +12,6 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/bio.h> -#include <linux/buffer_head.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -108,9 +107,12 @@ struct erofs_domain { struct erofs_fscache { struct fscache_cookie *cookie; - struct inode *inode; - struct inode *anon_inode; + struct inode *inode; /* anonymous inode for the blob */ + + /* used for share domain mode */ struct erofs_domain *domain; + struct list_head node; + refcount_t ref; char *name; }; @@ -271,11 +273,6 @@ struct erofs_buf { #define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ) #define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ) -static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid) -{ - return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits); -} - #define EROFS_FEATURE_FUNCS(name, compat, feature) \ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ { \ @@ -340,13 +337,14 @@ struct erofs_inode { struct inode vfs_inode; }; -#define EROFS_I(ptr) \ - container_of(ptr, struct erofs_inode, vfs_inode) +#define EROFS_I(ptr) container_of(ptr, struct erofs_inode, vfs_inode) -static inline unsigned long erofs_inode_datablocks(struct inode *inode) +static inline erofs_off_t erofs_iloc(struct inode *inode) { - /* since i_size cannot be changed */ - return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + + return blknr_to_addr(sbi->meta_blkaddr) + + (EROFS_I(inode)->nid << sbi->islotbits); } static inline unsigned int erofs_bitrange(unsigned int value, unsigned int bit, @@ -382,31 +380,18 @@ struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, readahead_gfp_mask(mapping) & ~__GFP_RECLAIM); } -extern const struct super_operations erofs_sops; -extern struct file_system_type erofs_fs_type; - -extern const struct address_space_operations erofs_raw_access_aops; -extern const struct address_space_operations z_erofs_aops; - -enum { - BH_Encoded = BH_PrivateStart, - BH_FullMapped, - BH_Fragment, - BH_Partialref, -}; - /* Has a disk mapping */ -#define EROFS_MAP_MAPPED (1 << BH_Mapped) +#define EROFS_MAP_MAPPED 0x0001 /* Located in metadata (could be copied from bd_inode) */ -#define EROFS_MAP_META (1 << BH_Meta) +#define EROFS_MAP_META 0x0002 /* The extent is encoded */ -#define EROFS_MAP_ENCODED (1 << BH_Encoded) +#define EROFS_MAP_ENCODED 0x0004 /* The length of extent is full */ -#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) +#define EROFS_MAP_FULL_MAPPED 0x0008 /* Located in the special packed inode */ -#define EROFS_MAP_FRAGMENT (1 << BH_Fragment) +#define EROFS_MAP_FRAGMENT 0x0010 /* The extent refers to partial decompressed data */ -#define EROFS_MAP_PARTIAL_REF (1 << BH_Partialref) +#define EROFS_MAP_PARTIAL_REF 0x0020 struct erofs_map_blocks { struct erofs_buf buf; @@ -419,17 +404,15 @@ struct erofs_map_blocks { unsigned int m_flags; }; -/* Flags used by erofs_map_blocks_flatmode() */ -#define EROFS_GET_BLOCKS_RAW 0x0001 /* * Used to get the exact decompressed length, e.g. fiemap (consider lookback * approach instead if possible since it's more metadata lightweight.) */ -#define EROFS_GET_BLOCKS_FIEMAP 0x0002 +#define EROFS_GET_BLOCKS_FIEMAP 0x0001 /* Used to map the whole extent if non-negligible data is requested for LZMA */ -#define EROFS_GET_BLOCKS_READMORE 0x0004 +#define EROFS_GET_BLOCKS_READMORE 0x0002 /* Used to map tail extent for tailpacking inline or fragment pcluster */ -#define EROFS_GET_BLOCKS_FINDTAIL 0x0008 +#define EROFS_GET_BLOCKS_FINDTAIL 0x0004 enum { Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, @@ -437,24 +420,6 @@ enum { Z_EROFS_COMPRESSION_RUNTIME_MAX }; -/* zmap.c */ -extern const struct iomap_ops z_erofs_iomap_report_ops; - -#ifdef CONFIG_EROFS_FS_ZIP -int z_erofs_fill_inode(struct inode *inode); -int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, - int flags); -#else -static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; } -static inline int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, - int flags) -{ - return -EOPNOTSUPP; -} -#endif /* !CONFIG_EROFS_FS_ZIP */ - struct erofs_map_dev { struct erofs_fscache *m_fscache; struct block_device *m_bdev; @@ -465,8 +430,27 @@ struct erofs_map_dev { unsigned int m_deviceid; }; -/* data.c */ +extern struct file_system_type erofs_fs_type; +extern const struct super_operations erofs_sops; + +extern const struct address_space_operations erofs_raw_access_aops; +extern const struct address_space_operations z_erofs_aops; +extern const struct address_space_operations erofs_fscache_access_aops; + +extern const struct inode_operations erofs_generic_iops; +extern const struct inode_operations erofs_symlink_iops; +extern const struct inode_operations erofs_fast_symlink_iops; +extern const struct inode_operations erofs_dir_iops; + extern const struct file_operations erofs_file_fops; +extern const struct file_operations erofs_dir_fops; + +extern const struct iomap_ops z_erofs_iomap_report_ops; + +/* flags for erofs_fscache_register_cookie() */ +#define EROFS_REG_COOKIE_SHARE 0x0001 +#define EROFS_REG_COOKIE_NEED_NOEXIST 0x0002 + void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); void *erofs_bread(struct erofs_buf *buf, struct inode *inode, @@ -476,37 +460,14 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); -int erofs_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, int flags); - -/* inode.c */ -static inline unsigned long erofs_inode_hash(erofs_nid_t nid) -{ -#if BITS_PER_LONG == 32 - return (nid >> 32) ^ (nid & 0xffffffff); -#else - return nid; -#endif -} - -extern const struct inode_operations erofs_generic_iops; -extern const struct inode_operations erofs_symlink_iops; -extern const struct inode_operations erofs_fast_symlink_iops; - +int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map); struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); -int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); - -/* namei.c */ -extern const struct inode_operations erofs_dir_iops; - int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, unsigned int *d_type); -/* dir.c */ -extern const struct file_operations erofs_dir_fops; - static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count) { int retried = 0; @@ -522,23 +483,19 @@ static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count) return NULL; } -/* pcpubuf.c */ void *erofs_get_pcpubuf(unsigned int requiredpages); void erofs_put_pcpubuf(void *ptr); int erofs_pcpubuf_growsize(unsigned int nrpages); void erofs_pcpubuf_init(void); void erofs_pcpubuf_exit(void); -/* sysfs.c */ int erofs_register_sysfs(struct super_block *sb); void erofs_unregister_sysfs(struct super_block *sb); int __init erofs_init_sysfs(void); void erofs_exit_sysfs(void); -/* utils.c / zdata.c */ struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp); -static inline void erofs_pagepool_add(struct page **pagepool, - struct page *page) +static inline void erofs_pagepool_add(struct page **pagepool, struct page *page) { set_page_private(page, (unsigned long)*pagepool); *pagepool = page; @@ -564,6 +521,9 @@ int erofs_try_to_free_cached_page(struct page *page); int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int len); +int z_erofs_fill_inode(struct inode *inode); +int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, + int flags); #else static inline void erofs_shrinker_register(struct super_block *sb) {} static inline void erofs_shrinker_unregister(struct super_block *sb) {} @@ -581,6 +541,7 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } return 0; } +static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_ZIP_LZMA @@ -601,23 +562,15 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb, } return 0; } -#endif /* !CONFIG_EROFS_FS_ZIP */ +#endif /* !CONFIG_EROFS_FS_ZIP_LZMA */ -/* flags for erofs_fscache_register_cookie() */ -#define EROFS_REG_COOKIE_NEED_INODE 1 -#define EROFS_REG_COOKIE_NEED_NOEXIST 2 - -/* fscache.c */ #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, - unsigned int flags); + char *name, unsigned int flags); void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache); - -extern const struct address_space_operations erofs_fscache_access_aops; #else static inline int erofs_fscache_register_fs(struct super_block *sb) { @@ -627,8 +580,7 @@ static inline void erofs_fscache_unregister_fs(struct super_block *sb) {} static inline struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, - unsigned int flags) + char *name, unsigned int flags) { return ERR_PTR(-EOPNOTSUPP); } diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index b64a108fac92..966eabc61c13 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -5,7 +5,6 @@ * Copyright (C) 2022, Alibaba Cloud */ #include "xattr.h" - #include <trace/events/erofs.h> struct erofs_qstr { @@ -87,19 +86,13 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name, return ERR_PTR(-ENOENT); } -static void *find_target_block_classic(struct erofs_buf *target, - struct inode *dir, - struct erofs_qstr *name, - int *_ndirents) +static void *erofs_find_target_block(struct erofs_buf *target, + struct inode *dir, struct erofs_qstr *name, int *_ndirents) { - unsigned int startprfx, endprfx; - int head, back; + int head = 0, back = DIV_ROUND_UP(dir->i_size, EROFS_BLKSIZ) - 1; + unsigned int startprfx = 0, endprfx = 0; void *candidate = ERR_PTR(-ENOENT); - startprfx = endprfx = 0; - head = 0; - back = erofs_inode_datablocks(dir) - 1; - while (head <= back) { const int mid = head + (back - head) / 2; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; @@ -180,8 +173,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, qn.end = name->name + name->len; ndirents = 0; - - de = find_target_block_classic(&buf, dir, &qn, &ndirents); + de = erofs_find_target_block(&buf, dir, &qn, &ndirents); if (IS_ERR(de)) return PTR_ERR(de); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 626a615dafc2..19b1ae79cec4 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -5,7 +5,6 @@ * Copyright (C) 2021, Alibaba Cloud */ #include <linux/module.h> -#include <linux/buffer_head.h> #include <linux/statfs.h> #include <linux/parser.h> #include <linux/seq_file.h> @@ -969,6 +968,8 @@ static void erofs_put_super(struct super_block *sb) iput(sbi->packed_inode); sbi->packed_inode = NULL; #endif + erofs_free_dev_context(sbi->devs); + sbi->devs = NULL; erofs_fscache_unregister_fs(sb); } diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index fd476961f742..435e515c0792 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -179,13 +179,13 @@ static const struct sysfs_ops erofs_attr_ops = { .store = erofs_attr_store, }; -static struct kobj_type erofs_sb_ktype = { +static const struct kobj_type erofs_sb_ktype = { .default_groups = erofs_groups, .sysfs_ops = &erofs_attr_ops, .release = erofs_sb_release, }; -static struct kobj_type erofs_ktype = { +static const struct kobj_type erofs_ktype = { .sysfs_ops = &erofs_attr_ops, }; @@ -193,7 +193,7 @@ static struct kset erofs_root = { .kobj = {.ktype = &erofs_ktype}, }; -static struct kobj_type erofs_feat_ktype = { +static const struct kobj_type erofs_feat_ktype = { .default_groups = erofs_feat_groups, .sysfs_ops = &erofs_attr_ops, }; diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h deleted file mode 100644 index 64ceb7270b5c..000000000000 --- a/fs/erofs/tagptr.h +++ /dev/null @@ -1,107 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * A tagged pointer implementation - */ -#ifndef __EROFS_FS_TAGPTR_H -#define __EROFS_FS_TAGPTR_H - -#include <linux/types.h> -#include <linux/build_bug.h> - -/* - * the name of tagged pointer types are tagptr{1, 2, 3...}_t - * avoid directly using the internal structs __tagptr{1, 2, 3...} - */ -#define __MAKE_TAGPTR(n) \ -typedef struct __tagptr##n { \ - uintptr_t v; \ -} tagptr##n##_t; - -__MAKE_TAGPTR(1) -__MAKE_TAGPTR(2) -__MAKE_TAGPTR(3) -__MAKE_TAGPTR(4) - -#undef __MAKE_TAGPTR - -extern void __compiletime_error("bad tagptr tags") - __bad_tagptr_tags(void); - -extern void __compiletime_error("bad tagptr type") - __bad_tagptr_type(void); - -/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */ -#define __tagptr_mask_1(ptr, n) \ - __builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \ - (1UL << (n)) - 1 : - -#define __tagptr_mask(ptr) (\ - __tagptr_mask_1(ptr, 1) ( \ - __tagptr_mask_1(ptr, 2) ( \ - __tagptr_mask_1(ptr, 3) ( \ - __tagptr_mask_1(ptr, 4) ( \ - __bad_tagptr_type(), 0))))) - -/* generate a tagged pointer from a raw value */ -#define tagptr_init(type, val) \ - ((typeof(type)){ .v = (uintptr_t)(val) }) - -/* - * directly cast a tagged pointer to the native pointer type, which - * could be used for backward compatibility of existing code. - */ -#define tagptr_cast_ptr(tptr) ((void *)(tptr).v) - -/* encode tagged pointers */ -#define tagptr_fold(type, ptr, _tags) ({ \ - const typeof(_tags) tags = (_tags); \ - if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \ - __bad_tagptr_tags(); \ -tagptr_init(type, (uintptr_t)(ptr) | tags); }) - -/* decode tagged pointers */ -#define tagptr_unfold_ptr(tptr) \ - ((void *)((tptr).v & ~__tagptr_mask(tptr))) - -#define tagptr_unfold_tags(tptr) \ - ((tptr).v & __tagptr_mask(tptr)) - -/* operations for the tagger pointer */ -#define tagptr_eq(_tptr1, _tptr2) ({ \ - typeof(_tptr1) tptr1 = (_tptr1); \ - typeof(_tptr2) tptr2 = (_tptr2); \ - (void)(&tptr1 == &tptr2); \ -(tptr1).v == (tptr2).v; }) - -/* lock-free CAS operation */ -#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - typeof(_o) o = (_o); \ - typeof(_n) n = (_n); \ - (void)(&o == &n); \ - (void)(&o == ptptr); \ -tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); }) - -/* wrap WRITE_ONCE if atomic update is needed */ -#define tagptr_replace_tags(_ptptr, tags) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - *ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \ -*ptptr; }) - -#define tagptr_set_tags(_ptptr, _tags) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - const typeof(_tags) tags = (_tags); \ - if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ - __bad_tagptr_tags(); \ - ptptr->v |= tags; \ -*ptptr; }) - -#define tagptr_clear_tags(_ptptr, _tags) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - const typeof(_tags) tags = (_tags); \ - if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ - __bad_tagptr_tags(); \ - ptptr->v &= ~tags; \ -*ptptr; }) - -#endif /* __EROFS_FS_TAGPTR_H */ diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index a62fb8a3318a..60729b1220b6 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -22,8 +22,7 @@ static int init_inode_xattrs(struct inode *inode) struct xattr_iter it; unsigned int i; struct erofs_xattr_ibody_header *ih; - struct super_block *sb; - struct erofs_sb_info *sbi; + struct super_block *sb = inode->i_sb; int ret = 0; /* the most case is that xattrs of this inode are initialized. */ @@ -52,15 +51,14 @@ static int init_inode_xattrs(struct inode *inode) * undefined right now (maybe use later with some new sb feature). */ if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) { - erofs_err(inode->i_sb, + erofs_err(sb, "xattr_isize %d of nid %llu is not supported yet", vi->xattr_isize, vi->nid); ret = -EOPNOTSUPP; goto out_unlock; } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) { if (vi->xattr_isize) { - erofs_err(inode->i_sb, - "bogus xattr ibody @ nid %llu", vi->nid); + erofs_err(sb, "bogus xattr ibody @ nid %llu", vi->nid); DBG_BUGON(1); ret = -EFSCORRUPTED; goto out_unlock; /* xattr ondisk layout error */ @@ -69,11 +67,9 @@ static int init_inode_xattrs(struct inode *inode) goto out_unlock; } - sb = inode->i_sb; - sbi = EROFS_SB(sb); it.buf = __EROFS_BUF_INITIALIZER; - it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize); - it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize); + it.blkaddr = erofs_blknr(erofs_iloc(inode) + vi->inode_isize); + it.ofs = erofs_blkoff(erofs_iloc(inode) + vi->inode_isize); /* read in shared xattr array (non-atomic, see kmalloc below) */ it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP); @@ -159,7 +155,6 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); - struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb); unsigned int xattr_header_sz, inline_xattr_ofs; xattr_header_sz = inlinexattr_header_size(inode); @@ -170,9 +165,8 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, inline_xattr_ofs = vi->inode_isize + xattr_header_sz; - it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs); - it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); - + it->blkaddr = erofs_blknr(erofs_iloc(inode) + inline_xattr_ofs); + it->ofs = erofs_blkoff(erofs_iloc(inode) + inline_xattr_ofs); it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, EROFS_KMAP); if (IS_ERR(it->kaddr)) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5200bb86e264..3247d2422bea 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -4,13 +4,178 @@ * https://www.huawei.com/ * Copyright (C) 2022 Alibaba Cloud */ -#include "zdata.h" #include "compress.h" #include <linux/prefetch.h> #include <linux/psi.h> - +#include <linux/cpuhotplug.h> #include <trace/events/erofs.h> +#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) +#define Z_EROFS_INLINE_BVECS 2 + +/* + * let's leave a type here in case of introducing + * another tagged pointer later. + */ +typedef void *z_erofs_next_pcluster_t; + +struct z_erofs_bvec { + struct page *page; + int offset; + unsigned int end; +}; + +#define __Z_EROFS_BVSET(name, total) \ +struct name { \ + /* point to the next page which contains the following bvecs */ \ + struct page *nextpage; \ + struct z_erofs_bvec bvec[total]; \ +} +__Z_EROFS_BVSET(z_erofs_bvset,); +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); + +/* + * Structure fields follow one of the following exclusion rules. + * + * I: Modifiable by initialization/destruction paths and read-only + * for everyone else; + * + * L: Field should be protected by the pcluster lock; + * + * A: Field should be accessed / updated in atomic for parallelized code. + */ +struct z_erofs_pcluster { + struct erofs_workgroup obj; + struct mutex lock; + + /* A: point to next chained pcluster or TAILs */ + z_erofs_next_pcluster_t next; + + /* L: the maximum decompression size of this round */ + unsigned int length; + + /* L: total number of bvecs */ + unsigned int vcnt; + + /* I: page offset of start position of decompression */ + unsigned short pageofs_out; + + /* I: page offset of inline compressed data */ + unsigned short pageofs_in; + + union { + /* L: inline a certain number of bvec for bootstrap */ + struct z_erofs_bvset_inline bvset; + + /* I: can be used to free the pcluster by RCU. */ + struct rcu_head rcu; + }; + + union { + /* I: physical cluster size in pages */ + unsigned short pclusterpages; + + /* I: tailpacking inline compressed size */ + unsigned short tailpacking_size; + }; + + /* I: compression algorithm format */ + unsigned char algorithmformat; + + /* L: whether partial decompression or not */ + bool partial; + + /* L: indicate several pageofs_outs or not */ + bool multibases; + + /* A: compressed bvecs (can be cached or inplaced pages) */ + struct z_erofs_bvec compressed_bvecs[]; +}; + +/* let's avoid the valid 32-bit kernel addresses */ + +/* the chained workgroup has't submitted io (still open) */ +#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) +/* the chained workgroup has already submitted io */ +#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) + +#define Z_EROFS_PCLUSTER_NIL (NULL) + +struct z_erofs_decompressqueue { + struct super_block *sb; + atomic_t pending_bios; + z_erofs_next_pcluster_t head; + + union { + struct completion done; + struct work_struct work; + struct kthread_work kthread_work; + } u; + bool eio, sync; +}; + +static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) +{ + return !pcl->obj.index; +} + +static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) +{ + if (z_erofs_is_inline_pcluster(pcl)) + return 1; + return pcl->pclusterpages; +} + +/* + * bit 30: I/O error occurred on this page + * bit 0 - 29: remaining parts to complete this page + */ +#define Z_EROFS_PAGE_EIO (1 << 30) + +static inline void z_erofs_onlinepage_init(struct page *page) +{ + union { + atomic_t o; + unsigned long v; + } u = { .o = ATOMIC_INIT(1) }; + + set_page_private(page, u.v); + smp_wmb(); + SetPagePrivate(page); +} + +static inline void z_erofs_onlinepage_split(struct page *page) +{ + atomic_inc((atomic_t *)&page->private); +} + +static inline void z_erofs_page_mark_eio(struct page *page) +{ + int orig; + + do { + orig = atomic_read((atomic_t *)&page->private); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, + orig | Z_EROFS_PAGE_EIO) != orig); +} + +static inline void z_erofs_onlinepage_endio(struct page *page) +{ + unsigned int v; + + DBG_BUGON(!PagePrivate(page)); + v = atomic_dec_return((atomic_t *)&page->private); + if (!(v & ~Z_EROFS_PAGE_EIO)) { + set_page_private(page, 0); + ClearPagePrivate(page); + if (!(v & Z_EROFS_PAGE_EIO)) + SetPageUptodate(page); + unlock_page(page); + } +} + +#define Z_EROFS_ONSTACK_PAGES 32 + /* * since pclustersize is variable for big pcluster feature, introduce slab * pools implementation for different pcluster sizes. @@ -175,35 +340,130 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) DBG_BUGON(1); } -/* - * tagged pointer with 1-bit tag for all compressed pages - * tag 0 - the page is just found with an extra page reference - */ -typedef tagptr1_t compressed_page_t; +static struct workqueue_struct *z_erofs_workqueue __read_mostly; -#define tag_compressed_page_justfound(page) \ - tagptr_fold(compressed_page_t, page, 1) +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD +static struct kthread_worker __rcu **z_erofs_pcpu_workers; -static struct workqueue_struct *z_erofs_workqueue __read_mostly; +static void erofs_destroy_percpu_workers(void) +{ + struct kthread_worker *worker; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + worker = rcu_dereference_protected( + z_erofs_pcpu_workers[cpu], 1); + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); + if (worker) + kthread_destroy_worker(worker); + } + kfree(z_erofs_pcpu_workers); +} -void z_erofs_exit_zip_subsystem(void) +static struct kthread_worker *erofs_init_percpu_worker(int cpu) { - destroy_workqueue(z_erofs_workqueue); - z_erofs_destroy_pcluster_pool(); + struct kthread_worker *worker = + kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu); + + if (IS_ERR(worker)) + return worker; + if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI)) + sched_set_fifo_low(worker->task); + else + sched_set_normal(worker->task, 0); + return worker; } -static inline int z_erofs_init_workqueue(void) +static int erofs_init_percpu_workers(void) { - const unsigned int onlinecpus = num_possible_cpus(); + struct kthread_worker *worker; + unsigned int cpu; - /* - * no need to spawn too many threads, limiting threads could minimum - * scheduling overhead, perhaps per-CPU threads should be better? - */ - z_erofs_workqueue = alloc_workqueue("erofs_unzipd", - WQ_UNBOUND | WQ_HIGHPRI, - onlinecpus + onlinecpus / 4); - return z_erofs_workqueue ? 0 : -ENOMEM; + z_erofs_pcpu_workers = kcalloc(num_possible_cpus(), + sizeof(struct kthread_worker *), GFP_ATOMIC); + if (!z_erofs_pcpu_workers) + return -ENOMEM; + + for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */ + worker = erofs_init_percpu_worker(cpu); + if (!IS_ERR(worker)) + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); + } + return 0; +} +#else +static inline void erofs_destroy_percpu_workers(void) {} +static inline int erofs_init_percpu_workers(void) { return 0; } +#endif + +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD) +static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock); +static enum cpuhp_state erofs_cpuhp_state; + +static int erofs_cpu_online(unsigned int cpu) +{ + struct kthread_worker *worker, *old; + + worker = erofs_init_percpu_worker(cpu); + if (IS_ERR(worker)) + return PTR_ERR(worker); + + spin_lock(&z_erofs_pcpu_worker_lock); + old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], + lockdep_is_held(&z_erofs_pcpu_worker_lock)); + if (!old) + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); + spin_unlock(&z_erofs_pcpu_worker_lock); + if (old) + kthread_destroy_worker(worker); + return 0; +} + +static int erofs_cpu_offline(unsigned int cpu) +{ + struct kthread_worker *worker; + + spin_lock(&z_erofs_pcpu_worker_lock); + worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], + lockdep_is_held(&z_erofs_pcpu_worker_lock)); + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); + spin_unlock(&z_erofs_pcpu_worker_lock); + + synchronize_rcu(); + if (worker) + kthread_destroy_worker(worker); + return 0; +} + +static int erofs_cpu_hotplug_init(void) +{ + int state; + + state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline); + if (state < 0) + return state; + + erofs_cpuhp_state = state; + return 0; +} + +static void erofs_cpu_hotplug_destroy(void) +{ + if (erofs_cpuhp_state) + cpuhp_remove_state_nocalls(erofs_cpuhp_state); +} +#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */ +static inline int erofs_cpu_hotplug_init(void) { return 0; } +static inline void erofs_cpu_hotplug_destroy(void) {} +#endif + +void z_erofs_exit_zip_subsystem(void) +{ + erofs_cpu_hotplug_destroy(); + erofs_destroy_percpu_workers(); + destroy_workqueue(z_erofs_workqueue); + z_erofs_destroy_pcluster_pool(); } int __init z_erofs_init_zip_subsystem(void) @@ -211,10 +471,31 @@ int __init z_erofs_init_zip_subsystem(void) int err = z_erofs_create_pcluster_pool(); if (err) - return err; - err = z_erofs_init_workqueue(); + goto out_error_pcluster_pool; + + z_erofs_workqueue = alloc_workqueue("erofs_worker", + WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus()); + if (!z_erofs_workqueue) { + err = -ENOMEM; + goto out_error_workqueue_init; + } + + err = erofs_init_percpu_workers(); if (err) - z_erofs_destroy_pcluster_pool(); + goto out_error_pcpu_worker; + + err = erofs_cpu_hotplug_init(); + if (err < 0) + goto out_error_cpuhp_init; + return err; + +out_error_cpuhp_init: + erofs_destroy_percpu_workers(); +out_error_pcpu_worker: + destroy_workqueue(z_erofs_workqueue); +out_error_workqueue_init: + z_erofs_destroy_pcluster_pool(); +out_error_pcluster_pool: return err; } @@ -319,7 +600,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page; - compressed_page_t t; + void *t; /* mark pages just found for debugging */ struct page *newpage = NULL; /* the compressed page was loaded before */ @@ -329,7 +610,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, page = find_get_page(mc, pcl->obj.index + i); if (page) { - t = tag_compressed_page_justfound(page); + t = (void *)((unsigned long)page | 1); } else { /* I/O is needed, no possible to decompress directly */ standalone = false; @@ -345,11 +626,10 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, if (!newpage) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); - t = tag_compressed_page_justfound(newpage); + t = (void *)((unsigned long)newpage | 1); } - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, - tagptr_cast_ptr(t))) + if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) continue; if (page) @@ -1151,18 +1431,24 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); z_erofs_decompress_queue(bgq, &pagepool); - erofs_release_pages(&pagepool); kvfree(bgq); } +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD +static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work) +{ + z_erofs_decompressqueue_work((struct work_struct *)work); +} +#endif + static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, - bool sync, int bios) + int bios) { struct erofs_sb_info *const sbi = EROFS_SB(io->sb); /* wake up the caller thread for sync decompression */ - if (sync) { + if (io->sync) { if (!atomic_add_return(bios, &io->pending_bios)) complete(&io->u.done); return; @@ -1170,9 +1456,24 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (atomic_add_return(bios, &io->pending_bios)) return; - /* Use workqueue and sync decompression for atomic contexts only */ + /* Use (kthread_)work and sync decompression for atomic contexts only */ if (in_atomic() || irqs_disabled()) { +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD + struct kthread_worker *worker; + + rcu_read_lock(); + worker = rcu_dereference( + z_erofs_pcpu_workers[raw_smp_processor_id()]); + if (!worker) { + INIT_WORK(&io->u.work, z_erofs_decompressqueue_work); + queue_work(z_erofs_workqueue, &io->u.work); + } else { + kthread_queue_work(worker, &io->u.kthread_work); + } + rcu_read_unlock(); +#else queue_work(z_erofs_workqueue, &io->u.work); +#endif /* enable sync decompression for readahead */ if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; @@ -1192,8 +1493,6 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, struct address_space *mapping; struct page *oldpage, *page; - - compressed_page_t t; int justfound; repeat: @@ -1203,10 +1502,8 @@ repeat: if (!page) goto out_allocpage; - /* process the target tagged pointer */ - t = tagptr_init(compressed_page_t, page); - justfound = tagptr_unfold_tags(t); - page = tagptr_unfold_ptr(t); + justfound = (unsigned long)page & 1UL; + page = (struct page *)((unsigned long)page & ~1UL); /* * preallocated cached pages, which is used to avoid direct reclaim @@ -1294,9 +1591,8 @@ out: /* the only exit (for tracing and debugging) */ return page; } -static struct z_erofs_decompressqueue * -jobqueue_init(struct super_block *sb, - struct z_erofs_decompressqueue *fgq, bool *fg) +static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, + struct z_erofs_decompressqueue *fgq, bool *fg) { struct z_erofs_decompressqueue *q; @@ -1306,13 +1602,19 @@ jobqueue_init(struct super_block *sb, *fg = true; goto fg_out; } +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD + kthread_init_work(&q->u.kthread_work, + z_erofs_decompressqueue_kthread_work); +#else INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); +#endif } else { fg_out: q = fgq; init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); q->eio = false; + q->sync = true; } q->sb = sb; q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; @@ -1326,20 +1628,6 @@ enum { NR_JOBQUEUES, }; -static void *jobqueueset_init(struct super_block *sb, - struct z_erofs_decompressqueue *q[], - struct z_erofs_decompressqueue *fgq, bool *fg) -{ - /* - * if managed cache is enabled, bypass jobqueue is needed, - * no need to read from device for all pclusters in this queue. - */ - q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); - q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, fg); - - return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], *fg)); -} - static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, z_erofs_next_pcluster_t qtail[], z_erofs_next_pcluster_t owned_head) @@ -1361,8 +1649,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, static void z_erofs_decompressqueue_endio(struct bio *bio) { - tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); - struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); + struct z_erofs_decompressqueue *q = bio->bi_private; blk_status_t err = bio->bi_status; struct bio_vec *bvec; struct bvec_iter_all iter_all; @@ -1381,7 +1668,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) } if (err) q->eio = true; - z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); + z_erofs_decompress_kickoff(q, -1); bio_put(bio); } @@ -1394,7 +1681,6 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; - void *bi_private; z_erofs_next_pcluster_t owned_head = f->owned_head; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ pgoff_t last_index; @@ -1404,7 +1690,13 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, unsigned long pflags; int memstall = 0; - bi_private = jobqueueset_init(sb, q, fgq, force_fg); + /* + * if managed cache is enabled, bypass jobqueue is needed, + * no need to read from device for all pclusters in this queue. + */ + q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); + q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); + qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; @@ -1473,7 +1765,7 @@ submit_bio_retry: last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << LOG_SECTORS_PER_BLOCK; - bio->bi_private = bi_private; + bio->bi_private = q[JQ_SUBMIT]; if (f->readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; @@ -1500,13 +1792,13 @@ submit_bio_retry: /* * although background is preferred, no one is pending for submission. - * don't issue workqueue for decompression but drop it directly instead. + * don't issue decompression but drop it directly instead. */ if (!*force_fg && !nr_bios) { kvfree(q[JQ_SUBMIT]); return; } - z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); + z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); } static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h deleted file mode 100644 index d98c95212985..000000000000 --- a/fs/erofs/zdata.h +++ /dev/null @@ -1,178 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2018 HUAWEI, Inc. - * https://www.huawei.com/ - */ -#ifndef __EROFS_FS_ZDATA_H -#define __EROFS_FS_ZDATA_H - -#include "internal.h" -#include "tagptr.h" - -#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) -#define Z_EROFS_INLINE_BVECS 2 - -/* - * let's leave a type here in case of introducing - * another tagged pointer later. - */ -typedef void *z_erofs_next_pcluster_t; - -struct z_erofs_bvec { - struct page *page; - int offset; - unsigned int end; -}; - -#define __Z_EROFS_BVSET(name, total) \ -struct name { \ - /* point to the next page which contains the following bvecs */ \ - struct page *nextpage; \ - struct z_erofs_bvec bvec[total]; \ -} -__Z_EROFS_BVSET(z_erofs_bvset,); -__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); - -/* - * Structure fields follow one of the following exclusion rules. - * - * I: Modifiable by initialization/destruction paths and read-only - * for everyone else; - * - * L: Field should be protected by the pcluster lock; - * - * A: Field should be accessed / updated in atomic for parallelized code. - */ -struct z_erofs_pcluster { - struct erofs_workgroup obj; - struct mutex lock; - - /* A: point to next chained pcluster or TAILs */ - z_erofs_next_pcluster_t next; - - /* L: the maximum decompression size of this round */ - unsigned int length; - - /* L: total number of bvecs */ - unsigned int vcnt; - - /* I: page offset of start position of decompression */ - unsigned short pageofs_out; - - /* I: page offset of inline compressed data */ - unsigned short pageofs_in; - - union { - /* L: inline a certain number of bvec for bootstrap */ - struct z_erofs_bvset_inline bvset; - - /* I: can be used to free the pcluster by RCU. */ - struct rcu_head rcu; - }; - - union { - /* I: physical cluster size in pages */ - unsigned short pclusterpages; - - /* I: tailpacking inline compressed size */ - unsigned short tailpacking_size; - }; - - /* I: compression algorithm format */ - unsigned char algorithmformat; - - /* L: whether partial decompression or not */ - bool partial; - - /* L: indicate several pageofs_outs or not */ - bool multibases; - - /* A: compressed bvecs (can be cached or inplaced pages) */ - struct z_erofs_bvec compressed_bvecs[]; -}; - -/* let's avoid the valid 32-bit kernel addresses */ - -/* the chained workgroup has't submitted io (still open) */ -#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) -/* the chained workgroup has already submitted io */ -#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) - -#define Z_EROFS_PCLUSTER_NIL (NULL) - -struct z_erofs_decompressqueue { - struct super_block *sb; - atomic_t pending_bios; - z_erofs_next_pcluster_t head; - - union { - struct completion done; - struct work_struct work; - } u; - - bool eio; -}; - -static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) -{ - return !pcl->obj.index; -} - -static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) -{ - if (z_erofs_is_inline_pcluster(pcl)) - return 1; - return pcl->pclusterpages; -} - -/* - * bit 30: I/O error occurred on this page - * bit 0 - 29: remaining parts to complete this page - */ -#define Z_EROFS_PAGE_EIO (1 << 30) - -static inline void z_erofs_onlinepage_init(struct page *page) -{ - union { - atomic_t o; - unsigned long v; - } u = { .o = ATOMIC_INIT(1) }; - - set_page_private(page, u.v); - smp_wmb(); - SetPagePrivate(page); -} - -static inline void z_erofs_onlinepage_split(struct page *page) -{ - atomic_inc((atomic_t *)&page->private); -} - -static inline void z_erofs_page_mark_eio(struct page *page) -{ - int orig; - - do { - orig = atomic_read((atomic_t *)&page->private); - } while (atomic_cmpxchg((atomic_t *)&page->private, orig, - orig | Z_EROFS_PAGE_EIO) != orig); -} - -static inline void z_erofs_onlinepage_endio(struct page *page) -{ - unsigned int v; - - DBG_BUGON(!PagePrivate(page)); - v = atomic_dec_return((atomic_t *)&page->private); - if (!(v & ~Z_EROFS_PAGE_EIO)) { - set_page_private(page, 0); - ClearPagePrivate(page); - if (!(v & Z_EROFS_PAGE_EIO)) - SetPageUptodate(page); - unlock_page(page); - } -} - -#define Z_EROFS_ONSTACK_PAGES 32 - -#endif diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 98fb90b9af71..8bf6d30518b6 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -7,10 +7,6 @@ #include <asm/unaligned.h> #include <trace/events/erofs.h> -static int z_erofs_do_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, - int flags); - int z_erofs_fill_inode(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); @@ -29,126 +25,6 @@ int z_erofs_fill_inode(struct inode *inode) return 0; } -static int z_erofs_fill_inode_lazy(struct inode *inode) -{ - struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = inode->i_sb; - int err, headnr; - erofs_off_t pos; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - void *kaddr; - struct z_erofs_map_header *h; - - if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { - /* - * paired with smp_mb() at the end of the function to ensure - * fields will only be observed after the bit is set. - */ - smp_mb(); - return 0; - } - - if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) - return -ERESTARTSYS; - - err = 0; - if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) - goto out_unlock; - - pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + - vi->xattr_isize, 8); - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); - if (IS_ERR(kaddr)) { - err = PTR_ERR(kaddr); - goto out_unlock; - } - - h = kaddr + erofs_blkoff(pos); - /* - * if the highest bit of the 8-byte map header is set, the whole file - * is stored in the packed inode. The rest bits keeps z_fragmentoff. - */ - if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { - vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; - vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); - vi->z_tailextent_headlcn = 0; - goto done; - } - vi->z_advise = le16_to_cpu(h->h_advise); - vi->z_algorithmtype[0] = h->h_algorithmtype & 15; - vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; - - headnr = 0; - if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || - vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { - erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", - headnr + 1, vi->z_algorithmtype[headnr], vi->nid); - err = -EOPNOTSUPP; - goto out_put_metabuf; - } - - vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); - if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && - vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | - Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { - erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", - vi->nid); - err = -EFSCORRUPTED; - goto out_put_metabuf; - } - if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { - erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", - vi->nid); - err = -EFSCORRUPTED; - goto out_put_metabuf; - } - - if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { - struct erofs_map_blocks map = { - .buf = __EROFS_BUF_INITIALIZER - }; - - vi->z_idata_size = le16_to_cpu(h->h_idata_size); - err = z_erofs_do_map_blocks(inode, &map, - EROFS_GET_BLOCKS_FINDTAIL); - erofs_put_metabuf(&map.buf); - - if (!map.m_plen || - erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) { - erofs_err(sb, "invalid tail-packing pclustersize %llu", - map.m_plen); - err = -EFSCORRUPTED; - } - if (err < 0) - goto out_put_metabuf; - } - - if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && - !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { - struct erofs_map_blocks map = { - .buf = __EROFS_BUF_INITIALIZER - }; - - vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); - err = z_erofs_do_map_blocks(inode, &map, - EROFS_GET_BLOCKS_FINDTAIL); - erofs_put_metabuf(&map.buf); - if (err < 0) - goto out_put_metabuf; - } -done: - /* paired with smp_mb() at the beginning of the function */ - smp_mb(); - set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); -out_put_metabuf: - erofs_put_metabuf(&buf); -out_unlock: - clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); - return err; -} - struct z_erofs_maprecorder { struct inode *inode; struct erofs_map_blocks *map; @@ -169,10 +45,9 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const erofs_off_t ibase = iloc(EROFS_I_SB(inode), vi->nid); const erofs_off_t pos = - Z_EROFS_VLE_LEGACY_INDEX_ALIGN(ibase + vi->inode_isize + - vi->xattr_isize) + + Z_EROFS_VLE_LEGACY_INDEX_ALIGN(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize) + lcn * sizeof(struct z_erofs_vle_decompressed_index); struct z_erofs_vle_decompressed_index *di; unsigned int advise, type; @@ -372,9 +247,8 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; - const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) + - vi->inode_isize + vi->xattr_isize, 8) + - sizeof(struct z_erofs_map_header); + const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + + ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); unsigned int compacted_4b_initial, compacted_2b; unsigned int amortizedshift; @@ -732,6 +606,125 @@ unmap_out: return err; } +static int z_erofs_fill_inode_lazy(struct inode *inode) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct super_block *const sb = inode->i_sb; + int err, headnr; + erofs_off_t pos; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + void *kaddr; + struct z_erofs_map_header *h; + + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { + /* + * paired with smp_mb() at the end of the function to ensure + * fields will only be observed after the bit is set. + */ + smp_mb(); + return 0; + } + + if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) + return -ERESTARTSYS; + + err = 0; + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) + goto out_unlock; + + pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); + if (IS_ERR(kaddr)) { + err = PTR_ERR(kaddr); + goto out_unlock; + } + + h = kaddr + erofs_blkoff(pos); + /* + * if the highest bit of the 8-byte map header is set, the whole file + * is stored in the packed inode. The rest bits keeps z_fragmentoff. + */ + if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { + vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); + vi->z_tailextent_headlcn = 0; + goto done; + } + vi->z_advise = le16_to_cpu(h->h_advise); + vi->z_algorithmtype[0] = h->h_algorithmtype & 15; + vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; + + headnr = 0; + if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || + vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { + erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", + headnr + 1, vi->z_algorithmtype[headnr], vi->nid); + err = -EOPNOTSUPP; + goto out_put_metabuf; + } + + vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); + if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && + vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | + Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto out_put_metabuf; + } + if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto out_put_metabuf; + } + + if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_idata_size = le16_to_cpu(h->h_idata_size); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + + if (!map.m_plen || + erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) { + erofs_err(sb, "invalid tail-packing pclustersize %llu", + map.m_plen); + err = -EFSCORRUPTED; + } + if (err < 0) + goto out_put_metabuf; + } + + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && + !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + if (err < 0) + goto out_put_metabuf; + } +done: + /* paired with smp_mb() at the beginning of the function */ + smp_mb(); + set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); +out_put_metabuf: + erofs_put_metabuf(&buf); +out_unlock: + clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); + return err; +} + int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { diff --git a/fs/exec.c b/fs/exec.c index ab913243a367..3d2b80d8d58e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1414,15 +1414,15 @@ EXPORT_SYMBOL(begin_new_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { struct inode *inode = file_inode(file); - struct user_namespace *mnt_userns = file_mnt_user_ns(file); - if (inode_permission(mnt_userns, inode, MAY_READ) < 0) { + struct mnt_idmap *idmap = file_mnt_idmap(file); + if (inode_permission(idmap, inode, MAY_READ) < 0) { struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; /* Ensure mm->user_ns contains the executable */ user_ns = old = bprm->mm->user_ns; while ((user_ns != &init_user_ns) && - !privileged_wrt_inode_uidgid(user_ns, mnt_userns, inode)) + !privileged_wrt_inode_uidgid(user_ns, idmap, inode)) user_ns = user_ns->parent; if (old != user_ns) { @@ -1596,7 +1596,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *inode = file_inode(file); unsigned int mode; vfsuid_t vfsuid; @@ -1612,15 +1612,15 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) if (!(mode & (S_ISUID|S_ISGID))) return; - mnt_userns = file_mnt_user_ns(file); + idmap = file_mnt_idmap(file); /* Be careful if suid/sgid is set */ inode_lock(inode); /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); - vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid = i_gid_into_vfsgid(idmap, inode); inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index bc6d21d7c5ad..1bf16abe3c84 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -450,9 +450,9 @@ int exfat_trim_fs(struct inode *inode, struct fstrim_range *range); extern const struct file_operations exfat_file_operations; int __exfat_truncate(struct inode *inode); void exfat_truncate(struct inode *inode); -int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path, +int exfat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, unsigned int request_mask, unsigned int query_flags); int exfat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/fs/exfat/file.c b/fs/exfat/file.c index f5b29072775d..1fdb0a64b91d 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -226,14 +226,14 @@ write_size: mutex_unlock(&sbi->s_lock); } -int exfat_getattr(struct user_namespace *mnt_uerns, const struct path *path, +int exfat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, unsigned int request_mask, unsigned int query_flags) { struct inode *inode = d_backing_inode(path->dentry); struct exfat_inode_info *ei = EXFAT_I(inode); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); exfat_truncate_atime(&stat->atime); stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; @@ -242,7 +242,7 @@ int exfat_getattr(struct user_namespace *mnt_uerns, const struct path *path, return 0; } -int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct exfat_sb_info *sbi = EXFAT_SB(dentry->d_sb); @@ -266,7 +266,7 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ATTR_TIMES_SET); } - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); attr->ia_valid = ia_valid; if (error) goto out; @@ -293,7 +293,7 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (attr->ia_valid & ATTR_SIZE) inode->i_mtime = inode->i_ctime = current_time(inode); - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); exfat_truncate_atime(&inode->i_atime); if (attr->ia_valid & ATTR_SIZE) { diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 5f995eba5dbb..02aab4c3a5f7 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -551,7 +551,7 @@ out: return ret; } -static int exfat_create(struct user_namespace *mnt_userns, struct inode *dir, +static int exfat_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; @@ -834,7 +834,7 @@ unlock: return err; } -static int exfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; @@ -1285,7 +1285,7 @@ out: return ret; } -static int exfat_rename(struct user_namespace *mnt_userns, +static int exfat_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 3204bd33e4e8..ab88d33d106c 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -145,7 +145,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); - tmp = lookup_one_unlocked(mnt_user_ns(mnt), nbuf, parent, strlen(nbuf)); + tmp = lookup_one_unlocked(mnt_idmap(mnt), nbuf, parent, strlen(nbuf)); if (IS_ERR(tmp)) { dprintk("lookup failed: %ld\n", PTR_ERR(tmp)); err = PTR_ERR(tmp); @@ -524,7 +524,7 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, } inode_lock(target_dir->d_inode); - nresult = lookup_one(mnt_user_ns(mnt), nbuf, + nresult = lookup_one(mnt_idmap(mnt), nbuf, target_dir, strlen(nbuf)); if (!IS_ERR(nresult)) { if (unlikely(nresult->d_inode != result->d_inode)) { diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 440d5f1e9d47..82b17d7fc93f 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -219,7 +219,7 @@ __ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) * inode->i_mutex: down */ int -ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +ext2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int error; @@ -228,7 +228,7 @@ ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - error = posix_acl_update_mode(&init_user_ns, inode, &mode, + error = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (error) return error; diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 3841becb94ff..4a8443a2b8ec 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -56,7 +56,7 @@ static inline int ext2_acl_count(size_t size) /* acl.c */ extern struct posix_acl *ext2_get_acl(struct inode *inode, int type, bool rcu); -extern int ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int ext2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int ext2_init_acl (struct inode *, struct inode *); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index e5cbc27ba459..4a6955a0a116 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -461,9 +461,9 @@ static int ext2_handle_dirsync(struct inode *dir) return err; } -void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, - struct page *page, void *page_addr, struct inode *inode, - int update_times) +int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, + struct page *page, void *page_addr, struct inode *inode, + bool update_times) { loff_t pos = page_offset(page) + (char *) de - (char *) page_addr; @@ -472,7 +472,10 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, lock_page(page); err = ext2_prepare_chunk(page, pos, len); - BUG_ON(err); + if (err) { + unlock_page(page); + return err; + } de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type(de, inode); ext2_commit_chunk(page, pos, len); @@ -480,7 +483,7 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, dir->i_mtime = dir->i_ctime = current_time(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); - ext2_handle_dirsync(dir); + return ext2_handle_dirsync(dir); } /* @@ -646,7 +649,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent) unlock_page(page); goto fail; } - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); memset(kaddr, 0, chunk_size); de = (struct ext2_dir_entry_2 *)kaddr; de->name_len = 1; @@ -661,7 +664,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent) de->inode = cpu_to_le32(parent->i_ino); memcpy (de->name, "..\0", 4); ext2_set_de_type (de, inode); - kunmap_atomic(kaddr); + kunmap_local(kaddr); ext2_commit_chunk(page, 0, chunk_size); err = ext2_handle_dirsync(inode); fail: diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 28de11a22e5f..cb78d7dcfb95 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -734,8 +734,9 @@ extern int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page, char *kaddr); extern int ext2_empty_dir (struct inode *); extern struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p, void **pa); -extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, void *, - struct inode *, int); +int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, + struct page *page, void *page_addr, struct inode *inode, + bool update_times); static inline void ext2_put_page(struct page *page, void *page_addr) { kunmap_local(page_addr); @@ -753,8 +754,8 @@ extern struct inode *ext2_iget (struct super_block *, unsigned long); extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_evict_inode(struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); -extern int ext2_setattr (struct user_namespace *, struct dentry *, struct iattr *); -extern int ext2_getattr (struct user_namespace *, const struct path *, +extern int ext2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *); +extern int ext2_getattr (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void ext2_set_inode_flags(struct inode *inode); extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -762,7 +763,7 @@ extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* ioctl.c */ extern int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa); -extern int ext2_fileattr_set(struct user_namespace *mnt_userns, +extern int ext2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); extern long ext2_ioctl(struct file *, unsigned int, unsigned long); extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 78b8686d9a4a..a4e1d7a9c544 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -545,7 +545,7 @@ got: inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; } else - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = ino; inode->i_blocks = 0; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 69aed9e2359e..26f135e7ffce 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1592,7 +1592,7 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } -int ext2_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ext2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -1614,28 +1614,28 @@ int ext2_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; } -int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ext2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(&init_user_ns, dentry, iattr); + error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) return error; - if (is_quota_modification(mnt_userns, inode, iattr)) { + if (is_quota_modification(&nop_mnt_idmap, inode, iattr)) { error = dquot_initialize(inode); if (error) return error; } - if (i_uid_needs_update(mnt_userns, iattr, inode) || - i_gid_needs_update(mnt_userns, iattr, inode)) { - error = dquot_transfer(mnt_userns, inode, iattr); + if (i_uid_needs_update(&nop_mnt_idmap, iattr, inode) || + i_gid_needs_update(&nop_mnt_idmap, iattr, inode)) { + error = dquot_transfer(&nop_mnt_idmap, inode, iattr); if (error) return error; } @@ -1644,9 +1644,9 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; } - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); if (iattr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + error = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); mark_inode_dirty(inode); return error; diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index e8340bf09b10..cc87d413eb43 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -27,7 +27,7 @@ int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int ext2_fileattr_set(struct user_namespace *mnt_userns, +int ext2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -66,7 +66,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case EXT2_IOC_SETVERSION: { __u32 generation; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EPERM; ret = mnt_want_write_file(filp); if (ret) @@ -99,7 +99,7 @@ setversion_out: if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EACCES; if (get_user(rsv_window_size, (int __user *)arg)) diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index c056957221a2..7f5dfa87cc95 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -99,7 +99,7 @@ struct dentry *ext2_get_parent(struct dentry *child) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext2_create (struct user_namespace * mnt_userns, +static int ext2_create (struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { @@ -119,7 +119,7 @@ static int ext2_create (struct user_namespace * mnt_userns, return ext2_add_nondir(dentry, inode); } -static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int ext2_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode = ext2_new_inode(dir, mode, NULL); @@ -133,7 +133,7 @@ static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return finish_open_simple(file, 0); } -static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir, +static int ext2_mknod (struct mnt_idmap * idmap, struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode * inode; @@ -154,7 +154,7 @@ static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir, return err; } -static int ext2_symlink (struct user_namespace * mnt_userns, struct inode * dir, +static int ext2_symlink (struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, const char * symname) { struct super_block * sb = dir->i_sb; @@ -225,7 +225,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, return err; } -static int ext2_mkdir(struct user_namespace * mnt_userns, +static int ext2_mkdir(struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, umode_t mode) { struct inode * inode; @@ -315,7 +315,7 @@ static int ext2_rmdir (struct inode * dir, struct dentry *dentry) return err; } -static int ext2_rename (struct user_namespace * mnt_userns, +static int ext2_rename (struct mnt_idmap * idmap, struct inode * old_dir, struct dentry * old_dentry, struct inode * new_dir, struct dentry * new_dentry, unsigned int flags) @@ -370,8 +370,11 @@ static int ext2_rename (struct user_namespace * mnt_userns, err = PTR_ERR(new_de); goto out_dir; } - ext2_set_link(new_dir, new_de, new_page, page_addr, old_inode, 1); + err = ext2_set_link(new_dir, new_de, new_page, page_addr, + old_inode, true); ext2_put_page(new_page, page_addr); + if (err) + goto out_dir; new_inode->i_ctime = current_time(new_inode); if (dir_de) drop_nlink(new_inode); @@ -394,24 +397,24 @@ static int ext2_rename (struct user_namespace * mnt_userns, ext2_delete_entry(old_de, old_page, old_page_addr); if (dir_de) { - if (old_dir != new_dir) - ext2_set_link(old_inode, dir_de, dir_page, - dir_page_addr, new_dir, 0); + if (old_dir != new_dir) { + err = ext2_set_link(old_inode, dir_de, dir_page, + dir_page_addr, new_dir, false); + } ext2_put_page(dir_page, dir_page_addr); inode_dec_link_count(old_dir); } +out_old: ext2_put_page(old_page, old_page_addr); - return 0; +out: + return err; out_dir: if (dir_de) ext2_put_page(dir_page, dir_page_addr); -out_old: - ext2_put_page(old_page, old_page_addr); -out: - return err; + goto out_old; } const struct inode_operations ext2_dir_inode_operations = { diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index ebade1f52451..db47b8ab153e 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -19,7 +19,7 @@ ext2_xattr_security_get(const struct xattr_handler *handler, static int ext2_xattr_security_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 18a87d5dd1ab..995f931228ce 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -26,7 +26,7 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler, static int ext2_xattr_trusted_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 58092449f8ff..dd1507231081 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -30,7 +30,7 @@ ext2_xattr_user_get(const struct xattr_handler *handler, static int ext2_xattr_user_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index a9f89539aeee..27fcbddfb148 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -225,7 +225,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, } int -ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { handle_t *handle; @@ -249,7 +249,7 @@ retry: return PTR_ERR(handle); if ((type == ACL_TYPE_ACCESS) && acl) { - error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl); + error = posix_acl_update_mode(idmap, inode, &mode, &acl); if (error) goto out_stop; if (mode != inode->i_mode) diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 09c4a8a3b716..0c5a79c3b5d4 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -56,7 +56,7 @@ static inline int ext4_acl_count(size_t size) /* acl.c */ struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu); -int ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 140e1eb300d1..43e26e6f6e42 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2845,7 +2845,7 @@ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, /* ialloc.c */ extern int ext4_mark_inode_used(struct super_block *sb, int ino); -extern struct inode *__ext4_new_inode(struct user_namespace *, handle_t *, +extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *, struct inode *, umode_t, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, @@ -2853,11 +2853,11 @@ extern struct inode *__ext4_new_inode(struct user_namespace *, handle_t *, int nblocks); #define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ - __ext4_new_inode(&init_user_ns, (handle), (dir), (mode), (qstr), \ + __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr), \ (goal), (owner), i_flags, 0, 0, 0) -#define ext4_new_inode_start_handle(mnt_userns, dir, mode, qstr, goal, owner, \ +#define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \ type, nblocks) \ - __ext4_new_inode((mnt_userns), NULL, (dir), (mode), (qstr), (goal), (owner), \ + __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \ 0, (type), __LINE__, (nblocks)) @@ -2976,14 +2976,14 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, __ext4_iget((sb), (ino), (flags), __func__, __LINE__) extern int ext4_write_inode(struct inode *, struct writeback_control *); -extern int ext4_setattr(struct user_namespace *, struct dentry *, +extern int ext4_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern u32 ext4_dio_alignment(struct inode *inode); -extern int ext4_getattr(struct user_namespace *, const struct path *, +extern int ext4_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); -extern int ext4_file_getattr(struct user_namespace *, const struct path *, +extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); @@ -3024,7 +3024,7 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); -int ext4_fileattr_set(struct user_namespace *mnt_userns, +int ext4_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 63f9bb6e8851..157663031f8c 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -921,7 +921,7 @@ static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *__ext4_new_inode(struct user_namespace *mnt_userns, +struct inode *__ext4_new_inode(struct mnt_idmap *idmap, handle_t *handle, struct inode *dir, umode_t mode, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, @@ -972,10 +972,10 @@ struct inode *__ext4_new_inode(struct user_namespace *mnt_userns, i_gid_write(inode, owner[1]); } else if (test_opt(sb, GRPID)) { inode->i_mode = mode; - inode_fsuid_set(inode, mnt_userns); + inode_fsuid_set(inode, idmap); inode->i_gid = dir->i_gid; } else - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); if (ext4_has_feature_project(sb) && ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9d9f414f99fe..b936ee3af51e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1136,7 +1136,8 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, for (i = 0; i < nr_wait; i++) { int err2; - err2 = fscrypt_decrypt_pagecache_blocks(page, blocksize, + err2 = fscrypt_decrypt_pagecache_blocks(page_folio(page), + blocksize, bh_offset(wait[i])); if (err2) { clear_buffer_uptodate(wait[i]); @@ -3858,7 +3859,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); - err = fscrypt_decrypt_pagecache_blocks(page, blocksize, + err = fscrypt_decrypt_pagecache_blocks(page_folio(page), + blocksize, bh_offset(bh)); if (err) { clear_buffer_uptodate(bh); @@ -5434,7 +5436,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) * * Called with inode->i_rwsem down. */ -int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -5454,7 +5456,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ATTR_GID | ATTR_TIMES_SET)))) return -EPERM; - error = setattr_prepare(mnt_userns, dentry, attr); + error = setattr_prepare(idmap, dentry, attr); if (error) return error; @@ -5466,14 +5468,14 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; - if (is_quota_modification(mnt_userns, inode, attr)) { + if (is_quota_modification(idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } - if (i_uid_needs_update(mnt_userns, attr, inode) || - i_gid_needs_update(mnt_userns, attr, inode)) { + if (i_uid_needs_update(idmap, attr, inode) || + i_gid_needs_update(idmap, attr, inode)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, @@ -5490,7 +5492,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); - error = dquot_transfer(mnt_userns, inode, attr); + error = dquot_transfer(idmap, inode, attr); up_read(&EXT4_I(inode)->xattr_sem); if (error) { @@ -5499,8 +5501,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Update corresponding info in inode so that everything is in * one transaction */ - i_uid_update(mnt_userns, attr, inode); - i_gid_update(mnt_userns, attr, inode); + i_uid_update(idmap, attr, inode); + i_gid_update(idmap, attr, inode); error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { @@ -5630,7 +5632,7 @@ out_mmap_sem: if (!error) { if (inc_ivers) inode_inc_iversion(inode); - setattr_copy(mnt_userns, inode, attr); + setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); } @@ -5642,7 +5644,7 @@ out_mmap_sem: ext4_orphan_del(NULL, inode); if (!error && (ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); + rc = posix_acl_chmod(idmap, dentry, inode->i_mode); err_out: if (error) @@ -5668,7 +5670,7 @@ u32 ext4_dio_alignment(struct inode *inode) return 1; /* use the iomap defaults */ } -int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -5725,18 +5727,18 @@ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); return 0; } -int ext4_file_getattr(struct user_namespace *mnt_userns, +int ext4_file_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); u64 delalloc_blocks; - ext4_getattr(mnt_userns, path, stat, request_mask, query_flags); + ext4_getattr(idmap, path, stat, request_mask, query_flags); /* * If there is inline data in the inode, the inode will normally not diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 8067ccda34e4..b0dc7212694e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -358,12 +358,12 @@ void ext4_reset_inode_seed(struct inode *inode) * important fields of the inodes. * * @sb: the super block of the filesystem - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: the inode to swap with EXT4_BOOT_LOADER_INO * */ static long swap_inode_boot_loader(struct super_block *sb, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *inode) { handle_t *handle; @@ -393,7 +393,7 @@ static long swap_inode_boot_loader(struct super_block *sb, } if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || - !inode_owner_or_capable(mnt_userns, inode) || + !inode_owner_or_capable(idmap, inode) || !capable(CAP_SYS_ADMIN)) { err = -EPERM; goto journal_err_out; @@ -979,7 +979,7 @@ int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int ext4_fileattr_set(struct user_namespace *mnt_userns, +int ext4_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -1217,7 +1217,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); + struct mnt_idmap *idmap = file_mnt_idmap(filp); ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); @@ -1234,7 +1234,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) __u32 generation; int err; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (ext4_has_metadata_csum(inode->i_sb)) { @@ -1376,7 +1376,7 @@ mext_out: case EXT4_IOC_MIGRATE: { int err; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; err = mnt_want_write_file(filp); @@ -1398,7 +1398,7 @@ mext_out: case EXT4_IOC_ALLOC_DA_BLKS: { int err; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; err = mnt_want_write_file(filp); @@ -1417,7 +1417,7 @@ mext_out: err = mnt_want_write_file(filp); if (err) return err; - err = swap_inode_boot_loader(sb, mnt_userns, inode); + err = swap_inode_boot_loader(sb, idmap, inode); mnt_drop_write_file(filp); return err; } @@ -1542,7 +1542,7 @@ resizefs_out: case EXT4_IOC_CLEAR_ES_CACHE: { - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; ext4_clear_inode_es(inode); return 0; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index dd28453d6ea3..d10a508d95cd 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2792,7 +2792,7 @@ static int ext4_add_nondir(handle_t *handle, * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext4_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ext4_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { handle_t *handle; @@ -2806,7 +2806,7 @@ static int ext4_create(struct user_namespace *mnt_userns, struct inode *dir, credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, &dentry->d_name, + inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); @@ -2827,7 +2827,7 @@ retry: return err; } -static int ext4_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ext4_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { handle_t *handle; @@ -2841,7 +2841,7 @@ static int ext4_mknod(struct user_namespace *mnt_userns, struct inode *dir, credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, &dentry->d_name, + inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); @@ -2861,7 +2861,7 @@ retry: return err; } -static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int ext4_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { handle_t *handle; @@ -2873,7 +2873,7 @@ static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return err; retry: - inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, + inode = ext4_new_inode_start_handle(idmap, dir, mode, NULL, 0, NULL, EXT4_HT_DIR, EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + @@ -2972,7 +2972,7 @@ out: return err; } -static int ext4_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; @@ -2989,7 +2989,7 @@ static int ext4_mkdir(struct user_namespace *mnt_userns, struct inode *dir, credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFDIR | mode, + inode = ext4_new_inode_start_handle(idmap, dir, S_IFDIR | mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); @@ -3339,7 +3339,7 @@ out: return err; } -static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { handle_t *handle; @@ -3370,7 +3370,7 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; retry: - inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFLNK|S_IRWXUGO, + inode = ext4_new_inode_start_handle(idmap, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); @@ -3720,7 +3720,7 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) } } -static struct inode *ext4_whiteout_for_rename(struct user_namespace *mnt_userns, +static struct inode *ext4_whiteout_for_rename(struct mnt_idmap *idmap, struct ext4_renament *ent, int credits, handle_t **h) { @@ -3735,7 +3735,7 @@ static struct inode *ext4_whiteout_for_rename(struct user_namespace *mnt_userns, credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) + EXT4_XATTR_TRANS_BLOCKS + 4); retry: - wh = ext4_new_inode_start_handle(mnt_userns, ent->dir, + wh = ext4_new_inode_start_handle(idmap, ent->dir, S_IFCHR | WHITEOUT_MODE, &ent->dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); @@ -3763,7 +3763,7 @@ retry: * while new_{dentry,inode) refers to the destination dentry/inode * This comes from rename(const char *oldpath, const char *newpath) */ -static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -3851,7 +3851,7 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, goto release_bh; } } else { - whiteout = ext4_whiteout_for_rename(mnt_userns, &old, credits, &handle); + whiteout = ext4_whiteout_for_rename(idmap, &old, credits, &handle); if (IS_ERR(whiteout)) { retval = PTR_ERR(whiteout); goto release_bh; @@ -4158,7 +4158,7 @@ end_rename: return retval; } -static int ext4_rename2(struct user_namespace *mnt_userns, +static int ext4_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -4181,7 +4181,7 @@ static int ext4_rename2(struct user_namespace *mnt_userns, new_dir, new_dentry); } - return ext4_rename(mnt_userns, old_dir, old_dentry, new_dir, new_dentry, flags); + return ext4_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); } /* diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index d5266932ce6c..c61dc8a7c014 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -211,8 +211,7 @@ static void ext4_set_bio_post_read_ctx(struct bio *bio, static inline loff_t ext4_readpage_limit(struct inode *inode) { - if (IS_ENABLED(CONFIG_FS_VERITY) && - (IS_VERITY(inode) || ext4_verity_in_progress(inode))) + if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return inode->i_sb->s_maxbytes; return i_size_read(inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 260c1b3e3ef2..2ae46d11aa30 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2635,7 +2635,6 @@ static int ext4_check_test_dummy_encryption(const struct fs_context *fc, { const struct ext4_fs_context *ctx = fc->fs_private; const struct ext4_sb_info *sbi = EXT4_SB(sb); - int err; if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy)) return 0; @@ -2668,17 +2667,7 @@ static int ext4_check_test_dummy_encryption(const struct fs_context *fc, "Conflicting test_dummy_encryption options"); return -EINVAL; } - /* - * fscrypt_add_test_dummy_key() technically changes the super_block, so - * technically it should be delayed until ext4_apply_options() like the - * other changes. But since we never get here for remounts (see above), - * and this is the last chance to report errors, we do it here. - */ - err = fscrypt_add_test_dummy_key(sb, &ctx->dummy_enc_policy); - if (err) - ext4_msg(NULL, KERN_WARNING, - "Error adding test dummy encryption key [%d]", err); - return err; + return 0; } static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx, @@ -5336,11 +5325,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) } } - if (ext4_has_feature_verity(sb) && sb->s_blocksize != PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); - goto failed_mount_wq; - } - /* * Get the # of file system overhead blocks from the * superblock if present. diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 3d3ed3c38f56..75bf1f88843c 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -55,12 +55,12 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, return paddr; } -static int ext4_encrypted_symlink_getattr(struct user_namespace *mnt_userns, +static int ext4_encrypted_symlink_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - ext4_getattr(mnt_userns, path, stat, request_mask, query_flags); + ext4_getattr(idmap, path, stat, request_mask, query_flags); return fscrypt_symlink_getattr(path, stat); } diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 30e3b65798b5..e4da1704438e 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -381,11 +381,11 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, } static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, - u64 index, int log_blocksize) + u64 pos, unsigned int size) { - loff_t pos = ext4_verity_metadata_pos(inode) + (index << log_blocksize); + pos += ext4_verity_metadata_pos(inode); - return pagecache_write(inode, buf, 1 << log_blocksize, pos); + return pagecache_write(inode, buf, size, pos); } const struct fsverity_operations ext4_verityops = { diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c index c78df5790377..8a5842e4cd95 100644 --- a/fs/ext4/xattr_hurd.c +++ b/fs/ext4/xattr_hurd.c @@ -32,7 +32,7 @@ ext4_xattr_hurd_get(const struct xattr_handler *handler, static int ext4_xattr_hurd_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 8213f66f7b2d..776cf11d24ca 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -23,7 +23,7 @@ ext4_xattr_security_get(const struct xattr_handler *handler, static int ext4_xattr_security_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 7c21ffb26d25..9811eb0ab276 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -30,7 +30,7 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler, static int ext4_xattr_trusted_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 2fe7ff0a479c..4b70bf4e7626 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -31,7 +31,7 @@ ext4_xattr_user_get(const struct xattr_handler *handler, static int ext4_xattr_user_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index c1c74aa658ae..ec2aeccb69a3 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -204,7 +204,7 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu) return __f2fs_get_acl(inode, type, NULL); } -static int f2fs_acl_update_mode(struct user_namespace *mnt_userns, +static int f2fs_acl_update_mode(struct mnt_idmap *idmap, struct inode *inode, umode_t *mode_p, struct posix_acl **acl) { @@ -219,14 +219,14 @@ static int f2fs_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) && + !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; return 0; } -static int __f2fs_set_acl(struct user_namespace *mnt_userns, +static int __f2fs_set_acl(struct mnt_idmap *idmap, struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { @@ -240,7 +240,7 @@ static int __f2fs_set_acl(struct user_namespace *mnt_userns, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = f2fs_acl_update_mode(mnt_userns, inode, + error = f2fs_acl_update_mode(idmap, inode, &mode, &acl); if (error) return error; @@ -276,7 +276,7 @@ static int __f2fs_set_acl(struct user_namespace *mnt_userns, return error; } -int f2fs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int f2fs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct inode *inode = d_inode(dentry); @@ -284,7 +284,7 @@ int f2fs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; - return __f2fs_set_acl(mnt_userns, inode, type, acl, NULL); + return __f2fs_set_acl(idmap, inode, type, acl, NULL); } /* diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index ea2bbb3f264b..94ebfbfbdc6f 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -34,7 +34,7 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool); -extern int f2fs_set_acl(struct user_namespace *, struct dentry *, +extern int f2fs_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 97e816590cd9..8630df80fedb 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2053,8 +2053,7 @@ out: static inline loff_t f2fs_readpage_limit(struct inode *inode) { - if (IS_ENABLED(CONFIG_FS_VERITY) && - (IS_VERITY(inode) || f2fs_verity_in_progress(inode))) + if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return inode->i_sb->s_maxbytes; return i_size_read(inode); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e8953c3dc81a..9a3ffa39ad30 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3469,15 +3469,15 @@ void f2fs_truncate_data_blocks(struct dnode_of_data *dn); int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); -int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int f2fs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); -int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_precache_extents(struct inode *inode); int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int f2fs_fileattr_set(struct user_namespace *mnt_userns, +int f2fs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3505,7 +3505,7 @@ void f2fs_handle_failed_inode(struct inode *inode); int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); -int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct inode **new_inode); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ecbc8c135b49..b90617639743 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -837,7 +837,7 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw) return false; } -int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int f2fs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -892,7 +892,7 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); /* we need to show initial sectors used for inline_data/dentries */ if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || @@ -903,13 +903,13 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, } #ifdef CONFIG_F2FS_FS_POSIX_ACL -static void __setattr_copy(struct user_namespace *mnt_userns, +static void __setattr_copy(struct mnt_idmap *idmap, struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; - i_uid_update(mnt_userns, attr, inode); - i_gid_update(mnt_userns, attr, inode); + i_uid_update(idmap, attr, inode); + i_gid_update(idmap, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -918,10 +918,10 @@ static void __setattr_copy(struct user_namespace *mnt_userns, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); if (!vfsgid_in_group_p(vfsgid) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } @@ -930,7 +930,7 @@ static void __setattr_copy(struct user_namespace *mnt_userns, #define __setattr_copy setattr_copy #endif -int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -951,7 +951,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, !f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - err = setattr_prepare(mnt_userns, dentry, attr); + err = setattr_prepare(idmap, dentry, attr); if (err) return err; @@ -963,15 +963,15 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (err) return err; - if (is_quota_modification(mnt_userns, inode, attr)) { + if (is_quota_modification(idmap, inode, attr)) { err = f2fs_dquot_initialize(inode); if (err) return err; } - if (i_uid_needs_update(mnt_userns, attr, inode) || - i_gid_needs_update(mnt_userns, attr, inode)) { + if (i_uid_needs_update(idmap, attr, inode) || + i_gid_needs_update(idmap, attr, inode)) { f2fs_lock_op(F2FS_I_SB(inode)); - err = dquot_transfer(mnt_userns, inode, attr); + err = dquot_transfer(idmap, inode, attr); if (err) { set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); @@ -982,8 +982,8 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * update uid/gid under lock_op(), so that dquot and inode can * be updated atomically. */ - i_uid_update(mnt_userns, attr, inode); - i_gid_update(mnt_userns, attr, inode); + i_uid_update(idmap, attr, inode); + i_gid_update(idmap, attr, inode); f2fs_mark_inode_dirty_sync(inode, true); f2fs_unlock_op(F2FS_I_SB(inode)); } @@ -1023,10 +1023,10 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, spin_unlock(&F2FS_I(inode)->i_size_lock); } - __setattr_copy(mnt_userns, inode, attr); + __setattr_copy(idmap, inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = posix_acl_chmod(mnt_userns, dentry, f2fs_get_inode_mode(inode)); + err = posix_acl_chmod(idmap, dentry, f2fs_get_inode_mode(inode)); if (is_inode_flag_set(inode, FI_ACL_MODE)) { if (!err) @@ -2038,14 +2038,14 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) { struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); + struct mnt_idmap *idmap = file_mnt_idmap(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inode *pinode; loff_t isize; int ret; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; if (!S_ISREG(inode->i_mode)) @@ -2095,7 +2095,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) goto out; } - ret = f2fs_get_tmpfile(mnt_userns, pinode, &fi->cow_inode); + ret = f2fs_get_tmpfile(idmap, pinode, &fi->cow_inode); iput(pinode); if (ret) { f2fs_up_write(&fi->i_gc_rwsem[WRITE]); @@ -2135,10 +2135,10 @@ out: static int f2fs_ioc_commit_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); + struct mnt_idmap *idmap = file_mnt_idmap(filp); int ret; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -2167,10 +2167,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) static int f2fs_ioc_abort_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); + struct mnt_idmap *idmap = file_mnt_idmap(filp); int ret; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -3090,7 +3090,7 @@ int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int f2fs_fileattr_set(struct user_namespace *mnt_userns, +int f2fs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 6032589099ce..d8e01bbbf27f 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -202,7 +202,7 @@ static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, file_set_hot(inode); } -static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, +static struct inode *f2fs_new_inode(struct mnt_idmap *idmap, struct inode *dir, umode_t mode, const char *name) { @@ -225,7 +225,7 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, nid_free = true; - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); inode->i_ino = ino; inode->i_blocks = 0; @@ -246,7 +246,7 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; else - F2FS_I(inode)->i_projid = make_kprojid(mnt_userns, + F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, F2FS_DEF_PROJID); err = fscrypt_prepare_new_inode(dir, inode, &encrypt); @@ -333,7 +333,7 @@ fail_drop: return ERR_PTR(err); } -static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int f2fs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -350,7 +350,7 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode, dentry->d_name.name); + inode = f2fs_new_inode(idmap, dir, mode, dentry->d_name.name); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -659,7 +659,7 @@ static const char *f2fs_get_link(struct dentry *dentry, return link; } -static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int f2fs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -682,7 +682,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO, NULL); + inode = f2fs_new_inode(idmap, dir, S_IFLNK | S_IRWXUGO, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -739,7 +739,7 @@ out_free_encrypted_link: return err; } -static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -753,7 +753,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode, NULL); + inode = f2fs_new_inode(idmap, dir, S_IFDIR | mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -794,7 +794,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) return -ENOTEMPTY; } -static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int f2fs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -810,7 +810,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); + inode = f2fs_new_inode(idmap, dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -837,7 +837,7 @@ out: return err; } -static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode, bool is_whiteout, struct inode **new_inode) { @@ -849,7 +849,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); + inode = f2fs_new_inode(idmap, dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -907,7 +907,7 @@ out: return err; } -static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -918,28 +918,28 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = __f2fs_tmpfile(mnt_userns, dir, file, mode, false, NULL); + err = __f2fs_tmpfile(idmap, dir, file, mode, false, NULL); return finish_open_simple(file, err); } -static int f2fs_create_whiteout(struct user_namespace *mnt_userns, +static int f2fs_create_whiteout(struct mnt_idmap *idmap, struct inode *dir, struct inode **whiteout) { if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) return -EIO; - return __f2fs_tmpfile(mnt_userns, dir, NULL, + return __f2fs_tmpfile(idmap, dir, NULL, S_IFCHR | WHITEOUT_MODE, true, whiteout); } -int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct inode **new_inode) { - return __f2fs_tmpfile(mnt_userns, dir, NULL, S_IFREG, false, new_inode); + return __f2fs_tmpfile(idmap, dir, NULL, S_IFREG, false, new_inode); } -static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -979,7 +979,7 @@ static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, } if (flags & RENAME_WHITEOUT) { - err = f2fs_create_whiteout(mnt_userns, old_dir, &whiteout); + err = f2fs_create_whiteout(idmap, old_dir, &whiteout); if (err) return err; } @@ -1295,7 +1295,7 @@ out: return err; } -static int f2fs_rename2(struct user_namespace *mnt_userns, +static int f2fs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -1318,7 +1318,7 @@ static int f2fs_rename2(struct user_namespace *mnt_userns, * VFS has already handled the new dentry existence case, * here, we just deal with "RENAME_NOREPLACE" as regular rename. */ - return f2fs_rename(mnt_userns, old_dir, old_dentry, + return f2fs_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); } @@ -1342,12 +1342,12 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, return target; } -static int f2fs_encrypted_symlink_getattr(struct user_namespace *mnt_userns, +static int f2fs_encrypted_symlink_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - f2fs_getattr(mnt_userns, path, stat, request_mask, query_flags); + f2fs_getattr(idmap, path, stat, request_mask, query_flags); return fscrypt_symlink_getattr(path, stat); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 77fd453949b1..dfd41908b12d 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -258,15 +258,15 @@ static int recover_quota_data(struct inode *inode, struct page *page) attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid)); attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid)); - if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&init_user_ns, inode))) + if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&nop_mnt_idmap, inode))) attr.ia_valid |= ATTR_UID; - if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&init_user_ns, inode))) + if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&nop_mnt_idmap, inode))) attr.ia_valid |= ATTR_GID; if (!attr.ia_valid) return 0; - err = dquot_transfer(&init_user_ns, inode, &attr); + err = dquot_transfer(&nop_mnt_idmap, inode, &attr); if (err) set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1f812b9ce985..64d3556d61a5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -540,12 +540,6 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, opt, err); return -EINVAL; } - err = fscrypt_add_test_dummy_key(sb, policy); - if (err) { - f2fs_warn(sbi, "Error adding test dummy encryption key [%d]", - err); - return err; - } f2fs_warn(sbi, "Test dummy encryption mode enabled"); return 0; } diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index c352fff88a5e..f320ed8172ec 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -276,11 +276,11 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, } static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf, - u64 index, int log_blocksize) + u64 pos, unsigned int size) { - loff_t pos = f2fs_verity_metadata_pos(inode) + (index << log_blocksize); + pos += f2fs_verity_metadata_pos(inode); - return pagecache_write(inode, buf, 1 << log_blocksize, pos); + return pagecache_write(inode, buf, size, pos); } const struct fsverity_operations f2fs_verityops = { diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index dc2e8637189e..d92edbbdc30e 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -65,7 +65,7 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler, } static int f2fs_xattr_generic_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -109,7 +109,7 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler, } static int f2fs_xattr_advise_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -117,7 +117,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, unsigned char old_advise = F2FS_I(inode)->i_advise; unsigned char new_advise; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EPERM; if (value == NULL) return -EINVAL; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index a415c02ede39..e3b690b48e3e 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -398,10 +398,10 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); extern const struct file_operations fat_file_operations; extern const struct inode_operations fat_file_inode_operations; -extern int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int fat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern void fat_truncate_blocks(struct inode *inode, loff_t offset); -extern int fat_getattr(struct user_namespace *mnt_userns, +extern int fat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); extern int fat_file_fsync(struct file *file, loff_t start, loff_t end, diff --git a/fs/fat/file.c b/fs/fat/file.c index 8a6b493b5b5f..795a4fad5c40 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -90,13 +90,13 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) * out the RO attribute for checking by the security * module, just because it maps to a file mode. */ - err = security_inode_setattr(file_mnt_user_ns(file), + err = security_inode_setattr(file_mnt_idmap(file), file->f_path.dentry, &ia); if (err) goto out_unlock_inode; /* This MUST be done before doing anything irreversible... */ - err = fat_setattr(file_mnt_user_ns(file), file->f_path.dentry, &ia); + err = fat_setattr(file_mnt_idmap(file), file->f_path.dentry, &ia); if (err) goto out_unlock_inode; @@ -395,13 +395,13 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset) fat_flush_inodes(inode->i_sb, inode, NULL); } -int fat_getattr(struct user_namespace *mnt_userns, const struct path *path, +int fat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); stat->blksize = sbi->cluster_size; if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) { @@ -456,14 +456,14 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi, return 0; } -static int fat_allow_set_time(struct user_namespace *mnt_userns, +static int fat_allow_set_time(struct mnt_idmap *idmap, struct msdos_sb_info *sbi, struct inode *inode) { umode_t allow_utime = sbi->options.allow_utime; - if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), + if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) { - if (vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode))) + if (vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode))) allow_utime >>= 3; if (allow_utime & MAY_WRITE) return 1; @@ -477,7 +477,7 @@ static int fat_allow_set_time(struct user_namespace *mnt_userns, /* valid file mode bits */ #define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) -int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int fat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); @@ -488,11 +488,11 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, /* Check for setting the inode time. */ ia_valid = attr->ia_valid; if (ia_valid & TIMES_SET_FLAGS) { - if (fat_allow_set_time(mnt_userns, sbi, inode)) + if (fat_allow_set_time(idmap, sbi, inode)) attr->ia_valid &= ~TIMES_SET_FLAGS; } - error = setattr_prepare(mnt_userns, dentry, attr); + error = setattr_prepare(idmap, dentry, attr); attr->ia_valid = ia_valid; if (error) { if (sbi->options.quiet) @@ -518,10 +518,10 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } if (((attr->ia_valid & ATTR_UID) && - (!uid_eq(from_vfsuid(mnt_userns, i_user_ns(inode), attr->ia_vfsuid), + (!uid_eq(from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid), sbi->options.fs_uid))) || ((attr->ia_valid & ATTR_GID) && - (!gid_eq(from_vfsgid(mnt_userns, i_user_ns(inode), attr->ia_vfsgid), + (!gid_eq(from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid), sbi->options.fs_gid))) || ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~FAT_VALID_MODE))) @@ -564,7 +564,7 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, fat_truncate_time(inode, &attr->ia_mtime, S_MTIME); attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME); - setattr_copy(mnt_userns, inode, attr); + setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); out: return error; diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index efba301d68ae..2116c486843b 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -261,7 +261,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, } /***** Create a file */ -static int msdos_create(struct user_namespace *mnt_userns, struct inode *dir, +static int msdos_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; @@ -339,7 +339,7 @@ out: } /***** Make a directory */ -static int msdos_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; @@ -594,7 +594,7 @@ error_inode: } /***** Rename, a wrapper for rename_same_dir & rename_diff_dir */ -static int msdos_rename(struct user_namespace *mnt_userns, +static int msdos_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 21620054e1c4..fceda1de4805 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -756,7 +756,7 @@ error: return ERR_PTR(err); } -static int vfat_create(struct user_namespace *mnt_userns, struct inode *dir, +static int vfat_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; @@ -844,7 +844,7 @@ out: return err; } -static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; @@ -1158,7 +1158,7 @@ error_exchange: goto out; } -static int vfat_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, +static int vfat_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/fcntl.c b/fs/fcntl.c index 146c9ab0cd4b..b622be119706 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/sched/task.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/capability.h> @@ -47,7 +48,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) /* O_NOATIME can only be set by the owner or superuser */ if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) - if (!inode_owner_or_capable(file_mnt_user_ns(filp), inode)) + if (!inode_owner_or_capable(file_mnt_idmap(filp), inode)) return -EPERM; /* required for strict SunOS emulation */ diff --git a/fs/file_table.c b/fs/file_table.c index dd88701e54a9..372653b92617 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/eventpoll.h> diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig index c05c71d57291..0e2fc08f7de4 100644 --- a/fs/freevxfs/Kconfig +++ b/fs/freevxfs/Kconfig @@ -8,7 +8,7 @@ config VXFS_FS of SCO UnixWare (and possibly others) and optionally available for Sunsoft Solaris, HP-UX and many other operating systems. However these particular OS implementations of vxfs may differ in on-disk - data endianess and/or superblock offset. The vxfs module has been + data endianness and/or superblock offset. The vxfs module has been tested with SCO UnixWare and HP-UX B.10.20 (pa-risc 1.1 arch.) Currently only readonly access is supported and VxFX versions 2, 3 and 4. Tests were performed with HP-UX VxFS version 3. diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c index ab8ceddf9efa..cdf991bdd9de 100644 --- a/fs/fscache/volume.c +++ b/fs/fscache/volume.c @@ -141,13 +141,14 @@ static bool fscache_is_acquire_pending(struct fscache_volume *volume) static void fscache_wait_on_volume_collision(struct fscache_volume *candidate, unsigned int collidee_debug_id) { - wait_var_event_timeout(&candidate->flags, - !fscache_is_acquire_pending(candidate), 20 * HZ); + wait_on_bit_timeout(&candidate->flags, FSCACHE_VOLUME_ACQUIRE_PENDING, + TASK_UNINTERRUPTIBLE, 20 * HZ); if (fscache_is_acquire_pending(candidate)) { pr_notice("Potential volume collision new=%08x old=%08x", candidate->debug_id, collidee_debug_id); fscache_stat(&fscache_n_volumes_collision); - wait_var_event(&candidate->flags, !fscache_is_acquire_pending(candidate)); + wait_on_bit(&candidate->flags, FSCACHE_VOLUME_ACQUIRE_PENDING, + TASK_UNINTERRUPTIBLE); } } @@ -279,8 +280,7 @@ static void fscache_create_volume_work(struct work_struct *work) fscache_end_cache_access(volume->cache, fscache_access_acquire_volume_end); - clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags); - wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING); + clear_and_wake_up_bit(FSCACHE_VOLUME_CREATING, &volume->flags); fscache_put_volume(volume, fscache_volume_put_create_work); } @@ -347,8 +347,8 @@ static void fscache_wake_pending_volume(struct fscache_volume *volume, hlist_bl_for_each_entry(cursor, p, h, hash_link) { if (fscache_volume_same(cursor, volume)) { fscache_see_volume(cursor, fscache_volume_see_hash_wake); - clear_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &cursor->flags); - wake_up_bit(&cursor->flags, FSCACHE_VOLUME_ACQUIRE_PENDING); + clear_and_wake_up_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, + &cursor->flags); return; } } diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index ad670369955f..3d192b80a561 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -12,7 +12,7 @@ #include <linux/posix_acl_xattr.h> static struct posix_acl *__fuse_get_acl(struct fuse_conn *fc, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *inode, int type, bool rcu) { int size; @@ -65,7 +65,7 @@ static inline bool fuse_no_acl(const struct fuse_conn *fc, return !fc->posix_acl && (i_user_ns(inode) != &init_user_ns); } -struct posix_acl *fuse_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *fuse_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { struct inode *inode = d_inode(dentry); @@ -74,7 +74,7 @@ struct posix_acl *fuse_get_acl(struct user_namespace *mnt_userns, if (fuse_no_acl(fc, inode)) return ERR_PTR(-EOPNOTSUPP); - return __fuse_get_acl(fc, mnt_userns, inode, type, false); + return __fuse_get_acl(fc, idmap, inode, type, false); } struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu) @@ -91,10 +91,10 @@ struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu) if (!fc->posix_acl) return NULL; - return __fuse_get_acl(fc, &init_user_ns, inode, type, rcu); + return __fuse_get_acl(fc, &nop_mnt_idmap, inode, type, rcu); } -int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct inode *inode = d_inode(dentry); @@ -146,8 +146,8 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, * be stripped. */ if (fc->posix_acl && - !vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) && - !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) + !vfsgid_in_group_p(i_gid_into_vfsgid(&nop_mnt_idmap, inode)) && + !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; ret = fuse_setxattr(inode, name, value, size, 0, extra_flags); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2725fb54328e..cd1eae61e84c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -645,7 +645,7 @@ out_err: return err; } -static int fuse_mknod(struct user_namespace *, struct inode *, struct dentry *, +static int fuse_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); static int fuse_atomic_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned flags, @@ -686,7 +686,7 @@ out_dput: return err; mknod: - err = fuse_mknod(&init_user_ns, dir, entry, mode, 0); + err = fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0); if (err) goto out_dput; no_open: @@ -773,7 +773,7 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, return err; } -static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev) { struct fuse_mknod_in inarg; @@ -796,13 +796,13 @@ static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir, return create_new_entry(fm, &args, dir, entry, mode); } -static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir, +static int fuse_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode, bool excl) { - return fuse_mknod(&init_user_ns, dir, entry, mode, 0); + return fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0); } -static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct fuse_conn *fc = get_fuse_conn(dir); @@ -819,7 +819,7 @@ static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; @@ -841,7 +841,7 @@ static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, return create_new_entry(fm, &args, dir, entry, S_IFDIR); } -static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, const char *link) { struct fuse_mount *fm = get_fuse_mount(dir); @@ -998,7 +998,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, return err; } -static int fuse_rename2(struct user_namespace *mnt_userns, struct inode *olddir, +static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags) { @@ -1156,7 +1156,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else if (stat) { - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; } @@ -1326,7 +1326,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) * access request is sent. Execute permission is still checked * locally based on file mode. */ -static int fuse_permission(struct user_namespace *mnt_userns, +static int fuse_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -1358,7 +1358,7 @@ static int fuse_permission(struct user_namespace *mnt_userns, } if (fc->default_permissions) { - err = generic_permission(&init_user_ns, inode, mask); + err = generic_permission(&nop_mnt_idmap, inode, mask); /* If permission is denied, try to refresh file attributes. This is also needed, because the root @@ -1366,7 +1366,7 @@ static int fuse_permission(struct user_namespace *mnt_userns, if (err == -EACCES && !refreshed) { err = fuse_perm_getattr(inode, mask); if (!err) - err = generic_permission(&init_user_ns, + err = generic_permission(&nop_mnt_idmap, inode, mask); } @@ -1690,7 +1690,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) return err; @@ -1837,7 +1837,7 @@ error: return err; } -static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry, +static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry, struct iattr *attr) { struct inode *inode = d_inode(entry); @@ -1900,7 +1900,7 @@ static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry, return ret; } -static int fuse_getattr(struct user_namespace *mnt_userns, +static int fuse_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 875314ee6f59..82710d103556 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -18,6 +18,7 @@ #include <linux/falloc.h> #include <linux/uio.h> #include <linux/fs.h> +#include <linux/filelock.h> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -1313,7 +1314,8 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) return err; if (fc->handle_killpriv_v2 && - setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) { + setattr_should_drop_suidgid(&nop_mnt_idmap, + file_inode(file))) { goto writethrough; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 46797a171a84..9b5058cf5bc3 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1267,9 +1267,9 @@ extern const struct xattr_handler *fuse_xattr_handlers[]; struct posix_acl; struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu); -struct posix_acl *fuse_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *fuse_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type); -int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int fuse_set_acl(struct mnt_idmap *, struct dentry *dentry, struct posix_acl *acl, int type); /* readdir.c */ @@ -1309,7 +1309,7 @@ long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int fuse_fileattr_set(struct user_namespace *mnt_userns, +int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); /* file.c */ diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index fcce94ace2c2..e50a18ee6cc6 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -467,7 +467,7 @@ cleanup: return err; } -int fuse_fileattr_set(struct user_namespace *mnt_userns, +int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 9fe571ab569e..49c01559580f 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -189,7 +189,7 @@ static int fuse_xattr_get(const struct xattr_handler *handler, } static int fuse_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 3dcde4912413..a392aa0f041d 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -109,7 +109,7 @@ out: return error; } -int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct inode *inode = d_inode(dentry); @@ -135,7 +135,7 @@ int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - ret = posix_acl_update_mode(&init_user_ns, inode, &mode, &acl); + ret = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (ret) goto unlock; } diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index b8de8c148f5c..d4deb2b19959 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -13,7 +13,7 @@ extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu); extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index eea5be4fbf0e..300844f50dcd 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -15,6 +15,7 @@ #include <linux/mm.h> #include <linux/mount.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/gfs2_ondisk.h> #include <linux/falloc.h> #include <linux/swap.h> @@ -235,7 +236,7 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask) goto out; if (!IS_IMMUTABLE(inode)) { - error = gfs2_permission(&init_user_ns, inode, MAY_WRITE); + error = gfs2_permission(&nop_mnt_idmap, inode, MAY_WRITE); if (error) goto out; } @@ -273,7 +274,7 @@ out: return error; } -int gfs2_fileattr_set(struct user_namespace *mnt_userns, +int gfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 614db3055c02..713efa3bb732 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -320,7 +320,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, } if (!is_root) { - error = gfs2_permission(&init_user_ns, dir, MAY_EXEC); + error = gfs2_permission(&nop_mnt_idmap, dir, MAY_EXEC); if (error) goto out; } @@ -350,7 +350,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, { int error; - error = gfs2_permission(&init_user_ns, &dip->i_inode, + error = gfs2_permission(&nop_mnt_idmap, &dip->i_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -843,7 +843,7 @@ fail: /** * gfs2_create - Create a file - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: The directory in which to create the file * @dentry: The dentry of the new file * @mode: The mode of the new file @@ -852,7 +852,7 @@ fail: * Returns: errno */ -static int gfs2_create(struct user_namespace *mnt_userns, struct inode *dir, +static int gfs2_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl); @@ -960,7 +960,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink == 0) goto out_gunlock; - error = gfs2_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(&nop_mnt_idmap, dir, MAY_WRITE | MAY_EXEC); if (error) goto out_gunlock; @@ -1078,7 +1078,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, if (IS_APPEND(&dip->i_inode)) return -EPERM; - error = gfs2_permission(&init_user_ns, &dip->i_inode, + error = gfs2_permission(&nop_mnt_idmap, &dip->i_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -1207,7 +1207,7 @@ out_inodes: /** * gfs2_symlink - Create a symlink - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: The directory to create the symlink in * @dentry: The dentry to put the symlink in * @symname: The thing which the link points to @@ -1215,7 +1215,7 @@ out_inodes: * Returns: errno */ -static int gfs2_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int gfs2_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { unsigned int size; @@ -1229,7 +1229,7 @@ static int gfs2_symlink(struct user_namespace *mnt_userns, struct inode *dir, /** * gfs2_mkdir - Make a directory - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: The parent directory of the new one * @dentry: The dentry of the new directory * @mode: The mode of the new directory @@ -1237,7 +1237,7 @@ static int gfs2_symlink(struct user_namespace *mnt_userns, struct inode *dir, * Returns: errno */ -static int gfs2_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir)); @@ -1246,7 +1246,7 @@ static int gfs2_mkdir(struct user_namespace *mnt_userns, struct inode *dir, /** * gfs2_mknod - Make a special file - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: The directory in which the special file will reside * @dentry: The dentry of the special file * @mode: The mode of the special file @@ -1254,7 +1254,7 @@ static int gfs2_mkdir(struct user_namespace *mnt_userns, struct inode *dir, * */ -static int gfs2_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int gfs2_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0); @@ -1504,7 +1504,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, } } } else { - error = gfs2_permission(&init_user_ns, ndir, + error = gfs2_permission(&nop_mnt_idmap, ndir, MAY_WRITE | MAY_EXEC); if (error) goto out_gunlock; @@ -1541,7 +1541,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, /* Check out the dir to be renamed */ if (dir_rename) { - error = gfs2_permission(&init_user_ns, d_inode(odentry), + error = gfs2_permission(&nop_mnt_idmap, d_inode(odentry), MAY_WRITE); if (error) goto out_gunlock; @@ -1705,13 +1705,13 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry, goto out_gunlock; if (S_ISDIR(old_mode)) { - error = gfs2_permission(&init_user_ns, odentry->d_inode, + error = gfs2_permission(&nop_mnt_idmap, odentry->d_inode, MAY_WRITE); if (error) goto out_gunlock; } if (S_ISDIR(new_mode)) { - error = gfs2_permission(&init_user_ns, ndentry->d_inode, + error = gfs2_permission(&nop_mnt_idmap, ndentry->d_inode, MAY_WRITE); if (error) goto out_gunlock; @@ -1766,7 +1766,7 @@ out: return error; } -static int gfs2_rename2(struct user_namespace *mnt_userns, struct inode *odir, +static int gfs2_rename2(struct mnt_idmap *idmap, struct inode *odir, struct dentry *odentry, struct inode *ndir, struct dentry *ndentry, unsigned int flags) { @@ -1841,7 +1841,7 @@ out: /** * gfs2_permission - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: The inode * @mask: The mask to be tested * @@ -1852,7 +1852,7 @@ out: * Returns: errno */ -int gfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, +int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct gfs2_inode *ip; @@ -1872,7 +1872,7 @@ int gfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) error = -EPERM; else - error = generic_permission(&init_user_ns, inode, mask); + error = generic_permission(&nop_mnt_idmap, inode, mask); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); @@ -1881,7 +1881,7 @@ int gfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr) { - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -1966,7 +1966,7 @@ out: /** * gfs2_setattr - Change attributes on an inode - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: The dentry which is changing * @attr: The structure describing the change * @@ -1976,7 +1976,7 @@ out: * Returns: errno */ -static int gfs2_setattr(struct user_namespace *mnt_userns, +static int gfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -1992,11 +1992,11 @@ static int gfs2_setattr(struct user_namespace *mnt_userns, if (error) goto out; - error = may_setattr(&init_user_ns, inode, attr->ia_valid); + error = may_setattr(&nop_mnt_idmap, inode, attr->ia_valid); if (error) goto error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) goto error; @@ -2007,7 +2007,7 @@ static int gfs2_setattr(struct user_namespace *mnt_userns, else { error = gfs2_setattr_simple(inode, attr); if (!error && attr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, dentry, + error = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); } @@ -2022,7 +2022,7 @@ out: /** * gfs2_getattr - Read out an inode's attributes - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @path: Object to query * @stat: The inode's stats * @request_mask: Mask of STATX_xxx flags indicating the caller's interests @@ -2037,7 +2037,7 @@ out: * Returns: errno */ -static int gfs2_getattr(struct user_namespace *mnt_userns, +static int gfs2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -2066,7 +2066,7 @@ static int gfs2_getattr(struct user_namespace *mnt_userns, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 0264d514dda7..c8c5814e7295 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -99,7 +99,7 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip); extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, int is_root); -extern int gfs2_permission(struct user_namespace *mnt_userns, +extern int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); @@ -111,7 +111,7 @@ extern const struct file_operations gfs2_file_fops_nolock; extern const struct file_operations gfs2_dir_fops_nolock; extern int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); -extern int gfs2_fileattr_set(struct user_namespace *mnt_userns, +extern int gfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); extern void gfs2_set_inode_flags(struct inode *inode); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 518c0677e12a..adf6d17cf033 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1225,7 +1225,7 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, } static int gfs2_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index 2bd54efaf416..6341bb248247 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -121,7 +121,7 @@ static int hfs_xattr_get(const struct xattr_handler *handler, } static int hfs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 527f6e46cbe8..3e1e3dcf0b48 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -189,7 +189,7 @@ static int hfs_dir_release(struct inode *inode, struct file *file) * a directory and return a corresponding inode, given the inode for * the directory and the name (and its length) of the new file. */ -static int hfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int hfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; @@ -219,7 +219,7 @@ static int hfs_create(struct user_namespace *mnt_userns, struct inode *dir, * in a directory, given the inode for the parent directory and the * name (and its length) of the new directory. */ -static int hfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -280,7 +280,7 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry) * new file/directory. * XXX: how do you handle must_be dir? */ -static int hfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int hfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 68d0305880f7..49d02524e667 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -206,7 +206,7 @@ int hfs_write_begin(struct file *file, struct address_space *mapping, extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); extern int hfs_write_inode(struct inode *, struct writeback_control *); -extern int hfs_inode_setattr(struct user_namespace *, struct dentry *, +extern int hfs_inode_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, __be32 log_size, __be32 phys_size, u32 clump_size); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 3a155c1d810e..1f7bd068acf0 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -611,14 +611,14 @@ static int hfs_file_release(struct inode *inode, struct file *file) * correspond to the same HFS file. */ -int hfs_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); int error; - error = setattr_prepare(&init_user_ns, dentry, + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); /* basic permission checks */ if (error) return error; @@ -658,7 +658,7 @@ int hfs_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, current_time(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 84714bbccc12..56fb5f1312e7 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -434,7 +434,7 @@ out: return res; } -static int hfsplus_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int hfsplus_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); @@ -476,7 +476,7 @@ out: return res; } -static int hfsplus_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int hfsplus_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); @@ -517,19 +517,19 @@ out: return res; } -static int hfsplus_create(struct user_namespace *mnt_userns, struct inode *dir, +static int hfsplus_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return hfsplus_mknod(&init_user_ns, dir, dentry, mode, 0); + return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); } -static int hfsplus_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - return hfsplus_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); + return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); } -static int hfsplus_rename(struct user_namespace *mnt_userns, +static int hfsplus_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 6aa919e59483..7ededcb720c1 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -481,13 +481,13 @@ void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork); int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd); int hfsplus_cat_write_inode(struct inode *inode); -int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, +int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int hfsplus_fileattr_set(struct user_namespace *mnt_userns, +int hfsplus_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); /* ioctl.c */ diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 840577a0c1e7..abb91f5fae92 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -246,13 +246,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) return 0; } -static int hfsplus_setattr(struct user_namespace *mnt_userns, +static int hfsplus_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -270,13 +270,13 @@ static int hfsplus_setattr(struct user_namespace *mnt_userns, inode->i_mtime = inode->i_ctime = current_time(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } -int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, +int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -298,7 +298,7 @@ int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; } @@ -390,7 +390,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, return NULL; inode->i_ino = sbi->next_cnid++; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); @@ -655,7 +655,7 @@ int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int hfsplus_fileattr_set(struct user_namespace *mnt_userns, +int hfsplus_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 49891b12c415..5b476f57eb17 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -857,7 +857,7 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler, } static int hfsplus_osx_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index c1c7a16cbf21..90f68ec119cd 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -23,7 +23,7 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler, } static int hfsplus_security_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c index e150372ec564..fdbaebc1c49a 100644 --- a/fs/hfsplus/xattr_trusted.c +++ b/fs/hfsplus/xattr_trusted.c @@ -22,7 +22,7 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler, } static int hfsplus_trusted_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c index a6b60b153916..6464b6c3d58d 100644 --- a/fs/hfsplus/xattr_user.c +++ b/fs/hfsplus/xattr_user.c @@ -22,7 +22,7 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler, } static int hfsplus_user_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 277468783fee..c18bb50c31b6 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -559,7 +559,7 @@ static int read_name(struct inode *ino, char *name) return 0; } -static int hostfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int hostfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; @@ -658,7 +658,7 @@ static int hostfs_unlink(struct inode *ino, struct dentry *dentry) return err; } -static int hostfs_symlink(struct user_namespace *mnt_userns, struct inode *ino, +static int hostfs_symlink(struct mnt_idmap *idmap, struct inode *ino, struct dentry *dentry, const char *to) { char *file; @@ -671,7 +671,7 @@ static int hostfs_symlink(struct user_namespace *mnt_userns, struct inode *ino, return err; } -static int hostfs_mkdir(struct user_namespace *mnt_userns, struct inode *ino, +static int hostfs_mkdir(struct mnt_idmap *idmap, struct inode *ino, struct dentry *dentry, umode_t mode) { char *file; @@ -696,7 +696,7 @@ static int hostfs_rmdir(struct inode *ino, struct dentry *dentry) return err; } -static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int hostfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; @@ -734,7 +734,7 @@ static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int hostfs_rename2(struct user_namespace *mnt_userns, +static int hostfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -763,7 +763,7 @@ static int hostfs_rename2(struct user_namespace *mnt_userns, return err; } -static int hostfs_permission(struct user_namespace *mnt_userns, +static int hostfs_permission(struct mnt_idmap *idmap, struct inode *ino, int desired) { char *name; @@ -786,11 +786,11 @@ static int hostfs_permission(struct user_namespace *mnt_userns, err = access_file(name, r, w, x); __putname(name); if (!err) - err = generic_permission(&init_user_ns, ino, desired); + err = generic_permission(&nop_mnt_idmap, ino, desired); return err; } -static int hostfs_setattr(struct user_namespace *mnt_userns, +static int hostfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -800,7 +800,7 @@ static int hostfs_setattr(struct user_namespace *mnt_userns, int fd = HOSTFS_I(inode)->fd; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) return err; @@ -857,7 +857,7 @@ static int hostfs_setattr(struct user_namespace *mnt_userns, attr->ia_size != i_size_read(inode)) truncate_setsize(inode, attr->ia_size); - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 167ec6884642..f5a2476c47bf 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -280,7 +280,7 @@ void hpfs_init_inode(struct inode *); void hpfs_read_inode(struct inode *); void hpfs_write_inode(struct inode *); void hpfs_write_inode_nolock(struct inode *); -int hpfs_setattr(struct user_namespace *, struct dentry *, struct iattr *); +int hpfs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); void hpfs_write_if_changed(struct inode *); void hpfs_evict_inode(struct inode *); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 82208cc28ebd..e50e92a42432 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -257,7 +257,7 @@ void hpfs_write_inode_nolock(struct inode *i) brelse(bh); } -int hpfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int hpfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -275,7 +275,7 @@ int hpfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) goto out_unlock; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) goto out_unlock; @@ -289,7 +289,7 @@ int hpfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, hpfs_truncate(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); hpfs_write_inode(inode); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 15fc63276caa..69fb40b2c99a 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -20,7 +20,7 @@ static void hpfs_update_directory_times(struct inode *dir) hpfs_write_inode_nolock(dir); } -static int hpfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { const unsigned char *name = dentry->d_name.name; @@ -129,7 +129,7 @@ bail: return err; } -static int hpfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { const unsigned char *name = dentry->d_name.name; @@ -217,7 +217,7 @@ bail: return err; } -static int hpfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int hpfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { const unsigned char *name = dentry->d_name.name; @@ -292,7 +292,7 @@ bail: return err; } -static int hpfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int hpfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symlink) { const unsigned char *name = dentry->d_name.name; @@ -512,7 +512,7 @@ const struct address_space_operations hpfs_symlink_aops = { .read_folio = hpfs_symlink_read_folio }; -static int hpfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int hpfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 790d2727141a..0ce1cc4c2add 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -898,7 +898,7 @@ out: return error; } -static int hugetlbfs_setattr(struct user_namespace *mnt_userns, +static int hugetlbfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -907,7 +907,7 @@ static int hugetlbfs_setattr(struct user_namespace *mnt_userns, unsigned int ia_valid = attr->ia_valid; struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -924,7 +924,7 @@ static int hugetlbfs_setattr(struct user_namespace *mnt_userns, hugetlb_vmtruncate(inode, newsize); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -980,7 +980,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); inode->i_ino = get_next_ino(); - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; @@ -1019,7 +1019,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, /* * File creation. Allocate an inode, and we're done.. */ -static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; @@ -1033,24 +1033,24 @@ static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, return 0; } -static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - int retval = hugetlbfs_mknod(&init_user_ns, dir, dentry, + int retval = hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); return retval; } -static int hugetlbfs_create(struct user_namespace *mnt_userns, +static int hugetlbfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return hugetlbfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); + return hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); } -static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns, +static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { @@ -1064,7 +1064,7 @@ static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns, return finish_open_simple(file, 0); } -static int hugetlbfs_symlink(struct user_namespace *mnt_userns, +static int hugetlbfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { diff --git a/fs/init.c b/fs/init.c index 5c36adaa9b44..9684406a8416 100644 --- a/fs/init.c +++ b/fs/init.c @@ -157,7 +157,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) mode &= ~current_umask(); error = security_path_mknod(&path, dentry, mode, dev); if (!error) - error = vfs_mknod(mnt_user_ns(path.mnt), path.dentry->d_inode, + error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); done_path_create(&path, dentry); return error; @@ -167,7 +167,7 @@ int __init init_link(const char *oldname, const char *newname) { struct dentry *new_dentry; struct path old_path, new_path; - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; int error; error = kern_path(oldname, 0, &old_path); @@ -182,14 +182,14 @@ int __init init_link(const char *oldname, const char *newname) error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - mnt_userns = mnt_user_ns(new_path.mnt); - error = may_linkat(mnt_userns, &old_path); + idmap = mnt_idmap(new_path.mnt); + error = may_linkat(idmap, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode, + error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, NULL); out_dput: done_path_create(&new_path, new_dentry); @@ -209,7 +209,7 @@ int __init init_symlink(const char *oldname, const char *newname) return PTR_ERR(dentry); error = security_path_symlink(&path, dentry, oldname); if (!error) - error = vfs_symlink(mnt_user_ns(path.mnt), path.dentry->d_inode, + error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, oldname); done_path_create(&path, dentry); return error; @@ -233,7 +233,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) mode &= ~current_umask(); error = security_path_mkdir(&path, dentry, mode); if (!error) - error = vfs_mkdir(mnt_user_ns(path.mnt), path.dentry->d_inode, + error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode); done_path_create(&path, dentry); return error; diff --git a/fs/inode.c b/fs/inode.c index f453eb58fd03..4558dc2f1355 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -5,6 +5,7 @@ */ #include <linux/export.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/mm.h> #include <linux/backing-dev.h> #include <linux/hash.h> @@ -1893,7 +1894,7 @@ bool atime_needs_update(const struct path *path, struct inode *inode) /* Atime updates will likely cause i_uid and i_gid to be written * back improprely if their true value is unknown to the vfs. */ - if (HAS_UNMAPPED_ID(mnt_user_ns(mnt), inode)) + if (HAS_UNMAPPED_ID(mnt_idmap(mnt), inode)) return false; if (IS_NOATIME(inode)) @@ -1953,7 +1954,7 @@ EXPORT_SYMBOL(touch_atime); * response to write or truncate. Return 0 if nothing has to be changed. * Negative value on error (change should be denied). */ -int dentry_needs_remove_privs(struct user_namespace *mnt_userns, +int dentry_needs_remove_privs(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_inode(dentry); @@ -1963,7 +1964,7 @@ int dentry_needs_remove_privs(struct user_namespace *mnt_userns, if (IS_NOSEC(inode)) return 0; - mask = setattr_should_drop_suidgid(mnt_userns, inode); + mask = setattr_should_drop_suidgid(idmap, inode); ret = security_inode_need_killpriv(dentry); if (ret < 0) return ret; @@ -1972,7 +1973,7 @@ int dentry_needs_remove_privs(struct user_namespace *mnt_userns, return mask; } -static int __remove_privs(struct user_namespace *mnt_userns, +static int __remove_privs(struct mnt_idmap *idmap, struct dentry *dentry, int kill) { struct iattr newattrs; @@ -1982,7 +1983,7 @@ static int __remove_privs(struct user_namespace *mnt_userns, * Note we call this on write, so notify_change will not * encounter any conflicting delegations: */ - return notify_change(mnt_userns, dentry, &newattrs, NULL); + return notify_change(idmap, dentry, &newattrs, NULL); } static int __file_remove_privs(struct file *file, unsigned int flags) @@ -1995,7 +1996,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags) if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; - kill = dentry_needs_remove_privs(file_mnt_user_ns(file), dentry); + kill = dentry_needs_remove_privs(file_mnt_idmap(file), dentry); if (kill < 0) return kill; @@ -2003,7 +2004,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags) if (flags & IOCB_NOWAIT) return -EAGAIN; - error = __remove_privs(file_mnt_user_ns(file), dentry, kill); + error = __remove_privs(file_mnt_idmap(file), dentry, kill); } if (!error) @@ -2279,21 +2280,21 @@ EXPORT_SYMBOL(init_special_inode); /** * inode_init_owner - Init uid,gid,mode for new inode according to posix standards - * @mnt_userns: User namespace of the mount the inode was created from + * @idmap: idmap of the mount the inode was created from * @inode: New inode * @dir: Directory inode * @mode: mode of the new inode * - * If the inode has been created through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions + * If the inode has been created through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions * and initializing i_uid and i_gid. On non-idmapped mounts or if permission - * checking is to be performed on the raw inode simply passs init_user_ns. + * checking is to be performed on the raw inode simply pass @nop_mnt_idmap. */ -void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, +void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, const struct inode *dir, umode_t mode) { - inode_fsuid_set(inode, mnt_userns); + inode_fsuid_set(inode, idmap); if (dir && dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; @@ -2301,32 +2302,32 @@ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, if (S_ISDIR(mode)) mode |= S_ISGID; } else - inode_fsgid_set(inode, mnt_userns); + inode_fsgid_set(inode, idmap); inode->i_mode = mode; } EXPORT_SYMBOL(inode_init_owner); /** * inode_owner_or_capable - check current task permissions to inode - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode being checked * * Return true if current either has CAP_FOWNER in a namespace with the * inode owner uid mapped, or owns the file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -bool inode_owner_or_capable(struct user_namespace *mnt_userns, +bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode) { vfsuid_t vfsuid; struct user_namespace *ns; - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return true; @@ -2458,7 +2459,7 @@ EXPORT_SYMBOL(current_time); /** * in_group_or_capable - check whether caller is CAP_FSETID privileged - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: inode to check * @vfsgid: the new/current vfsgid of @inode * @@ -2468,19 +2469,19 @@ EXPORT_SYMBOL(current_time); * * Return: true if the caller is sufficiently privileged, false if not. */ -bool in_group_or_capable(struct user_namespace *mnt_userns, +bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid) { if (vfsgid_in_group_p(vfsgid)) return true; - if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) return true; return false; } /** * mode_strip_sgid - handle the sgid bit for non-directories - * @mnt_userns: User namespace of the mount the inode was created from + * @idmap: idmap of the mount the inode was created from * @dir: parent directory inode * @mode: mode of the file to be created in @dir * @@ -2492,15 +2493,14 @@ bool in_group_or_capable(struct user_namespace *mnt_userns, * * Return: the new mode to use for the file */ -umode_t mode_strip_sgid(struct user_namespace *mnt_userns, +umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode) { if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP)) return mode; if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) return mode; - if (in_group_or_capable(mnt_userns, dir, - i_gid_into_vfsgid(mnt_userns, dir))) + if (in_group_or_capable(idmap, dir, i_gid_into_vfsgid(idmap, dir))) return mode; return mode & ~S_ISGID; } diff --git a/fs/internal.h b/fs/internal.h index a803cc3cf716..766e8a554b2c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -14,9 +14,9 @@ struct path; struct mount; struct shrink_control; struct fs_context; -struct user_namespace; struct pipe_inode_info; struct iov_iter; +struct mnt_idmap; /* * block/bdev.c @@ -63,7 +63,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); int do_rmdir(int dfd, struct filename *name); int do_unlinkat(int dfd, struct filename *name); -int may_linkat(struct user_namespace *mnt_userns, const struct path *link); +int may_linkat(struct mnt_idmap *idmap, const struct path *link); int do_renameat2(int olddfd, struct filename *oldname, int newdfd, struct filename *newname, unsigned int flags); int do_mkdirat(int dfd, struct filename *name, umode_t mode); @@ -150,8 +150,8 @@ extern int vfs_open(const struct path *, struct file *); * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); -int dentry_needs_remove_privs(struct user_namespace *, struct dentry *dentry); -bool in_group_or_capable(struct user_namespace *mnt_userns, +int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry); +bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); /* @@ -234,7 +234,7 @@ ssize_t do_getxattr(struct mnt_idmap *idmap, int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_ctx *ctx); -int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode); +int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode); #ifdef CONFIG_FS_POSIX_ACL int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, @@ -261,5 +261,8 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po /* * fs/attr.c */ -int setattr_should_drop_sgid(struct user_namespace *mnt_userns, +int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); +struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); +struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); +void mnt_idmap_put(struct mnt_idmap *idmap); diff --git a/fs/ioctl.c b/fs/ioctl.c index 80ac36aea913..5b2481cd4750 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -651,7 +651,7 @@ static int fileattr_set_prepare(struct inode *inode, /** * vfs_fileattr_set - change miscellaneous file attributes - * @mnt_userns: user namespace of the mount + * @idmap: idmap of the mount * @dentry: the object to change * @fa: fileattr pointer * @@ -665,7 +665,7 @@ static int fileattr_set_prepare(struct inode *inode, * * Return: 0 on success, or a negative error on failure. */ -int vfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, +int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -675,7 +675,7 @@ int vfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, if (!inode->i_op->fileattr_set) return -ENOIOCTLCMD; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; inode_lock(inode); @@ -693,7 +693,7 @@ int vfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, } err = fileattr_set_prepare(inode, &old_ma, fa); if (!err) - err = inode->i_op->fileattr_set(mnt_userns, dentry, fa); + err = inode->i_op->fileattr_set(idmap, dentry, fa); } inode_unlock(inode); @@ -714,7 +714,7 @@ static int ioctl_getflags(struct file *file, unsigned int __user *argp) static int ioctl_setflags(struct file *file, unsigned int __user *argp) { - struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct mnt_idmap *idmap = file_mnt_idmap(file); struct dentry *dentry = file->f_path.dentry; struct fileattr fa; unsigned int flags; @@ -725,7 +725,7 @@ static int ioctl_setflags(struct file *file, unsigned int __user *argp) err = mnt_want_write_file(file); if (!err) { fileattr_fill_flags(&fa, flags); - err = vfs_fileattr_set(mnt_userns, dentry, &fa); + err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); } } @@ -746,7 +746,7 @@ static int ioctl_fsgetxattr(struct file *file, void __user *argp) static int ioctl_fssetxattr(struct file *file, void __user *argp) { - struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct mnt_idmap *idmap = file_mnt_idmap(file); struct dentry *dentry = file->f_path.dentry; struct fileattr fa; int err; @@ -755,7 +755,7 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) if (!err) { err = mnt_want_write_file(file); if (!err) { - err = vfs_fileattr_set(mnt_userns, dentry, &fa); + err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); } } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 9804714b1751..f771001574d0 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -217,16 +217,10 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; - if (!(dio->flags & IOMAP_DIO_WRITE)) { - WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND); + if (!(dio->flags & IOMAP_DIO_WRITE)) return REQ_OP_READ; - } - - if (iomap->flags & IOMAP_F_ZONE_APPEND) - opflags |= REQ_OP_ZONE_APPEND; - else - opflags |= REQ_OP_WRITE; + opflags |= REQ_OP_WRITE; if (use_fua) opflags |= REQ_FUA; else diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 8bb58ce5c06c..888a7ceb6479 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -229,7 +229,7 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a return rc; } -int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int jffs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int rc, xprefix; @@ -241,7 +241,7 @@ int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (acl) { umode_t mode; - rc = posix_acl_update_mode(&init_user_ns, inode, &mode, + rc = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (rc) return rc; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index ca36a6eca594..e976b8cb82cf 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -28,7 +28,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu); -int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int jffs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); extern int jffs2_init_acl_post(struct inode *); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index f399b390b5f6..5075a0a6d594 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -24,20 +24,20 @@ static int jffs2_readdir (struct file *, struct dir_context *); -static int jffs2_create (struct user_namespace *, struct inode *, +static int jffs2_create (struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); static struct dentry *jffs2_lookup (struct inode *,struct dentry *, unsigned int); static int jffs2_link (struct dentry *,struct inode *,struct dentry *); static int jffs2_unlink (struct inode *,struct dentry *); -static int jffs2_symlink (struct user_namespace *, struct inode *, +static int jffs2_symlink (struct mnt_idmap *, struct inode *, struct dentry *, const char *); -static int jffs2_mkdir (struct user_namespace *, struct inode *,struct dentry *, +static int jffs2_mkdir (struct mnt_idmap *, struct inode *,struct dentry *, umode_t); static int jffs2_rmdir (struct inode *,struct dentry *); -static int jffs2_mknod (struct user_namespace *, struct inode *,struct dentry *, +static int jffs2_mknod (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); -static int jffs2_rename (struct user_namespace *, struct inode *, +static int jffs2_rename (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); @@ -160,7 +160,7 @@ static int jffs2_readdir(struct file *file, struct dir_context *ctx) /***********************************************************************/ -static int jffs2_create(struct user_namespace *mnt_userns, struct inode *dir_i, +static int jffs2_create(struct mnt_idmap *idmap, struct inode *dir_i, struct dentry *dentry, umode_t mode, bool excl) { struct jffs2_raw_inode *ri; @@ -279,7 +279,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de /***********************************************************************/ -static int jffs2_symlink (struct user_namespace *mnt_userns, struct inode *dir_i, +static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i, struct dentry *dentry, const char *target) { struct jffs2_inode_info *f, *dir_f; @@ -442,7 +442,7 @@ static int jffs2_symlink (struct user_namespace *mnt_userns, struct inode *dir_i } -static int jffs2_mkdir (struct user_namespace *mnt_userns, struct inode *dir_i, +static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, struct dentry *dentry, umode_t mode) { struct jffs2_inode_info *f, *dir_f; @@ -614,7 +614,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) return ret; } -static int jffs2_mknod (struct user_namespace *mnt_userns, struct inode *dir_i, +static int jffs2_mknod (struct mnt_idmap *idmap, struct inode *dir_i, struct dentry *dentry, umode_t mode, dev_t rdev) { struct jffs2_inode_info *f, *dir_f; @@ -762,7 +762,7 @@ static int jffs2_mknod (struct user_namespace *mnt_userns, struct inode *dir_i, return ret; } -static int jffs2_rename (struct user_namespace *mnt_userns, +static int jffs2_rename (struct mnt_idmap *idmap, struct inode *old_dir_i, struct dentry *old_dentry, struct inode *new_dir_i, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 66af51c41619..09174898efd0 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -190,19 +190,19 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) return 0; } -int jffs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int jffs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int rc; - rc = setattr_prepare(&init_user_ns, dentry, iattr); + rc = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (rc) return rc; rc = jffs2_do_setattr(inode, iattr); if (!rc && (iattr->ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + rc = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); return rc; } diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 921d782583d6..8da19766c101 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -164,7 +164,7 @@ long jffs2_ioctl(struct file *, unsigned int, unsigned long); extern const struct inode_operations jffs2_symlink_inode_operations; /* fs.c */ -int jffs2_setattr (struct user_namespace *, struct dentry *, struct iattr *); +int jffs2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *); int jffs2_do_setattr (struct inode *, struct iattr *); struct inode *jffs2_iget(struct super_block *, unsigned long); void jffs2_evict_inode (struct inode *); diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index aef5522551db..437f3a2c1b54 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -57,7 +57,7 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler, } static int jffs2_security_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index cc3f24883e7d..b7c5da2d89bd 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -25,7 +25,7 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler, } static int jffs2_trusted_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index fb945977c013..f64edce4927b 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -25,7 +25,7 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler, } static int jffs2_user_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 3b667eccc73b..fb96f872d207 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -94,7 +94,7 @@ out: return rc; } -int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int jfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int rc; @@ -106,7 +106,7 @@ int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, tid = txBegin(inode->i_sb, 0); mutex_lock(&JFS_IP(inode)->commit_mutex); if (type == ACL_TYPE_ACCESS && acl) { - rc = posix_acl_update_mode(&init_user_ns, inode, &mode, &acl); + rc = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (rc) goto end_tx; if (mode != inode->i_mode) diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 88663465aecd..2ee35be49de1 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -85,24 +85,24 @@ static int jfs_release(struct inode *inode, struct file *file) return 0; } -int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int jfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int rc; - rc = setattr_prepare(&init_user_ns, dentry, iattr); + rc = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (rc) return rc; - if (is_quota_modification(mnt_userns, inode, iattr)) { + if (is_quota_modification(&nop_mnt_idmap, inode, iattr)) { rc = dquot_initialize(inode); if (rc) return rc; } if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { - rc = dquot_transfer(mnt_userns, inode, iattr); + rc = dquot_transfer(&nop_mnt_idmap, inode, iattr); if (rc) return rc; } @@ -119,11 +119,11 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, jfs_truncate(inode); } - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) - rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + rc = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); return rc; } diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 1e7b177ece60..ed7989bc2db1 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -70,7 +70,7 @@ int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int jfs_fileattr_set(struct user_namespace *mnt_userns, +int jfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index f0704a25835f..f892e54d0fcd 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -8,7 +8,7 @@ #ifdef CONFIG_JFS_POSIX_ACL struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu); -int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int jfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 59379089e939..9e1f02767201 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -64,7 +64,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) goto fail_put; } - inode_init_owner(&init_user_ns, inode, parent, mode); + inode_init_owner(&nop_mnt_idmap, inode, parent, mode); /* * New inodes need to save sane values on disk when * uid & gid mount options are used diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 7de961a81862..ea80661597ac 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -10,7 +10,7 @@ struct fid; extern struct inode *ialloc(struct inode *, umode_t); extern int jfs_fsync(struct file *, loff_t, loff_t, int); extern int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -extern int jfs_fileattr_set(struct user_namespace *mnt_userns, +extern int jfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); extern long jfs_ioctl(struct file *, unsigned int, unsigned long); extern struct inode *jfs_iget(struct super_block *, unsigned long); @@ -28,7 +28,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); extern void jfs_set_inode_flags(struct inode *); extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); -extern int jfs_setattr(struct user_namespace *, struct dentry *, struct iattr *); +extern int jfs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern const struct address_space_operations jfs_aops; extern const struct inode_operations jfs_dir_inode_operations; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index a38d14eed047..b29d68b5eec5 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -59,7 +59,7 @@ static inline void free_ea_wmap(struct inode *inode) * RETURN: Errors from subroutines * */ -static int jfs_create(struct user_namespace *mnt_userns, struct inode *dip, +static int jfs_create(struct mnt_idmap *idmap, struct inode *dip, struct dentry *dentry, umode_t mode, bool excl) { int rc = 0; @@ -192,7 +192,7 @@ static int jfs_create(struct user_namespace *mnt_userns, struct inode *dip, * note: * EACCES: user needs search+write permission on the parent directory */ -static int jfs_mkdir(struct user_namespace *mnt_userns, struct inode *dip, +static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, struct dentry *dentry, umode_t mode) { int rc = 0; @@ -869,7 +869,7 @@ static int jfs_link(struct dentry *old_dentry, * an intermediate result whose length exceeds PATH_MAX [XPG4.2] */ -static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip, +static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, struct dentry *dentry, const char *name) { int rc; @@ -1059,7 +1059,7 @@ static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip, * * FUNCTION: rename a file or directory */ -static int jfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -1345,7 +1345,7 @@ static int jfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, * * FUNCTION: Create a special file (device) */ -static int jfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int jfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct jfs_inode_info *jfs_ip; diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index f9273f6901c8..f817798fa1eb 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -932,7 +932,7 @@ static int jfs_xattr_get(const struct xattr_handler *handler, } static int jfs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -951,7 +951,7 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler, } static int jfs_xattr_set_os2(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 935ef8cb02b2..e3181c3e1988 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1200,7 +1200,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, return d_splice_alias(inode, dentry); } -static int kernfs_iop_mkdir(struct user_namespace *mnt_userns, +static int kernfs_iop_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { @@ -1238,7 +1238,7 @@ static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) return ret; } -static int kernfs_iop_rename(struct user_namespace *mnt_userns, +static int kernfs_iop_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index eac0f210299a..30494dcb0df3 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -107,7 +107,7 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) return ret; } -int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); @@ -120,7 +120,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, root = kernfs_root(kn); down_write(&root->kernfs_rwsem); - error = setattr_prepare(&init_user_ns, dentry, iattr); + error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) goto out; @@ -129,7 +129,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, goto out; /* this ignores size changes */ - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); out: up_write(&root->kernfs_rwsem); @@ -181,7 +181,7 @@ static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) set_nlink(inode, kn->dir.subdirs + 2); } -int kernfs_iop_getattr(struct user_namespace *mnt_userns, +int kernfs_iop_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -191,7 +191,7 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns, down_read(&root->kernfs_rwsem); kernfs_refresh_inode(kn, inode); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); up_read(&root->kernfs_rwsem); return 0; @@ -272,7 +272,7 @@ void kernfs_evict_inode(struct inode *inode) kernfs_put(kn); } -int kernfs_iop_permission(struct user_namespace *mnt_userns, +int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct kernfs_node *kn; @@ -287,7 +287,7 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns, down_read(&root->kernfs_rwsem); kernfs_refresh_inode(kn, inode); - ret = generic_permission(&init_user_ns, inode, mask); + ret = generic_permission(&nop_mnt_idmap, inode, mask); up_read(&root->kernfs_rwsem); return ret; @@ -324,7 +324,7 @@ static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, } static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) @@ -391,7 +391,7 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, } static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 9046d9f39e63..236c3a6113f1 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -127,11 +127,11 @@ extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; */ extern const struct xattr_handler *kernfs_xattr_handlers[]; void kernfs_evict_inode(struct inode *inode); -int kernfs_iop_permission(struct user_namespace *mnt_userns, +int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); -int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); -int kernfs_iop_getattr(struct user_namespace *mnt_userns, +int kernfs_iop_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c index 4d9e0b54e3db..3507d8f89074 100644 --- a/fs/ksmbd/ndr.c +++ b/fs/ksmbd/ndr.c @@ -338,7 +338,7 @@ static int ndr_encode_posix_acl_entry(struct ndr *n, struct xattr_smb_acl *acl) } int ndr_encode_posix_acl(struct ndr *n, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct inode *inode, struct xattr_smb_acl *acl, struct xattr_smb_acl *def_acl) @@ -374,11 +374,11 @@ int ndr_encode_posix_acl(struct ndr *n, if (ret) return ret; - vfsuid = i_uid_into_vfsuid(user_ns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); ret = ndr_write_int64(n, from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid))); if (ret) return ret; - vfsgid = i_gid_into_vfsgid(user_ns, inode); + vfsgid = i_gid_into_vfsgid(idmap, inode); ret = ndr_write_int64(n, from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid))); if (ret) return ret; diff --git a/fs/ksmbd/ndr.h b/fs/ksmbd/ndr.h index 60ca265d1bb0..f3c108c8cf4d 100644 --- a/fs/ksmbd/ndr.h +++ b/fs/ksmbd/ndr.h @@ -14,7 +14,7 @@ struct ndr { int ndr_encode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da); int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da); -int ndr_encode_posix_acl(struct ndr *n, struct user_namespace *user_ns, +int ndr_encode_posix_acl(struct ndr *n, struct mnt_idmap *idmap, struct inode *inode, struct xattr_smb_acl *acl, struct xattr_smb_acl *def_acl); int ndr_encode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl); diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c index d7d47b82451d..2e54ded4d92c 100644 --- a/fs/ksmbd/oplock.c +++ b/fs/ksmbd/oplock.c @@ -1608,9 +1608,9 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp) { struct create_posix_rsp *buf; struct inode *inode = file_inode(fp->filp); - struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); - vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode); - vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode); + struct mnt_idmap *idmap = file_mnt_idmap(fp->filp); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); buf = (struct create_posix_rsp *)cc; memset(buf, 0, sizeof(struct create_posix_rsp)); diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index d681f91947d9..4ef6e1e59a40 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -12,6 +12,7 @@ #include <linux/ethtool.h> #include <linux/falloc.h> #include <linux/mount.h> +#include <linux/filelock.h> #include "glob.h" #include "smbfsctl.h" @@ -2192,7 +2193,7 @@ out: static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, const struct path *path) { - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); char *attr_name = NULL, *value; int rc = 0; unsigned int next = 0; @@ -2228,7 +2229,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, value = (char *)&eabuf->name + eabuf->EaNameLength + 1; if (!eabuf->EaValueLength) { - rc = ksmbd_vfs_casexattr_len(user_ns, + rc = ksmbd_vfs_casexattr_len(idmap, path->dentry, attr_name, XATTR_USER_PREFIX_LEN + @@ -2236,7 +2237,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, /* delete the EA only when it exits */ if (rc > 0) { - rc = ksmbd_vfs_remove_xattr(user_ns, + rc = ksmbd_vfs_remove_xattr(idmap, path->dentry, attr_name); @@ -2251,7 +2252,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, /* if the EA doesn't exist, just do nothing. */ rc = 0; } else { - rc = ksmbd_vfs_setxattr(user_ns, + rc = ksmbd_vfs_setxattr(idmap, path->dentry, attr_name, value, le16_to_cpu(eabuf->EaValueLength), 0); if (rc < 0) { @@ -2281,7 +2282,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, struct ksmbd_file *fp, char *stream_name, int s_type) { - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); size_t xattr_stream_size; char *xattr_stream_name; int rc; @@ -2297,7 +2298,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, fp->stream.size = xattr_stream_size; /* Check if there is stream prefix in xattr space */ - rc = ksmbd_vfs_casexattr_len(user_ns, + rc = ksmbd_vfs_casexattr_len(idmap, path->dentry, xattr_stream_name, xattr_stream_size); @@ -2309,7 +2310,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, return -EBADF; } - rc = ksmbd_vfs_setxattr(user_ns, path->dentry, + rc = ksmbd_vfs_setxattr(idmap, path->dentry, xattr_stream_name, NULL, 0, 0); if (rc < 0) pr_err("Failed to store XATTR stream name :%d\n", rc); @@ -2318,7 +2319,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, static int smb2_remove_smb_xattrs(const struct path *path) { - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); char *name, *xattr_list = NULL; ssize_t xattr_list_len; int err = 0; @@ -2338,7 +2339,7 @@ static int smb2_remove_smb_xattrs(const struct path *path) if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && !strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, STREAM_PREFIX_LEN)) { - err = ksmbd_vfs_remove_xattr(user_ns, path->dentry, + err = ksmbd_vfs_remove_xattr(idmap, path->dentry, name); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", @@ -2385,7 +2386,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path * da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_user_ns(path->mnt), + rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), path->dentry, &da); if (rc) ksmbd_debug(SMB, "failed to store file attribute into xattr\n"); @@ -2404,7 +2405,7 @@ static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon, KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) return; - rc = ksmbd_vfs_get_dos_attrib_xattr(mnt_user_ns(path->mnt), + rc = ksmbd_vfs_get_dos_attrib_xattr(mnt_idmap(path->mnt), path->dentry, &da); if (rc > 0) { fp->f_ci->m_fattr = cpu_to_le32(da.attr); @@ -2479,11 +2480,11 @@ static int smb2_create_sd_buffer(struct ksmbd_work *work, } static void ksmbd_acls_fattr(struct smb_fattr *fattr, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *inode) { - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); fattr->cf_uid = vfsuid_into_kuid(vfsuid); fattr->cf_gid = vfsgid_into_kgid(vfsgid); @@ -2515,7 +2516,7 @@ int smb2_open(struct ksmbd_work *work) struct ksmbd_share_config *share = tcon->share_conf; struct ksmbd_file *fp = NULL; struct file *filp = NULL; - struct user_namespace *user_ns = NULL; + struct mnt_idmap *idmap = NULL; struct kstat stat; struct create_context *context; struct lease_ctx_info *lc = NULL; @@ -2768,7 +2769,7 @@ int smb2_open(struct ksmbd_work *work) rc = 0; } else { file_present = true; - user_ns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); } if (stream_name) { if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) { @@ -2831,7 +2832,7 @@ int smb2_open(struct ksmbd_work *work) if (!file_present) { daccess = cpu_to_le32(GENERIC_ALL_FLAGS); } else { - rc = ksmbd_vfs_query_maximal_access(user_ns, + rc = ksmbd_vfs_query_maximal_access(idmap, path.dentry, &daccess); if (rc) @@ -2867,7 +2868,7 @@ int smb2_open(struct ksmbd_work *work) } created = true; - user_ns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); if (ea_buf) { if (le32_to_cpu(ea_buf->ccontext.DataLength) < sizeof(struct smb2_ea_info)) { @@ -2889,7 +2890,7 @@ int smb2_open(struct ksmbd_work *work) * is already granted. */ if (daccess & ~(FILE_READ_ATTRIBUTES_LE | FILE_READ_CONTROL_LE)) { - rc = inode_permission(user_ns, + rc = inode_permission(idmap, d_inode(path.dentry), may_flags); if (rc) @@ -2897,7 +2898,7 @@ int smb2_open(struct ksmbd_work *work) if ((daccess & FILE_DELETE_LE) || (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) { - rc = ksmbd_vfs_may_delete(user_ns, + rc = ksmbd_vfs_may_delete(idmap, path.dentry); if (rc) goto err_out; @@ -2960,7 +2961,7 @@ int smb2_open(struct ksmbd_work *work) int posix_acl_rc; struct inode *inode = d_inode(path.dentry); - posix_acl_rc = ksmbd_vfs_inherit_posix_acl(user_ns, + posix_acl_rc = ksmbd_vfs_inherit_posix_acl(idmap, path.dentry, d_inode(path.dentry->d_parent)); if (posix_acl_rc) @@ -2976,7 +2977,7 @@ int smb2_open(struct ksmbd_work *work) rc = smb2_create_sd_buffer(work, req, &path); if (rc) { if (posix_acl_rc) - ksmbd_vfs_set_init_posix_acl(user_ns, + ksmbd_vfs_set_init_posix_acl(idmap, path.dentry); if (test_share_config_flag(work->tcon->share_conf, @@ -2985,7 +2986,7 @@ int smb2_open(struct ksmbd_work *work) struct smb_ntsd *pntsd; int pntsd_size, ace_num = 0; - ksmbd_acls_fattr(&fattr, user_ns, inode); + ksmbd_acls_fattr(&fattr, idmap, inode); if (fattr.cf_acls) ace_num = fattr.cf_acls->a_count; if (fattr.cf_dacls) @@ -2999,7 +3000,7 @@ int smb2_open(struct ksmbd_work *work) if (!pntsd) goto err_out; - rc = build_sec_desc(user_ns, + rc = build_sec_desc(idmap, pntsd, NULL, 0, OWNER_SECINFO | GROUP_SECINFO | @@ -3013,7 +3014,7 @@ int smb2_open(struct ksmbd_work *work) } rc = ksmbd_vfs_set_sd_xattr(conn, - user_ns, + idmap, path.dentry, pntsd, pntsd_size); @@ -3209,7 +3210,7 @@ int smb2_open(struct ksmbd_work *work) struct create_context *mxac_ccontext; if (maximal_access == 0) - ksmbd_vfs_query_maximal_access(user_ns, + ksmbd_vfs_query_maximal_access(idmap, path.dentry, &maximal_access); mxac_ccontext = (struct create_context *)(rsp->Buffer + @@ -3634,7 +3635,7 @@ static void unlock_dir(struct ksmbd_file *dir_fp) static int process_query_dir_entries(struct smb2_query_dir_private *priv) { - struct user_namespace *user_ns = file_mnt_user_ns(priv->dir_fp->filp); + struct mnt_idmap *idmap = file_mnt_idmap(priv->dir_fp->filp); struct kstat kstat; struct ksmbd_kstat ksmbd_kstat; int rc; @@ -3647,7 +3648,7 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv) return -EINVAL; lock_dir(priv->dir_fp); - dent = lookup_one(user_ns, priv->d_info->name, + dent = lookup_one(idmap, priv->d_info->name, priv->dir_fp->filp->f_path.dentry, priv->d_info->name_len); unlock_dir(priv->dir_fp); @@ -3668,7 +3669,7 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv) ksmbd_kstat.kstat = &kstat; if (priv->info_level != FILE_NAMES_INFORMATION) ksmbd_vfs_fill_dentry_attrs(priv->work, - user_ns, + idmap, dent, &ksmbd_kstat); @@ -3898,7 +3899,7 @@ int smb2_query_dir(struct ksmbd_work *work) } if (!(dir_fp->daccess & FILE_LIST_DIRECTORY_LE) || - inode_permission(file_mnt_user_ns(dir_fp->filp), + inode_permission(file_mnt_idmap(dir_fp->filp), file_inode(dir_fp->filp), MAY_READ | MAY_EXEC)) { pr_err("no right to enumerate directory (%pD)\n", dir_fp->filp); @@ -4164,7 +4165,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, ssize_t buf_free_len, alignment_bytes, next_offset, rsp_data_cnt = 0; struct smb2_ea_info_req *ea_req = NULL; const struct path *path; - struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); + struct mnt_idmap *idmap = file_mnt_idmap(fp->filp); if (!(fp->daccess & FILE_READ_EA_LE)) { pr_err("Not permitted to read ext attr : 0x%x\n", @@ -4244,7 +4245,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, buf_free_len -= (offsetof(struct smb2_ea_info, name) + name_len + 1); /* bailout if xattr can't fit in buf_free_len */ - value_len = ksmbd_vfs_getxattr(user_ns, path->dentry, + value_len = ksmbd_vfs_getxattr(idmap, path->dentry, name, &buf); if (value_len <= 0) { rc = -ENOENT; @@ -4334,7 +4335,7 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp, } basic_info = (struct smb2_file_basic_info *)rsp->Buffer; - generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), &stat); basic_info->CreationTime = cpu_to_le64(fp->create_time); time = ksmbd_UnixTimeToNT(stat.atime); @@ -4375,7 +4376,7 @@ static void get_file_standard_info(struct smb2_query_info_rsp *rsp, struct kstat stat; inode = file_inode(fp->filp); - generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); sinfo = (struct smb2_file_standard_info *)rsp->Buffer; delete_pending = ksmbd_inode_pending_delete(fp); @@ -4429,7 +4430,7 @@ static int get_file_all_info(struct ksmbd_work *work, return PTR_ERR(filename); inode = file_inode(fp->filp); - generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); ksmbd_debug(SMB, "filename = %s\n", filename); delete_pending = ksmbd_inode_pending_delete(fp); @@ -4506,7 +4507,7 @@ static void get_file_stream_info(struct ksmbd_work *work, int buf_free_len; struct smb2_query_info_req *req = ksmbd_req_buf_next(work); - generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), &stat); file_info = (struct smb2_file_stream_info *)rsp->Buffer; @@ -4597,7 +4598,7 @@ static void get_file_internal_info(struct smb2_query_info_rsp *rsp, struct smb2_file_internal_info *file_info; struct kstat stat; - generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), &stat); file_info = (struct smb2_file_internal_info *)rsp->Buffer; file_info->IndexNumber = cpu_to_le64(stat.ino); @@ -4623,7 +4624,7 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer; inode = file_inode(fp->filp); - generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); file_info->CreationTime = cpu_to_le64(fp->create_time); time = ksmbd_UnixTimeToNT(stat.atime); @@ -4684,7 +4685,7 @@ static void get_file_compression_info(struct smb2_query_info_rsp *rsp, struct smb2_file_comp_info *file_info; struct kstat stat; - generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), &stat); file_info = (struct smb2_file_comp_info *)rsp->Buffer; @@ -4725,9 +4726,9 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, { struct smb311_posix_qinfo *file_info; struct inode *inode = file_inode(fp->filp); - struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); - vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode); - vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode); + struct mnt_idmap *idmap = file_mnt_idmap(fp->filp); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); u64 time; int out_buf_len = sizeof(struct smb311_posix_qinfo) + 32; @@ -5127,7 +5128,7 @@ static int smb2_get_info_sec(struct ksmbd_work *work, struct smb2_query_info_rsp *rsp) { struct ksmbd_file *fp; - struct user_namespace *user_ns; + struct mnt_idmap *idmap; struct smb_ntsd *pntsd = (struct smb_ntsd *)rsp->Buffer, *ppntsd = NULL; struct smb_fattr fattr = {{0}}; struct inode *inode; @@ -5174,19 +5175,19 @@ static int smb2_get_info_sec(struct ksmbd_work *work, if (!fp) return -ENOENT; - user_ns = file_mnt_user_ns(fp->filp); + idmap = file_mnt_idmap(fp->filp); inode = file_inode(fp->filp); - ksmbd_acls_fattr(&fattr, user_ns, inode); + ksmbd_acls_fattr(&fattr, idmap, inode); if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) - ppntsd_size = ksmbd_vfs_get_sd_xattr(work->conn, user_ns, + ppntsd_size = ksmbd_vfs_get_sd_xattr(work->conn, idmap, fp->filp->f_path.dentry, &ppntsd); /* Check if sd buffer size exceeds response buffer size */ if (smb2_resp_buf_len(work, 8) > ppntsd_size) - rc = build_sec_desc(user_ns, pntsd, ppntsd, ppntsd_size, + rc = build_sec_desc(idmap, pntsd, ppntsd, ppntsd_size, addition_info, &secdesclen, &fattr); posix_acl_release(fattr.cf_acls); posix_acl_release(fattr.cf_dacls); @@ -5416,7 +5417,7 @@ int smb2_echo(struct ksmbd_work *work) static int smb2_rename(struct ksmbd_work *work, struct ksmbd_file *fp, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct smb2_file_rename_info *file_info, struct nls_table *local_nls) { @@ -5479,7 +5480,7 @@ static int smb2_rename(struct ksmbd_work *work, if (rc) goto out; - rc = ksmbd_vfs_setxattr(user_ns, + rc = ksmbd_vfs_setxattr(idmap, fp->filp->f_path.dentry, xattr_stream_name, NULL, 0, 0); @@ -5618,7 +5619,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, struct iattr attrs; struct file *filp; struct inode *inode; - struct user_namespace *user_ns; + struct mnt_idmap *idmap; int rc = 0; if (!(fp->daccess & FILE_WRITE_ATTRIBUTES_LE)) @@ -5627,7 +5628,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, attrs.ia_valid = 0; filp = fp->filp; inode = file_inode(filp); - user_ns = file_mnt_user_ns(filp); + idmap = file_mnt_idmap(filp); if (file_info->CreationTime) fp->create_time = le64_to_cpu(file_info->CreationTime); @@ -5671,7 +5672,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(user_ns, + rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, filp->f_path.dentry, &da); if (rc) ksmbd_debug(SMB, @@ -5689,7 +5690,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, inode_lock(inode); inode->i_ctime = attrs.ia_ctime; attrs.ia_valid &= ~ATTR_CTIME; - rc = notify_change(user_ns, dentry, &attrs, NULL); + rc = notify_change(idmap, dentry, &attrs, NULL); inode_unlock(inode); } return rc; @@ -5782,7 +5783,7 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, struct smb2_file_rename_info *rename_info, unsigned int buf_len) { - struct user_namespace *user_ns; + struct mnt_idmap *idmap; struct ksmbd_file *parent_fp; struct dentry *parent; struct dentry *dentry = fp->filp->f_path.dentry; @@ -5797,12 +5798,12 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, le32_to_cpu(rename_info->FileNameLength)) return -EINVAL; - user_ns = file_mnt_user_ns(fp->filp); + idmap = file_mnt_idmap(fp->filp); if (ksmbd_stream_fd(fp)) goto next; parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry); + ret = ksmbd_vfs_lock_parent(idmap, parent, dentry); if (ret) { dput(parent); return ret; @@ -5821,7 +5822,7 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, ksmbd_fd_put(work, parent_fp); } next: - return smb2_rename(work, fp, user_ns, rename_info, + return smb2_rename(work, fp, idmap, rename_info, work->conn->local_nls); } @@ -7530,14 +7531,14 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id, struct file_sparse *sparse) { struct ksmbd_file *fp; - struct user_namespace *user_ns; + struct mnt_idmap *idmap; int ret = 0; __le32 old_fattr; fp = ksmbd_lookup_fd_fast(work, id); if (!fp) return -ENOENT; - user_ns = file_mnt_user_ns(fp->filp); + idmap = file_mnt_idmap(fp->filp); old_fattr = fp->f_ci->m_fattr; if (sparse->SetSparse) @@ -7550,13 +7551,13 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id, KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) { struct xattr_dos_attrib da; - ret = ksmbd_vfs_get_dos_attrib_xattr(user_ns, + ret = ksmbd_vfs_get_dos_attrib_xattr(idmap, fp->filp->f_path.dentry, &da); if (ret <= 0) goto out; da.attr = le32_to_cpu(fp->f_ci->m_fattr); - ret = ksmbd_vfs_set_dos_attrib_xattr(user_ns, + ret = ksmbd_vfs_set_dos_attrib_xattr(idmap, fp->filp->f_path.dentry, &da); if (ret) fp->f_ci->m_fattr = old_fattr; diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index 2a4fbbd55b91..fa2b54df6ee6 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -307,7 +307,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, { int i, rc = 0; struct ksmbd_conn *conn = work->conn; - struct user_namespace *user_ns = file_mnt_user_ns(dir->filp); + struct mnt_idmap *idmap = file_mnt_idmap(dir->filp); for (i = 0; i < 2; i++) { struct kstat kstat; @@ -333,7 +333,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, ksmbd_kstat.kstat = &kstat; ksmbd_vfs_fill_dentry_attrs(work, - user_ns, + idmap, dentry, &ksmbd_kstat); rc = fn(conn, info_level, d_info, &ksmbd_kstat); diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index ab5c68cc0e13..6d6cfb6957a9 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -254,7 +254,7 @@ void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid) ssid->num_subauth++; } -static int sid_to_id(struct user_namespace *user_ns, +static int sid_to_id(struct mnt_idmap *idmap, struct smb_sid *psid, uint sidtype, struct smb_fattr *fattr) { @@ -276,7 +276,7 @@ static int sid_to_id(struct user_namespace *user_ns, id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); uid = KUIDT_INIT(id); - uid = from_vfsuid(user_ns, &init_user_ns, VFSUIDT_INIT(uid)); + uid = from_vfsuid(idmap, &init_user_ns, VFSUIDT_INIT(uid)); if (uid_valid(uid)) { fattr->cf_uid = uid; rc = 0; @@ -287,7 +287,7 @@ static int sid_to_id(struct user_namespace *user_ns, id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); gid = KGIDT_INIT(id); - gid = from_vfsgid(user_ns, &init_user_ns, VFSGIDT_INIT(gid)); + gid = from_vfsgid(idmap, &init_user_ns, VFSGIDT_INIT(gid)); if (gid_valid(gid)) { fattr->cf_gid = gid; rc = 0; @@ -362,7 +362,7 @@ void free_acl_state(struct posix_acl_state *state) kfree(state->groups); } -static void parse_dacl(struct user_namespace *user_ns, +static void parse_dacl(struct mnt_idmap *idmap, struct smb_acl *pdacl, char *end_of_acl, struct smb_sid *pownersid, struct smb_sid *pgrpsid, struct smb_fattr *fattr) @@ -489,7 +489,7 @@ static void parse_dacl(struct user_namespace *user_ns, acl_mode = access_flags_to_mode(fattr, ppace[i]->access_req, ppace[i]->type); temp_fattr.cf_uid = INVALID_UID; - ret = sid_to_id(user_ns, &ppace[i]->sid, SIDOWNER, &temp_fattr); + ret = sid_to_id(idmap, &ppace[i]->sid, SIDOWNER, &temp_fattr); if (ret || uid_eq(temp_fattr.cf_uid, INVALID_UID)) { pr_err("%s: Error %d mapping Owner SID to uid\n", __func__, ret); @@ -575,7 +575,7 @@ static void parse_dacl(struct user_namespace *user_ns, free_acl_state(&default_acl_state); } -static void set_posix_acl_entries_dacl(struct user_namespace *user_ns, +static void set_posix_acl_entries_dacl(struct mnt_idmap *idmap, struct smb_ace *pndace, struct smb_fattr *fattr, u32 *num_aces, u16 *size, u32 nt_aces_num) @@ -600,14 +600,14 @@ static void set_posix_acl_entries_dacl(struct user_namespace *user_ns, uid_t uid; unsigned int sid_type = SIDOWNER; - uid = posix_acl_uid_translate(user_ns, pace); + uid = posix_acl_uid_translate(idmap, pace); if (!uid) sid_type = SIDUNIX_USER; id_to_sid(uid, sid_type, sid); } else if (pace->e_tag == ACL_GROUP) { gid_t gid; - gid = posix_acl_gid_translate(user_ns, pace); + gid = posix_acl_gid_translate(idmap, pace); id_to_sid(gid, SIDUNIX_GROUP, sid); } else if (pace->e_tag == ACL_OTHER && !nt_aces_num) { smb_copy_sid(sid, &sid_everyone); @@ -666,12 +666,12 @@ posix_default_acl: if (pace->e_tag == ACL_USER) { uid_t uid; - uid = posix_acl_uid_translate(user_ns, pace); + uid = posix_acl_uid_translate(idmap, pace); id_to_sid(uid, SIDCREATOR_OWNER, sid); } else if (pace->e_tag == ACL_GROUP) { gid_t gid; - gid = posix_acl_gid_translate(user_ns, pace); + gid = posix_acl_gid_translate(idmap, pace); id_to_sid(gid, SIDCREATOR_GROUP, sid); } else { kfree(sid); @@ -689,7 +689,7 @@ posix_default_acl: } } -static void set_ntacl_dacl(struct user_namespace *user_ns, +static void set_ntacl_dacl(struct mnt_idmap *idmap, struct smb_acl *pndacl, struct smb_acl *nt_dacl, unsigned int aces_size, @@ -723,13 +723,13 @@ static void set_ntacl_dacl(struct user_namespace *user_ns, } } - set_posix_acl_entries_dacl(user_ns, pndace, fattr, + set_posix_acl_entries_dacl(idmap, pndace, fattr, &num_aces, &size, nt_num_aces); pndacl->num_aces = cpu_to_le32(num_aces); pndacl->size = cpu_to_le16(le16_to_cpu(pndacl->size) + size); } -static void set_mode_dacl(struct user_namespace *user_ns, +static void set_mode_dacl(struct mnt_idmap *idmap, struct smb_acl *pndacl, struct smb_fattr *fattr) { struct smb_ace *pace, *pndace; @@ -741,7 +741,7 @@ static void set_mode_dacl(struct user_namespace *user_ns, pace = pndace = (struct smb_ace *)((char *)pndacl + sizeof(struct smb_acl)); if (fattr->cf_acls) { - set_posix_acl_entries_dacl(user_ns, pndace, fattr, + set_posix_acl_entries_dacl(idmap, pndace, fattr, &num_aces, &size, num_aces); goto out; } @@ -808,7 +808,7 @@ static int parse_sid(struct smb_sid *psid, char *end_of_acl) } /* Convert CIFS ACL to POSIX form */ -int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, +int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, int acl_len, struct smb_fattr *fattr) { int rc = 0; @@ -851,7 +851,7 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, return rc; } - rc = sid_to_id(user_ns, owner_sid_ptr, SIDOWNER, fattr); + rc = sid_to_id(idmap, owner_sid_ptr, SIDOWNER, fattr); if (rc) { pr_err("%s: Error %d mapping Owner SID to uid\n", __func__, rc); @@ -866,7 +866,7 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, __func__, rc); return rc; } - rc = sid_to_id(user_ns, group_sid_ptr, SIDUNIX_GROUP, fattr); + rc = sid_to_id(idmap, group_sid_ptr, SIDUNIX_GROUP, fattr); if (rc) { pr_err("%s: Error %d mapping Group SID to gid\n", __func__, rc); @@ -881,7 +881,7 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, pntsd->type |= cpu_to_le16(DACL_PROTECTED); if (dacloffset) { - parse_dacl(user_ns, dacl_ptr, end_of_acl, + parse_dacl(idmap, dacl_ptr, end_of_acl, owner_sid_ptr, group_sid_ptr, fattr); } @@ -889,7 +889,7 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, } /* Convert permission bits from mode to equivalent CIFS ACL */ -int build_sec_desc(struct user_namespace *user_ns, +int build_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, struct smb_ntsd *ppntsd, int ppntsd_size, int addition_info, __u32 *secdesclen, struct smb_fattr *fattr) @@ -950,7 +950,7 @@ int build_sec_desc(struct user_namespace *user_ns, dacl_ptr->num_aces = 0; if (!ppntsd) { - set_mode_dacl(user_ns, dacl_ptr, fattr); + set_mode_dacl(idmap, dacl_ptr, fattr); } else { struct smb_acl *ppdacl_ptr; unsigned int dacl_offset = le32_to_cpu(ppntsd->dacloffset); @@ -966,7 +966,7 @@ int build_sec_desc(struct user_namespace *user_ns, ppdacl_size < sizeof(struct smb_acl)) goto out; - set_ntacl_dacl(user_ns, dacl_ptr, ppdacl_ptr, + set_ntacl_dacl(idmap, dacl_ptr, ppdacl_ptr, ntacl_size - sizeof(struct smb_acl), nowner_sid_ptr, ngroup_sid_ptr, fattr); @@ -1002,13 +1002,13 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, struct smb_ntsd *parent_pntsd = NULL; struct smb_sid owner_sid, group_sid; struct dentry *parent = path->dentry->d_parent; - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); int inherited_flags = 0, flags = 0, i, ace_cnt = 0, nt_size = 0, pdacl_size; int rc = 0, num_aces, dacloffset, pntsd_type, pntsd_size, acl_len, aces_size; char *aces_base; bool is_dir = S_ISDIR(d_inode(path->dentry)->i_mode); - pntsd_size = ksmbd_vfs_get_sd_xattr(conn, user_ns, + pntsd_size = ksmbd_vfs_get_sd_xattr(conn, idmap, parent, &parent_pntsd); if (pntsd_size <= 0) return -ENOENT; @@ -1162,7 +1162,7 @@ pass: pntsd_size += sizeof(struct smb_acl) + nt_size; } - ksmbd_vfs_set_sd_xattr(conn, user_ns, + ksmbd_vfs_set_sd_xattr(conn, idmap, path->dentry, pntsd, pntsd_size); kfree(pntsd); } @@ -1190,7 +1190,7 @@ bool smb_inherit_flags(int flags, bool is_dir) int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, __le32 *pdaccess, int uid) { - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); struct smb_ntsd *pntsd = NULL; struct smb_acl *pdacl; struct posix_acl *posix_acls; @@ -1206,7 +1206,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, unsigned short ace_size; ksmbd_debug(SMB, "check permission using windows acl\n"); - pntsd_size = ksmbd_vfs_get_sd_xattr(conn, user_ns, + pntsd_size = ksmbd_vfs_get_sd_xattr(conn, idmap, path->dentry, &pntsd); if (pntsd_size <= 0 || !pntsd) goto err_out; @@ -1296,9 +1296,9 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, pa_entry = posix_acls->a_entries; for (i = 0; i < posix_acls->a_count; i++, pa_entry++) { if (pa_entry->e_tag == ACL_USER) - id = posix_acl_uid_translate(user_ns, pa_entry); + id = posix_acl_uid_translate(idmap, pa_entry); else if (pa_entry->e_tag == ACL_GROUP) - id = posix_acl_gid_translate(user_ns, pa_entry); + id = posix_acl_gid_translate(idmap, pa_entry); else continue; @@ -1360,14 +1360,14 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, int rc; struct smb_fattr fattr = {{0}}; struct inode *inode = d_inode(path->dentry); - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); struct iattr newattrs; fattr.cf_uid = INVALID_UID; fattr.cf_gid = INVALID_GID; fattr.cf_mode = inode->i_mode; - rc = parse_sec_desc(user_ns, pntsd, ntsd_len, &fattr); + rc = parse_sec_desc(idmap, pntsd, ntsd_len, &fattr); if (rc) goto out; @@ -1383,17 +1383,17 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, newattrs.ia_valid |= ATTR_MODE; newattrs.ia_mode = (inode->i_mode & ~0777) | (fattr.cf_mode & 0777); - ksmbd_vfs_remove_acl_xattrs(user_ns, path->dentry); + ksmbd_vfs_remove_acl_xattrs(idmap, path->dentry); /* Update posix acls */ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) { - rc = set_posix_acl(user_ns, path->dentry, + rc = set_posix_acl(idmap, path->dentry, ACL_TYPE_ACCESS, fattr.cf_acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) { - rc = set_posix_acl(user_ns, path->dentry, + rc = set_posix_acl(idmap, path->dentry, ACL_TYPE_DEFAULT, fattr.cf_dacls); if (rc) ksmbd_debug(SMB, @@ -1403,7 +1403,7 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, } inode_lock(inode); - rc = notify_change(user_ns, path->dentry, &newattrs, NULL); + rc = notify_change(idmap, path->dentry, &newattrs, NULL); inode_unlock(inode); if (rc) goto out; @@ -1414,8 +1414,8 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, if (test_share_config_flag(tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { /* Update WinACL in xattr */ - ksmbd_vfs_remove_sd_xattrs(user_ns, path->dentry); - ksmbd_vfs_set_sd_xattr(conn, user_ns, + ksmbd_vfs_remove_sd_xattrs(idmap, path->dentry); + ksmbd_vfs_set_sd_xattr(conn, idmap, path->dentry, pntsd, ntsd_len); } diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h index 618f2e0236b3..49a8c292bd2e 100644 --- a/fs/ksmbd/smbacl.h +++ b/fs/ksmbd/smbacl.h @@ -190,9 +190,9 @@ struct posix_acl_state { struct posix_ace_state_array *groups; }; -int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, +int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, int acl_len, struct smb_fattr *fattr); -int build_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, +int build_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, struct smb_ntsd *ppntsd, int ppntsd_size, int addition_info, __u32 *secdesclen, struct smb_fattr *fattr); int init_acl_state(struct posix_acl_state *state, int cnt); @@ -211,25 +211,25 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid); void ksmbd_init_domain(u32 *sub_auth); -static inline uid_t posix_acl_uid_translate(struct user_namespace *mnt_userns, +static inline uid_t posix_acl_uid_translate(struct mnt_idmap *idmap, struct posix_acl_entry *pace) { vfsuid_t vfsuid; /* If this is an idmapped mount, apply the idmapping. */ - vfsuid = make_vfsuid(mnt_userns, &init_user_ns, pace->e_uid); + vfsuid = make_vfsuid(idmap, &init_user_ns, pace->e_uid); /* Translate the kuid into a userspace id ksmbd would see. */ return from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid)); } -static inline gid_t posix_acl_gid_translate(struct user_namespace *mnt_userns, +static inline gid_t posix_acl_gid_translate(struct mnt_idmap *idmap, struct posix_acl_entry *pace) { vfsgid_t vfsgid; /* If this is an idmapped mount, apply the idmapping. */ - vfsgid = make_vfsgid(mnt_userns, &init_user_ns, pace->e_gid); + vfsgid = make_vfsgid(idmap, &init_user_ns, pace->e_gid); /* Translate the kgid into a userspace id ksmbd would see. */ return from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid)); diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index ff0e7a4fcd4d..aa1300b7bfc2 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -6,6 +6,7 @@ #include <linux/kernel.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/uaccess.h> #include <linux/backing-dev.h> #include <linux/writeback.h> @@ -69,14 +70,14 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work, * * the reference count of @parent isn't incremented. */ -int ksmbd_vfs_lock_parent(struct user_namespace *user_ns, struct dentry *parent, +int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent, struct dentry *child) { struct dentry *dentry; int ret = 0; inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); - dentry = lookup_one(user_ns, child->d_name.name, parent, + dentry = lookup_one(idmap, child->d_name.name, parent, child->d_name.len); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); @@ -96,20 +97,20 @@ out_err: return ret; } -int ksmbd_vfs_may_delete(struct user_namespace *user_ns, +int ksmbd_vfs_may_delete(struct mnt_idmap *idmap, struct dentry *dentry) { struct dentry *parent; int ret; parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry); + ret = ksmbd_vfs_lock_parent(idmap, parent, dentry); if (ret) { dput(parent); return ret; } - ret = inode_permission(user_ns, d_inode(parent), + ret = inode_permission(idmap, d_inode(parent), MAY_EXEC | MAY_WRITE); inode_unlock(d_inode(parent)); @@ -117,7 +118,7 @@ int ksmbd_vfs_may_delete(struct user_namespace *user_ns, return ret; } -int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, +int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, struct dentry *dentry, __le32 *daccess) { struct dentry *parent; @@ -125,26 +126,26 @@ int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, *daccess = cpu_to_le32(FILE_READ_ATTRIBUTES | READ_CONTROL); - if (!inode_permission(user_ns, d_inode(dentry), MAY_OPEN | MAY_WRITE)) + if (!inode_permission(idmap, d_inode(dentry), MAY_OPEN | MAY_WRITE)) *daccess |= cpu_to_le32(WRITE_DAC | WRITE_OWNER | SYNCHRONIZE | FILE_WRITE_DATA | FILE_APPEND_DATA | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES | FILE_DELETE_CHILD); - if (!inode_permission(user_ns, d_inode(dentry), MAY_OPEN | MAY_READ)) + if (!inode_permission(idmap, d_inode(dentry), MAY_OPEN | MAY_READ)) *daccess |= FILE_READ_DATA_LE | FILE_READ_EA_LE; - if (!inode_permission(user_ns, d_inode(dentry), MAY_OPEN | MAY_EXEC)) + if (!inode_permission(idmap, d_inode(dentry), MAY_OPEN | MAY_EXEC)) *daccess |= FILE_EXECUTE_LE; parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry); + ret = ksmbd_vfs_lock_parent(idmap, parent, dentry); if (ret) { dput(parent); return ret; } - if (!inode_permission(user_ns, d_inode(parent), MAY_EXEC | MAY_WRITE)) + if (!inode_permission(idmap, d_inode(parent), MAY_EXEC | MAY_WRITE)) *daccess |= FILE_DELETE_LE; inode_unlock(d_inode(parent)); @@ -177,7 +178,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) } mode |= S_IFREG; - err = vfs_create(mnt_user_ns(path.mnt), d_inode(path.dentry), + err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry), dentry, mode, true); if (!err) { ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), @@ -199,7 +200,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) */ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) { - struct user_namespace *user_ns; + struct mnt_idmap *idmap; struct path path; struct dentry *dentry; int err; @@ -215,15 +216,15 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) return err; } - user_ns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); mode |= S_IFDIR; - err = vfs_mkdir(user_ns, d_inode(path.dentry), dentry, mode); + err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); if (err) { goto out; } else if (d_unhashed(dentry)) { struct dentry *d; - d = lookup_one(user_ns, dentry->d_name.name, dentry->d_parent, + d = lookup_one(idmap, dentry->d_name.name, dentry->d_parent, dentry->d_name.len); if (IS_ERR(d)) { err = PTR_ERR(d); @@ -245,7 +246,7 @@ out: return err; } -static ssize_t ksmbd_vfs_getcasexattr(struct user_namespace *user_ns, +static ssize_t ksmbd_vfs_getcasexattr(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name, int attr_name_len, char **attr_value) { @@ -262,7 +263,7 @@ static ssize_t ksmbd_vfs_getcasexattr(struct user_namespace *user_ns, if (strncasecmp(attr_name, name, attr_name_len)) continue; - value_len = ksmbd_vfs_getxattr(user_ns, + value_len = ksmbd_vfs_getxattr(idmap, dentry, name, attr_value); @@ -285,7 +286,7 @@ static int ksmbd_vfs_stream_read(struct ksmbd_file *fp, char *buf, loff_t *pos, ksmbd_debug(VFS, "read stream data pos : %llu, count : %zd\n", *pos, count); - v_len = ksmbd_vfs_getcasexattr(file_mnt_user_ns(fp->filp), + v_len = ksmbd_vfs_getcasexattr(file_mnt_idmap(fp->filp), fp->filp->f_path.dentry, fp->stream.name, fp->stream.size, @@ -409,7 +410,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, size_t count) { char *stream_buf = NULL, *wbuf; - struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); + struct mnt_idmap *idmap = file_mnt_idmap(fp->filp); size_t size, v_len; int err = 0; @@ -422,7 +423,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, count = (*pos + count) - XATTR_SIZE_MAX; } - v_len = ksmbd_vfs_getcasexattr(user_ns, + v_len = ksmbd_vfs_getcasexattr(idmap, fp->filp->f_path.dentry, fp->stream.name, fp->stream.size, @@ -448,7 +449,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, memcpy(&stream_buf[*pos], buf, count); - err = ksmbd_vfs_setxattr(user_ns, + err = ksmbd_vfs_setxattr(idmap, fp->filp->f_path.dentry, fp->stream.name, (void *)stream_buf, @@ -583,7 +584,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id) */ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) { - struct user_namespace *user_ns; + struct mnt_idmap *idmap; struct path path; struct dentry *parent; int err; @@ -598,9 +599,9 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) return err; } - user_ns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); parent = dget_parent(path.dentry); - err = ksmbd_vfs_lock_parent(user_ns, parent, path.dentry); + err = ksmbd_vfs_lock_parent(idmap, parent, path.dentry); if (err) { dput(parent); path_put(&path); @@ -614,12 +615,12 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) } if (S_ISDIR(d_inode(path.dentry)->i_mode)) { - err = vfs_rmdir(user_ns, d_inode(parent), path.dentry); + err = vfs_rmdir(idmap, d_inode(parent), path.dentry); if (err && err != -ENOTEMPTY) ksmbd_debug(VFS, "%s: rmdir failed, err %d\n", name, err); } else { - err = vfs_unlink(user_ns, d_inode(parent), path.dentry, NULL); + err = vfs_unlink(idmap, d_inode(parent), path.dentry, NULL); if (err) ksmbd_debug(VFS, "%s: unlink failed, err %d\n", name, err); @@ -672,7 +673,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, goto out3; } - err = vfs_link(oldpath.dentry, mnt_user_ns(newpath.mnt), + err = vfs_link(oldpath.dentry, mnt_idmap(newpath.mnt), d_inode(newpath.dentry), dentry, NULL); if (err) @@ -711,10 +712,10 @@ static int ksmbd_validate_entry_in_use(struct dentry *src_dent) } static int __ksmbd_vfs_rename(struct ksmbd_work *work, - struct user_namespace *src_user_ns, + struct mnt_idmap *src_idmap, struct dentry *src_dent_parent, struct dentry *src_dent, - struct user_namespace *dst_user_ns, + struct mnt_idmap *dst_idmap, struct dentry *dst_dent_parent, struct dentry *trap_dent, char *dst_name) @@ -740,8 +741,8 @@ static int __ksmbd_vfs_rename(struct ksmbd_work *work, if (ksmbd_override_fsids(work)) return -ENOMEM; - dst_dent = lookup_one(dst_user_ns, dst_name, dst_dent_parent, - strlen(dst_name)); + dst_dent = lookup_one(dst_idmap, dst_name, + dst_dent_parent, strlen(dst_name)); err = PTR_ERR(dst_dent); if (IS_ERR(dst_dent)) { pr_err("lookup failed %s [%d]\n", dst_name, err); @@ -751,10 +752,10 @@ static int __ksmbd_vfs_rename(struct ksmbd_work *work, err = -ENOTEMPTY; if (dst_dent != trap_dent && !d_really_is_positive(dst_dent)) { struct renamedata rd = { - .old_mnt_userns = src_user_ns, + .old_mnt_idmap = src_idmap, .old_dir = d_inode(src_dent_parent), .old_dentry = src_dent, - .new_mnt_userns = dst_user_ns, + .new_mnt_idmap = dst_idmap, .new_dir = d_inode(dst_dent_parent), .new_dentry = dst_dent, }; @@ -772,7 +773,7 @@ out: int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, char *newname) { - struct user_namespace *user_ns; + struct mnt_idmap *idmap; struct path dst_path; struct dentry *src_dent_parent, *dst_dent_parent; struct dentry *src_dent, *trap_dent, *src_child; @@ -800,8 +801,8 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, trap_dent = lock_rename(src_dent_parent, dst_dent_parent); dget(src_dent); dget(dst_dent_parent); - user_ns = file_mnt_user_ns(fp->filp); - src_child = lookup_one(user_ns, src_dent->d_name.name, src_dent_parent, + idmap = file_mnt_idmap(fp->filp); + src_child = lookup_one(idmap, src_dent->d_name.name, src_dent_parent, src_dent->d_name.len); if (IS_ERR(src_child)) { err = PTR_ERR(src_child); @@ -816,10 +817,10 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, dput(src_child); err = __ksmbd_vfs_rename(work, - user_ns, + idmap, src_dent_parent, src_dent, - mnt_user_ns(dst_path.mnt), + mnt_idmap(dst_path.mnt), dst_dent_parent, trap_dent, dst_name); @@ -907,22 +908,22 @@ ssize_t ksmbd_vfs_listxattr(struct dentry *dentry, char **list) return size; } -static ssize_t ksmbd_vfs_xattr_len(struct user_namespace *user_ns, +static ssize_t ksmbd_vfs_xattr_len(struct mnt_idmap *idmap, struct dentry *dentry, char *xattr_name) { - return vfs_getxattr(user_ns, dentry, xattr_name, NULL, 0); + return vfs_getxattr(idmap, dentry, xattr_name, NULL, 0); } /** * ksmbd_vfs_getxattr() - vfs helper for smb get extended attributes value - * @user_ns: user namespace + * @idmap: idmap * @dentry: dentry of file for getting xattrs * @xattr_name: name of xattr name to query * @xattr_buf: destination buffer xattr value * * Return: read xattr value length on success, otherwise error */ -ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, +ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry, char *xattr_name, char **xattr_buf) { @@ -930,7 +931,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, char *buf; *xattr_buf = NULL; - xattr_len = ksmbd_vfs_xattr_len(user_ns, dentry, xattr_name); + xattr_len = ksmbd_vfs_xattr_len(idmap, dentry, xattr_name); if (xattr_len < 0) return xattr_len; @@ -938,7 +939,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, if (!buf) return -ENOMEM; - xattr_len = vfs_getxattr(user_ns, dentry, xattr_name, + xattr_len = vfs_getxattr(idmap, dentry, xattr_name, (void *)buf, xattr_len); if (xattr_len > 0) *xattr_buf = buf; @@ -949,7 +950,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, /** * ksmbd_vfs_setxattr() - vfs helper for smb set extended attributes value - * @user_ns: user namespace + * @idmap: idmap of the relevant mount * @dentry: dentry to set XATTR at * @name: xattr name for setxattr * @value: xattr value to set @@ -958,13 +959,13 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, * * Return: 0 on success, otherwise error */ -int ksmbd_vfs_setxattr(struct user_namespace *user_ns, +int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *attr_name, void *attr_value, size_t attr_size, int flags) { int err; - err = vfs_setxattr(user_ns, + err = vfs_setxattr(idmap, dentry, attr_name, attr_value, @@ -1074,26 +1075,26 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, return ret; } -int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name) { - return vfs_removexattr(user_ns, dentry, attr_name); + return vfs_removexattr(idmap, dentry, attr_name); } -int ksmbd_vfs_unlink(struct user_namespace *user_ns, +int ksmbd_vfs_unlink(struct mnt_idmap *idmap, struct dentry *dir, struct dentry *dentry) { int err = 0; - err = ksmbd_vfs_lock_parent(user_ns, dir, dentry); + err = ksmbd_vfs_lock_parent(idmap, dir, dentry); if (err) return err; dget(dentry); if (S_ISDIR(d_inode(dentry)->i_mode)) - err = vfs_rmdir(user_ns, d_inode(dir), dentry); + err = vfs_rmdir(idmap, d_inode(dir), dentry); else - err = vfs_unlink(user_ns, d_inode(dir), dentry, NULL); + err = vfs_unlink(idmap, d_inode(dir), dentry, NULL); dput(dentry); inode_unlock(d_inode(dir)); @@ -1298,7 +1299,7 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, return dent; } -int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, +int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, struct dentry *dentry) { char *name, *xattr_list = NULL; @@ -1321,7 +1322,7 @@ int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) || !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) { - err = vfs_remove_acl(user_ns, dentry, name); + err = vfs_remove_acl(idmap, dentry, name); if (err) ksmbd_debug(SMB, "remove acl xattr failed : %s\n", name); @@ -1332,7 +1333,7 @@ out: return err; } -int ksmbd_vfs_remove_sd_xattrs(struct user_namespace *user_ns, +int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, struct dentry *dentry) { char *name, *xattr_list = NULL; @@ -1352,7 +1353,7 @@ int ksmbd_vfs_remove_sd_xattrs(struct user_namespace *user_ns, ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); if (!strncmp(name, XATTR_NAME_SD, XATTR_NAME_SD_LEN)) { - err = ksmbd_vfs_remove_xattr(user_ns, dentry, name); + err = ksmbd_vfs_remove_xattr(idmap, dentry, name); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", name); } @@ -1362,7 +1363,7 @@ out: return err; } -static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespace *user_ns, +static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct mnt_idmap *idmap, struct inode *inode, int acl_type) { @@ -1392,14 +1393,14 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespac switch (pa_entry->e_tag) { case ACL_USER: xa_entry->type = SMB_ACL_USER; - xa_entry->uid = posix_acl_uid_translate(user_ns, pa_entry); + xa_entry->uid = posix_acl_uid_translate(idmap, pa_entry); break; case ACL_USER_OBJ: xa_entry->type = SMB_ACL_USER_OBJ; break; case ACL_GROUP: xa_entry->type = SMB_ACL_GROUP; - xa_entry->gid = posix_acl_gid_translate(user_ns, pa_entry); + xa_entry->gid = posix_acl_gid_translate(idmap, pa_entry); break; case ACL_GROUP_OBJ: xa_entry->type = SMB_ACL_GROUP_OBJ; @@ -1428,7 +1429,7 @@ out: } int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd *pntsd, int len) { @@ -1461,13 +1462,13 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, return rc; } - smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode, ACL_TYPE_ACCESS); if (S_ISDIR(inode->i_mode)) - def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode, ACL_TYPE_DEFAULT); - rc = ndr_encode_posix_acl(&acl_ndr, user_ns, inode, + rc = ndr_encode_posix_acl(&acl_ndr, idmap, inode, smb_acl, def_smb_acl); if (rc) { pr_err("failed to encode ndr to posix acl\n"); @@ -1487,7 +1488,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, goto out; } - rc = ksmbd_vfs_setxattr(user_ns, dentry, + rc = ksmbd_vfs_setxattr(idmap, dentry, XATTR_NAME_SD, sd_ndr.data, sd_ndr.offset, 0); if (rc < 0) @@ -1502,7 +1503,7 @@ out: } int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd **pntsd) { @@ -1514,7 +1515,7 @@ int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, struct xattr_smb_acl *smb_acl = NULL, *def_smb_acl = NULL; __u8 cmp_hash[XATTR_SD_HASH_SIZE] = {0}; - rc = ksmbd_vfs_getxattr(user_ns, dentry, XATTR_NAME_SD, &n.data); + rc = ksmbd_vfs_getxattr(idmap, dentry, XATTR_NAME_SD, &n.data); if (rc <= 0) return rc; @@ -1523,13 +1524,13 @@ int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, if (rc) goto free_n_data; - smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode, ACL_TYPE_ACCESS); if (S_ISDIR(inode->i_mode)) - def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode, ACL_TYPE_DEFAULT); - rc = ndr_encode_posix_acl(&acl_ndr, user_ns, inode, smb_acl, + rc = ndr_encode_posix_acl(&acl_ndr, idmap, inode, smb_acl, def_smb_acl); if (rc) { pr_err("failed to encode ndr to posix acl\n"); @@ -1576,7 +1577,7 @@ free_n_data: return rc; } -int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da) { @@ -1587,7 +1588,7 @@ int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns, if (err) return err; - err = ksmbd_vfs_setxattr(user_ns, dentry, XATTR_NAME_DOS_ATTRIBUTE, + err = ksmbd_vfs_setxattr(idmap, dentry, XATTR_NAME_DOS_ATTRIBUTE, (void *)n.data, n.offset, 0); if (err) ksmbd_debug(SMB, "failed to store dos attribute in xattr\n"); @@ -1596,14 +1597,14 @@ int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns, return err; } -int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da) { struct ndr n; int err; - err = ksmbd_vfs_getxattr(user_ns, dentry, XATTR_NAME_DOS_ATTRIBUTE, + err = ksmbd_vfs_getxattr(idmap, dentry, XATTR_NAME_DOS_ATTRIBUTE, (char **)&n.data); if (err > 0) { n.length = err; @@ -1650,14 +1651,14 @@ void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat) } int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct ksmbd_kstat *ksmbd_kstat) { u64 time; int rc; - generic_fillattr(user_ns, d_inode(dentry), ksmbd_kstat->kstat); + generic_fillattr(idmap, d_inode(dentry), ksmbd_kstat->kstat); time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->ctime); ksmbd_kstat->create_time = time; @@ -1675,7 +1676,7 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) { struct xattr_dos_attrib da; - rc = ksmbd_vfs_get_dos_attrib_xattr(user_ns, dentry, &da); + rc = ksmbd_vfs_get_dos_attrib_xattr(idmap, dentry, &da); if (rc > 0) { ksmbd_kstat->file_attributes = cpu_to_le32(da.attr); ksmbd_kstat->create_time = da.create_time; @@ -1687,7 +1688,7 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, return 0; } -ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, +ssize_t ksmbd_vfs_casexattr_len(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name, int attr_name_len) { @@ -1704,7 +1705,7 @@ ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, if (strncasecmp(attr_name, name, attr_name_len)) continue; - value_len = ksmbd_vfs_xattr_len(user_ns, dentry, name); + value_len = ksmbd_vfs_xattr_len(idmap, dentry, name); break; } @@ -1823,7 +1824,7 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock) locks_delete_block(flock); } -int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, +int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry) { struct posix_acl_state acl_state; @@ -1857,13 +1858,13 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, return -ENOMEM; } posix_state_to_acl(&acl_state, acls->a_entries); - rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls); + rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); else if (S_ISDIR(inode->i_mode)) { posix_state_to_acl(&acl_state, acls->a_entries); - rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, acls); + rc = set_posix_acl(idmap, dentry, ACL_TYPE_DEFAULT, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); @@ -1873,7 +1874,7 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, return rc; } -int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, +int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *parent_inode) { struct posix_acl *acls; @@ -1896,12 +1897,12 @@ int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, } } - rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls); + rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); if (S_ISDIR(inode->i_mode)) { - rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, + rc = set_posix_acl(idmap, dentry, ACL_TYPE_DEFAULT, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 0d73d735cc39..9d676ab0cd25 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -71,10 +71,10 @@ struct ksmbd_kstat { __le32 file_attributes; }; -int ksmbd_vfs_lock_parent(struct user_namespace *user_ns, struct dentry *parent, +int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent, struct dentry *child); -int ksmbd_vfs_may_delete(struct user_namespace *user_ns, struct dentry *dentry); -int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, +int ksmbd_vfs_may_delete(struct mnt_idmap *idmap, struct dentry *dentry); +int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, struct dentry *dentry, __le32 *daccess); int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode); int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode); @@ -102,19 +102,19 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, unsigned int *chunk_size_written, loff_t *total_size_written); ssize_t ksmbd_vfs_listxattr(struct dentry *dentry, char **list); -ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, +ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry, char *xattr_name, char **xattr_buf); -ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, +ssize_t ksmbd_vfs_casexattr_len(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name, int attr_name_len); -int ksmbd_vfs_setxattr(struct user_namespace *user_ns, +int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *attr_name, void *attr_value, size_t attr_size, int flags); int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); -int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name); int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, unsigned int flags, struct path *path, @@ -131,37 +131,37 @@ struct file_allocated_range_buffer; int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, struct file_allocated_range_buffer *ranges, unsigned int in_count, unsigned int *out_count); -int ksmbd_vfs_unlink(struct user_namespace *user_ns, - struct dentry *dir, struct dentry *dentry); +int ksmbd_vfs_unlink(struct mnt_idmap *idmap, struct dentry *dir, + struct dentry *dentry); void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat); int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct ksmbd_kstat *ksmbd_kstat); void ksmbd_vfs_posix_lock_wait(struct file_lock *flock); int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout); void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock); -int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, +int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, struct dentry *dentry); -int ksmbd_vfs_remove_sd_xattrs(struct user_namespace *user_ns, +int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, struct dentry *dentry); int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd *pntsd, int len); int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd **pntsd); -int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da); -int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da); -int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, +int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry); -int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, +int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *parent_inode); #endif /* __KSMBD_VFS_H__ */ diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c index da9163b00350..1d8126443a7f 100644 --- a/fs/ksmbd/vfs_cache.c +++ b/fs/ksmbd/vfs_cache.c @@ -5,6 +5,7 @@ */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -251,7 +252,7 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) filp = fp->filp; if (ksmbd_stream_fd(fp) && (ci->m_flags & S_DEL_ON_CLS_STREAM)) { ci->m_flags &= ~S_DEL_ON_CLS_STREAM; - err = ksmbd_vfs_remove_xattr(file_mnt_user_ns(filp), + err = ksmbd_vfs_remove_xattr(file_mnt_idmap(filp), filp->f_path.dentry, fp->stream.name); if (err) @@ -266,7 +267,7 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) dir = dentry->d_parent; ci->m_flags &= ~(S_DEL_ON_CLS | S_DEL_PENDING); write_unlock(&ci->m_lock); - ksmbd_vfs_unlink(file_mnt_user_ns(filp), dir, dentry); + ksmbd_vfs_unlink(file_mnt_idmap(filp), dir, dentry); write_lock(&ci->m_lock); } write_unlock(&ci->m_lock); diff --git a/fs/libfs.c b/fs/libfs.c index aada4e7c8713..4eda519c3002 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -28,12 +28,12 @@ #include "internal.h" -int simple_getattr(struct user_namespace *mnt_userns, const struct path *path, +int simple_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); return 0; } @@ -473,7 +473,7 @@ int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, } EXPORT_SYMBOL_GPL(simple_rename_exchange); -int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -509,7 +509,7 @@ EXPORT_SYMBOL(simple_rename); /** * simple_setattr - setattr for simple filesystem - * @mnt_userns: user namespace of the target mount + * @idmap: idmap of the target mount * @dentry: dentry * @iattr: iattr structure * @@ -522,19 +522,19 @@ EXPORT_SYMBOL(simple_rename); * on simple regular filesystems. Anything that needs to change on-disk * or wire state on size changes needs its own setattr method. */ -int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int simple_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(mnt_userns, dentry, iattr); + error = setattr_prepare(idmap, dentry, iattr); if (error) return error; if (iattr->ia_valid & ATTR_SIZE) truncate_setsize(inode, iattr->ia_size); - setattr_copy(mnt_userns, inode, iattr); + setattr_copy(idmap, inode, iattr); mark_inode_dirty(inode); return 0; } @@ -1315,16 +1315,16 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENOENT); } -static int empty_dir_getattr(struct user_namespace *mnt_userns, +static int empty_dir_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; } -static int empty_dir_setattr(struct user_namespace *mnt_userns, +static int empty_dir_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return -EPERM; @@ -1582,3 +1582,39 @@ bool inode_maybe_inc_iversion(struct inode *inode, bool force) return true; } EXPORT_SYMBOL(inode_maybe_inc_iversion); + +/** + * inode_query_iversion - read i_version for later use + * @inode: inode from which i_version should be read + * + * Read the inode i_version counter. This should be used by callers that wish + * to store the returned i_version for later comparison. This will guarantee + * that a later query of the i_version will result in a different value if + * anything has changed. + * + * In this implementation, we fetch the current value, set the QUERIED flag and + * then try to swap it into place with a cmpxchg, if it wasn't already set. If + * that fails, we try again with the newly fetched value from the cmpxchg. + */ +u64 inode_query_iversion(struct inode *inode) +{ + u64 cur, new; + + cur = inode_peek_iversion_raw(inode); + do { + /* If flag is already set, then no need to swap */ + if (cur & I_VERSION_QUERIED) { + /* + * This barrier (and the implicit barrier in the + * cmpxchg below) pairs with the barrier in + * inode_maybe_inc_iversion(). + */ + smp_mb(); + break; + } + + new = cur | I_VERSION_QUERIED; + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); + return cur >> I_VERSION_QUERIED_SHIFT; +} +EXPORT_SYMBOL(inode_query_iversion); diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index a5bb3f721a9d..82b19a30e0f0 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -188,7 +188,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) continue; if (!rpc_cmp_addr(nlm_addr(block->b_host), addr)) continue; - if (nfs_compare_fh(NFS_FH(locks_inode(fl_blocked->fl_file)), fh) != 0) + if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)), fh) != 0) continue; /* Alright, we found a lock. Set the return status * and wake up the caller diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 99fffc9cb958..16b4de868cd2 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/errno.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/nfs_fs.h> #include <linux/utsname.h> #include <linux/freezer.h> @@ -130,7 +131,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) char *nodename = req->a_host->h_rpcclnt->cl_nodename; nlmclnt_next_cookie(&argp->cookie); - memcpy(&lock->fh, NFS_FH(locks_inode(fl->fl_file)), sizeof(struct nfs_fh)); + memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh)); lock->caller = nodename; lock->oh.data = req->a_owner; lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 5bec78c8e431..17432c445fe6 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -3,6 +3,7 @@ #define __LOCKD_NETNS_H__ #include <linux/fs.h> +#include <linux/filelock.h> #include <net/netns/generic.h> struct lockd_net { diff --git a/fs/locks.c b/fs/locks.c index 8f01bee17715..624c6ac92ede 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -52,6 +52,7 @@ #include <linux/capability.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/filelock.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/security.h> @@ -233,7 +234,7 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_type) { struct file_lock *fl; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); list_for_each_entry(fl, list, fl_list) if (fl->fl_file == filp) @@ -887,7 +888,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) { struct file_lock *cfl; struct file_lock_context *ctx; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); void *owner; void (*func)(void); @@ -1330,7 +1331,7 @@ retry: int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { - return posix_lock_inode(locks_inode(filp), fl, conflock); + return posix_lock_inode(file_inode(filp), fl, conflock); } EXPORT_SYMBOL(posix_lock_file); @@ -1629,7 +1630,7 @@ EXPORT_SYMBOL(lease_get_mtime); int fcntl_getlease(struct file *filp) { struct file_lock *fl; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; int type = F_UNLCK; LIST_HEAD(dispose); @@ -1667,7 +1668,7 @@ int fcntl_getlease(struct file *filp) static int check_conflicting_open(struct file *filp, const long arg, int flags) { - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); int self_wcount = 0, self_rcount = 0; if (flags & FL_LAYOUT) @@ -1703,7 +1704,7 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv) { struct file_lock *fl, *my_fl = NULL, *lease; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; @@ -1819,7 +1820,7 @@ static int generic_delete_lease(struct file *filp, void *owner) { int error = -EAGAIN; struct file_lock *fl, *victim = NULL; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; LIST_HEAD(dispose); @@ -1861,7 +1862,7 @@ static int generic_delete_lease(struct file *filp, void *owner) int generic_setlease(struct file *filp, long arg, struct file_lock **flp, void **priv) { - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); int error; if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE)) @@ -2350,7 +2351,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, struct flock *flock) { struct file_lock *file_lock = locks_alloc_lock(); - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file *f; int error; @@ -2554,7 +2555,7 @@ out: void locks_remove_posix(struct file *filp, fl_owner_t owner) { int error; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock lock; struct file_lock_context *ctx; @@ -2591,7 +2592,7 @@ static void locks_remove_flock(struct file *filp, struct file_lock_context *flctx) { struct file_lock fl; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); if (list_empty(&flctx->flc_flock)) return; @@ -2636,7 +2637,7 @@ void locks_remove_file(struct file *filp) { struct file_lock_context *ctx; - ctx = locks_inode_context(locks_inode(filp)); + ctx = locks_inode_context(file_inode(filp)); if (!ctx) return; @@ -2720,7 +2721,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, */ if (fl->fl_file != NULL) - inode = locks_inode(fl->fl_file); + inode = file_inode(fl->fl_file); seq_printf(f, "%lld: ", id); @@ -2861,7 +2862,7 @@ static void __show_fd_locks(struct seq_file *f, void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) { - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; int id = 0; diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 9115948c624e..724d8191a310 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -252,7 +252,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error) iput(inode); return NULL; } - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = j; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; diff --git a/fs/minix/file.c b/fs/minix/file.c index 6a7bd2d9eec0..0dd05d47724a 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -22,13 +22,13 @@ const struct file_operations minix_file_operations = { .splice_read = generic_file_splice_read, }; -static int minix_setattr(struct user_namespace *mnt_userns, +static int minix_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -42,7 +42,7 @@ static int minix_setattr(struct user_namespace *mnt_userns, minix_truncate(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index da8bdd1712a7..e9fbb5303a22 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -654,13 +654,13 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } -int minix_getattr(struct user_namespace *mnt_userns, const struct path *path, +int minix_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct super_block *sb = path->dentry->d_sb; struct inode *inode = d_inode(path->dentry); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (INODE_VERSION(inode) == MINIX_V1) stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb); else diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 202173368025..e0b76defa85c 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -51,7 +51,7 @@ extern unsigned long minix_count_free_inodes(struct super_block *sb); extern int minix_new_block(struct inode * inode); extern void minix_free_block(struct inode *inode, unsigned long block); extern unsigned long minix_count_free_blocks(struct super_block *sb); -extern int minix_getattr(struct user_namespace *, const struct path *, +extern int minix_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 8afdc408ca4f..39ebe10d6a8b 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -33,7 +33,7 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, un return d_splice_alias(inode, dentry); } -static int minix_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int minix_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { int error; @@ -52,7 +52,7 @@ static int minix_mknod(struct user_namespace *mnt_userns, struct inode *dir, return error; } -static int minix_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int minix_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { int error; @@ -65,13 +65,13 @@ static int minix_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return finish_open_simple(file, error); } -static int minix_create(struct user_namespace *mnt_userns, struct inode *dir, +static int minix_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return minix_mknod(mnt_userns, dir, dentry, mode, 0); + return minix_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); } -static int minix_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int minix_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int err = -ENAMETOOLONG; @@ -111,7 +111,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int minix_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode * inode; @@ -184,7 +184,7 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry) return err; } -static int minix_rename(struct user_namespace *mnt_userns, +static int minix_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c new file mode 100644 index 000000000000..4905665c47d0 --- /dev/null +++ b/fs/mnt_idmapping.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Christian Brauner <brauner@kernel.org> */ + +#include <linux/cred.h> +#include <linux/fs.h> +#include <linux/mnt_idmapping.h> +#include <linux/slab.h> +#include <linux/user_namespace.h> + +#include "internal.h" + +struct mnt_idmap { + struct user_namespace *owner; + refcount_t count; +}; + +/* + * Carries the initial idmapping of 0:0:4294967295 which is an identity + * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is + * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. + */ +struct mnt_idmap nop_mnt_idmap = { + .owner = &init_user_ns, + .count = REFCOUNT_INIT(1), +}; +EXPORT_SYMBOL_GPL(nop_mnt_idmap); + +/** + * check_fsmapping - check whether an mount idmapping is allowed + * @idmap: idmap of the relevent mount + * @sb: super block of the filesystem + * + * Return: true if @idmap is allowed, false if not. + */ +bool check_fsmapping(const struct mnt_idmap *idmap, + const struct super_block *sb) +{ + return idmap->owner != sb->s_user_ns; +} + +/** + * initial_idmapping - check whether this is the initial mapping + * @ns: idmapping to check + * + * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1, + * [...], 1000 to 1000 [...]. + * + * Return: true if this is the initial mapping, false if not. + */ +static inline bool initial_idmapping(const struct user_namespace *ns) +{ + return ns == &init_user_ns; +} + +/** + * no_idmapping - check whether we can skip remapping a kuid/gid + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * + * This function can be used to check whether a remapping between two + * idmappings is required. + * An idmapped mount is a mount that has an idmapping attached to it that + * is different from the filsystem's idmapping and the initial idmapping. + * If the initial mapping is used or the idmapping of the mount and the + * filesystem are identical no remapping is required. + * + * Return: true if remapping can be skipped, false if not. + */ +static inline bool no_idmapping(const struct user_namespace *mnt_userns, + const struct user_namespace *fs_userns) +{ + return initial_idmapping(mnt_userns) || mnt_userns == fs_userns; +} + +/** + * make_vfsuid - map a filesystem kuid according to an idmapping + * @idmap: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @kuid : kuid to be mapped + * + * Take a @kuid and remap it from @fs_userns into @idmap. Use this + * function when preparing a @kuid to be reported to userspace. + * + * If no_idmapping() determines that this is not an idmapped mount we can + * simply return @kuid unchanged. + * If initial_idmapping() tells us that the filesystem is not mounted with an + * idmapping we know the value of @kuid won't change when calling + * from_kuid() so we can simply retrieve the value via __kuid_val() + * directly. + * + * Return: @kuid mapped according to @idmap. + * If @kuid has no mapping in either @idmap or @fs_userns INVALID_UID is + * returned. + */ + +vfsuid_t make_vfsuid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, + kuid_t kuid) +{ + uid_t uid; + struct user_namespace *mnt_userns = idmap->owner; + + if (no_idmapping(mnt_userns, fs_userns)) + return VFSUIDT_INIT(kuid); + if (initial_idmapping(fs_userns)) + uid = __kuid_val(kuid); + else + uid = from_kuid(fs_userns, kuid); + if (uid == (uid_t)-1) + return INVALID_VFSUID; + return VFSUIDT_INIT(make_kuid(mnt_userns, uid)); +} +EXPORT_SYMBOL_GPL(make_vfsuid); + +/** + * make_vfsgid - map a filesystem kgid according to an idmapping + * @idmap: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @kgid : kgid to be mapped + * + * Take a @kgid and remap it from @fs_userns into @idmap. Use this + * function when preparing a @kgid to be reported to userspace. + * + * If no_idmapping() determines that this is not an idmapped mount we can + * simply return @kgid unchanged. + * If initial_idmapping() tells us that the filesystem is not mounted with an + * idmapping we know the value of @kgid won't change when calling + * from_kgid() so we can simply retrieve the value via __kgid_val() + * directly. + * + * Return: @kgid mapped according to @idmap. + * If @kgid has no mapping in either @idmap or @fs_userns INVALID_GID is + * returned. + */ +vfsgid_t make_vfsgid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, kgid_t kgid) +{ + gid_t gid; + struct user_namespace *mnt_userns = idmap->owner; + + if (no_idmapping(mnt_userns, fs_userns)) + return VFSGIDT_INIT(kgid); + if (initial_idmapping(fs_userns)) + gid = __kgid_val(kgid); + else + gid = from_kgid(fs_userns, kgid); + if (gid == (gid_t)-1) + return INVALID_VFSGID; + return VFSGIDT_INIT(make_kgid(mnt_userns, gid)); +} +EXPORT_SYMBOL_GPL(make_vfsgid); + +/** + * from_vfsuid - map a vfsuid into the filesystem idmapping + * @idmap: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @vfsuid : vfsuid to be mapped + * + * Map @vfsuid into the filesystem idmapping. This function has to be used in + * order to e.g. write @vfsuid to inode->i_uid. + * + * Return: @vfsuid mapped into the filesystem idmapping + */ +kuid_t from_vfsuid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, vfsuid_t vfsuid) +{ + uid_t uid; + struct user_namespace *mnt_userns = idmap->owner; + + if (no_idmapping(mnt_userns, fs_userns)) + return AS_KUIDT(vfsuid); + uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid)); + if (uid == (uid_t)-1) + return INVALID_UID; + if (initial_idmapping(fs_userns)) + return KUIDT_INIT(uid); + return make_kuid(fs_userns, uid); +} +EXPORT_SYMBOL_GPL(from_vfsuid); + +/** + * from_vfsgid - map a vfsgid into the filesystem idmapping + * @idmap: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @vfsgid : vfsgid to be mapped + * + * Map @vfsgid into the filesystem idmapping. This function has to be used in + * order to e.g. write @vfsgid to inode->i_gid. + * + * Return: @vfsgid mapped into the filesystem idmapping + */ +kgid_t from_vfsgid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, vfsgid_t vfsgid) +{ + gid_t gid; + struct user_namespace *mnt_userns = idmap->owner; + + if (no_idmapping(mnt_userns, fs_userns)) + return AS_KGIDT(vfsgid); + gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid)); + if (gid == (gid_t)-1) + return INVALID_GID; + if (initial_idmapping(fs_userns)) + return KGIDT_INIT(gid); + return make_kgid(fs_userns, gid); +} +EXPORT_SYMBOL_GPL(from_vfsgid); + +#ifdef CONFIG_MULTIUSER +/** + * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups + * @vfsgid: the mnt gid to match + * + * This function can be used to determine whether @vfsuid matches any of the + * caller's groups. + * + * Return: 1 if vfsuid matches caller's groups, 0 if not. + */ +int vfsgid_in_group_p(vfsgid_t vfsgid) +{ + return in_group_p(AS_KGIDT(vfsgid)); +} +#else +int vfsgid_in_group_p(vfsgid_t vfsgid) +{ + return 1; +} +#endif +EXPORT_SYMBOL_GPL(vfsgid_in_group_p); + +struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) +{ + struct mnt_idmap *idmap; + + idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); + if (!idmap) + return ERR_PTR(-ENOMEM); + + idmap->owner = get_user_ns(mnt_userns); + refcount_set(&idmap->count, 1); + return idmap; +} + +/** + * mnt_idmap_get - get a reference to an idmapping + * @idmap: the idmap to bump the reference on + * + * If @idmap is not the @nop_mnt_idmap bump the reference count. + * + * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed. + */ +struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) +{ + if (idmap != &nop_mnt_idmap) + refcount_inc(&idmap->count); + + return idmap; +} + +/** + * mnt_idmap_put - put a reference to an idmapping + * @idmap: the idmap to put the reference on + * + * If this is a non-initial idmapping, put the reference count when a mount is + * released and free it if we're the last user. + */ +void mnt_idmap_put(struct mnt_idmap *idmap) +{ + if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { + put_user_ns(idmap->owner); + kfree(idmap); + } +} diff --git a/fs/mpage.c b/fs/mpage.c index 0f8ae954a579..ce53179428db 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -532,6 +532,8 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, map_bh.b_size = 1 << blkbits; if (mpd->get_block(inode, block_in_file, &map_bh, 1)) goto confused; + if (!buffer_mapped(&map_bh)) + goto confused; if (buffer_new(&map_bh)) clean_bdev_bh_alias(&map_bh); if (buffer_boundary(&map_bh)) { diff --git a/fs/namei.c b/fs/namei.c index 309ae6fc8c99..5855dc6edbd5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -20,6 +20,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> @@ -273,7 +274,7 @@ void putname(struct filename *name) /** * check_acl - perform ACL permission checking - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * @@ -281,13 +282,13 @@ void putname(struct filename *name) * retrieve POSIX acls it needs to know whether it is called from a blocking or * non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -static int check_acl(struct user_namespace *mnt_userns, +static int check_acl(struct mnt_idmap *idmap, struct inode *inode, int mask) { #ifdef CONFIG_FS_POSIX_ACL @@ -300,14 +301,14 @@ static int check_acl(struct user_namespace *mnt_userns, /* no ->get_inode_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; - return posix_acl_permission(mnt_userns, inode, acl, mask); + return posix_acl_permission(idmap, inode, acl, mask); } acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { - int error = posix_acl_permission(mnt_userns, inode, acl, mask); + int error = posix_acl_permission(idmap, inode, acl, mask); posix_acl_release(acl); return error; } @@ -318,7 +319,7 @@ static int check_acl(struct user_namespace *mnt_userns, /** * acl_permission_check - perform basic UNIX permission checking - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * @@ -326,20 +327,20 @@ static int check_acl(struct user_namespace *mnt_userns, * function may retrieve POSIX acls it needs to know whether it is called from a * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -static int acl_permission_check(struct user_namespace *mnt_userns, +static int acl_permission_check(struct mnt_idmap *idmap, struct inode *inode, int mask) { unsigned int mode = inode->i_mode; vfsuid_t vfsuid; /* Are we the owner? If so, ACL's don't matter */ - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { mask &= 7; mode >>= 6; @@ -348,7 +349,7 @@ static int acl_permission_check(struct user_namespace *mnt_userns, /* Do we have ACL's? */ if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { - int error = check_acl(mnt_userns, inode, mask); + int error = check_acl(idmap, inode, mask); if (error != -EAGAIN) return error; } @@ -362,7 +363,7 @@ static int acl_permission_check(struct user_namespace *mnt_userns, * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); if (vfsgid_in_group_p(vfsgid)) mode >>= 3; } @@ -373,7 +374,7 @@ static int acl_permission_check(struct user_namespace *mnt_userns, /** * generic_permission - check for access rights on a Posix-like filesystem - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, * %MAY_NOT_BLOCK ...) @@ -387,13 +388,13 @@ static int acl_permission_check(struct user_namespace *mnt_userns, * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, +int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; @@ -401,17 +402,17 @@ int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, /* * Do the basic permission checks. */ - ret = acl_permission_check(mnt_userns, inode, mask); + ret = acl_permission_check(idmap, inode, mask); if (ret != -EACCES) return ret; if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ if (!(mask & MAY_WRITE)) - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; @@ -422,7 +423,7 @@ int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; /* @@ -431,7 +432,7 @@ int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; @@ -441,7 +442,7 @@ EXPORT_SYMBOL(generic_permission); /** * do_inode_permission - UNIX permission checking - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * @@ -450,19 +451,19 @@ EXPORT_SYMBOL(generic_permission); * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ -static inline int do_inode_permission(struct user_namespace *mnt_userns, +static inline int do_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) - return inode->i_op->permission(mnt_userns, inode, mask); + return inode->i_op->permission(idmap, inode, mask); /* This gets set once for the inode lifetime */ spin_lock(&inode->i_lock); inode->i_opflags |= IOP_FASTPERM; spin_unlock(&inode->i_lock); } - return generic_permission(mnt_userns, inode, mask); + return generic_permission(idmap, inode, mask); } /** @@ -487,7 +488,7 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) /** * inode_permission - Check for access rights to a given inode - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * @@ -497,7 +498,7 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ -int inode_permission(struct user_namespace *mnt_userns, +int inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int retval; @@ -518,11 +519,11 @@ int inode_permission(struct user_namespace *mnt_userns, * written back improperly if their true value is unknown * to the vfs. */ - if (HAS_UNMAPPED_ID(mnt_userns, inode)) + if (HAS_UNMAPPED_ID(idmap, inode)) return -EACCES; } - retval = do_inode_permission(mnt_userns, inode, mask); + retval = do_inode_permission(idmap, inode, mask); if (retval) return retval; @@ -1094,14 +1095,14 @@ fs_initcall(init_fs_namei_sysctls); */ static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; vfsuid_t vfsuid; if (!sysctl_protected_symlinks) return 0; - mnt_userns = mnt_user_ns(nd->path.mnt); - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + idmap = mnt_idmap(nd->path.mnt); + vfsuid = i_uid_into_vfsuid(idmap, inode); /* Allowed if owner and follower match. */ if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return 0; @@ -1124,7 +1125,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod /** * safe_hardlink_source - Check for safe hardlink conditions - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: the source inode to hardlink from * * Return false if at least one of the following conditions: @@ -1135,7 +1136,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod * * Otherwise returns true. */ -static bool safe_hardlink_source(struct user_namespace *mnt_userns, +static bool safe_hardlink_source(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; @@ -1153,7 +1154,7 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns, return false; /* Hardlinking to unreadable or unwritable sources is dangerous. */ - if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE)) + if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE)) return false; return true; @@ -1161,8 +1162,8 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns, /** * may_linkat - Check permissions for creating a hardlink - * @mnt_userns: user namespace of the mount the inode was found from - * @link: the source to hardlink from + * @idmap: idmap of the mount the inode was found from + * @link: the source to hardlink from * * Block hardlink when all of: * - sysctl_protected_hardlinks enabled @@ -1170,21 +1171,21 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns, * - hardlink source is unsafe (see safe_hardlink_source() above) * - not CAP_FOWNER in a namespace with the inode owner uid mapped * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if successful, -ve on error. */ -int may_linkat(struct user_namespace *mnt_userns, const struct path *link) +int may_linkat(struct mnt_idmap *idmap, const struct path *link) { struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) || - !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) + if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) @@ -1193,8 +1194,8 @@ int may_linkat(struct user_namespace *mnt_userns, const struct path *link) /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ - if (safe_hardlink_source(mnt_userns, inode) || - inode_owner_or_capable(mnt_userns, inode)) + if (safe_hardlink_source(idmap, inode) || + inode_owner_or_capable(idmap, inode)) return 0; audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); @@ -1205,7 +1206,7 @@ int may_linkat(struct user_namespace *mnt_userns, const struct path *link) * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory * should be allowed, or not, on files that already * exist. - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @nd: nameidata pathwalk data * @inode: the inode of the file to open * @@ -1220,15 +1221,15 @@ int may_linkat(struct user_namespace *mnt_userns, const struct path *link) * the directory doesn't have to be world writable: being group writable will * be enough. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if the open is allowed, -ve on error. */ -static int may_create_in_sticky(struct user_namespace *mnt_userns, +static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, struct inode *const inode) { umode_t dir_mode = nd->dir_mode; @@ -1237,8 +1238,8 @@ static int may_create_in_sticky(struct user_namespace *mnt_userns, if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || likely(!(dir_mode & S_ISVTX)) || - vfsuid_eq(i_uid_into_vfsuid(mnt_userns, inode), dir_vfsuid) || - vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid())) + vfsuid_eq(i_uid_into_vfsuid(idmap, inode), dir_vfsuid) || + vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) return 0; if (likely(dir_mode & 0002) || @@ -1704,15 +1705,15 @@ static struct dentry *lookup_slow(const struct qstr *name, return res; } -static inline int may_lookup(struct user_namespace *mnt_userns, +static inline int may_lookup(struct mnt_idmap *idmap, struct nameidata *nd) { if (nd->flags & LOOKUP_RCU) { - int err = inode_permission(mnt_userns, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); + int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (err != -ECHILD || !try_to_unlazy(nd)) return err; } - return inode_permission(mnt_userns, nd->inode, MAY_EXEC); + return inode_permission(idmap, nd->inode, MAY_EXEC); } static int reserve_stack(struct nameidata *nd, struct path *link) @@ -2253,13 +2254,13 @@ static int link_path_walk(const char *name, struct nameidata *nd) /* At this point we know we have a real path component. */ for(;;) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; const char *link; u64 hash_len; int type; - mnt_userns = mnt_user_ns(nd->path.mnt); - err = may_lookup(mnt_userns, nd); + idmap = mnt_idmap(nd->path.mnt); + err = may_lookup(idmap, nd); if (err) return err; @@ -2307,7 +2308,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) OK: /* pathname or trailing symlink, done */ if (!depth) { - nd->dir_vfsuid = i_uid_into_vfsuid(mnt_userns, nd->inode); + nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; @@ -2622,7 +2623,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(vfs_path_lookup); -static int lookup_one_common(struct user_namespace *mnt_userns, +static int lookup_one_common(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len, struct qstr *this) { @@ -2652,7 +2653,7 @@ static int lookup_one_common(struct user_namespace *mnt_userns, return err; } - return inode_permission(mnt_userns, base->d_inode, MAY_EXEC); + return inode_permission(idmap, base->d_inode, MAY_EXEC); } /** @@ -2676,7 +2677,7 @@ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(&init_user_ns, name, base, len, &this); + err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2703,7 +2704,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(&init_user_ns, name, base, len, &this); + err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2714,7 +2715,7 @@ EXPORT_SYMBOL(lookup_one_len); /** * lookup_one - filesystem helper to lookup single pathname component - * @mnt_userns: user namespace of the mount the lookup is performed from + * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to @@ -2724,7 +2725,7 @@ EXPORT_SYMBOL(lookup_one_len); * * The caller must hold base->i_mutex. */ -struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name, +struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct dentry *dentry; @@ -2733,7 +2734,7 @@ struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name, WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(mnt_userns, name, base, len, &this); + err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2744,7 +2745,7 @@ EXPORT_SYMBOL(lookup_one); /** * lookup_one_unlocked - filesystem helper to lookup single pathname component - * @mnt_userns: idmapping of the mount the lookup is performed from + * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to @@ -2755,7 +2756,7 @@ EXPORT_SYMBOL(lookup_one); * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ -struct dentry *lookup_one_unlocked(struct user_namespace *mnt_userns, +struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { @@ -2763,7 +2764,7 @@ struct dentry *lookup_one_unlocked(struct user_namespace *mnt_userns, int err; struct dentry *ret; - err = lookup_one_common(mnt_userns, name, base, len, &this); + err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2777,7 +2778,7 @@ EXPORT_SYMBOL(lookup_one_unlocked); /** * lookup_one_positive_unlocked - filesystem helper to lookup single * pathname component - * @mnt_userns: idmapping of the mount the lookup is performed from + * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to @@ -2794,11 +2795,11 @@ EXPORT_SYMBOL(lookup_one_unlocked); * * The helper should be called without i_mutex held. */ -struct dentry *lookup_one_positive_unlocked(struct user_namespace *mnt_userns, +struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { - struct dentry *ret = lookup_one_unlocked(mnt_userns, name, base, len); + struct dentry *ret = lookup_one_unlocked(idmap, name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { dput(ret); @@ -2823,7 +2824,7 @@ EXPORT_SYMBOL(lookup_one_positive_unlocked); struct dentry *lookup_one_len_unlocked(const char *name, struct dentry *base, int len) { - return lookup_one_unlocked(&init_user_ns, name, base, len); + return lookup_one_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_one_len_unlocked); @@ -2838,7 +2839,7 @@ EXPORT_SYMBOL(lookup_one_len_unlocked); struct dentry *lookup_positive_unlocked(const char *name, struct dentry *base, int len) { - return lookup_one_positive_unlocked(&init_user_ns, name, base, len); + return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_positive_unlocked); @@ -2880,16 +2881,16 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags, } EXPORT_SYMBOL(user_path_at_empty); -int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir, +int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); - if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid)) return 0; - if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, dir), fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid)) return 0; - return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER); + return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER); } EXPORT_SYMBOL(__check_sticky); @@ -2913,7 +2914,7 @@ EXPORT_SYMBOL(__check_sticky); * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ -static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, +static int may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, bool isdir) { struct inode *inode = d_backing_inode(victim); @@ -2926,21 +2927,21 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) || - !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) + if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) || + if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || - HAS_UNMAPPED_ID(mnt_userns, inode)) + HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -2965,7 +2966,7 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, * 4. We should have write and exec permissions on dir * 5. We can't do it if dir is immutable (done in permission()) */ -static inline int may_create(struct user_namespace *mnt_userns, +static inline int may_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child) { audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); @@ -2973,10 +2974,10 @@ static inline int may_create(struct user_namespace *mnt_userns, return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns)) + if (!fsuidgid_has_mapping(dir->i_sb, idmap)) return -EOVERFLOW; - return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } /* @@ -3044,7 +3045,7 @@ static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode) /** * vfs_prepare_mode - prepare the mode to be used for a new inode - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: parent directory of the new inode * @mode: mode of the new inode * @mask_perms: allowed permission by the vfs @@ -3065,11 +3066,11 @@ static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode) * * Returns: mode to be passed to the filesystem */ -static inline umode_t vfs_prepare_mode(struct user_namespace *mnt_userns, +static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode, umode_t mask_perms, umode_t type) { - mode = mode_strip_sgid(mnt_userns, dir, mode); + mode = mode_strip_sgid(idmap, dir, mode); mode = mode_strip_umask(dir, mode); /* @@ -3084,7 +3085,7 @@ static inline umode_t vfs_prepare_mode(struct user_namespace *mnt_userns, /** * vfs_create - create new file - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @mode: mode of the new file @@ -3092,27 +3093,29 @@ static inline umode_t vfs_prepare_mode(struct user_namespace *mnt_userns, * * Create a new file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_create(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) { - int error = may_create(mnt_userns, dir, dentry); + int error; + + error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ - mode = vfs_prepare_mode(mnt_userns, dir, mode, S_IALLUGO, S_IFREG); + mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG); error = security_inode_create(dir, dentry, mode); if (error) return error; - error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl); + error = dir->i_op->create(idmap, dir, dentry, mode, want_excl); if (!error) fsnotify_create(dir, dentry); return error; @@ -3124,7 +3127,7 @@ int vfs_mkobj(struct dentry *dentry, umode_t mode, void *arg) { struct inode *dir = dentry->d_parent->d_inode; - int error = may_create(&init_user_ns, dir, dentry); + int error = may_create(&nop_mnt_idmap, dir, dentry); if (error) return error; @@ -3146,7 +3149,7 @@ bool may_open_dev(const struct path *path) !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } -static int may_open(struct user_namespace *mnt_userns, const struct path *path, +static int may_open(struct mnt_idmap *idmap, const struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; @@ -3182,7 +3185,7 @@ static int may_open(struct user_namespace *mnt_userns, const struct path *path, break; } - error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode); + error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); if (error) return error; @@ -3197,13 +3200,13 @@ static int may_open(struct user_namespace *mnt_userns, const struct path *path, } /* O_NOATIME can only be set by the owner or superuser */ - if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode)) + if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode)) return -EPERM; return 0; } -static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) +static int handle_truncate(struct mnt_idmap *idmap, struct file *filp) { const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; @@ -3213,7 +3216,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) error = security_file_truncate(filp); if (!error) { - error = do_truncate(mnt_userns, path->dentry, 0, + error = do_truncate(idmap, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } @@ -3228,7 +3231,7 @@ static inline int open_to_namei_flags(int flag) return flag; } -static int may_o_create(struct user_namespace *mnt_userns, +static int may_o_create(struct mnt_idmap *idmap, const struct path *dir, struct dentry *dentry, umode_t mode) { @@ -3236,10 +3239,10 @@ static int may_o_create(struct user_namespace *mnt_userns, if (error) return error; - if (!fsuidgid_has_mapping(dir->dentry->d_sb, mnt_userns)) + if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap)) return -EOVERFLOW; - error = inode_permission(mnt_userns, dir->dentry->d_inode, + error = inode_permission(idmap, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -3319,7 +3322,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, const struct open_flags *op, bool got_write) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; int open_flag = op->open_flag; @@ -3367,13 +3370,13 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, */ if (unlikely(!got_write)) open_flag &= ~O_TRUNC; - mnt_userns = mnt_user_ns(nd->path.mnt); + idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if (open_flag & O_EXCL) open_flag &= ~O_TRUNC; - mode = vfs_prepare_mode(mnt_userns, dir->d_inode, mode, mode, mode); + mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode); if (likely(got_write)) - create_error = may_o_create(mnt_userns, &nd->path, + create_error = may_o_create(idmap, &nd->path, dentry, mode); else create_error = -EROFS; @@ -3410,7 +3413,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, goto out_dput; } - error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry, + error = dir_inode->i_op->create(idmap, dir_inode, dentry, mode, open_flag & O_EXCL); if (error) goto out_dput; @@ -3513,7 +3516,7 @@ finish_lookup: static int do_open(struct nameidata *nd, struct file *file, const struct open_flags *op) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; int open_flag = op->open_flag; bool do_truncate; int acc_mode; @@ -3526,13 +3529,13 @@ static int do_open(struct nameidata *nd, } if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); - mnt_userns = mnt_user_ns(nd->path.mnt); + idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) return -EEXIST; if (d_is_dir(nd->path.dentry)) return -EISDIR; - error = may_create_in_sticky(mnt_userns, nd, + error = may_create_in_sticky(idmap, nd, d_backing_inode(nd->path.dentry)); if (unlikely(error)) return error; @@ -3552,13 +3555,13 @@ static int do_open(struct nameidata *nd, return error; do_truncate = true; } - error = may_open(mnt_userns, &nd->path, acc_mode, open_flag); + error = may_open(idmap, &nd->path, acc_mode, open_flag); if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file); if (!error) error = ima_file_check(file, op->acc_mode); if (!error && do_truncate) - error = handle_truncate(mnt_userns, file); + error = handle_truncate(idmap, file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; @@ -3570,20 +3573,20 @@ static int do_open(struct nameidata *nd, /** * vfs_tmpfile - create tmpfile - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: pointer to dentry of the base directory * @mode: mode of the new tmpfile * @open_flag: flags * * Create a temporary file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -static int vfs_tmpfile(struct user_namespace *mnt_userns, +static int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode) { @@ -3594,7 +3597,7 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns, int open_flag = file->f_flags; /* we want directory to be writable */ - error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (!dir->i_op->tmpfile) @@ -3604,13 +3607,13 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns, return -ENOMEM; file->f_path.mnt = parentpath->mnt; file->f_path.dentry = child; - mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode); - error = dir->i_op->tmpfile(mnt_userns, dir, file, mode); + mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); + error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); if (error) return error; /* Don't check for other permissions, the inode was just created */ - error = may_open(mnt_userns, &file->f_path, 0, file->f_flags); + error = may_open(idmap, &file->f_path, 0, file->f_flags); if (error) return error; inode = file_inode(file); @@ -3619,13 +3622,13 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns, inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } - ima_post_create_tmpfile(mnt_userns, inode); + ima_post_create_tmpfile(idmap, inode); return 0; } /** * vfs_tmpfile_open - open a tmpfile for kernel internal use - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @parentpath: path of the base directory * @mode: mode of the new tmpfile * @open_flag: flags @@ -3635,7 +3638,7 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns, * hence this is only for kernel internal use, and must not be installed into * file tables or such. */ -struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, +struct file *vfs_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred) { @@ -3644,7 +3647,7 @@ struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, file = alloc_empty_file_noaccount(open_flag, cred); if (!IS_ERR(file)) { - error = vfs_tmpfile(mnt_userns, parentpath, file, mode); + error = vfs_tmpfile(idmap, parentpath, file, mode); if (error) { fput(file); file = ERR_PTR(error); @@ -3658,7 +3661,6 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { - struct user_namespace *mnt_userns; struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); @@ -3667,8 +3669,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_tmpfile(mnt_userns, &path, file, op->mode); + error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode); if (error) goto out2; audit_inode(nd->name, file->f_path.dentry, 0); @@ -3873,7 +3874,7 @@ EXPORT_SYMBOL(user_path_create); /** * vfs_mknod - create device node or file - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @mode: mode of the new device node or file @@ -3881,17 +3882,17 @@ EXPORT_SYMBOL(user_path_create); * * Create a device node or file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; - int error = may_create(mnt_userns, dir, dentry); + int error = may_create(idmap, dir, dentry); if (error) return error; @@ -3903,7 +3904,7 @@ int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (!dir->i_op->mknod) return -EPERM; - mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode); + mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = devcgroup_inode_mknod(mode, dev); if (error) return error; @@ -3912,7 +3913,7 @@ int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (error) return error; - error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev); + error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); return error; @@ -3939,7 +3940,7 @@ static int may_mknod(umode_t mode) static int do_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct dentry *dentry; struct path path; int error; @@ -3959,20 +3960,20 @@ retry: if (error) goto out2; - mnt_userns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: - error = vfs_create(mnt_userns, path.dentry->d_inode, + error = vfs_create(idmap, path.dentry->d_inode, dentry, mode, true); if (!error) - ima_post_path_mknod(mnt_userns, dentry); + ima_post_path_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: - error = vfs_mknod(mnt_userns, path.dentry->d_inode, + error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); break; case S_IFIFO: case S_IFSOCK: - error = vfs_mknod(mnt_userns, path.dentry->d_inode, + error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, 0); break; } @@ -4000,32 +4001,33 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d /** * vfs_mkdir - create directory - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @mode: mode of the new directory * * Create a directory. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - int error = may_create(mnt_userns, dir, dentry); + int error; unsigned max_links = dir->i_sb->s_max_links; + error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->mkdir) return -EPERM; - mode = vfs_prepare_mode(mnt_userns, dir, mode, S_IRWXUGO | S_ISVTX, 0); + mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); error = security_inode_mkdir(dir, dentry, mode); if (error) return error; @@ -4033,7 +4035,7 @@ int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (max_links && dir->i_nlink >= max_links) return -EMLINK; - error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode); + error = dir->i_op->mkdir(idmap, dir, dentry, mode); if (!error) fsnotify_mkdir(dir, dentry); return error; @@ -4056,10 +4058,8 @@ retry: error = security_path_mkdir(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { - struct user_namespace *mnt_userns; - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry, - mode); + error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, mode); } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { @@ -4083,22 +4083,22 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) /** * vfs_rmdir - remove directory - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * * Remove a directory. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { - int error = may_delete(mnt_userns, dir, dentry, 1); + int error = may_delete(idmap, dir, dentry, 1); if (error) return error; @@ -4138,7 +4138,6 @@ EXPORT_SYMBOL(vfs_rmdir); int do_rmdir(int dfd, struct filename *name) { - struct user_namespace *mnt_userns; int error; struct dentry *dentry; struct path path; @@ -4178,8 +4177,7 @@ retry: error = security_path_rmdir(&path, dentry); if (error) goto exit4; - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry); + error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry); exit4: dput(dentry); exit3: @@ -4203,7 +4201,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) /** * vfs_unlink - unlink a filesystem object - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: parent directory * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. @@ -4220,17 +4218,17 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { struct inode *target = dentry->d_inode; - int error = may_delete(mnt_userns, dir, dentry, 0); + int error = may_delete(idmap, dir, dentry, 0); if (error) return error; @@ -4304,7 +4302,6 @@ retry_deleg: dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { - struct user_namespace *mnt_userns; /* Why not before? Because we want correct error value */ if (last.name[last.len]) @@ -4316,9 +4313,8 @@ retry_deleg: error = security_path_unlink(&path, dentry); if (error) goto exit3; - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry, - &delegated_inode); + error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, &delegated_inode); exit3: dput(dentry); } @@ -4370,24 +4366,25 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) /** * vfs_symlink - create symlink - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @oldname: name of the file to link to * * Create a symlink. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *oldname) { - int error = may_create(mnt_userns, dir, dentry); + int error; + error = may_create(idmap, dir, dentry); if (error) return error; @@ -4398,7 +4395,7 @@ int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (error) return error; - error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname); + error = dir->i_op->symlink(idmap, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); return error; @@ -4423,13 +4420,9 @@ retry: goto out_putnames; error = security_path_symlink(&path, dentry, from->name); - if (!error) { - struct user_namespace *mnt_userns; - - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry, - from->name); - } + if (!error) + error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, from->name); done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -4455,7 +4448,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn /** * vfs_link - create a new link * @old_dentry: object to be linked - * @mnt_userns: the user namespace of the mount + * @idmap: idmap of the mount * @dir: new parent * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break @@ -4472,13 +4465,13 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, +int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { @@ -4489,7 +4482,7 @@ int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, if (!inode) return -ENOENT; - error = may_create(mnt_userns, dir, new_dentry); + error = may_create(idmap, dir, new_dentry); if (error) return error; @@ -4506,7 +4499,7 @@ int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, * be writen back improperly if their true value is unknown to * the vfs. */ - if (HAS_UNMAPPED_ID(mnt_userns, inode)) + if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (!dir->i_op->link) return -EPERM; @@ -4553,7 +4546,7 @@ EXPORT_SYMBOL(vfs_link); int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct dentry *new_dentry; struct path old_path, new_path; struct inode *delegated_inode = NULL; @@ -4590,14 +4583,14 @@ retry: error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - mnt_userns = mnt_user_ns(new_path.mnt); - error = may_linkat(mnt_userns, &old_path); + idmap = mnt_idmap(new_path.mnt); + error = may_linkat(idmap, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode, + error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); @@ -4697,20 +4690,20 @@ int vfs_rename(struct renamedata *rd) if (source == target) return 0; - error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir); + error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { - error = may_create(rd->new_mnt_userns, new_dir, new_dentry); + error = may_create(rd->new_mnt_idmap, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) - error = may_delete(rd->new_mnt_userns, new_dir, + error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, is_dir); else - error = may_delete(rd->new_mnt_userns, new_dir, + error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, new_is_dir); } if (error) @@ -4725,13 +4718,13 @@ int vfs_rename(struct renamedata *rd) */ if (new_dir != old_dir) { if (is_dir) { - error = inode_permission(rd->old_mnt_userns, source, + error = inode_permission(rd->old_mnt_idmap, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { - error = inode_permission(rd->new_mnt_userns, target, + error = inode_permission(rd->new_mnt_idmap, target, MAY_WRITE); if (error) return error; @@ -4776,7 +4769,7 @@ int vfs_rename(struct renamedata *rd) if (error) goto out; } - error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry, + error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; @@ -4921,10 +4914,10 @@ retry_deleg: rd.old_dir = old_path.dentry->d_inode; rd.old_dentry = old_dentry; - rd.old_mnt_userns = mnt_user_ns(old_path.mnt); + rd.old_mnt_idmap = mnt_idmap(old_path.mnt); rd.new_dir = new_path.dentry->d_inode; rd.new_dentry = new_dentry; - rd.new_mnt_userns = mnt_user_ns(new_path.mnt); + rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; rd.flags = flags; error = vfs_rename(&rd); diff --git a/fs/namespace.c b/fs/namespace.c index ab467ee58341..5927d90e24a0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -75,22 +75,6 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ -struct mnt_idmap { - struct user_namespace *owner; - refcount_t count; -}; - -/* - * Carries the initial idmapping of 0:0:4294967295 which is an identity - * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is - * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. - */ -struct mnt_idmap nop_mnt_idmap = { - .owner = &init_user_ns, - .count = REFCOUNT_INIT(1), -}; -EXPORT_SYMBOL_GPL(nop_mnt_idmap); - struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; @@ -210,104 +194,6 @@ int mnt_get_count(struct mount *mnt) #endif } -/** - * mnt_idmap_owner - retrieve owner of the mount's idmapping - * @idmap: mount idmapping - * - * This helper will go away once the conversion to use struct mnt_idmap - * everywhere has finished at which point the helper will be unexported. - * - * Only code that needs to perform permission checks based on the owner of the - * idmapping will get access to it. All other code will solely rely on - * idmappings. This will get us type safety so it's impossible to conflate - * filesystems idmappings with mount idmappings. - * - * Return: The owner of the idmapping. - */ -struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap) -{ - return idmap->owner; -} -EXPORT_SYMBOL_GPL(mnt_idmap_owner); - -/** - * mnt_user_ns - retrieve owner of an idmapped mount - * @mnt: the relevant vfsmount - * - * This helper will go away once the conversion to use struct mnt_idmap - * everywhere has finished at which point the helper will be unexported. - * - * Only code that needs to perform permission checks based on the owner of the - * idmapping will get access to it. All other code will solely rely on - * idmappings. This will get us type safety so it's impossible to conflate - * filesystems idmappings with mount idmappings. - * - * Return: The owner of the idmapped. - */ -struct user_namespace *mnt_user_ns(const struct vfsmount *mnt) -{ - struct mnt_idmap *idmap = mnt_idmap(mnt); - - /* Return the actual owner of the filesystem instead of the nop. */ - if (idmap == &nop_mnt_idmap && - !initial_idmapping(mnt->mnt_sb->s_user_ns)) - return mnt->mnt_sb->s_user_ns; - return mnt_idmap_owner(idmap); -} -EXPORT_SYMBOL_GPL(mnt_user_ns); - -/** - * alloc_mnt_idmap - allocate a new idmapping for the mount - * @mnt_userns: owning userns of the idmapping - * - * Allocate a new struct mnt_idmap which carries the idmapping of the mount. - * - * Return: On success a new idmap, on error an error pointer is returned. - */ -static struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) -{ - struct mnt_idmap *idmap; - - idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); - if (!idmap) - return ERR_PTR(-ENOMEM); - - idmap->owner = get_user_ns(mnt_userns); - refcount_set(&idmap->count, 1); - return idmap; -} - -/** - * mnt_idmap_get - get a reference to an idmapping - * @idmap: the idmap to bump the reference on - * - * If @idmap is not the @nop_mnt_idmap bump the reference count. - * - * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed. - */ -static inline struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) -{ - if (idmap != &nop_mnt_idmap) - refcount_inc(&idmap->count); - - return idmap; -} - -/** - * mnt_idmap_put - put a reference to an idmapping - * @idmap: the idmap to put the reference on - * - * If this is a non-initial idmapping, put the reference count when a mount is - * released and free it if we're the last user. - */ -static inline void mnt_idmap_put(struct mnt_idmap *idmap) -{ - if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { - put_user_ns(idmap->owner); - kfree(idmap); - } -} - static struct mount *alloc_vfsmnt(const char *name) { struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -4094,7 +3980,7 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) * Creating an idmapped mount with the filesystem wide idmapping * doesn't make sense so block that. We don't allow mushy semantics. */ - if (mnt_idmap_owner(kattr->mnt_idmap) == fs_userns) + if (!check_fsmapping(kattr->mnt_idmap, m->mnt_sb)) return -EINVAL; /* @@ -4340,7 +4226,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, * result. */ mnt_userns = container_of(ns, struct user_namespace, ns); - if (initial_idmapping(mnt_userns)) { + if (mnt_userns == &init_user_ns) { err = -EPERM; goto out_fput; } diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 1ead5bd740c2..14a72224b657 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -209,8 +209,8 @@ config NFS_DISABLE_UDP_SUPPORT config NFS_V4_2_READ_PLUS bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation" depends on NFS_V4_2 - default y + default n help - Choose Y here to enable the use of READ_PLUS over NFS v4.2. READ_PLUS - attempts to improve read performance by compressing out sparse holes - in the file contents. + This is intended for developers only. The READ_PLUS operation has + been shown to have issues under specific conditions and should not + be used in production. diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f7e4a88d5d92..f8e420464b77 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2296,7 +2296,7 @@ EXPORT_SYMBOL_GPL(nfs_instantiate); * that the operation succeeded on the server, but an error in the * reply path made it appear to have failed. */ -int nfs_create(struct user_namespace *mnt_userns, struct inode *dir, +int nfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct iattr attr; @@ -2325,7 +2325,7 @@ EXPORT_SYMBOL_GPL(nfs_create); * See comments for nfs_proc_create regarding failed operations. */ int -nfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +nfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct iattr attr; @@ -2352,7 +2352,7 @@ EXPORT_SYMBOL_GPL(nfs_mknod); /* * See comments for nfs_proc_create regarding failed operations. */ -int nfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct iattr attr; @@ -2524,7 +2524,7 @@ EXPORT_SYMBOL_GPL(nfs_unlink); * now have a new file handle and can instantiate an in-core NFS inode * and move the raw page into its mapping. */ -int nfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct page *page; @@ -2642,7 +2642,7 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) * If these conditions are met, we can drop the dentries before doing * the rename. */ -int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -3262,7 +3262,7 @@ static int nfs_execute_ok(struct inode *inode, int mask) return ret; } -int nfs_permission(struct user_namespace *mnt_userns, +int nfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { @@ -3313,7 +3313,7 @@ out_notsup: res = nfs_revalidate_inode(inode, NFS_INO_INVALID_MODE | NFS_INO_INVALID_OTHER); if (res == 0) - res = generic_permission(&init_user_ns, inode, mask); + res = generic_permission(&nop_mnt_idmap, inode, mask); goto out; } EXPORT_SYMBOL_GPL(nfs_permission); diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 01596f2d0a1e..1a9d5aa51dfb 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -145,17 +145,10 @@ out: return parent; } -static u64 nfs_fetch_iversion(struct inode *inode) -{ - nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); - return inode_peek_iversion_raw(inode); -} - const struct export_operations nfs_export_ops = { .encode_fh = nfs_encode_fh, .fh_to_dentry = nfs_fh_to_dentry, .get_parent = nfs_get_parent, - .fetch_iversion = nfs_fetch_iversion, .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| EXPORT_OP_NOATOMIC_ATTR, diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d8ec889a4b3f..b0f3c9339e70 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -31,6 +31,7 @@ #include <linux/swap.h> #include <linux/uaccess.h> +#include <linux/filelock.h> #include "delegation.h" #include "internal.h" diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e98ee7599eeb..222a28320e1c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -606,7 +606,7 @@ EXPORT_SYMBOL_GPL(nfs_fhget); #define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN) int -nfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -825,10 +825,12 @@ static u32 nfs_get_valid_attrmask(struct inode *inode) reply_mask |= STATX_UID | STATX_GID; if (!(cache_validity & NFS_INO_INVALID_BLOCKS)) reply_mask |= STATX_BLOCKS; + if (!(cache_validity & NFS_INO_INVALID_CHANGE)) + reply_mask |= STATX_CHANGE_COOKIE; return reply_mask; } -int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -843,7 +845,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID | STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME | - STATX_INO | STATX_SIZE | STATX_BLOCKS; + STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_BTIME | + STATX_CHANGE_COOKIE; if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { if (readdirplus_enabled) @@ -851,8 +854,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, goto out_no_revalidate; } - /* Flush out writes to the server in order to update c/mtime. */ - if ((request_mask & (STATX_CTIME | STATX_MTIME)) && + /* Flush out writes to the server in order to update c/mtime/version. */ + if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_CHANGE_COOKIE)) && S_ISREG(inode->i_mode)) filemap_write_and_wait(inode->i_mapping); @@ -872,7 +875,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, /* Is the user requesting attributes that might need revalidation? */ if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| STATX_MTIME|STATX_UID|STATX_GID| - STATX_SIZE|STATX_BLOCKS))) + STATX_SIZE|STATX_BLOCKS| + STATX_CHANGE_COOKIE))) goto out_no_revalidate; /* Check whether the cached attributes are stale */ @@ -908,8 +912,12 @@ out_no_revalidate: /* Only return attributes that were revalidated. */ stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + stat->change_cookie = inode_peek_iversion_raw(inode); + stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; + if (server->change_attr_type != NFS4_CHANGE_TYPE_IS_UNDEFINED) + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; if (S_ISDIR(inode->i_mode)) stat->blksize = NFS_SERVER(inode)->dtsize; out: diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index ae7d4a8c728c..41468c21291d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -384,18 +384,18 @@ extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc); struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); void nfs_d_prune_case_insensitive_aliases(struct inode *inode); -int nfs_create(struct user_namespace *, struct inode *, struct dentry *, +int nfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); -int nfs_mkdir(struct user_namespace *, struct inode *, struct dentry *, +int nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); int nfs_rmdir(struct inode *, struct dentry *); int nfs_unlink(struct inode *, struct dentry *); -int nfs_symlink(struct user_namespace *, struct inode *, struct dentry *, +int nfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int nfs_link(struct dentry *, struct inode *, struct dentry *); -int nfs_mknod(struct user_namespace *, struct inode *, struct dentry *, umode_t, +int nfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); -int nfs_rename(struct user_namespace *, struct inode *, struct dentry *, +int nfs_rename(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); #ifdef CONFIG_NFS_V4_2 diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index b0ef7e7ddb30..19d51ebf842c 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -208,23 +208,23 @@ out_fc: } static int -nfs_namespace_getattr(struct user_namespace *mnt_userns, +nfs_namespace_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { if (NFS_FH(d_inode(path->dentry))->size != 0) - return nfs_getattr(mnt_userns, path, stat, request_mask, + return nfs_getattr(idmap, path, stat, request_mask, query_flags); - generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); return 0; } static int -nfs_namespace_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +nfs_namespace_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { if (NFS_FH(d_inode(dentry))->size != 0) - return nfs_setattr(mnt_userns, dentry, attr); + return nfs_setattr(idmap, dentry, attr); return -EACCES; } diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h index df9ca56db347..4fa37dc038b5 100644 --- a/fs/nfs/nfs3_fs.h +++ b/fs/nfs/nfs3_fs.h @@ -12,7 +12,7 @@ */ #ifdef CONFIG_NFS_V3_ACL extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu); -extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int nfs3_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 74d11e3c4205..1247f544a440 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -255,7 +255,7 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, } -int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int nfs3_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct posix_acl *orig = acl, *dfacl = NULL, *alloc; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 5edd1704f735..4c9f8bd866ab 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -23,6 +23,7 @@ #define NFS4_MAX_LOOP_ON_RECOVER (10) #include <linux/seqlock.h> +#include <linux/filelock.h> struct idmap; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 40d749f29ed3..d9c332019d06 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7692,7 +7692,7 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7716,7 +7716,7 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) #define XATTR_NAME_NFSV4_DACL "system.nfs4_dacl" static int nfs4_xattr_set_nfs4_dacl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7739,7 +7739,7 @@ static bool nfs4_xattr_list_nfs4_dacl(struct dentry *dentry) #define XATTR_NAME_NFSV4_SACL "system.nfs4_sacl" static int nfs4_xattr_set_nfs4_sacl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7764,7 +7764,7 @@ static bool nfs4_xattr_list_nfs4_sacl(struct dentry *dentry) #ifdef CONFIG_NFS_V4_SECURITY_LABEL static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7815,7 +7815,7 @@ nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len) #ifdef CONFIG_NFS_V4_2 static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 16be6dae524f..779bfc37233c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -21,6 +21,7 @@ #include <linux/nfs_page.h> #include <linux/nfs_mount.h> #include <linux/export.h> +#include <linux/filelock.h> #include "internal.h" #include "pnfs.h" diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 80c240e50952..1a80d548253a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -25,6 +25,7 @@ #include <linux/freezer.h> #include <linux/wait.h> #include <linux/iversion.h> +#include <linux/filelock.h> #include <linux/uaccess.h> #include <linux/sched/mm.h> diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index 0a9b72685f98..1479583fbb62 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -9,6 +9,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/fs.h> +#include <linux/filelock.h> static unsigned int grace_net_id; static DEFINE_SPINLOCK(grace_lock); diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 51a4b7885cae..ec49b200b797 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -10,6 +10,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <linux/filelock.h> #include <linux/percpu_counter.h> #include <linux/siphash.h> diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 1457f59f447a..995cb2c90b1a 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -113,11 +113,11 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp) inode_lock(inode); - error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS, + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_ACCESS, argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT, + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_DEFAULT, argp->acl_default); if (error) goto out_drop_lock; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 647108138e8a..887803735e2a 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -103,11 +103,11 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp) inode_lock(inode); - error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS, + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_ACCESS, argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT, + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_DEFAULT, argp->acl_default); out_drop_lock: diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index d01b29aba662..f41992ecd0d7 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -320,7 +320,7 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, iap->ia_mode &= ~current_umask(); fh_fill_pre_attrs(fhp); - host_err = vfs_create(&init_user_ns, inode, child, iap->ia_mode, true); + host_err = vfs_create(&nop_mnt_idmap, inode, child, iap->ia_mode, true); if (host_err < 0) { status = nfserrno(host_err); goto out; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 78b8cd9651d5..3509e73abe1f 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -233,7 +233,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * as well be forgiving and just succeed silently. */ goto out_put; - status = vfs_mkdir(&init_user_ns, d_inode(dir), dentry, S_IRWXU); + status = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); out_put: dput(dentry); out_unlock: @@ -353,7 +353,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) status = -ENOENT; if (d_really_is_negative(dentry)) goto out; - status = vfs_rmdir(&init_user_ns, d_inode(dir), dentry); + status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry); out: dput(dentry); out_unlock: @@ -443,7 +443,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) if (nfs4_has_reclaimed_state(name, nn)) goto out_free; - status = vfs_rmdir(&init_user_ns, d_inode(parent), child); + status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child); if (status) printk("failed to remove client recovery directory %pd\n", child); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4ef529379065..c1684da6c01f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5356,7 +5356,7 @@ static int nfsd4_check_conflicting_opens(struct nfs4_client *clp, { struct nfs4_ol_stateid *st; struct file *f = fp->fi_deleg_file->nf_file; - struct inode *ino = locks_inode(f); + struct inode *ino = file_inode(f); int writes; writes = atomic_read(&ino->i_writecount); @@ -7809,7 +7809,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) return status; } - inode = locks_inode(nf->nf_file); + inode = file_inode(nf->nf_file); flctx = locks_inode_context(inode); if (flctx && !list_empty_careful(&flctx->flc_posix)) { @@ -8182,7 +8182,6 @@ nfs4_state_shutdown_net(struct net *net) nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); - rhltable_destroy(&nfs4_file_rhltable); #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_shutdown_umount(nn); #endif @@ -8192,6 +8191,7 @@ void nfs4_state_shutdown(void) { nfsd4_destroy_callback_queue(); + rhltable_destroy(&nfs4_file_rhltable); } static void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 97edb32be77f..e12e5a4ad502 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2965,7 +2965,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, goto out; } - err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); + err = vfs_getattr(&path, &stat, + STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE, + AT_STATX_SYNC_AS_STAT); if (err) goto out_nfserr; if (!(stat.result_mask & STATX_BTIME)) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 8c52b6c9d31a..ccd8485fee04 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -40,7 +40,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) /* make sure parents give x permission to user */ int err; parent = dget_parent(tdentry); - err = inode_permission(&init_user_ns, + err = inode_permission(&nop_mnt_idmap, d_inode(parent), MAY_EXEC); if (err < 0) { dput(parent); @@ -628,6 +628,10 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) stat.mtime = inode->i_mtime; stat.ctime = inode->i_ctime; stat.size = inode->i_size; + if (v4 && IS_I_VERSION(inode)) { + stat.change_cookie = inode_query_iversion(inode); + stat.result_mask |= STATX_CHANGE_COOKIE; + } } if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); @@ -659,6 +663,10 @@ void fh_fill_post_attrs(struct svc_fh *fhp) if (err) { fhp->fh_post_saved = false; fhp->fh_post_attr.ctime = inode->i_ctime; + if (v4 && IS_I_VERSION(inode)) { + fhp->fh_post_attr.change_cookie = inode_query_iversion(inode); + fhp->fh_post_attr.result_mask |= STATX_CHANGE_COOKIE; + } } else fhp->fh_post_saved = true; if (v4) @@ -748,3 +756,37 @@ enum fsid_source fsid_source(const struct svc_fh *fhp) return FSIDSOURCE_UUID; return FSIDSOURCE_DEV; } + +/* + * We could use i_version alone as the change attribute. However, i_version + * can go backwards on a regular file after an unclean shutdown. On its own + * that doesn't necessarily cause a problem, but if i_version goes backwards + * and then is incremented again it could reuse a value that was previously + * used before boot, and a client who queried the two values might incorrectly + * assume nothing changed. + * + * By using both ctime and the i_version counter we guarantee that as long as + * time doesn't go backwards we never reuse an old value. If the filesystem + * advertises STATX_ATTR_CHANGE_MONOTONIC, then this mitigation is not + * needed. + * + * We only need to do this for regular files as well. For directories, we + * assume that the new change attr is always logged to stable storage in some + * fashion before the results can be seen. + */ +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode) +{ + u64 chattr; + + if (stat->result_mask & STATX_CHANGE_COOKIE) { + chattr = stat->change_cookie; + if (S_ISREG(inode->i_mode) && + !(stat->attributes & STATX_ATTR_CHANGE_MONOTONIC)) { + chattr += (u64)stat->ctime.tv_sec << 30; + chattr += stat->ctime.tv_nsec; + } + } else { + chattr = time_to_chattr(&stat->ctime); + } + return chattr; +} diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 513e028b0bbe..4e0ecf0ae2cf 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -293,34 +293,7 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) fhp->fh_pre_saved = false; } -/* - * We could use i_version alone as the change attribute. However, - * i_version can go backwards after a reboot. On its own that doesn't - * necessarily cause a problem, but if i_version goes backwards and then - * is incremented again it could reuse a value that was previously used - * before boot, and a client who queried the two values might - * incorrectly assume nothing changed. - * - * By using both ctime and the i_version counter we guarantee that as - * long as time doesn't go backwards we never reuse an old value. - */ -static inline u64 nfsd4_change_attribute(struct kstat *stat, - struct inode *inode) -{ - if (inode->i_sb->s_export_op->fetch_iversion) - return inode->i_sb->s_export_op->fetch_iversion(inode); - else if (IS_I_VERSION(inode)) { - u64 chattr; - - chattr = stat->ctime.tv_sec; - chattr <<= 30; - chattr += stat->ctime.tv_nsec; - chattr += inode_query_iversion(inode); - return chattr; - } else - return time_to_chattr(&stat->ctime); -} - +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode); extern void fh_fill_pre_attrs(struct svc_fh *fhp); extern void fh_fill_post_attrs(struct svc_fh *fhp); extern void fh_fill_both_attrs(struct svc_fh *fhp); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 9744443c3965..a82d91afdc9c 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -93,7 +93,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp) if (delta < 0) delta = -delta; if (delta < MAX_TOUCH_TIME_ERROR && - setattr_prepare(&init_user_ns, fhp->fh_dentry, iap) != 0) { + setattr_prepare(&nop_mnt_idmap, fhp->fh_dentry, iap) != 0) { /* * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. * This will cause notify_change to set these times diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 4c3a0d84043c..ab4ee3509ce3 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -426,7 +426,7 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) if (iap->ia_size < 0) return -EFBIG; - host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL); + host_err = notify_change(&nop_mnt_idmap, dentry, &size_attr, NULL); if (host_err) return host_err; iap->ia_valid &= ~ATTR_SIZE; @@ -444,7 +444,7 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) return 0; iap->ia_valid |= ATTR_CTIME; - return notify_change(&init_user_ns, dentry, iap, NULL); + return notify_change(&nop_mnt_idmap, dentry, iap, NULL); } /** @@ -542,12 +542,12 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, attr->na_labelerr = security_inode_setsecctx(dentry, attr->na_seclabel->data, attr->na_seclabel->len); if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl) - attr->na_aclerr = set_posix_acl(&init_user_ns, + attr->na_aclerr = set_posix_acl(&nop_mnt_idmap, dentry, ACL_TYPE_ACCESS, attr->na_pacl); if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && !attr->na_aclerr && attr->na_dpacl && S_ISDIR(inode->i_mode)) - attr->na_aclerr = set_posix_acl(&init_user_ns, + attr->na_aclerr = set_posix_acl(&nop_mnt_idmap, dentry, ACL_TYPE_DEFAULT, attr->na_dpacl); inode_unlock(inode); @@ -583,7 +583,7 @@ int nfsd4_is_junction(struct dentry *dentry) return 0; if (!(inode->i_mode & S_ISVTX)) return 0; - if (vfs_getxattr(&init_user_ns, dentry, NFSD_JUNCTION_XATTR_NAME, + if (vfs_getxattr(&nop_mnt_idmap, dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0) return 0; return 1; @@ -1363,12 +1363,13 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, err = 0; switch (type) { case S_IFREG: - host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true); + host_err = vfs_create(&nop_mnt_idmap, dirp, dchild, + iap->ia_mode, true); if (!host_err) nfsd_check_ignore_resizing(iap); break; case S_IFDIR: - host_err = vfs_mkdir(&init_user_ns, dirp, dchild, iap->ia_mode); + host_err = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode); if (!host_err && unlikely(d_unhashed(dchild))) { struct dentry *d; d = lookup_one_len(dchild->d_name.name, @@ -1396,7 +1397,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, case S_IFBLK: case S_IFIFO: case S_IFSOCK: - host_err = vfs_mknod(&init_user_ns, dirp, dchild, + host_err = vfs_mknod(&nop_mnt_idmap, dirp, dchild, iap->ia_mode, rdev); break; default: @@ -1557,7 +1558,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_drop_write; } fh_fill_pre_attrs(fhp); - host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path); + host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path); err = nfserrno(host_err); cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); if (!err) @@ -1625,7 +1626,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (d_really_is_negative(dold)) goto out_dput; fh_fill_pre_attrs(ffhp); - host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL); + host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL); fh_fill_post_attrs(ffhp); inode_unlock(dirp); if (!host_err) { @@ -1745,10 +1746,10 @@ retry: goto out_dput_old; } else { struct renamedata rd = { - .old_mnt_userns = &init_user_ns, + .old_mnt_idmap = &nop_mnt_idmap, .old_dir = fdir, .old_dentry = odentry, - .new_mnt_userns = &init_user_ns, + .new_mnt_idmap = &nop_mnt_idmap, .new_dir = tdir, .new_dentry = ndentry, }; @@ -1850,14 +1851,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, nfsd_close_cached_files(rdentry); for (retries = 1;;) { - host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL); + host_err = vfs_unlink(&nop_mnt_idmap, dirp, rdentry, NULL); if (host_err != -EAGAIN || !retries--) break; if (!nfsd_wait_for_delegreturn(rqstp, rinode)) break; } } else { - host_err = vfs_rmdir(&init_user_ns, dirp, rdentry); + host_err = vfs_rmdir(&nop_mnt_idmap, dirp, rdentry); } fh_fill_post_attrs(fhp); @@ -2129,7 +2130,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, inode_lock_shared(inode); - len = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0); + len = vfs_getxattr(&nop_mnt_idmap, dentry, name, NULL, 0); /* * Zero-length attribute, just return. @@ -2156,7 +2157,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, goto out; } - len = vfs_getxattr(&init_user_ns, dentry, name, buf, len); + len = vfs_getxattr(&nop_mnt_idmap, dentry, name, buf, len); if (len <= 0) { kvfree(buf); buf = NULL; @@ -2267,7 +2268,7 @@ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name) inode_lock(fhp->fh_dentry->d_inode); fh_fill_pre_attrs(fhp); - ret = __vfs_removexattr_locked(&init_user_ns, fhp->fh_dentry, + ret = __vfs_removexattr_locked(&nop_mnt_idmap, fhp->fh_dentry, name, NULL); fh_fill_post_attrs(fhp); @@ -2294,7 +2295,7 @@ nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, inode_lock(fhp->fh_dentry->d_inode); fh_fill_pre_attrs(fhp); - ret = __vfs_setxattr_locked(&init_user_ns, fhp->fh_dentry, name, buf, + ret = __vfs_setxattr_locked(&nop_mnt_idmap, fhp->fh_dentry, name, buf, len, flags, NULL); fh_fill_post_attrs(fhp); inode_unlock(fhp->fh_dentry->d_inode); @@ -2378,14 +2379,14 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, return 0; /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */ - err = inode_permission(&init_user_ns, inode, + err = inode_permission(&nop_mnt_idmap, inode, acc & (MAY_READ | MAY_WRITE | MAY_EXEC)); /* Allow read access to binaries even when mode 111 */ if (err == -EACCES && S_ISREG(inode->i_mode) && (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) || acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC))) - err = inode_permission(&init_user_ns, inode, MAY_EXEC); + err = inode_permission(&nop_mnt_idmap, inode, MAY_EXEC); return err? nfserrno(err) : 0; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index dbdfef7ae85b..43fb57a301d3 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -170,9 +170,14 @@ static inline void fh_drop_write(struct svc_fh *fh) static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) { + u32 request_mask = STATX_BASIC_STATS; struct path p = {.mnt = fh->fh_export->ex_path.mnt, .dentry = fh->fh_dentry}; - return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS, + + if (fh->fh_maxsize == NFS4_FHSIZE) + request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); + + return nfserrno(vfs_getattr(&p, stat, request_mask, AT_STATX_SYNC_AS_STAT)); } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 232dd7b6cca1..1310d2d5feb3 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -364,7 +364,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) ii->i_bh = bh; atomic64_inc(&root->inodes_count); - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = ino; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); @@ -949,7 +949,7 @@ void nilfs_evict_inode(struct inode *inode) */ } -int nilfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int nilfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct nilfs_transaction_info ti; @@ -957,7 +957,7 @@ int nilfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct super_block *sb = inode->i_sb; int err; - err = setattr_prepare(&init_user_ns, dentry, iattr); + err = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (err) return err; @@ -972,7 +972,7 @@ int nilfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, nilfs_truncate(inode); } - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) { @@ -988,7 +988,7 @@ out_err: return err; } -int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int nilfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct nilfs_root *root = NILFS_I(inode)->i_root; @@ -997,7 +997,7 @@ int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode, root->cno != NILFS_CPTREE_CURRENT_CNO) return -EROFS; /* snapshot is not writable */ - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 87e1004b606d..5ccc638ae92f 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -128,7 +128,7 @@ int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) /** * nilfs_fileattr_set - ioctl to support chattr */ -int nilfs_fileattr_set(struct user_namespace *mnt_userns, +int nilfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -1114,7 +1114,14 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) minseg = range[0] + segbytes - 1; do_div(minseg, segbytes); + + if (range[1] < 4096) + goto out; + maxseg = NILFS_SB2_OFFSET_BYTES(range[1]); + if (maxseg < segbytes) + goto out; + do_div(maxseg, segbytes); maxseg--; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 23899e0ae850..c7024da8f1e2 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -72,7 +72,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int nilfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int nilfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; @@ -100,7 +100,7 @@ static int nilfs_create(struct user_namespace *mnt_userns, struct inode *dir, } static int -nilfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +nilfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -125,7 +125,7 @@ nilfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int nilfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int nilfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct nilfs_transaction_info ti; @@ -202,7 +202,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, return err; } -static int nilfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -340,7 +340,7 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) return err; } -static int nilfs_rename(struct user_namespace *mnt_userns, +static int nilfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index aecda4fc95f5..8046490cd7fe 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -242,7 +242,7 @@ extern int nilfs_sync_file(struct file *, loff_t, loff_t, int); /* ioctl.c */ int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *m); -int nilfs_fileattr_set(struct user_namespace *mnt_userns, +int nilfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); long nilfs_ioctl(struct file *, unsigned int, unsigned long); long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -271,10 +271,10 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode); extern void nilfs_update_inode(struct inode *, struct buffer_head *, int); extern void nilfs_truncate(struct inode *); extern void nilfs_evict_inode(struct inode *); -extern int nilfs_setattr(struct user_namespace *, struct dentry *, +extern int nilfs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void nilfs_write_failed(struct address_space *mapping, loff_t to); -int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int nilfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); extern int nilfs_inode_dirty(struct inode *); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 6edb6e0dd61f..1422b8ba24ed 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -409,6 +409,15 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize) goto out; /* + * Prevent underflow in second superblock position calculation. + * The exact minimum size check is done in nilfs_sufile_resize(). + */ + if (newsize < 4096) { + ret = -ENOSPC; + goto out; + } + + /* * Write lock is required to protect some functions depending * on the number of segments, the number of reserved segments, * and so forth. diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 2064e6473d30..3a4c9c150cbf 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -544,9 +544,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, { struct nilfs_super_block **sbp = nilfs->ns_sbp; struct buffer_head **sbh = nilfs->ns_sbh; - u64 sb2off = NILFS_SB2_OFFSET_BYTES(bdev_nr_bytes(nilfs->ns_bdev)); + u64 sb2off, devsize = bdev_nr_bytes(nilfs->ns_bdev); int valid[2], swp = 0; + if (devsize < NILFS_SEG_MIN_BLOCKS * NILFS_MIN_BLOCK_SIZE + 4096) { + nilfs_err(sb, "device size too small"); + return -EINVAL; + } + sb2off = NILFS_SB2_OFFSET_BYTES(devsize); + sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize, &sbh[0]); sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index a2a15bc4df28..29bdd99b29fa 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -262,7 +262,7 @@ static int fanotify_get_response(struct fsnotify_group *group, } /* userspace responded, convert to something usable */ - switch (event->response & ~FAN_AUDIT) { + switch (event->response & FANOTIFY_RESPONSE_ACCESS) { case FAN_ALLOW: ret = 0; break; @@ -273,7 +273,8 @@ static int fanotify_get_response(struct fsnotify_group *group, /* Check if the response should be audited */ if (event->response & FAN_AUDIT) - audit_fanotify(event->response & ~FAN_AUDIT); + audit_fanotify(event->response & ~FAN_AUDIT, + &event->audit_rule); pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, group, event, ret); @@ -563,6 +564,9 @@ static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path, pevent->fae.type = FANOTIFY_EVENT_TYPE_PATH_PERM; pevent->response = 0; + pevent->hdr.type = FAN_RESPONSE_INFO_NONE; + pevent->hdr.pad = 0; + pevent->hdr.len = 0; pevent->state = FAN_EVENT_INIT; pevent->path = *path; path_get(path); diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 57f51a9a3015..e8a3c28c5d12 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -425,9 +425,13 @@ FANOTIFY_PE(struct fanotify_event *event) struct fanotify_perm_event { struct fanotify_event fae; struct path path; - unsigned short response; /* userspace answer to the event */ + u32 response; /* userspace answer to the event */ unsigned short state; /* state of the event */ int fd; /* fd we passed to userspace for this event */ + union { + struct fanotify_response_info_header hdr; + struct fanotify_response_info_audit_rule audit_rule; + }; }; static inline struct fanotify_perm_event * diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 4546da4a54f9..8f430bfad487 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -283,19 +283,42 @@ static int create_fd(struct fsnotify_group *group, const struct path *path, return client_fd; } +static int process_access_response_info(const char __user *info, + size_t info_len, + struct fanotify_response_info_audit_rule *friar) +{ + if (info_len != sizeof(*friar)) + return -EINVAL; + + if (copy_from_user(friar, info, sizeof(*friar))) + return -EFAULT; + + if (friar->hdr.type != FAN_RESPONSE_INFO_AUDIT_RULE) + return -EINVAL; + if (friar->hdr.pad != 0) + return -EINVAL; + if (friar->hdr.len != sizeof(*friar)) + return -EINVAL; + + return info_len; +} + /* * Finish processing of permission event by setting it to ANSWERED state and * drop group->notification_lock. */ static void finish_permission_event(struct fsnotify_group *group, - struct fanotify_perm_event *event, - unsigned int response) + struct fanotify_perm_event *event, u32 response, + struct fanotify_response_info_audit_rule *friar) __releases(&group->notification_lock) { bool destroy = false; assert_spin_locked(&group->notification_lock); - event->response = response; + event->response = response & ~FAN_INFO; + if (response & FAN_INFO) + memcpy(&event->audit_rule, friar, sizeof(*friar)); + if (event->state == FAN_EVENT_CANCELED) destroy = true; else @@ -306,20 +329,27 @@ static void finish_permission_event(struct fsnotify_group *group, } static int process_access_response(struct fsnotify_group *group, - struct fanotify_response *response_struct) + struct fanotify_response *response_struct, + const char __user *info, + size_t info_len) { struct fanotify_perm_event *event; int fd = response_struct->fd; - int response = response_struct->response; + u32 response = response_struct->response; + int ret = info_len; + struct fanotify_response_info_audit_rule friar; - pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group, - fd, response); + pr_debug("%s: group=%p fd=%d response=%u buf=%p size=%zu\n", __func__, + group, fd, response, info, info_len); /* * make sure the response is valid, if invalid we do nothing and either * userspace can send a valid response or we will clean it up after the * timeout */ - switch (response & ~FAN_AUDIT) { + if (response & ~FANOTIFY_RESPONSE_VALID_MASK) + return -EINVAL; + + switch (response & FANOTIFY_RESPONSE_ACCESS) { case FAN_ALLOW: case FAN_DENY: break; @@ -327,10 +357,20 @@ static int process_access_response(struct fsnotify_group *group, return -EINVAL; } - if (fd < 0) + if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) return -EINVAL; - if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) + if (response & FAN_INFO) { + ret = process_access_response_info(info, info_len, &friar); + if (ret < 0) + return ret; + if (fd == FAN_NOFD) + return ret; + } else { + ret = 0; + } + + if (fd < 0) return -EINVAL; spin_lock(&group->notification_lock); @@ -340,9 +380,9 @@ static int process_access_response(struct fsnotify_group *group, continue; list_del_init(&event->fae.fse.list); - finish_permission_event(group, event, response); + finish_permission_event(group, event, response, &friar); wake_up(&group->fanotify_data.access_waitq); - return 0; + return ret; } spin_unlock(&group->notification_lock); @@ -804,7 +844,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, if (ret <= 0) { spin_lock(&group->notification_lock); finish_permission_event(group, - FANOTIFY_PERM(event), FAN_DENY); + FANOTIFY_PERM(event), FAN_DENY, NULL); wake_up(&group->fanotify_data.access_waitq); } else { spin_lock(&group->notification_lock); @@ -827,28 +867,32 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct fanotify_response response = { .fd = -1, .response = -1 }; + struct fanotify_response response; struct fsnotify_group *group; int ret; + const char __user *info_buf = buf + sizeof(struct fanotify_response); + size_t info_len; if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) return -EINVAL; group = file->private_data; + pr_debug("%s: group=%p count=%zu\n", __func__, group, count); + if (count < sizeof(response)) return -EINVAL; - count = sizeof(response); - - pr_debug("%s: group=%p count=%zu\n", __func__, group, count); - - if (copy_from_user(&response, buf, count)) + if (copy_from_user(&response, buf, sizeof(response))) return -EFAULT; - ret = process_access_response(group, &response); + info_len = count - sizeof(response); + + ret = process_access_response(group, &response, info_buf, info_len); if (ret < 0) count = ret; + else + count = sizeof(response) + ret; return count; } @@ -876,7 +920,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) event = list_first_entry(&group->fanotify_data.access_list, struct fanotify_perm_event, fae.fse.list); list_del_init(&event->fae.fse.list); - finish_permission_event(group, event, FAN_ALLOW); + finish_permission_event(group, event, FAN_ALLOW, NULL); spin_lock(&group->notification_lock); } @@ -893,7 +937,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) fsnotify_destroy_event(group, fsn_event); } else { finish_permission_event(group, FANOTIFY_PERM(event), - FAN_ALLOW); + FAN_ALLOW, NULL); } spin_lock(&group->notification_lock); } diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 08c659332e26..e6fc5f7cb1d7 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2865,7 +2865,7 @@ void ntfs_truncate_vfs(struct inode *vi) { /** * ntfs_setattr - called from notify_change() when an attribute is being changed - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: dentry whose attributes to change * @attr: structure describing the attributes and the changes * @@ -2878,14 +2878,14 @@ void ntfs_truncate_vfs(struct inode *vi) { * * Called with ->i_mutex held. */ -int ntfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *vi = d_inode(dentry); int err; unsigned int ia_valid = attr->ia_valid; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) goto out; /* We do not support NTFS ACLs yet. */ diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 6f78ee00f57f..147ef4ddb691 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -289,7 +289,7 @@ extern int ntfs_show_options(struct seq_file *sf, struct dentry *root); extern int ntfs_truncate(struct inode *vi); extern void ntfs_truncate_vfs(struct inode *vi); -extern int ntfs_setattr(struct user_namespace *mnt_userns, +extern int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern int __ntfs_write_inode(struct inode *vi, int sync); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index d294cd975688..e9bdc1ff08c9 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -70,7 +70,7 @@ static long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg) /* * ntfs_getattr - inode_operations::getattr */ -int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags) { struct inode *inode = d_inode(path->dentry); @@ -84,7 +84,7 @@ int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->attributes_mask |= STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED; - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); stat->result_mask |= STATX_BTIME; stat->btime = ni->i_crtime; @@ -657,7 +657,7 @@ out: /* * ntfs3_setattr - inode_operations::setattr */ -int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct super_block *sb = dentry->d_sb; @@ -676,7 +676,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ia_valid = attr->ia_valid; } - err = setattr_prepare(mnt_userns, dentry, attr); + err = setattr_prepare(idmap, dentry, attr); if (err) goto out; @@ -704,10 +704,10 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, inode->i_size = newsize; } - setattr_copy(mnt_userns, inode, attr); + setattr_copy(idmap, inode, attr); if (mode != inode->i_mode) { - err = ntfs_acl_chmod(mnt_userns, dentry); + err = ntfs_acl_chmod(idmap, dentry); if (err) goto out; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 20b953871574..8ce2616b087f 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -1185,7 +1185,7 @@ out: * * NOTE: if fnd != NULL (ntfs_atomic_open) then @dir is locked */ -struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, +struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const struct cpu_str *uni, umode_t mode, dev_t dev, const char *symname, u32 size, @@ -1307,7 +1307,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, goto out3; } inode = &ni->vfs_inode; - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); mode = inode->i_mode; inode->i_atime = inode->i_mtime = inode->i_ctime = ni->i_crtime = @@ -1614,7 +1614,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, #ifdef CONFIG_NTFS3_FS_POSIX_ACL if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) { - err = ntfs_init_acl(mnt_userns, inode, dir); + err = ntfs_init_acl(idmap, inode, dir); if (err) goto out7; } else diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index c8db35e2ae17..407fe92394e2 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -94,12 +94,12 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry, /* * ntfs_create - inode_operations::create */ -static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ntfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFREG | mode, + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; @@ -110,12 +110,12 @@ static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, * * inode_operations::mknod */ -static int ntfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ntfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, mode, rdev, + inode = ntfs_create_inode(idmap, dir, dentry, NULL, mode, rdev, NULL, 0, NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; @@ -183,13 +183,13 @@ static int ntfs_unlink(struct inode *dir, struct dentry *dentry) /* * ntfs_symlink - inode_operations::symlink */ -static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { u32 size = strlen(symname); struct inode *inode; - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFLNK | 0777, + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, 0, symname, size, NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; @@ -198,12 +198,12 @@ static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, /* * ntfs_mkdir- inode_operations::mkdir */ -static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFDIR | mode, + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0, NULL, 0, NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; @@ -229,7 +229,7 @@ static int ntfs_rmdir(struct inode *dir, struct dentry *dentry) /* * ntfs_rename - inode_operations::rename */ -static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir, +static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode *new_dir, struct dentry *new_dentry, u32 flags) { @@ -415,13 +415,13 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, /* * Unfortunately I don't know how to get here correct 'struct nameidata *nd' - * or 'struct user_namespace *mnt_userns'. + * or 'struct mnt_idmap *idmap'. * See atomic_open in fs/namei.c. * This is why xfstest/633 failed. - * Looks like ntfs_atomic_open must accept 'struct user_namespace *mnt_userns' as argument. + * Looks like ntfs_atomic_open must accept 'struct mnt_idmap *idmap' as argument. */ - inode = ntfs_create_inode(&init_user_ns, dir, dentry, uni, mode, 0, + inode = ntfs_create_inode(&nop_mnt_idmap, dir, dentry, uni, mode, 0, NULL, 0, fnd); err = IS_ERR(inode) ? PTR_ERR(inode) : finish_open(file, dentry, ntfs_file_open); diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 0e051c5595a2..80072e5f96f7 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -492,10 +492,12 @@ bool dir_is_empty(struct inode *dir); extern const struct file_operations ntfs_dir_operations; /* Globals from file.c */ -int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags); -int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); +void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn, + CLST len); int ntfs_file_open(struct inode *inode, struct file *file); int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); @@ -706,7 +708,7 @@ int ntfs_sync_inode(struct inode *inode); int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); int inode_write_data(struct inode *inode, const void *data, size_t bytes); -struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, +struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const struct cpu_str *uni, umode_t mode, dev_t dev, const char *symname, u32 size, @@ -857,17 +859,17 @@ unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase, /* globals from xattr.c */ #ifdef CONFIG_NTFS3_FS_POSIX_ACL struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu); -int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ntfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); -int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, struct inode *dir); #else #define ntfs_get_acl NULL #define ntfs_set_acl NULL #endif -int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry); -int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry); +int ntfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); extern const struct xattr_handler *ntfs_xattr_handlers[]; diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 616df209feea..ff64302e87e5 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -578,7 +578,7 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu) return ntfs_get_acl_ex(inode, type, 0); } -static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, +static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, struct inode *inode, struct posix_acl *acl, int type, bool init_acl) { @@ -597,7 +597,7 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, case ACL_TYPE_ACCESS: /* Do not change i_mode if we are in init_acl */ if (acl && !init_acl) { - err = posix_acl_update_mode(mnt_userns, inode, &mode, + err = posix_acl_update_mode(idmap, inode, &mode, &acl); if (err) return err; @@ -652,10 +652,10 @@ out: /* * ntfs_set_acl - inode_operations::set_acl */ -int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ntfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { - return ntfs_set_acl_ex(mnt_userns, d_inode(dentry), acl, type, false); + return ntfs_set_acl_ex(idmap, d_inode(dentry), acl, type, false); } /* @@ -663,7 +663,7 @@ int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, * * Called from ntfs_create_inode(). */ -int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, struct inode *dir) { struct posix_acl *default_acl, *acl; @@ -674,7 +674,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, return err; if (default_acl) { - err = ntfs_set_acl_ex(mnt_userns, inode, default_acl, + err = ntfs_set_acl_ex(idmap, inode, default_acl, ACL_TYPE_DEFAULT, true); posix_acl_release(default_acl); } else { @@ -683,7 +683,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, if (acl) { if (!err) - err = ntfs_set_acl_ex(mnt_userns, inode, acl, + err = ntfs_set_acl_ex(idmap, inode, acl, ACL_TYPE_ACCESS, true); posix_acl_release(acl); } else { @@ -697,7 +697,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, /* * ntfs_acl_chmod - Helper for ntfs3_setattr(). */ -int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry) +int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; @@ -708,13 +708,13 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry) if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - return posix_acl_chmod(mnt_userns, dentry, inode->i_mode); + return posix_acl_chmod(idmap, dentry, inode->i_mode); } /* * ntfs_permission - inode_operations::permission */ -int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (ntfs_sb(inode->i_sb)->options->noacsrules) { @@ -722,7 +722,7 @@ int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, return 0; } - return generic_permission(mnt_userns, inode, mask); + return generic_permission(idmap, inode, mask); } /* @@ -835,7 +835,7 @@ out: * ntfs_setxattr - inode_operations::setxattr */ static noinline int ntfs_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *de, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 9f19cf9a5a9f..9fd03eaf15f8 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -260,7 +260,7 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } -int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct buffer_head *bh = NULL; @@ -274,7 +274,7 @@ int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (type == ACL_TYPE_ACCESS && acl) { umode_t mode; - status = posix_acl_update_mode(&init_user_ns, inode, &mode, + status = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (status) goto unlock; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index a897c4e41b26..667c6f03fa60 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -17,7 +17,7 @@ struct ocfs2_acl_entry { }; struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu); -int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 8b2020f92b5f..ba26c5567cff 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -188,18 +188,18 @@ static int dlmfs_file_release(struct inode *inode, * We do ->setattr() just to override size changes. Our size is the size * of the LVB and nothing else. */ -static int dlmfs_file_setattr(struct user_namespace *mnt_userns, +static int dlmfs_file_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = d_inode(dentry); attr->ia_valid &= ~ATTR_SIZE; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -336,7 +336,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { inode->i_ino = get_next_ino(); - inode_init_owner(&init_user_ns, inode, NULL, mode); + inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inc_nlink(inode); @@ -359,7 +359,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, return NULL; inode->i_ino = get_next_ino(); - inode_init_owner(&init_user_ns, inode, parent, mode); + inode_init_owner(&nop_mnt_idmap, inode, parent, mode); inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); ip = DLMFS_I(inode); @@ -402,7 +402,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, * File creation. Allocate an inode, and we're done.. */ /* SMP-safe */ -static int dlmfs_mkdir(struct user_namespace * mnt_userns, +static int dlmfs_mkdir(struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, umode_t mode) @@ -451,7 +451,7 @@ bail: return status; } -static int dlmfs_create(struct user_namespace *mnt_userns, +static int dlmfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5c60b6bc85bf..efb09de4343d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1111,7 +1111,7 @@ out: return ret; } -int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int status = 0, size_change; @@ -1142,11 +1142,11 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) return 0; - status = setattr_prepare(&init_user_ns, dentry, attr); + status = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (status) return status; - if (is_quota_modification(mnt_userns, inode, attr)) { + if (is_quota_modification(&nop_mnt_idmap, inode, attr)) { status = dquot_initialize(inode); if (status) return status; @@ -1265,7 +1265,7 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); status = ocfs2_mark_inode_dirty(handle, inode, bh); @@ -1302,7 +1302,7 @@ bail: return status; } -int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); @@ -1317,7 +1317,7 @@ int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, goto bail; } - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); /* * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). @@ -1334,7 +1334,7 @@ bail: return err; } -int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ocfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret, had_lock; @@ -1360,7 +1360,7 @@ int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, dump_stack(); } - ret = generic_permission(&init_user_ns, inode, mask); + ret = generic_permission(&nop_mnt_idmap, inode, mask); ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); out: @@ -1991,7 +1991,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, } } - if (file && setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) { + if (file && setattr_should_drop_suidgid(&nop_mnt_idmap, file_inode(file))) { ret = __ocfs2_write_remove_suid(inode, di_bh); if (ret) { mlog_errno(ret); @@ -2279,7 +2279,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * inode. There's also the dinode i_size state which * can be lost via setattr during extending writes (we * set inode->i_size at the end of a write. */ - if (setattr_should_drop_suidgid(&init_user_ns, inode)) { + if (setattr_should_drop_suidgid(&nop_mnt_idmap, inode)) { if (meta_level == 0) { ocfs2_inode_unlock_for_extent_tree(inode, &di_bh, diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 71db8f3aa027..8e53e4ac1120 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -49,11 +49,11 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); -int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); -int ocfs2_permission(struct user_namespace *mnt_userns, +int ocfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index afd54ec66103..811a6ea374bb 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -82,7 +82,7 @@ int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) return status; } -int ocfs2_fileattr_set(struct user_namespace *mnt_userns, +int ocfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h index 0297c8846945..48a5fdfe87a1 100644 --- a/fs/ocfs2/ioctl.h +++ b/fs/ocfs2/ioctl.h @@ -12,7 +12,7 @@ #define OCFS2_IOCTL_PROTO_H int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int ocfs2_fileattr_set(struct user_namespace *mnt_userns, +int ocfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 73a3854b2afb..f37174e79fad 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -8,6 +8,7 @@ */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/fcntl.h> #include <cluster/masklog.h> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index a8fd51afb794..9175dbc47201 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -197,8 +197,8 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) * callers. */ if (S_ISDIR(mode)) set_nlink(inode, 2); - mode = mode_strip_sgid(&init_user_ns, dir, mode); - inode_init_owner(&init_user_ns, inode, dir, mode); + mode = mode_strip_sgid(&nop_mnt_idmap, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); status = dquot_initialize(inode); if (status) return ERR_PTR(status); @@ -221,7 +221,7 @@ static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb, iput(inode); } -static int ocfs2_mknod(struct user_namespace *mnt_userns, +static int ocfs2_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, @@ -642,7 +642,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe_blkno, suballoc_loc, suballoc_bit); } -static int ocfs2_mkdir(struct user_namespace *mnt_userns, +static int ocfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) @@ -651,14 +651,14 @@ static int ocfs2_mkdir(struct user_namespace *mnt_userns, trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name, OCFS2_I(dir)->ip_blkno, mode); - ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); + ret = ocfs2_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (ret) mlog_errno(ret); return ret; } -static int ocfs2_create(struct user_namespace *mnt_userns, +static int ocfs2_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, @@ -668,7 +668,7 @@ static int ocfs2_create(struct user_namespace *mnt_userns, trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno, mode); - ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); + ret = ocfs2_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); if (ret) mlog_errno(ret); @@ -1194,7 +1194,7 @@ static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) ocfs2_inode_unlock(inode2, 1); } -static int ocfs2_rename(struct user_namespace *mnt_userns, +static int ocfs2_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, @@ -1784,7 +1784,7 @@ bail: return status; } -static int ocfs2_symlink(struct user_namespace *mnt_userns, +static int ocfs2_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 623db358b1ef..5a656dc683f1 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4316,7 +4316,7 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); + return inode_permission(&nop_mnt_idmap, dir, MAY_WRITE | MAY_EXEC); } /** @@ -4370,7 +4370,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, * file. */ if (!preserve) { - error = inode_permission(&init_user_ns, inode, MAY_READ); + error = inode_permission(&nop_mnt_idmap, inode, MAY_READ); if (error) return error; } diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 64e6ddcfe329..05d4414d0c33 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/slab.h> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 95d0611c5fc7..389308efe854 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7247,7 +7247,7 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler, } static int ocfs2_xattr_security_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -7320,7 +7320,7 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler, } static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -7351,7 +7351,7 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler, } static int ocfs2_xattr_user_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index c219f91f44e9..82cf7e9a665f 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -279,13 +279,13 @@ out_free_inode: return err; } -static int omfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int omfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return omfs_add_node(dir, dentry, mode | S_IFDIR); } -static int omfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int omfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return omfs_add_node(dir, dentry, mode | S_IFREG); @@ -370,7 +370,7 @@ static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx, return true; } -static int omfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int omfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 3a5b4b88a583..0101f1f87b56 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -337,13 +337,13 @@ const struct file_operations omfs_file_operations = { .splice_read = generic_file_splice_read, }; -static int omfs_setattr(struct user_namespace *mnt_userns, +static int omfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -356,7 +356,7 @@ static int omfs_setattr(struct user_namespace *mnt_userns, omfs_truncate(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 2a0e83236c01..c4c79e07efc7 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -48,7 +48,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode) goto fail; inode->i_ino = new_block; - inode_init_owner(&init_user_ns, inode, NULL, mode); + inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); diff --git a/fs/open.c b/fs/open.c index 82c1a28b3308..8038cf652583 100644 --- a/fs/open.c +++ b/fs/open.c @@ -33,10 +33,11 @@ #include <linux/dnotify.h> #include <linux/compat.h> #include <linux/mnt_idmapping.h> +#include <linux/filelock.h> #include "internal.h" -int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, +int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { int ret; @@ -54,7 +55,7 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Remove suid, sgid, and file capabilities on truncate too */ - ret = dentry_needs_remove_privs(mnt_userns, dentry); + ret = dentry_needs_remove_privs(idmap, dentry); if (ret < 0) return ret; if (ret) @@ -62,14 +63,14 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ - ret = notify_change(mnt_userns, dentry, &newattrs, NULL); + ret = notify_change(idmap, dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } long vfs_truncate(const struct path *path, loff_t length) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *inode; long error; @@ -85,8 +86,8 @@ long vfs_truncate(const struct path *path, loff_t length) if (error) goto out; - mnt_userns = mnt_user_ns(path->mnt); - error = inode_permission(mnt_userns, inode, MAY_WRITE); + idmap = mnt_idmap(path->mnt); + error = inode_permission(idmap, inode, MAY_WRITE); if (error) goto mnt_drop_write_and_out; @@ -108,7 +109,7 @@ long vfs_truncate(const struct path *path, loff_t length) error = security_path_truncate(path); if (!error) - error = do_truncate(mnt_userns, path->dentry, length, 0, NULL); + error = do_truncate(idmap, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); @@ -190,7 +191,7 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small) sb_start_write(inode->i_sb); error = security_file_truncate(f.file); if (!error) - error = do_truncate(file_mnt_user_ns(f.file), dentry, length, + error = do_truncate(file_mnt_idmap(f.file), dentry, length, ATTR_MTIME | ATTR_CTIME, f.file); sb_end_write(inode->i_sb); out_putf: @@ -459,7 +460,7 @@ retry: goto out_path_release; } - res = inode_permission(mnt_user_ns(path.mnt), inode, mode | MAY_ACCESS); + res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; @@ -603,7 +604,7 @@ retry_deleg: goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(mnt_user_ns(path->mnt), path->dentry, + error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); @@ -701,7 +702,8 @@ static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) int chown_common(const struct path *path, uid_t user, gid_t group) { - struct user_namespace *mnt_userns, *fs_userns; + struct mnt_idmap *idmap; + struct user_namespace *fs_userns; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; int error; @@ -712,7 +714,7 @@ int chown_common(const struct path *path, uid_t user, gid_t group) uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); - mnt_userns = mnt_user_ns(path->mnt); + idmap = mnt_idmap(path->mnt); fs_userns = i_user_ns(inode); retry_deleg: @@ -726,14 +728,14 @@ retry_deleg: inode_lock(inode); if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | - setattr_should_drop_sgid(mnt_userns, inode); + setattr_should_drop_sgid(idmap, inode); /* Continue to send actual fs values, not the mount values. */ error = security_path_chown( path, - from_vfsuid(mnt_userns, fs_userns, newattrs.ia_vfsuid), - from_vfsgid(mnt_userns, fs_userns, newattrs.ia_vfsgid)); + from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid), + from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid)); if (!error) - error = notify_change(mnt_userns, path->dentry, &newattrs, + error = notify_change(idmap, path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (delegated_inode) { @@ -870,7 +872,7 @@ static int do_dentry_open(struct file *f, if (error) goto cleanup_all; - error = break_lease(locks_inode(f), f->f_flags); + error = break_lease(file_inode(f), f->f_flags); if (error) goto cleanup_all; @@ -1064,7 +1066,7 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, if (IS_ERR(f)) return f; - error = vfs_create(mnt_user_ns(path->mnt), + error = vfs_create(mnt_idmap(path->mnt), d_inode(path->dentry->d_parent), path->dentry, mode, true); if (!error) @@ -1411,8 +1413,9 @@ int filp_close(struct file *filp, fl_owner_t id) { int retval = 0; - if (!file_count(filp)) { - printk(KERN_ERR "VFS: Close: file count is 0\n"); + if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, + "VFS: Close: file count is 0 (f_op=%ps)", + filp->f_op)) { return 0; } diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index c5da2091cefb..5aefb705bcc8 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -118,7 +118,7 @@ out: return error; } -int orangefs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int orangefs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int error; @@ -136,7 +136,7 @@ int orangefs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, * and "mode" to the new desired value. It is up to * us to propagate the new mode back to the server... */ - error = posix_acl_update_mode(&init_user_ns, inode, + error = posix_acl_update_mode(&nop_mnt_idmap, inode, &iattr.ia_mode, &acl); if (error) { gossip_err("%s: posix_acl_update_mode err: %d\n", diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 167fa43b24f9..4ecb91a9bbeb 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -14,6 +14,7 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/pagemap.h> static int flush_racache(struct inode *inode) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 4df560894386..11e21a0e65ce 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -822,7 +822,7 @@ again: ORANGEFS_I(inode)->attr_uid = current_fsuid(); ORANGEFS_I(inode)->attr_gid = current_fsgid(); } - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); spin_unlock(&inode->i_lock); mark_inode_dirty(inode); @@ -839,20 +839,20 @@ int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr) ret = __orangefs_setattr(inode, iattr); /* change mode on a file that has ACLs */ if (!ret && (iattr->ia_valid & ATTR_MODE)) - ret = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + ret = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); return ret; } /* * Change attributes of an object referenced by dentry. */ -int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int orangefs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int ret; gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n", dentry); - ret = setattr_prepare(&init_user_ns, dentry, iattr); + ret = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (ret) goto out; ret = __orangefs_setattr_mode(dentry, iattr); @@ -866,7 +866,7 @@ out: /* * Obtain attributes of an object given a dentry */ -int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { int ret; @@ -879,7 +879,7 @@ int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, ret = orangefs_inode_getattr(inode, request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0); if (ret == 0) { - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); /* override block size reported to stat */ if (!(request_mask & STATX_SIZE)) @@ -890,7 +890,7 @@ int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, return ret; } -int orangefs_permission(struct user_namespace *mnt_userns, +int orangefs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; @@ -905,7 +905,7 @@ int orangefs_permission(struct user_namespace *mnt_userns, if (ret < 0) return ret; - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags) @@ -944,7 +944,7 @@ static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -static int orangefs_fileattr_set(struct user_namespace *mnt_userns, +static int orangefs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { u64 val = 0; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 75c1a3dcf68c..77518e248cf7 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -15,7 +15,7 @@ /* * Get a newly allocated inode to go with a negative dentry. */ -static int orangefs_create(struct user_namespace *mnt_userns, +static int orangefs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, @@ -216,7 +216,7 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry) return ret; } -static int orangefs_symlink(struct user_namespace *mnt_userns, +static int orangefs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) @@ -305,7 +305,7 @@ out: return ret; } -static int orangefs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct orangefs_inode_s *parent = ORANGEFS_I(dir); @@ -375,7 +375,7 @@ out: return ret; } -static int orangefs_rename(struct user_namespace *mnt_userns, +static int orangefs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 6e0cc01b3a14..ce20d3443869 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -106,7 +106,7 @@ enum orangefs_vfs_op_states { extern const struct xattr_handler *orangefs_xattr_handlers[]; extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu); -extern int orangefs_set_acl(struct user_namespace *mnt_userns, +extern int orangefs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type); @@ -362,12 +362,12 @@ struct inode *orangefs_new_inode(struct super_block *sb, int __orangefs_setattr(struct inode *, struct iattr *); int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr); -int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *); +int orangefs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); -int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); -int orangefs_permission(struct user_namespace *mnt_userns, +int orangefs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int orangefs_update_time(struct inode *, struct timespec64 *, int); diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 9a5b757fbd2f..6ecad4f94ae6 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -526,7 +526,7 @@ out_unlock: } static int orangefs_xattr_set_default(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index f61e37f4c8ff..fc25fb95d5fc 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -641,7 +641,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, inode->i_state |= I_CREATING; spin_unlock(&inode->i_lock); - inode_init_owner(&init_user_ns, inode, dentry->d_parent->d_inode, mode); + inode_init_owner(&nop_mnt_idmap, inode, dentry->d_parent->d_inode, mode); attr.mode = inode->i_mode; err = ovl_create_or_link(dentry, inode, &attr, false); @@ -655,19 +655,19 @@ out: return err; } -static int ovl_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ovl_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); } -static int ovl_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ovl_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); } -static int ovl_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ovl_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { /* Don't allow creation of "whiteout" on overlay */ @@ -677,7 +677,7 @@ static int ovl_mknod(struct user_namespace *mnt_userns, struct inode *dir, return ovl_create_object(dentry, mode, rdev, NULL); } -static int ovl_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ovl_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *link) { return ovl_create_object(dentry, S_IFLNK, 0, link); @@ -1075,7 +1075,7 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) return err; } -static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, +static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, struct dentry *old, struct inode *newdir, struct dentry *new, unsigned int flags) { diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index a25bb3453dde..defd4e231ad2 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -392,8 +392,8 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected, */ take_dentry_name_snapshot(&name, real); /* - * No mnt_userns handling here: it's an internal lookup. Could skip - * permission checking altogether, but for now just use non-mnt_userns + * No idmap handling here: it's an internal lookup. Could skip + * permission checking altogether, but for now just use non-idmap * transformed ids. */ this = lookup_one_len(name.name.name, connected, name.name.len); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index c9d0c362c7ef..7c04f033aadd 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -42,7 +42,7 @@ static struct file *ovl_open_realfile(const struct file *file, { struct inode *realinode = d_inode(realpath->dentry); struct inode *inode = file_inode(file); - struct user_namespace *real_mnt_userns; + struct mnt_idmap *real_idmap; struct file *realfile; const struct cred *old_cred; int flags = file->f_flags | OVL_OPEN_FLAGS; @@ -53,12 +53,12 @@ static struct file *ovl_open_realfile(const struct file *file, acc_mode |= MAY_APPEND; old_cred = ovl_override_creds(inode->i_sb); - real_mnt_userns = mnt_user_ns(realpath->mnt); - err = inode_permission(real_mnt_userns, realinode, MAY_OPEN | acc_mode); + real_idmap = mnt_idmap(realpath->mnt); + err = inode_permission(real_idmap, realinode, MAY_OPEN | acc_mode); if (err) { realfile = ERR_PTR(err); } else { - if (!inode_owner_or_capable(real_mnt_userns, realinode)) + if (!inode_owner_or_capable(real_idmap, realinode)) flags &= ~O_NOATIME; realfile = open_with_fake_path(&file->f_path, flags, realinode, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index ee6dfa577c93..541cf3717fc2 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -19,7 +19,7 @@ #include "overlayfs.h" -int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int err; @@ -28,7 +28,7 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct dentry *upperdentry; const struct cred *old_cred; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) return err; @@ -153,7 +153,7 @@ static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) } } -int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; @@ -278,7 +278,7 @@ out: return err; } -int ovl_permission(struct user_namespace *mnt_userns, +int ovl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct inode *upperinode = ovl_inode_upper(inode); @@ -298,7 +298,7 @@ int ovl_permission(struct user_namespace *mnt_userns, * Check overlay inode with the creds of task and underlying inode * with creds of mounter */ - err = generic_permission(&init_user_ns, inode, mask); + err = generic_permission(&nop_mnt_idmap, inode, mask); if (err) return err; @@ -310,7 +310,7 @@ int ovl_permission(struct user_namespace *mnt_userns, /* Make sure mounter can read file for copy up later */ mask |= MAY_READ; } - err = inode_permission(mnt_user_ns(realpath.mnt), realinode, mask); + err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask); revert_creds(old_cred); return err; @@ -361,7 +361,7 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, if (!value && !upperdentry) { ovl_path_lower(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getxattr(mnt_user_ns(realpath.mnt), realdentry, name, NULL, 0); + err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0); revert_creds(old_cred); if (err < 0) goto out_drop_write; @@ -403,7 +403,7 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, ovl_i_path_real(inode, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - res = vfs_getxattr(mnt_user_ns(realpath.mnt), realpath.dentry, name, value, size); + res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size); revert_creds(old_cred); return res; } @@ -463,7 +463,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) * alter the POSIX ACLs for the underlying filesystem. */ static void ovl_idmap_posix_acl(const struct inode *realinode, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct posix_acl *acl) { struct user_namespace *fs_userns = i_user_ns(realinode); @@ -475,11 +475,11 @@ static void ovl_idmap_posix_acl(const struct inode *realinode, struct posix_acl_entry *e = &acl->a_entries[i]; switch (e->e_tag) { case ACL_USER: - vfsuid = make_vfsuid(mnt_userns, fs_userns, e->e_uid); + vfsuid = make_vfsuid(idmap, fs_userns, e->e_uid); e->e_uid = vfsuid_into_kuid(vfsuid); break; case ACL_GROUP: - vfsgid = make_vfsgid(mnt_userns, fs_userns, e->e_gid); + vfsgid = make_vfsgid(idmap, fs_userns, e->e_gid); e->e_gid = vfsgid_into_kgid(vfsgid); break; } @@ -514,15 +514,15 @@ struct posix_acl *ovl_get_acl_path(const struct path *path, const char *acl_name, bool noperm) { struct posix_acl *real_acl, *clone; - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *realinode = d_inode(path->dentry); - mnt_userns = mnt_user_ns(path->mnt); + idmap = mnt_idmap(path->mnt); if (noperm) real_acl = get_inode_acl(realinode, posix_acl_type(acl_name)); else - real_acl = vfs_get_acl(mnt_userns, path->dentry, acl_name); + real_acl = vfs_get_acl(idmap, path->dentry, acl_name); if (IS_ERR_OR_NULL(real_acl)) return real_acl; @@ -540,7 +540,7 @@ struct posix_acl *ovl_get_acl_path(const struct path *path, if (!clone) return ERR_PTR(-ENOMEM); - ovl_idmap_posix_acl(realinode, mnt_userns, clone); + ovl_idmap_posix_acl(realinode, idmap, clone); return clone; } @@ -555,7 +555,7 @@ struct posix_acl *ovl_get_acl_path(const struct path *path, * * This is obviously only relevant when idmapped layers are used. */ -struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, struct inode *inode, int type, bool rcu, bool noperm) { @@ -618,7 +618,7 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, ovl_path_lower(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - real_acl = vfs_get_acl(mnt_user_ns(realpath.mnt), realdentry, + real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry, acl_name); revert_creds(old_cred); if (IS_ERR(real_acl)) { @@ -651,7 +651,7 @@ out_drop_write: return err; } -int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int err; @@ -665,7 +665,7 @@ int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, return -EOPNOTSUPP; if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EPERM; /* @@ -674,10 +674,10 @@ int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, */ if (unlikely(inode->i_mode & S_ISGID) && type == ACL_TYPE_ACCESS && !in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) { + !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) { struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; - err = ovl_setattr(&init_user_ns, dentry, &iattr); + err = ovl_setattr(&nop_mnt_idmap, dentry, &iattr); if (err) return err; } @@ -755,10 +755,10 @@ int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa) if (err) return err; - return vfs_fileattr_set(mnt_user_ns(realpath->mnt), realpath->dentry, fa); + return vfs_fileattr_set(mnt_idmap(realpath->mnt), realpath->dentry, fa); } -int ovl_fileattr_set(struct user_namespace *mnt_userns, +int ovl_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 46753134533a..cfb3420b7df0 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -204,7 +204,7 @@ static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, struct dentry *base, int len, bool drop_negative) { - struct dentry *ret = lookup_one_unlocked(mnt_user_ns(d->mnt), name, base, len); + struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->mnt), name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { @@ -711,7 +711,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, if (err) return ERR_PTR(err); - index = lookup_one_positive_unlocked(ovl_upper_mnt_userns(ofs), name.name, + index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), name.name, ofs->indexdir, name.len); if (IS_ERR(index)) { err = PTR_ERR(index); @@ -1182,7 +1182,7 @@ bool ovl_lower_positive(struct dentry *dentry) struct dentry *this; struct dentry *lowerdir = poe->lowerstack[i].dentry; - this = lookup_one_positive_unlocked(mnt_user_ns(poe->lowerstack[i].layer->mnt), + this = lookup_one_positive_unlocked(mnt_idmap(poe->lowerstack[i].layer->mnt), name->name, lowerdir, name->len); if (IS_ERR(this)) { switch (PTR_ERR(this)) { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 1df7f850ff3b..4d0b278f5630 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -141,13 +141,13 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs, struct dentry *upperdentry, struct iattr *attr) { - return notify_change(ovl_upper_mnt_userns(ofs), upperdentry, attr, NULL); + return notify_change(ovl_upper_mnt_idmap(ofs), upperdentry, attr, NULL); } static inline int ovl_do_rmdir(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { - int err = vfs_rmdir(ovl_upper_mnt_userns(ofs), dir, dentry); + int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry); pr_debug("rmdir(%pd2) = %i\n", dentry, err); return err; @@ -156,7 +156,7 @@ static inline int ovl_do_rmdir(struct ovl_fs *ofs, static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { - int err = vfs_unlink(ovl_upper_mnt_userns(ofs), dir, dentry, NULL); + int err = vfs_unlink(ovl_upper_mnt_idmap(ofs), dir, dentry, NULL); pr_debug("unlink(%pd2) = %i\n", dentry, err); return err; @@ -165,7 +165,8 @@ static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir, static inline int ovl_do_link(struct ovl_fs *ofs, struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { - int err = vfs_link(old_dentry, ovl_upper_mnt_userns(ofs), dir, new_dentry, NULL); + int err = vfs_link(old_dentry, ovl_upper_mnt_idmap(ofs), dir, + new_dentry, NULL); pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err); return err; @@ -175,7 +176,7 @@ static inline int ovl_do_create(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_create(ovl_upper_mnt_userns(ofs), dir, dentry, mode, true); + int err = vfs_create(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, true); pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); return err; @@ -185,7 +186,7 @@ static inline int ovl_do_mkdir(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_mkdir(ovl_upper_mnt_userns(ofs), dir, dentry, mode); + int err = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } @@ -194,7 +195,7 @@ static inline int ovl_do_mknod(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - int err = vfs_mknod(ovl_upper_mnt_userns(ofs), dir, dentry, mode, dev); + int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev); pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); return err; @@ -204,7 +205,7 @@ static inline int ovl_do_symlink(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, const char *oldname) { - int err = vfs_symlink(ovl_upper_mnt_userns(ofs), dir, dentry, oldname); + int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname); pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); return err; @@ -217,7 +218,7 @@ static inline ssize_t ovl_do_getxattr(const struct path *path, const char *name, WARN_ON(path->dentry->d_sb != path->mnt->mnt_sb); - err = vfs_getxattr(mnt_user_ns(path->mnt), path->dentry, + err = vfs_getxattr(mnt_idmap(path->mnt), path->dentry, name, value, size); len = (value && err > 0) ? err : 0; @@ -251,7 +252,7 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, + int err = vfs_setxattr(ovl_upper_mnt_idmap(ofs), dentry, name, value, size, flags); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", @@ -269,7 +270,7 @@ static inline int ovl_setxattr(struct ovl_fs *ofs, struct dentry *dentry, static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name) { - int err = vfs_removexattr(ovl_upper_mnt_userns(ofs), dentry, name); + int err = vfs_removexattr(ovl_upper_mnt_idmap(ofs), dentry, name); pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); return err; } @@ -283,13 +284,13 @@ static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry, static inline int ovl_do_set_acl(struct ovl_fs *ofs, struct dentry *dentry, const char *acl_name, struct posix_acl *acl) { - return vfs_set_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name, acl); + return vfs_set_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name, acl); } static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry, const char *acl_name) { - return vfs_remove_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name); + return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name); } static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, @@ -298,10 +299,10 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, { int err; struct renamedata rd = { - .old_mnt_userns = ovl_upper_mnt_userns(ofs), + .old_mnt_idmap = ovl_upper_mnt_idmap(ofs), .old_dir = olddir, .old_dentry = olddentry, - .new_mnt_userns = ovl_upper_mnt_userns(ofs), + .new_mnt_idmap = ovl_upper_mnt_idmap(ofs), .new_dir = newdir, .new_dentry = newdentry, .flags = flags, @@ -319,7 +320,7 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, static inline int ovl_do_whiteout(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { - int err = vfs_whiteout(ovl_upper_mnt_userns(ofs), dir, dentry); + int err = vfs_whiteout(ovl_upper_mnt_idmap(ofs), dir, dentry); pr_debug("whiteout(%pd2) = %i\n", dentry, err); return err; } @@ -328,7 +329,7 @@ static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs, struct dentry *dentry, umode_t mode) { struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry }; - struct file *file = vfs_tmpfile_open(ovl_upper_mnt_userns(ofs), &path, mode, + struct file *file = vfs_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path, mode, O_LARGEFILE | O_WRONLY, current_cred()); int err = PTR_ERR_OR_ZERO(file); @@ -340,7 +341,7 @@ static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, const char *name, struct dentry *base, int len) { - return lookup_one(ovl_upper_mnt_userns(ofs), name, base, len); + return lookup_one(ovl_upper_mnt_idmap(ofs), name, base, len); } static inline bool ovl_open_flags_need_copy_up(int flags) @@ -596,11 +597,11 @@ int ovl_set_nlink_lower(struct dentry *dentry); unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, struct dentry *upperdentry, unsigned int fallback); -int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); -int ovl_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ovl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags); @@ -609,20 +610,20 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); #ifdef CONFIG_FS_POSIX_ACL -struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, struct inode *inode, int type, bool rcu, bool noperm); static inline struct posix_acl *ovl_get_inode_acl(struct inode *inode, int type, bool rcu) { - return do_ovl_get_acl(&init_user_ns, inode, type, rcu, true); + return do_ovl_get_acl(&nop_mnt_idmap, inode, type, rcu, true); } -static inline struct posix_acl *ovl_get_acl(struct user_namespace *mnt_userns, +static inline struct posix_acl *ovl_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { - return do_ovl_get_acl(mnt_userns, d_inode(dentry), type, false, false); + return do_ovl_get_acl(idmap, d_inode(dentry), type, false, false); } -int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); struct posix_acl *ovl_get_acl_path(const struct path *path, const char *acl_name, bool noperm); @@ -717,7 +718,7 @@ void ovl_aio_request_cache_destroy(void); int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int ovl_fileattr_set(struct user_namespace *mnt_userns, +int ovl_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); /* copy_up.c */ diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index e1af8f660698..fd11fe6d6d45 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -90,9 +90,9 @@ static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs) return ofs->layers[0].mnt; } -static inline struct user_namespace *ovl_upper_mnt_userns(struct ovl_fs *ofs) +static inline struct mnt_idmap *ovl_upper_mnt_idmap(struct ovl_fs *ofs) { - return mnt_user_ns(ovl_upper_mnt(ofs)); + return mnt_idmap(ovl_upper_mnt(ofs)); } static inline struct ovl_fs *OVL_FS(struct super_block *sb) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 8cd2b9947de1..b6952b21a7ee 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -278,7 +278,7 @@ static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data while (rdd->first_maybe_whiteout) { p = rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p->next_maybe_whiteout; - dentry = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len); + dentry = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); if (!IS_ERR(dentry)) { p->is_whiteout = ovl_is_whiteout(dentry); dput(dentry); @@ -480,7 +480,7 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry goto get; } } - this = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len); + this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); if (IS_ERR_OR_NULL(this) || !this->d_inode) { /* Mark a stale entry */ p->is_whiteout = true; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 85b891152a2c..f1d9f75f8786 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1012,7 +1012,7 @@ static int ovl_own_xattr_get(const struct xattr_handler *handler, } static int ovl_own_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -1028,7 +1028,7 @@ static int ovl_other_xattr_get(const struct xattr_handler *handler, } static int ovl_other_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index bde291623c8c..923d66d131c1 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -491,7 +491,7 @@ bool ovl_is_whiteout(struct dentry *dentry) struct file *ovl_path_open(const struct path *path, int flags) { struct inode *inode = d_inode(path->dentry); - struct user_namespace *real_mnt_userns = mnt_user_ns(path->mnt); + struct mnt_idmap *real_idmap = mnt_idmap(path->mnt); int err, acc_mode; if (flags & ~(O_ACCMODE | O_LARGEFILE)) @@ -508,12 +508,12 @@ struct file *ovl_path_open(const struct path *path, int flags) BUG(); } - err = inode_permission(real_mnt_userns, inode, acc_mode | MAY_OPEN); + err = inode_permission(real_idmap, inode, acc_mode | MAY_OPEN); if (err) return ERR_PTR(err); /* O_NOATIME is an optimization, don't fail if not permitted */ - if (inode_owner_or_capable(real_mnt_userns, inode)) + if (inode_owner_or_capable(real_idmap, inode)) flags |= O_NOATIME; return dentry_open(path, flags, current_cred()); @@ -1101,16 +1101,16 @@ void ovl_copyattr(struct inode *inode) { struct path realpath; struct inode *realinode; - struct user_namespace *real_mnt_userns; + struct mnt_idmap *real_idmap; vfsuid_t vfsuid; vfsgid_t vfsgid; ovl_i_path_real(inode, &realpath); realinode = d_inode(realpath.dentry); - real_mnt_userns = mnt_user_ns(realpath.mnt); + real_idmap = mnt_idmap(realpath.mnt); - vfsuid = i_uid_into_vfsuid(real_mnt_userns, realinode); - vfsgid = i_gid_into_vfsgid(real_mnt_userns, realinode); + vfsuid = i_uid_into_vfsuid(real_idmap, realinode); + vfsgid = i_gid_into_vfsgid(real_idmap, realinode); inode->i_uid = vfsuid_into_kuid(vfsuid); inode->i_gid = vfsgid_into_kgid(vfsgid); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index d7bc81fc0840..5a76fb35923a 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -28,6 +28,7 @@ #include <linux/security.h> #include <linux/evm.h> #include <linux/fsnotify.h> +#include <linux/filelock.h> #include "internal.h" @@ -111,7 +112,7 @@ void forget_all_cached_acls(struct inode *inode) } EXPORT_SYMBOL(forget_all_cached_acls); -static struct posix_acl *__get_acl(struct user_namespace *mnt_userns, +static struct posix_acl *__get_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, int type) { @@ -154,7 +155,7 @@ static struct posix_acl *__get_acl(struct user_namespace *mnt_userns, * we'll just create the negative cache entry. */ if (dentry && inode->i_op->get_acl) { - acl = inode->i_op->get_acl(mnt_userns, dentry, type); + acl = inode->i_op->get_acl(idmap, dentry, type); } else if (inode->i_op->get_inode_acl) { acl = inode->i_op->get_inode_acl(inode, type, false); } else { @@ -174,14 +175,14 @@ static struct posix_acl *__get_acl(struct user_namespace *mnt_userns, * Cache the result, but only if our sentinel is still in place. */ posix_acl_dup(acl); - if (unlikely(cmpxchg(p, sentinel, acl) != sentinel)) + if (unlikely(!try_cmpxchg(p, &sentinel, acl))) posix_acl_release(acl); return acl; } struct posix_acl *get_inode_acl(struct inode *inode, int type) { - return __get_acl(&init_user_ns, NULL, inode, type); + return __get_acl(&nop_mnt_idmap, NULL, inode, type); } EXPORT_SYMBOL(get_inode_acl); @@ -372,7 +373,7 @@ EXPORT_SYMBOL(posix_acl_from_mode); * by the acl. Returns -E... otherwise. */ int -posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, +posix_acl_permission(struct mnt_idmap *idmap, struct inode *inode, const struct posix_acl *acl, int want) { const struct posix_acl_entry *pa, *pe, *mask_obj; @@ -387,18 +388,18 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, switch(pa->e_tag) { case ACL_USER_OBJ: /* (May have been checked already) */ - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto check_perm; break; case ACL_USER: - vfsuid = make_vfsuid(mnt_userns, fs_userns, + vfsuid = make_vfsuid(idmap, fs_userns, pa->e_uid); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto mask; break; case ACL_GROUP_OBJ: - vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(idmap, inode); if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) @@ -406,7 +407,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, } break; case ACL_GROUP: - vfsgid = make_vfsgid(mnt_userns, fs_userns, + vfsgid = make_vfsgid(idmap, fs_userns, pa->e_gid); if (vfsgid_in_group_p(vfsgid)) { found = 1; @@ -591,18 +592,18 @@ EXPORT_SYMBOL(__posix_acl_chmod); /** * posix_acl_chmod - chmod a posix acl * - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @dentry: dentry to check permissions on * @mode: the new mode of @inode * - * If the dentry has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the dentry has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. */ int - posix_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry, + posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) { struct inode *inode = d_inode(dentry); @@ -624,7 +625,7 @@ int ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (ret) return ret; - ret = inode->i_op->set_acl(mnt_userns, dentry, acl, ACL_TYPE_ACCESS); + ret = inode->i_op->set_acl(idmap, dentry, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); return ret; } @@ -683,7 +684,7 @@ EXPORT_SYMBOL_GPL(posix_acl_create); /** * posix_acl_update_mode - update mode in set_acl - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: target inode * @mode_p: mode (pointer) for update * @acl: acl pointer @@ -695,15 +696,15 @@ EXPORT_SYMBOL_GPL(posix_acl_create); * As with chmod, clear the setgid bit if the caller is not in the owning group * or capable of CAP_FSETID (see inode_change_ok). * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. * * Called from set_acl inode operations. */ -int posix_acl_update_mode(struct user_namespace *mnt_userns, +int posix_acl_update_mode(struct mnt_idmap *idmap, struct inode *inode, umode_t *mode_p, struct posix_acl **acl) { @@ -715,8 +716,8 @@ int posix_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) && + !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; return 0; @@ -893,7 +894,6 @@ static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap, struct posix_acl_xattr_header *ext_acl = buffer; struct posix_acl_xattr_entry *ext_entry; struct user_namespace *fs_userns, *caller_userns; - struct user_namespace *mnt_userns; ssize_t real_size, n; vfsuid_t vfsuid; vfsgid_t vfsgid; @@ -909,19 +909,18 @@ static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap, fs_userns = i_user_ns(inode); caller_userns = current_user_ns(); - mnt_userns = mnt_idmap_owner(idmap); for (n=0; n < acl->a_count; n++, ext_entry++) { const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); switch(acl_e->e_tag) { case ACL_USER: - vfsuid = make_vfsuid(mnt_userns, fs_userns, acl_e->e_uid); + vfsuid = make_vfsuid(idmap, fs_userns, acl_e->e_uid); ext_entry->e_id = cpu_to_le32(from_kuid( caller_userns, vfsuid_into_kuid(vfsuid))); break; case ACL_GROUP: - vfsgid = make_vfsgid(mnt_userns, fs_userns, acl_e->e_gid); + vfsgid = make_vfsgid(idmap, fs_userns, acl_e->e_gid); ext_entry->e_id = cpu_to_le32(from_kgid( caller_userns, vfsgid_into_kgid(vfsgid))); break; @@ -934,7 +933,7 @@ static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap, } int -set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +set_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type, struct posix_acl *acl) { struct inode *inode = d_inode(dentry); @@ -946,7 +945,7 @@ set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (acl) { @@ -954,7 +953,7 @@ set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (ret) return ret; } - return inode->i_op->set_acl(mnt_userns, dentry, acl, type); + return inode->i_op->set_acl(idmap, dentry, acl, type); } EXPORT_SYMBOL(set_posix_acl); @@ -978,14 +977,14 @@ const struct xattr_handler posix_acl_default_xattr_handler = { }; EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); -int simple_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int error; struct inode *inode = d_inode(dentry); if (type == ACL_TYPE_ACCESS) { - error = posix_acl_update_mode(mnt_userns, inode, + error = posix_acl_update_mode(idmap, inode, &inode->i_mode, &acl); if (error) return error; @@ -1017,7 +1016,7 @@ int simple_acl_create(struct inode *dir, struct inode *inode) return 0; } -static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns, +static int vfs_set_acl_idmapped_mnt(struct mnt_idmap *idmap, struct user_namespace *fs_userns, struct posix_acl *acl) { @@ -1026,11 +1025,11 @@ static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns, switch (acl_e->e_tag) { case ACL_USER: - acl_e->e_uid = from_vfsuid(mnt_userns, fs_userns, + acl_e->e_uid = from_vfsuid(idmap, fs_userns, VFSUIDT_INIT(acl_e->e_uid)); break; case ACL_GROUP: - acl_e->e_gid = from_vfsgid(mnt_userns, fs_userns, + acl_e->e_gid = from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(acl_e->e_gid)); break; } @@ -1041,7 +1040,7 @@ static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns, /** * vfs_set_acl - set posix acls - * @mnt_userns: user namespace of the mount + * @idmap: idmap of the mount * @dentry: the dentry based on which to set the posix acls * @acl_name: the name of the posix acl * @kacl: the posix acls in the appropriate VFS format @@ -1051,7 +1050,7 @@ static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns, * * Return: On success 0, on error negative errno. */ -int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { int acl_type; @@ -1071,7 +1070,7 @@ int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, * if this is a filesystem with a backing store - ultimately * translate them to backing store values. */ - error = vfs_set_acl_idmapped_mnt(mnt_userns, i_user_ns(inode), kacl); + error = vfs_set_acl_idmapped_mnt(idmap, i_user_ns(inode), kacl); if (error) return error; } @@ -1083,11 +1082,11 @@ retry_deleg: * We only care about restrictions the inode struct itself places upon * us otherwise POSIX ACLs aren't subject to any VFS restrictions. */ - error = may_write_xattr(mnt_userns, inode); + error = may_write_xattr(idmap, inode); if (error) goto out_inode_unlock; - error = security_inode_set_acl(mnt_userns, dentry, acl_name, kacl); + error = security_inode_set_acl(idmap, dentry, acl_name, kacl); if (error) goto out_inode_unlock; @@ -1096,7 +1095,7 @@ retry_deleg: goto out_inode_unlock; if (inode->i_opflags & IOP_XATTR) - error = set_posix_acl(mnt_userns, dentry, acl_type, kacl); + error = set_posix_acl(idmap, dentry, acl_type, kacl); else if (unlikely(is_bad_inode(inode))) error = -EIO; else @@ -1121,7 +1120,7 @@ EXPORT_SYMBOL_GPL(vfs_set_acl); /** * vfs_get_acl - get posix acls - * @mnt_userns: user namespace of the mount + * @idmap: idmap of the mount * @dentry: the dentry based on which to retrieve the posix acls * @acl_name: the name of the posix acl * @@ -1130,7 +1129,7 @@ EXPORT_SYMBOL_GPL(vfs_set_acl); * * Return: On success POSIX ACLs in VFS format, on error negative errno. */ -struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { struct inode *inode = d_inode(dentry); @@ -1145,7 +1144,7 @@ struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, * The VFS has no restrictions on reading POSIX ACLs so calling * something like xattr_permission() isn't needed. Only LSMs get a say. */ - error = security_inode_get_acl(mnt_userns, dentry, acl_name); + error = security_inode_get_acl(idmap, dentry, acl_name); if (error) return ERR_PTR(error); @@ -1154,7 +1153,7 @@ struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, if (S_ISLNK(inode->i_mode)) return ERR_PTR(-EOPNOTSUPP); - acl = __get_acl(mnt_userns, dentry, inode, acl_type); + acl = __get_acl(idmap, dentry, inode, acl_type); if (IS_ERR(acl)) return acl; if (!acl) @@ -1166,7 +1165,7 @@ EXPORT_SYMBOL_GPL(vfs_get_acl); /** * vfs_remove_acl - remove posix acls - * @mnt_userns: user namespace of the mount + * @idmap: idmap of the mount * @dentry: the dentry based on which to retrieve the posix acls * @acl_name: the name of the posix acl * @@ -1174,7 +1173,7 @@ EXPORT_SYMBOL_GPL(vfs_get_acl); * * Return: On success 0, on error negative errno. */ -int vfs_remove_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { int acl_type; @@ -1193,11 +1192,11 @@ retry_deleg: * We only care about restrictions the inode struct itself places upon * us otherwise POSIX ACLs aren't subject to any VFS restrictions. */ - error = may_write_xattr(mnt_userns, inode); + error = may_write_xattr(idmap, inode); if (error) goto out_inode_unlock; - error = security_inode_remove_acl(mnt_userns, dentry, acl_name); + error = security_inode_remove_acl(idmap, dentry, acl_name); if (error) goto out_inode_unlock; @@ -1206,14 +1205,14 @@ retry_deleg: goto out_inode_unlock; if (inode->i_opflags & IOP_XATTR) - error = set_posix_acl(mnt_userns, dentry, acl_type, NULL); + error = set_posix_acl(idmap, dentry, acl_type, NULL); else if (unlikely(is_bad_inode(inode))) error = -EIO; else error = -EOPNOTSUPP; if (!error) { fsnotify_xattr(dentry); - evm_inode_post_remove_acl(mnt_userns, dentry, acl_name); + evm_inode_post_remove_acl(idmap, dentry, acl_name); } out_inode_unlock: @@ -1245,7 +1244,7 @@ int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, return PTR_ERR(acl); } - error = vfs_set_acl(mnt_idmap_owner(idmap), dentry, acl_name, acl); + error = vfs_set_acl(idmap, dentry, acl_name, acl); posix_acl_release(acl); return error; } @@ -1256,7 +1255,7 @@ ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, ssize_t error; struct posix_acl *acl; - acl = vfs_get_acl(mnt_idmap_owner(idmap), dentry, acl_name); + acl = vfs_get_acl(idmap, dentry, acl_name); if (IS_ERR(acl)) return PTR_ERR(acl); diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e479d7d202b..5e0e0ccd47aa 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -685,7 +685,7 @@ static bool proc_fd_access_allowed(struct inode *inode) return allowed; } -int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int error; @@ -694,11 +694,11 @@ int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (attr->ia_valid & ATTR_MODE) return -EPERM; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -727,7 +727,7 @@ static bool has_pid_permissions(struct proc_fs_info *fs_info, } -static int proc_pid_permission(struct user_namespace *mnt_userns, +static int proc_pid_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); @@ -753,7 +753,7 @@ static int proc_pid_permission(struct user_namespace *mnt_userns, return -EPERM; } - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } @@ -1959,14 +1959,14 @@ static struct inode *proc_pid_make_base_inode(struct super_block *sb, return inode; } -int pid_getattr(struct user_namespace *mnt_userns, const struct path *path, +int pid_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; @@ -3557,7 +3557,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) * This function makes sure that the node is always accessible for members of * same thread group. */ -static int proc_tid_comm_permission(struct user_namespace *mnt_userns, +static int proc_tid_comm_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { bool is_same_tgroup; @@ -3577,7 +3577,7 @@ static int proc_tid_comm_permission(struct user_namespace *mnt_userns, return 0; } - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } static const struct inode_operations proc_tid_comm_inode_operations = { @@ -3891,13 +3891,13 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) return 0; } -static int proc_task_getattr(struct user_namespace *mnt_userns, +static int proc_task_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct task_struct *p = get_proc_task(inode); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (p) { stat->nlink += get_nr_threads(p); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index fc46d6fe080c..b3140deebbbf 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -12,6 +12,7 @@ #include <linux/file.h> #include <linux/seq_file.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/proc_fs.h> @@ -325,13 +326,13 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). */ -int proc_fd_permission(struct user_namespace *mnt_userns, +int proc_fd_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct task_struct *p; int rv; - rv = generic_permission(&init_user_ns, inode, mask); + rv = generic_permission(&nop_mnt_idmap, inode, mask); if (rv == 0) return rv; @@ -344,14 +345,14 @@ int proc_fd_permission(struct user_namespace *mnt_userns, return rv; } -static int proc_fd_getattr(struct user_namespace *mnt_userns, +static int proc_fd_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); int rv = 0; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); /* If it's a directory, put the number of open fds there */ if (S_ISDIR(inode->i_mode)) { diff --git a/fs/proc/fd.h b/fs/proc/fd.h index c5a921a06a0b..7e7265f7e06f 100644 --- a/fs/proc/fd.h +++ b/fs/proc/fd.h @@ -10,7 +10,7 @@ extern const struct inode_operations proc_fd_inode_operations; extern const struct file_operations proc_fdinfo_operations; extern const struct inode_operations proc_fdinfo_inode_operations; -extern int proc_fd_permission(struct user_namespace *mnt_userns, +extern int proc_fd_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); static inline unsigned int proc_fd(struct inode *inode) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 587b91d9d998..8379593fa4bb 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -115,18 +115,18 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, return true; } -static int proc_notify_change(struct user_namespace *mnt_userns, +static int proc_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct proc_dir_entry *de = PDE(inode); int error; - error = setattr_prepare(&init_user_ns, dentry, iattr); + error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) return error; - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); proc_set_user(de, inode->i_uid, inode->i_gid); @@ -134,7 +134,7 @@ static int proc_notify_change(struct user_namespace *mnt_userns, return 0; } -static int proc_getattr(struct user_namespace *mnt_userns, +static int proc_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -147,7 +147,7 @@ static int proc_getattr(struct user_namespace *mnt_userns, } } - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index b701d0207edf..9dda7e54b2d0 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -162,9 +162,9 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, * base.c */ extern const struct dentry_operations pid_dentry_operations; -extern int pid_getattr(struct user_namespace *, const struct path *, +extern int pid_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -extern int proc_setattr(struct user_namespace *, struct dentry *, +extern int proc_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 856839b8ae8b..a0c0419872e3 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -299,7 +299,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, return de; } -static int proc_tgid_net_getattr(struct user_namespace *mnt_userns, +static int proc_tgid_net_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -308,7 +308,7 @@ static int proc_tgid_net_getattr(struct user_namespace *mnt_userns, net = get_proc_task_net(inode); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (net != NULL) { stat->nlink = net->proc_net->nlink; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 48f2d60bd78a..e89bd8f1368b 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -798,7 +798,7 @@ out: return 0; } -static int proc_sys_permission(struct user_namespace *mnt_userns, +static int proc_sys_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { /* @@ -827,7 +827,7 @@ static int proc_sys_permission(struct user_namespace *mnt_userns, return error; } -static int proc_sys_setattr(struct user_namespace *mnt_userns, +static int proc_sys_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -836,16 +836,16 @@ static int proc_sys_setattr(struct user_namespace *mnt_userns, if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) return -EPERM; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } -static int proc_sys_getattr(struct user_namespace *mnt_userns, +static int proc_sys_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -856,7 +856,7 @@ static int proc_sys_getattr(struct user_namespace *mnt_userns, if (IS_ERR(head)) return PTR_ERR(head); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (table) stat->mode = (stat->mode & S_IFMT) | table->mode; diff --git a/fs/proc/root.c b/fs/proc/root.c index 3c2ee3eb1138..a86e65a608da 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -310,11 +310,11 @@ void __init proc_root_init(void) register_filesystem(&proc_fs_type); } -static int proc_root_getattr(struct user_namespace *mnt_userns, +static int proc_root_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e35a0398db63..af1c49ae11b1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -745,9 +745,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, page = pfn_swap_entry_to_page(swpent); } if (page) { - int mapcount = page_mapcount(page); - - if (mapcount >= 2) + if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index f27faf5db554..a6357f728034 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2085,7 +2085,7 @@ EXPORT_SYMBOL(__dquot_transfer); /* Wrapper for transferring ownership of an inode for uid/gid only * Called from FSXXX_setattr() */ -int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, +int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr) { struct dquot *transfer_to[MAXQUOTAS] = {}; @@ -2096,8 +2096,8 @@ int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, if (!dquot_active(inode)) return 0; - if (i_uid_needs_update(mnt_userns, iattr, inode)) { - kuid_t kuid = from_vfsuid(mnt_userns, i_user_ns(inode), + if (i_uid_needs_update(idmap, iattr, inode)) { + kuid_t kuid = from_vfsuid(idmap, i_user_ns(inode), iattr->ia_vfsuid); dquot = dqget(sb, make_kqid_uid(kuid)); @@ -2110,8 +2110,8 @@ int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, } transfer_to[USRQUOTA] = dquot; } - if (i_gid_needs_update(mnt_userns, iattr, inode)) { - kgid_t kgid = from_vfsgid(mnt_userns, i_user_ns(inode), + if (i_gid_needs_update(idmap, iattr, inode)) { + kgid_t kgid = from_vfsgid(idmap, i_user_ns(inode), iattr->ia_vfsgid); dquot = dqget(sb, make_kqid_gid(kgid)); diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index cb240eac5036..5bf74c2f6042 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -22,7 +22,7 @@ #include <linux/uaccess.h> #include "internal.h" -static int ramfs_nommu_setattr(struct user_namespace *, struct dentry *, struct iattr *); +static int ramfs_nommu_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, @@ -158,7 +158,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) * handle a change of attributes * - we're specifically interested in a change of size */ -static int ramfs_nommu_setattr(struct user_namespace *mnt_userns, +static int ramfs_nommu_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *ia) { struct inode *inode = d_inode(dentry); @@ -166,7 +166,7 @@ static int ramfs_nommu_setattr(struct user_namespace *mnt_userns, int ret = 0; /* POSIX UID/GID verification for setting inode attributes */ - ret = setattr_prepare(&init_user_ns, dentry, ia); + ret = setattr_prepare(&nop_mnt_idmap, dentry, ia); if (ret) return ret; @@ -186,7 +186,7 @@ static int ramfs_nommu_setattr(struct user_namespace *mnt_userns, } } - setattr_copy(&init_user_ns, inode, ia); + setattr_copy(&nop_mnt_idmap, inode, ia); out: ia->ia_valid = old_ia_valid; return ret; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index b3257e852820..5ba580c78835 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -61,7 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, if (inode) { inode->i_ino = get_next_ino(); - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_mapping->a_ops = &ram_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); @@ -95,7 +95,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, */ /* SMP-safe */ static int -ramfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev); @@ -110,22 +110,22 @@ ramfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, return error; } -static int ramfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - int retval = ramfs_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); + int retval = ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); return retval; } -static int ramfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ramfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return ramfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); + return ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); } -static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct inode *inode; @@ -145,7 +145,7 @@ static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, return error; } -static int ramfs_tmpfile(struct user_namespace *mnt_userns, +static int ramfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h index 29c503a06db4..2571b1a8be84 100644 --- a/fs/reiserfs/acl.h +++ b/fs/reiserfs/acl.h @@ -49,7 +49,7 @@ static inline int reiserfs_acl_count(size_t size) #ifdef CONFIG_REISERFS_FS_POSIX_ACL struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu); -int reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int reiserfs_acl_chmod(struct dentry *dentry); int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index c7d1fa526dea..d54cab854f60 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3262,21 +3262,21 @@ static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return ret; } -int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid; int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - if (is_quota_modification(mnt_userns, inode, attr)) { + if (is_quota_modification(&nop_mnt_idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; @@ -3359,7 +3359,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, reiserfs_write_unlock(inode->i_sb); if (error) goto out; - error = dquot_transfer(mnt_userns, inode, attr); + error = dquot_transfer(&nop_mnt_idmap, inode, attr); reiserfs_write_lock(inode->i_sb); if (error) { journal_end(&th); @@ -3398,7 +3398,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } if (!error) { - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); } diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 4b86ecf5817e..6bf9b54e58ca 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -24,7 +24,7 @@ int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int reiserfs_fileattr_set(struct user_namespace *mnt_userns, +int reiserfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -96,7 +96,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = put_user(inode->i_generation, (int __user *)arg); break; case REISERFS_IOC_SETVERSION: - if (!inode_owner_or_capable(&init_user_ns, inode)) { + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) { err = -EPERM; break; } diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 0b8aa99749f1..42d2c20e1345 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -616,11 +616,11 @@ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode) * the quota init calls have to know who to charge the quota to, so * we have to set uid and gid here */ - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); return dquot_initialize(inode); } -static int reiserfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int reiserfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { int retval; @@ -700,7 +700,7 @@ out_failed: return retval; } -static int reiserfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int reiserfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { int retval; @@ -784,7 +784,7 @@ out_failed: return retval; } -static int reiserfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int reiserfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int retval; @@ -1099,7 +1099,7 @@ out_unlink: return retval; } -static int reiserfs_symlink(struct user_namespace *mnt_userns, +static int reiserfs_symlink(struct mnt_idmap *idmap, struct inode *parent_dir, struct dentry *dentry, const char *symname) { @@ -1311,7 +1311,7 @@ static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de, * one path. If it holds 2 or more, it can get into endless waiting in * get_empty_nodes or its clones */ -static int reiserfs_rename(struct user_namespace *mnt_userns, +static int reiserfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 3aa928ec527a..98e6f53c2fe0 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -3100,7 +3100,7 @@ static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th, } void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); -int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); @@ -3407,7 +3407,7 @@ __u32 r5_hash(const signed char *msg, int len); /* prototypes from ioctl.c */ int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int reiserfs_fileattr_set(struct user_namespace *mnt_userns, +int reiserfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long reiserfs_compat_ioctl(struct file *filp, diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 8b2d52443f41..06d810c72c52 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -66,14 +66,14 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { BUG_ON(!inode_is_locked(dir)); - return dir->i_op->create(&init_user_ns, dir, dentry, mode, true); + return dir->i_op->create(&nop_mnt_idmap, dir, dentry, mode, true); } #endif static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { BUG_ON(!inode_is_locked(dir)); - return dir->i_op->mkdir(&init_user_ns, dir, dentry, mode); + return dir->i_op->mkdir(&nop_mnt_idmap, dir, dentry, mode); } /* @@ -352,7 +352,7 @@ static int chown_one_xattr(struct dentry *dentry, void *data) * ATTR_MODE is set. */ attrs->ia_valid &= (ATTR_UID|ATTR_GID); - err = reiserfs_setattr(&init_user_ns, dentry, attrs); + err = reiserfs_setattr(&nop_mnt_idmap, dentry, attrs); attrs->ia_valid = ia_valid; return err; @@ -597,7 +597,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR); inode_dio_wait(d_inode(dentry)); - err = reiserfs_setattr(&init_user_ns, dentry, &newattrs); + err = reiserfs_setattr(&nop_mnt_idmap, dentry, &newattrs); inode_unlock(d_inode(dentry)); } else update_ctime(inode); @@ -941,7 +941,7 @@ static int xattr_mount_check(struct super_block *s) return 0; } -int reiserfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int reiserfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { /* @@ -951,7 +951,7 @@ int reiserfs_permission(struct user_namespace *mnt_userns, struct inode *inode, if (IS_PRIVATE(inode)) return 0; - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags) diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index e47fde1182de..5868a4e990e3 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -16,7 +16,7 @@ int reiserfs_xattr_init(struct super_block *sb, int mount_flags); int reiserfs_lookup_privroot(struct super_block *sb); int reiserfs_delete_xattrs(struct inode *inode); int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs); -int reiserfs_permission(struct user_namespace *mnt_userns, +int reiserfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); #ifdef CONFIG_REISERFS_FS_XATTR diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 93fe414fed18..138060452678 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -18,7 +18,7 @@ static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, int -reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int error, error2; @@ -42,7 +42,7 @@ reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, reiserfs_write_unlock(inode->i_sb); if (error == 0) { if (type == ACL_TYPE_ACCESS && acl) { - error = posix_acl_update_mode(&init_user_ns, inode, + error = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (error) goto unlock; @@ -407,5 +407,5 @@ int reiserfs_acl_chmod(struct dentry *dentry) !reiserfs_posixacl(inode->i_sb)) return 0; - return posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + return posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); } diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 857a65b05726..41c0ea84fbff 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -22,7 +22,7 @@ security_get(const struct xattr_handler *handler, struct dentry *unused, static int security_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, struct dentry *unused, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index d853cea2afcd..0c0c74d8db0e 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -21,7 +21,7 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused, static int trusted_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, struct dentry *unused, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 65d9cd10a5ea..88195181e1d7 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -18,7 +18,7 @@ user_get(const struct xattr_handler *handler, struct dentry *unused, } static int -user_set(const struct xattr_handler *handler, struct user_namespace *mnt_userns, +user_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/remap_range.c b/fs/remap_range.c index 41f60477bb41..1331a890f2f2 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -419,16 +419,16 @@ EXPORT_SYMBOL(vfs_clone_file_range); /* Check whether we are allowed to dedupe the destination file */ static bool allow_file_dedupe(struct file *file) { - struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); if (capable(CAP_SYS_ADMIN)) return true; if (file->f_mode & FMODE_WRITE) return true; - if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid())) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) return true; - if (!inode_permission(mnt_userns, inode, MAY_WRITE)) + if (!inode_permission(idmap, inode, MAY_WRITE)) return true; return false; } diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index b3fdc8212c5f..95f8e8901768 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -183,7 +183,7 @@ static inline int squashfs_block_size(__le32 raw) #define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ sizeof(u64)) /* xattr id lookup table defines */ -#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id)) +#define SQUASHFS_XATTR_BYTES(A) (((u64) (A)) * sizeof(struct squashfs_xattr_id)) #define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \ SQUASHFS_METADATA_SIZE) diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 659082e9e51d..72f6f4b37863 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -63,7 +63,7 @@ struct squashfs_sb_info { long long bytes_used; unsigned int inodes; unsigned int fragments; - int xattr_ids; + unsigned int xattr_ids; unsigned int ids; bool panic_on_errors; const struct squashfs_decompressor_thread_ops *thread_ops; diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h index d8a270d3ac4c..f1a463d8bfa0 100644 --- a/fs/squashfs/xattr.h +++ b/fs/squashfs/xattr.h @@ -10,12 +10,12 @@ #ifdef CONFIG_SQUASHFS_XATTR extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, - u64 *, int *); + u64 *, unsigned int *); extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, unsigned int *, unsigned long long *); #else static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, - u64 start, u64 *xattr_table_start, int *xattr_ids) + u64 start, u64 *xattr_table_start, unsigned int *xattr_ids) { struct squashfs_xattr_id_table *id_table; diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c index 087cab8c78f4..c8469c656e0d 100644 --- a/fs/squashfs/xattr_id.c +++ b/fs/squashfs/xattr_id.c @@ -56,7 +56,7 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, * Read uncompressed xattr id lookup table indexes from disk into memory */ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, - u64 *xattr_table_start, int *xattr_ids) + u64 *xattr_table_start, unsigned int *xattr_ids) { struct squashfs_sb_info *msblk = sb->s_fs_info; unsigned int len, indexes; diff --git a/fs/stat.c b/fs/stat.c index d6cc74ca8486..7c238da22ef0 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -18,6 +18,7 @@ #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/compat.h> +#include <linux/iversion.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -27,7 +28,7 @@ /** * generic_fillattr - Fill in the basic attributes from the inode struct - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: Inode to use as the source * @stat: Where to fill in the attributes * @@ -35,17 +36,17 @@ * found on the VFS inode structure. This is the default if no getattr inode * operation is supplied. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before filling in the + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before filling in the * uid and gid filds. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. */ -void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode, +void generic_fillattr(struct mnt_idmap *idmap, struct inode *inode, struct kstat *stat) { - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); stat->dev = inode->i_sb->s_dev; stat->ino = inode->i_ino; @@ -97,7 +98,7 @@ EXPORT_SYMBOL(generic_fill_statx_attr); int vfs_getattr_nosec(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *inode = d_backing_inode(path->dentry); memset(stat, 0, sizeof(*stat)); @@ -122,12 +123,17 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | STATX_ATTR_DAX); - mnt_userns = mnt_user_ns(path->mnt); + if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { + stat->result_mask |= STATX_CHANGE_COOKIE; + stat->change_cookie = inode_query_iversion(inode); + } + + idmap = mnt_idmap(path->mnt); if (inode->i_op->getattr) - return inode->i_op->getattr(mnt_userns, path, stat, + return inode->i_op->getattr(idmap, path, stat, request_mask, query_flags); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); return 0; } EXPORT_SYMBOL(vfs_getattr_nosec); @@ -602,9 +608,11 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) memset(&tmp, 0, sizeof(tmp)); - tmp.stx_mask = stat->result_mask; + /* STATX_CHANGE_COOKIE is kernel-only for now */ + tmp.stx_mask = stat->result_mask & ~STATX_CHANGE_COOKIE; tmp.stx_blksize = stat->blksize; - tmp.stx_attributes = stat->attributes; + /* STATX_ATTR_CHANGE_MONOTONIC is kernel-only for now */ + tmp.stx_attributes = stat->attributes & ~STATX_ATTR_CHANGE_MONOTONIC; tmp.stx_nlink = stat->nlink; tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid); tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid); @@ -643,6 +651,11 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags, if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) return -EINVAL; + /* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests + * from userland. + */ + mask &= ~STATX_CHANGE_COOKIE; + error = vfs_statx(dfd, filename, flags, &stat, mask); if (error) return error; diff --git a/fs/super.c b/fs/super.c index 12c08cb20405..8e531174e7c2 100644 --- a/fs/super.c +++ b/fs/super.c @@ -291,7 +291,6 @@ static void __put_super(struct super_block *s) WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s); - fscrypt_destroy_keyring(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); @@ -491,10 +490,23 @@ void generic_shutdown_super(struct super_block *sb) if (sop->put_super) sop->put_super(sb); - if (!list_empty(&sb->s_inodes)) { - printk("VFS: Busy inodes after unmount of %s. " - "Self-destruct in 5 seconds. Have a nice day...\n", - sb->s_id); + if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), + "VFS: Busy inodes after unmount of %s (%s)", + sb->s_id, sb->s_type->name)) { + /* + * Adding a proper bailout path here would be hard, but + * we can at least make it more likely that a later + * iput_final() or such crashes cleanly. + */ + struct inode *inode; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + inode->i_op = VFS_PTR_POISON; + inode->i_sb = VFS_PTR_POISON; + inode->i_mapping = VFS_PTR_POISON; + } + spin_unlock(&sb->s_inode_list_lock); } } spin_lock(&sb_lock); diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 90e00124ea07..50eb92557a0f 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -29,13 +29,13 @@ const struct file_operations sysv_file_operations = { .splice_read = generic_file_splice_read, }; -static int sysv_setattr(struct user_namespace *mnt_userns, +static int sysv_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -48,7 +48,7 @@ static int sysv_setattr(struct user_namespace *mnt_userns, sysv_truncate(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index 50df794a3c1f..e732879036ab 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -163,7 +163,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); dirty_sb(sb); - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = fs16_to_cpu(sbi, ino); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 3b8567564e7e..b22764fe669c 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -441,11 +441,11 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size) return res; } -int sysv_getattr(struct user_namespace *mnt_userns, const struct path *path, +int sysv_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct super_block *s = path->dentry->d_sb; - generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); stat->blksize = s->s_blocksize; return 0; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index b2e6abc06a2d..ecd424461511 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -41,7 +41,7 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, un return d_splice_alias(inode, dentry); } -static int sysv_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int sysv_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode * inode; @@ -61,13 +61,13 @@ static int sysv_mknod(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int sysv_create(struct user_namespace *mnt_userns, struct inode *dir, +static int sysv_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return sysv_mknod(&init_user_ns, dir, dentry, mode, 0); + return sysv_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); } -static int sysv_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int sysv_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int err = -ENAMETOOLONG; @@ -110,7 +110,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int sysv_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int sysv_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode * inode; @@ -189,7 +189,7 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry) * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. */ -static int sysv_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 99ddf033da4f..5e122a5673c1 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -141,7 +141,7 @@ extern struct inode *sysv_iget(struct super_block *, unsigned int); extern int sysv_write_inode(struct inode *, struct writeback_control *wbc); extern int sysv_sync_inode(struct inode *); extern void sysv_set_inode(struct inode *, dev_t); -extern int sysv_getattr(struct user_namespace *, const struct path *, +extern int sysv_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int sysv_init_icache(void); extern void sysv_destroy_icache(void); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index da85b3979195..57ac8aa4a724 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -67,7 +67,7 @@ static char *get_dname(struct dentry *dentry) return name; } -static int tracefs_syscall_mkdir(struct user_namespace *mnt_userns, +static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, struct inode *inode, struct dentry *dentry, umode_t mode) { diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 0f29cf201136..1e92c1730c16 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -95,7 +95,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, */ inode->i_flags |= S_NOCMTIME; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_mapping->nrpages = 0; @@ -283,7 +283,7 @@ static int ubifs_prepare_create(struct inode *dir, struct dentry *dentry, return fscrypt_setup_filename(dir, &dentry->d_name, 0, nm); } -static int ubifs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; @@ -426,7 +426,7 @@ static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) mutex_unlock(&ubifs_inode(inode1)->ui_mutex); } -static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct dentry *dentry = file->f_path.dentry; @@ -979,7 +979,7 @@ out_fname: return err; } -static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -1052,7 +1052,7 @@ out_budg: return err; } -static int ubifs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -1141,7 +1141,7 @@ out_budg: return err; } -static int ubifs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct inode *inode; @@ -1606,7 +1606,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, return err; } -static int ubifs_rename(struct user_namespace *mnt_userns, +static int ubifs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -1631,7 +1631,7 @@ static int ubifs_rename(struct user_namespace *mnt_userns, return do_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } -int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { loff_t size; @@ -1654,7 +1654,7 @@ int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->blksize = UBIFS_BLOCK_SIZE; stat->size = ui->ui_size; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f2353dd676ef..8cb5d76b301c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1258,7 +1258,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, return err; } -int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ubifs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int err; @@ -1267,7 +1267,7 @@ int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, dbg_gen("ino %lu, mode %#x, ia_valid %#x", inode->i_ino, inode->i_mode, attr->ia_valid); - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) return err; @@ -1608,11 +1608,11 @@ static const char *ubifs_get_link(struct dentry *dentry, return fscrypt_get_symlink(inode, ui->data, ui->data_len, done); } -static int ubifs_symlink_getattr(struct user_namespace *mnt_userns, +static int ubifs_symlink_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - ubifs_getattr(mnt_userns, path, stat, request_mask, query_flags); + ubifs_getattr(idmap, path, stat, request_mask, query_flags); if (IS_ENCRYPTED(d_inode(path->dentry))) return fscrypt_symlink_getattr(path, stat); diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 71bcebe45f9c..67c5108abd89 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -144,7 +144,7 @@ int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int ubifs_fileattr_set(struct user_namespace *mnt_userns, +int ubifs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 478bbbb5382f..9063b73536f8 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -2020,15 +2020,15 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc); /* file.c */ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync); -int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ubifs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, umode_t mode, bool is_xattr); -int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags); +int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); int ubifs_check_dir_empty(struct inode *dir); /* xattr.c */ @@ -2085,7 +2085,7 @@ void ubifs_destroy_size_tree(struct ubifs_info *c); /* ioctl.c */ int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int ubifs_fileattr_set(struct user_namespace *mnt_userns, +int ubifs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); void ubifs_set_inode_flags(struct inode *inode); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 3db8486e3725..349228dd1191 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -699,7 +699,7 @@ static int xattr_get(const struct xattr_handler *handler, } static int xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 8e597db4d971..14b9db4c80f0 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -36,18 +36,41 @@ static int read_block_bitmap(struct super_block *sb, unsigned long bitmap_nr) { struct buffer_head *bh = NULL; - int retval = 0; + int i; + int max_bits, off, count; struct kernel_lb_addr loc; loc.logicalBlockNum = bitmap->s_extPosition; loc.partitionReferenceNum = UDF_SB(sb)->s_partition; - bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block)); + bh = sb_bread(sb, udf_get_lb_pblock(sb, &loc, block)); + bitmap->s_block_bitmap[bitmap_nr] = bh; if (!bh) - retval = -EIO; + return -EIO; - bitmap->s_block_bitmap[bitmap_nr] = bh; - return retval; + /* Check consistency of Space Bitmap buffer. */ + max_bits = sb->s_blocksize * 8; + if (!bitmap_nr) { + off = sizeof(struct spaceBitmapDesc) << 3; + count = min(max_bits - off, bitmap->s_nr_groups); + } else { + /* + * Rough check if bitmap number is too big to have any bitmap + * blocks reserved. + */ + if (bitmap_nr > + (bitmap->s_nr_groups >> (sb->s_blocksize_bits + 3)) + 2) + return 0; + off = 0; + count = bitmap->s_nr_groups - bitmap_nr * max_bits + + (sizeof(struct spaceBitmapDesc) << 3); + count = min(count, max_bits); + } + + for (i = 0; i < count; i++) + if (udf_test_bit(i + off, bh->b_data)) + return -EFSCORRUPTED; + return 0; } static int __load_block_bitmap(struct super_block *sb, diff --git a/fs/udf/dir.c b/fs/udf/dir.c index be640f4b2f2c..212393b12c22 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -39,26 +39,13 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) { struct inode *dir = file_inode(file); - struct udf_inode_info *iinfo = UDF_I(dir); - struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL}; - struct fileIdentDesc *fi = NULL; - struct fileIdentDesc cfi; - udf_pblk_t block, iblock; loff_t nf_pos, emit_pos = 0; int flen; - unsigned char *fname = NULL, *copy_name = NULL; - unsigned char *nameptr; - uint16_t liu; - uint8_t lfi; - loff_t size = udf_ext0_offset(dir) + dir->i_size; - struct buffer_head *tmp, *bha[16]; - struct kernel_lb_addr eloc; - uint32_t elen; - sector_t offset; - int i, num, ret = 0; - struct extent_position epos = { NULL, 0, {0, 0} }; + unsigned char *fname = NULL; + int ret = 0; struct super_block *sb = dir->i_sb; bool pos_valid = false; + struct udf_fileident_iter iter; if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) @@ -66,7 +53,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) ctx->pos = 1; } nf_pos = (ctx->pos - 1) << 2; - if (nf_pos >= size) + if (nf_pos >= dir->i_size) goto out; /* @@ -90,138 +77,57 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) goto out; } - if (nf_pos == 0) - nf_pos = udf_ext0_offset(dir); - - fibh.soffset = fibh.eoffset = nf_pos & (sb->s_blocksize - 1); - if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - if (inode_bmap(dir, nf_pos >> sb->s_blocksize_bits, - &epos, &eloc, &elen, &offset) - != (EXT_RECORDED_ALLOCATED >> 30)) { - ret = -ENOENT; - goto out; - } - block = udf_get_lb_pblock(sb, &eloc, offset); - if ((++offset << sb->s_blocksize_bits) < elen) { - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (iinfo->i_alloc_type == - ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - } else { - offset = 0; - } - - if (!(fibh.sbh = fibh.ebh = udf_tread(sb, block))) { - ret = -EIO; - goto out; - } - - if (!(offset & ((16 >> (sb->s_blocksize_bits - 9)) - 1))) { - i = 16 >> (sb->s_blocksize_bits - 9); - if (i + offset > (elen >> sb->s_blocksize_bits)) - i = (elen >> sb->s_blocksize_bits) - offset; - for (num = 0; i > 0; i--) { - block = udf_get_lb_pblock(sb, &eloc, offset + i); - tmp = udf_tgetblk(sb, block); - if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) - bha[num++] = tmp; - else - brelse(tmp); - } - if (num) { - bh_readahead_batch(num, bha, REQ_RAHEAD); - for (i = 0; i < num; i++) - brelse(bha[i]); - } - } - } - - while (nf_pos < size) { + for (ret = udf_fiiter_init(&iter, dir, nf_pos); + !ret && iter.pos < dir->i_size; + ret = udf_fiiter_advance(&iter)) { struct kernel_lb_addr tloc; - loff_t cur_pos = nf_pos; + udf_pblk_t iblock; - /* Update file position only if we got past the current one */ - if (nf_pos >= emit_pos) { - ctx->pos = (nf_pos >> 2) + 1; - pos_valid = true; - } - - fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, - &elen, &offset); - if (!fi) - goto out; /* Still not at offset where user asked us to read from? */ - if (cur_pos < emit_pos) + if (iter.pos < emit_pos) continue; - liu = le16_to_cpu(cfi.lengthOfImpUse); - lfi = cfi.lengthFileIdent; - - if (fibh.sbh == fibh.ebh) { - nameptr = udf_get_fi_ident(fi); - } else { - int poffset; /* Unpaded ending offset */ - - poffset = fibh.soffset + sizeof(struct fileIdentDesc) + liu + lfi; - - if (poffset >= lfi) { - nameptr = (char *)(fibh.ebh->b_data + poffset - lfi); - } else { - if (!copy_name) { - copy_name = kmalloc(UDF_NAME_LEN, - GFP_NOFS); - if (!copy_name) { - ret = -ENOMEM; - goto out; - } - } - nameptr = copy_name; - memcpy(nameptr, udf_get_fi_ident(fi), - lfi - poffset); - memcpy(nameptr + lfi - poffset, - fibh.ebh->b_data, poffset); - } - } + /* Update file position only if we got past the current one */ + pos_valid = true; + ctx->pos = (iter.pos >> 2) + 1; - if ((cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { + if (iter.fi.fileCharacteristics & FID_FILE_CHAR_DELETED) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) continue; } - if ((cfi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { + if (iter.fi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) continue; } - if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) { + if (iter.fi.fileCharacteristics & FID_FILE_CHAR_PARENT) { if (!dir_emit_dotdot(file, ctx)) - goto out; + goto out_iter; continue; } - flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); + flen = udf_get_filename(sb, iter.name, + iter.fi.lengthFileIdent, fname, UDF_NAME_LEN); if (flen < 0) continue; - tloc = lelb_to_cpu(cfi.icb.extLocation); + tloc = lelb_to_cpu(iter.fi.icb.extLocation); iblock = udf_get_lb_pblock(sb, &tloc, 0); if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN)) - goto out; - } /* end while */ - - ctx->pos = (nf_pos >> 2) + 1; - pos_valid = true; + goto out_iter; + } + if (!ret) { + ctx->pos = (iter.pos >> 2) + 1; + pos_valid = true; + } +out_iter: + udf_fiiter_release(&iter); out: if (pos_valid) file->f_version = inode_query_iversion(dir); - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - brelse(epos.bh); kfree(fname); - kfree(copy_name); return ret; } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 16bcf2c6b8b3..654536d2b609 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -17,183 +17,478 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/bio.h> +#include <linux/crc-itu-t.h> +#include <linux/iversion.h> -struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, - struct udf_fileident_bh *fibh, - struct fileIdentDesc *cfi, - struct extent_position *epos, - struct kernel_lb_addr *eloc, uint32_t *elen, - sector_t *offset) +static int udf_verify_fi(struct udf_fileident_iter *iter) { - struct fileIdentDesc *fi; - int i, num; - udf_pblk_t block; - struct buffer_head *tmp, *bha[16]; - struct udf_inode_info *iinfo = UDF_I(dir); - - fibh->soffset = fibh->eoffset; + unsigned int len; + + if (iter->fi.descTag.tagIdent != cpu_to_le16(TAG_IDENT_FID)) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has entry at pos %llu with incorrect tag %x\n", + iter->dir->i_ino, (unsigned long long)iter->pos, + le16_to_cpu(iter->fi.descTag.tagIdent)); + return -EFSCORRUPTED; + } + len = udf_dir_entry_len(&iter->fi); + if (le16_to_cpu(iter->fi.lengthOfImpUse) & 3) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has entry at pos %llu with unaligned length of impUse field\n", + iter->dir->i_ino, (unsigned long long)iter->pos); + return -EFSCORRUPTED; + } + /* + * This is in fact allowed by the spec due to long impUse field but + * we don't support it. If there is real media with this large impUse + * field, support can be added. + */ + if (len > 1 << iter->dir->i_blkbits) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has too big (%u) entry at pos %llu\n", + iter->dir->i_ino, len, (unsigned long long)iter->pos); + return -EFSCORRUPTED; + } + if (iter->pos + len > iter->dir->i_size) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has entry past directory size at pos %llu\n", + iter->dir->i_ino, (unsigned long long)iter->pos); + return -EFSCORRUPTED; + } + if (udf_dir_entry_len(&iter->fi) != + sizeof(struct tag) + le16_to_cpu(iter->fi.descTag.descCRCLength)) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has entry where CRC length (%u) does not match entry length (%u)\n", + iter->dir->i_ino, + (unsigned)le16_to_cpu(iter->fi.descTag.descCRCLength), + (unsigned)(udf_dir_entry_len(&iter->fi) - + sizeof(struct tag))); + return -EFSCORRUPTED; + } + return 0; +} +static int udf_copy_fi(struct udf_fileident_iter *iter) +{ + struct udf_inode_info *iinfo = UDF_I(iter->dir); + u32 blksize = 1 << iter->dir->i_blkbits; + u32 off, len, nameoff; + int err; + + /* Skip copying when we are at EOF */ + if (iter->pos >= iter->dir->i_size) { + iter->name = NULL; + return 0; + } + if (iter->dir->i_size < iter->pos + sizeof(struct fileIdentDesc)) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has entry straddling EOF\n", + iter->dir->i_ino); + return -EFSCORRUPTED; + } if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - fi = udf_get_fileident(iinfo->i_data - - (iinfo->i_efe ? - sizeof(struct extendedFileEntry) : - sizeof(struct fileEntry)), - dir->i_sb->s_blocksize, - &(fibh->eoffset)); - if (!fi) - return NULL; - - *nf_pos += fibh->eoffset - fibh->soffset; - - memcpy((uint8_t *)cfi, (uint8_t *)fi, + memcpy(&iter->fi, iinfo->i_data + iinfo->i_lenEAttr + iter->pos, sizeof(struct fileIdentDesc)); - - return fi; + err = udf_verify_fi(iter); + if (err < 0) + return err; + iter->name = iinfo->i_data + iinfo->i_lenEAttr + iter->pos + + sizeof(struct fileIdentDesc) + + le16_to_cpu(iter->fi.lengthOfImpUse); + return 0; } - if (fibh->eoffset == dir->i_sb->s_blocksize) { - uint32_t lextoffset = epos->offset; - unsigned char blocksize_bits = dir->i_sb->s_blocksize_bits; - - if (udf_next_aext(dir, epos, eloc, elen, 1) != - (EXT_RECORDED_ALLOCATED >> 30)) - return NULL; + off = iter->pos & (blksize - 1); + len = min_t(int, sizeof(struct fileIdentDesc), blksize - off); + memcpy(&iter->fi, iter->bh[0]->b_data + off, len); + if (len < sizeof(struct fileIdentDesc)) + memcpy((char *)(&iter->fi) + len, iter->bh[1]->b_data, + sizeof(struct fileIdentDesc) - len); + err = udf_verify_fi(iter); + if (err < 0) + return err; + + /* Handle directory entry name */ + nameoff = off + sizeof(struct fileIdentDesc) + + le16_to_cpu(iter->fi.lengthOfImpUse); + if (off + udf_dir_entry_len(&iter->fi) <= blksize) { + iter->name = iter->bh[0]->b_data + nameoff; + } else if (nameoff >= blksize) { + iter->name = iter->bh[1]->b_data + (nameoff - blksize); + } else { + iter->name = iter->namebuf; + len = blksize - nameoff; + memcpy(iter->name, iter->bh[0]->b_data + nameoff, len); + memcpy(iter->name + len, iter->bh[1]->b_data, + iter->fi.lengthFileIdent - len); + } + return 0; +} - block = udf_get_lb_pblock(dir->i_sb, eloc, *offset); +/* Readahead 8k once we are at 8k boundary */ +static void udf_readahead_dir(struct udf_fileident_iter *iter) +{ + unsigned int ralen = 16 >> (iter->dir->i_blkbits - 9); + struct buffer_head *tmp, *bha[16]; + int i, num; + udf_pblk_t blk; + + if (iter->loffset & (ralen - 1)) + return; + + if (iter->loffset + ralen > (iter->elen >> iter->dir->i_blkbits)) + ralen = (iter->elen >> iter->dir->i_blkbits) - iter->loffset; + num = 0; + for (i = 0; i < ralen; i++) { + blk = udf_get_lb_pblock(iter->dir->i_sb, &iter->eloc, + iter->loffset + i); + tmp = sb_getblk(iter->dir->i_sb, blk); + if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) + bha[num++] = tmp; + else + brelse(tmp); + } + if (num) { + bh_readahead_batch(num, bha, REQ_RAHEAD); + for (i = 0; i < num; i++) + brelse(bha[i]); + } +} - (*offset)++; +static struct buffer_head *udf_fiiter_bread_blk(struct udf_fileident_iter *iter) +{ + udf_pblk_t blk; - if ((*offset << blocksize_bits) >= *elen) - *offset = 0; - else - epos->offset = lextoffset; + udf_readahead_dir(iter); + blk = udf_get_lb_pblock(iter->dir->i_sb, &iter->eloc, iter->loffset); + return sb_bread(iter->dir->i_sb, blk); +} - brelse(fibh->sbh); - fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); - if (!fibh->sbh) - return NULL; - fibh->soffset = fibh->eoffset = 0; - - if (!(*offset & ((16 >> (blocksize_bits - 9)) - 1))) { - i = 16 >> (blocksize_bits - 9); - if (i + *offset > (*elen >> blocksize_bits)) - i = (*elen >> blocksize_bits)-*offset; - for (num = 0; i > 0; i--) { - block = udf_get_lb_pblock(dir->i_sb, eloc, - *offset + i); - tmp = udf_tgetblk(dir->i_sb, block); - if (tmp && !buffer_uptodate(tmp) && - !buffer_locked(tmp)) - bha[num++] = tmp; - else - brelse(tmp); - } - if (num) { - bh_readahead_batch(num, bha, REQ_RAHEAD); - for (i = 0; i < num; i++) - brelse(bha[i]); - } +/* + * Updates loffset to point to next directory block; eloc, elen & epos are + * updated if we need to traverse to the next extent as well. + */ +static int udf_fiiter_advance_blk(struct udf_fileident_iter *iter) +{ + iter->loffset++; + if (iter->loffset < DIV_ROUND_UP(iter->elen, 1<<iter->dir->i_blkbits)) + return 0; + + iter->loffset = 0; + if (udf_next_aext(iter->dir, &iter->epos, &iter->eloc, &iter->elen, 1) + != (EXT_RECORDED_ALLOCATED >> 30)) { + if (iter->pos == iter->dir->i_size) { + iter->elen = 0; + return 0; } - } else if (fibh->sbh != fibh->ebh) { - brelse(fibh->sbh); - fibh->sbh = fibh->ebh; + udf_err(iter->dir->i_sb, + "extent after position %llu not allocated in directory (ino %lu)\n", + (unsigned long long)iter->pos, iter->dir->i_ino); + return -EFSCORRUPTED; } + return 0; +} - fi = udf_get_fileident(fibh->sbh->b_data, dir->i_sb->s_blocksize, - &(fibh->eoffset)); - - if (!fi) - return NULL; +static int udf_fiiter_load_bhs(struct udf_fileident_iter *iter) +{ + int blksize = 1 << iter->dir->i_blkbits; + int off = iter->pos & (blksize - 1); + int err; + struct fileIdentDesc *fi; - *nf_pos += fibh->eoffset - fibh->soffset; + /* Is there any further extent we can map from? */ + if (!iter->bh[0] && iter->elen) { + iter->bh[0] = udf_fiiter_bread_blk(iter); + if (!iter->bh[0]) { + err = -ENOMEM; + goto out_brelse; + } + if (!buffer_uptodate(iter->bh[0])) { + err = -EIO; + goto out_brelse; + } + } + /* There's no next block so we are done */ + if (iter->pos >= iter->dir->i_size) + return 0; + /* Need to fetch next block as well? */ + if (off + sizeof(struct fileIdentDesc) > blksize) + goto fetch_next; + fi = (struct fileIdentDesc *)(iter->bh[0]->b_data + off); + /* Need to fetch next block to get name? */ + if (off + udf_dir_entry_len(fi) > blksize) { +fetch_next: + err = udf_fiiter_advance_blk(iter); + if (err) + goto out_brelse; + iter->bh[1] = udf_fiiter_bread_blk(iter); + if (!iter->bh[1]) { + err = -ENOMEM; + goto out_brelse; + } + if (!buffer_uptodate(iter->bh[1])) { + err = -EIO; + goto out_brelse; + } + } + return 0; +out_brelse: + brelse(iter->bh[0]); + brelse(iter->bh[1]); + iter->bh[0] = iter->bh[1] = NULL; + return err; +} - if (fibh->eoffset <= dir->i_sb->s_blocksize) { - memcpy((uint8_t *)cfi, (uint8_t *)fi, - sizeof(struct fileIdentDesc)); - } else if (fibh->eoffset > dir->i_sb->s_blocksize) { - uint32_t lextoffset = epos->offset; +int udf_fiiter_init(struct udf_fileident_iter *iter, struct inode *dir, + loff_t pos) +{ + struct udf_inode_info *iinfo = UDF_I(dir); + int err = 0; + + iter->dir = dir; + iter->bh[0] = iter->bh[1] = NULL; + iter->pos = pos; + iter->elen = 0; + iter->epos.bh = NULL; + iter->name = NULL; + /* + * When directory is verified, we don't expect directory iteration to + * fail and it can be difficult to undo without corrupting filesystem. + * So just do not allow memory allocation failures here. + */ + iter->namebuf = kmalloc(UDF_NAME_LEN_CS0, GFP_KERNEL | __GFP_NOFAIL); - if (udf_next_aext(dir, epos, eloc, elen, 1) != - (EXT_RECORDED_ALLOCATED >> 30)) - return NULL; + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + err = udf_copy_fi(iter); + goto out; + } - block = udf_get_lb_pblock(dir->i_sb, eloc, *offset); + if (inode_bmap(dir, iter->pos >> dir->i_blkbits, &iter->epos, + &iter->eloc, &iter->elen, &iter->loffset) != + (EXT_RECORDED_ALLOCATED >> 30)) { + if (pos == dir->i_size) + return 0; + udf_err(dir->i_sb, + "position %llu not allocated in directory (ino %lu)\n", + (unsigned long long)pos, dir->i_ino); + err = -EFSCORRUPTED; + goto out; + } + err = udf_fiiter_load_bhs(iter); + if (err < 0) + goto out; + err = udf_copy_fi(iter); +out: + if (err < 0) + udf_fiiter_release(iter); + return err; +} - (*offset)++; +int udf_fiiter_advance(struct udf_fileident_iter *iter) +{ + unsigned int oldoff, len; + int blksize = 1 << iter->dir->i_blkbits; + int err; + + oldoff = iter->pos & (blksize - 1); + len = udf_dir_entry_len(&iter->fi); + iter->pos += len; + if (UDF_I(iter->dir)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + if (oldoff + len >= blksize) { + brelse(iter->bh[0]); + iter->bh[0] = NULL; + /* Next block already loaded? */ + if (iter->bh[1]) { + iter->bh[0] = iter->bh[1]; + iter->bh[1] = NULL; + } else { + err = udf_fiiter_advance_blk(iter); + if (err < 0) + return err; + } + } + err = udf_fiiter_load_bhs(iter); + if (err < 0) + return err; + } + return udf_copy_fi(iter); +} - if ((*offset << dir->i_sb->s_blocksize_bits) >= *elen) - *offset = 0; - else - epos->offset = lextoffset; +void udf_fiiter_release(struct udf_fileident_iter *iter) +{ + iter->dir = NULL; + brelse(iter->bh[0]); + brelse(iter->bh[1]); + iter->bh[0] = iter->bh[1] = NULL; + kfree(iter->namebuf); + iter->namebuf = NULL; +} - fibh->soffset -= dir->i_sb->s_blocksize; - fibh->eoffset -= dir->i_sb->s_blocksize; +static void udf_copy_to_bufs(void *buf1, int len1, void *buf2, int len2, + int off, void *src, int len) +{ + int copy; + + if (off >= len1) { + off -= len1; + } else { + copy = min(off + len, len1) - off; + memcpy(buf1 + off, src, copy); + src += copy; + len -= copy; + off = 0; + } + if (len > 0) { + if (WARN_ON_ONCE(off + len > len2 || !buf2)) + return; + memcpy(buf2 + off, src, len); + } +} - fibh->ebh = udf_tread(dir->i_sb, block); - if (!fibh->ebh) - return NULL; +static uint16_t udf_crc_fi_bufs(void *buf1, int len1, void *buf2, int len2, + int off, int len) +{ + int copy; + uint16_t crc = 0; + + if (off >= len1) { + off -= len1; + } else { + copy = min(off + len, len1) - off; + crc = crc_itu_t(crc, buf1 + off, copy); + len -= copy; + off = 0; + } + if (len > 0) { + if (WARN_ON_ONCE(off + len > len2 || !buf2)) + return 0; + crc = crc_itu_t(crc, buf2 + off, len); + } + return crc; +} - if (sizeof(struct fileIdentDesc) > -fibh->soffset) { - int fi_len; +static void udf_copy_fi_to_bufs(char *buf1, int len1, char *buf2, int len2, + int off, struct fileIdentDesc *fi, + uint8_t *impuse, uint8_t *name) +{ + uint16_t crc; + int fioff = off; + int crcoff = off + sizeof(struct tag); + unsigned int crclen = udf_dir_entry_len(fi) - sizeof(struct tag); + char zeros[UDF_NAME_PAD] = {}; + int endoff = off + udf_dir_entry_len(fi); + + udf_copy_to_bufs(buf1, len1, buf2, len2, off, fi, + sizeof(struct fileIdentDesc)); + off += sizeof(struct fileIdentDesc); + if (impuse) + udf_copy_to_bufs(buf1, len1, buf2, len2, off, impuse, + le16_to_cpu(fi->lengthOfImpUse)); + off += le16_to_cpu(fi->lengthOfImpUse); + if (name) { + udf_copy_to_bufs(buf1, len1, buf2, len2, off, name, + fi->lengthFileIdent); + off += fi->lengthFileIdent; + udf_copy_to_bufs(buf1, len1, buf2, len2, off, zeros, + endoff - off); + } - memcpy((uint8_t *)cfi, (uint8_t *)fi, -fibh->soffset); - memcpy((uint8_t *)cfi - fibh->soffset, - fibh->ebh->b_data, - sizeof(struct fileIdentDesc) + fibh->soffset); + crc = udf_crc_fi_bufs(buf1, len1, buf2, len2, crcoff, crclen); + fi->descTag.descCRC = cpu_to_le16(crc); + fi->descTag.descCRCLength = cpu_to_le16(crclen); + fi->descTag.tagChecksum = udf_tag_checksum(&fi->descTag); - fi_len = udf_dir_entry_len(cfi); - *nf_pos += fi_len - (fibh->eoffset - fibh->soffset); - fibh->eoffset = fibh->soffset + fi_len; - } else { - memcpy((uint8_t *)cfi, (uint8_t *)fi, - sizeof(struct fileIdentDesc)); - } - } - /* Got last entry outside of dir size - fs is corrupted! */ - if (*nf_pos > dir->i_size) - return NULL; - return fi; + udf_copy_to_bufs(buf1, len1, buf2, len2, fioff, fi, sizeof(struct tag)); } -struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset) +void udf_fiiter_write_fi(struct udf_fileident_iter *iter, uint8_t *impuse) { - struct fileIdentDesc *fi; - int lengthThisIdent; - uint8_t *ptr; - int padlen; + struct udf_inode_info *iinfo = UDF_I(iter->dir); + void *buf1, *buf2 = NULL; + int len1, len2 = 0, off; + int blksize = 1 << iter->dir->i_blkbits; - if ((!buffer) || (!offset)) { - udf_debug("invalidparms, buffer=%p, offset=%p\n", - buffer, offset); - return NULL; + off = iter->pos & (blksize - 1); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + buf1 = iinfo->i_data + iinfo->i_lenEAttr; + len1 = iter->dir->i_size; + } else { + buf1 = iter->bh[0]->b_data; + len1 = blksize; + if (iter->bh[1]) { + buf2 = iter->bh[1]->b_data; + len2 = blksize; + } } - ptr = buffer; + udf_copy_fi_to_bufs(buf1, len1, buf2, len2, off, &iter->fi, impuse, + iter->name == iter->namebuf ? iter->name : NULL); - if ((*offset > 0) && (*offset < bufsize)) - ptr += *offset; - fi = (struct fileIdentDesc *)ptr; - if (fi->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FID)) { - udf_debug("0x%x != TAG_IDENT_FID\n", - le16_to_cpu(fi->descTag.tagIdent)); - udf_debug("offset: %d sizeof: %lu bufsize: %d\n", - *offset, (unsigned long)sizeof(struct fileIdentDesc), - bufsize); - return NULL; + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + mark_inode_dirty(iter->dir); + } else { + mark_buffer_dirty_inode(iter->bh[0], iter->dir); + if (iter->bh[1]) + mark_buffer_dirty_inode(iter->bh[1], iter->dir); } - if ((*offset + sizeof(struct fileIdentDesc)) > bufsize) - lengthThisIdent = sizeof(struct fileIdentDesc); - else - lengthThisIdent = sizeof(struct fileIdentDesc) + - fi->lengthFileIdent + le16_to_cpu(fi->lengthOfImpUse); + inode_inc_iversion(iter->dir); +} - /* we need to figure padding, too! */ - padlen = lengthThisIdent % UDF_NAME_PAD; - if (padlen) - lengthThisIdent += (UDF_NAME_PAD - padlen); - *offset = *offset + lengthThisIdent; +void udf_fiiter_update_elen(struct udf_fileident_iter *iter, uint32_t new_elen) +{ + struct udf_inode_info *iinfo = UDF_I(iter->dir); + int diff = new_elen - iter->elen; + + /* Skip update when we already went past the last extent */ + if (!iter->elen) + return; + iter->elen = new_elen; + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + iter->epos.offset -= sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + iter->epos.offset -= sizeof(struct long_ad); + udf_write_aext(iter->dir, &iter->epos, &iter->eloc, iter->elen, 1); + iinfo->i_lenExtents += diff; + mark_inode_dirty(iter->dir); +} - return fi; +/* Append new block to directory. @iter is expected to point at EOF */ +int udf_fiiter_append_blk(struct udf_fileident_iter *iter) +{ + struct udf_inode_info *iinfo = UDF_I(iter->dir); + int blksize = 1 << iter->dir->i_blkbits; + struct buffer_head *bh; + sector_t block; + uint32_t old_elen = iter->elen; + int err; + + if (WARN_ON_ONCE(iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)) + return -EINVAL; + + /* Round up last extent in the file */ + udf_fiiter_update_elen(iter, ALIGN(iter->elen, blksize)); + + /* Allocate new block and refresh mapping information */ + block = iinfo->i_lenExtents >> iter->dir->i_blkbits; + bh = udf_bread(iter->dir, block, 1, &err); + if (!bh) { + udf_fiiter_update_elen(iter, old_elen); + return err; + } + if (inode_bmap(iter->dir, block, &iter->epos, &iter->eloc, &iter->elen, + &iter->loffset) != (EXT_RECORDED_ALLOCATED >> 30)) { + udf_err(iter->dir->i_sb, + "block %llu not allocated in directory (ino %lu)\n", + (unsigned long long)block, iter->dir->i_ino); + return -EFSCORRUPTED; + } + if (!(iter->pos & (blksize - 1))) { + brelse(iter->bh[0]); + iter->bh[0] = bh; + } else { + iter->bh[1] = bh; + } + return 0; } struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, diff --git a/fs/udf/file.c b/fs/udf/file.c index 5c659e23e578..8238f742377b 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -38,100 +38,55 @@ #include "udf_i.h" #include "udf_sb.h" -static void __udf_adinicb_readpage(struct page *page) +static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) { - struct inode *inode = page->mapping->host; - char *kaddr; - struct udf_inode_info *iinfo = UDF_I(inode); - loff_t isize = i_size_read(inode); - - /* - * We have to be careful here as truncate can change i_size under us. - * So just sample it once and use the same value everywhere. - */ - kaddr = kmap_atomic(page); - memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, isize); - memset(kaddr + isize, 0, PAGE_SIZE - isize); - flush_dcache_page(page); - SetPageUptodate(page); - kunmap_atomic(kaddr); -} - -static int udf_adinicb_read_folio(struct file *file, struct folio *folio) -{ - BUG_ON(!folio_test_locked(folio)); - __udf_adinicb_readpage(&folio->page); - folio_unlock(folio); - - return 0; -} - -static int udf_adinicb_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - char *kaddr; - struct udf_inode_info *iinfo = UDF_I(inode); - - BUG_ON(!PageLocked(page)); - - kaddr = kmap_atomic(page); - memcpy(iinfo->i_data + iinfo->i_lenEAttr, kaddr, i_size_read(inode)); - SetPageUptodate(page); - kunmap_atomic(kaddr); - mark_inode_dirty(inode); - unlock_page(page); - - return 0; -} - -static int udf_adinicb_write_begin(struct file *file, - struct address_space *mapping, loff_t pos, - unsigned len, struct page **pagep, - void **fsdata) -{ - struct page *page; - - if (WARN_ON_ONCE(pos >= PAGE_SIZE)) - return -EIO; - page = grab_cache_page_write_begin(mapping, 0); - if (!page) - return -ENOMEM; - *pagep = page; - - if (!PageUptodate(page)) - __udf_adinicb_readpage(page); - return 0; -} - -static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - /* Fallback to buffered I/O. */ - return 0; -} + struct vm_area_struct *vma = vmf->vma; + struct inode *inode = file_inode(vma->vm_file); + struct address_space *mapping = inode->i_mapping; + struct page *page = vmf->page; + loff_t size; + unsigned int end; + vm_fault_t ret = VM_FAULT_LOCKED; + int err; -static int udf_adinicb_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = page->mapping->host; - loff_t last_pos = pos + copied; - if (last_pos > inode->i_size) - i_size_write(inode, last_pos); + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + filemap_invalidate_lock_shared(mapping); + lock_page(page); + size = i_size_read(inode); + if (page->mapping != inode->i_mapping || page_offset(page) >= size) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + goto out_unlock; + } + /* Space is already allocated for in-ICB file */ + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + goto out_dirty; + if (page->index == size >> PAGE_SHIFT) + end = size & ~PAGE_MASK; + else + end = PAGE_SIZE; + err = __block_write_begin(page, 0, end, udf_get_block); + if (!err) + err = block_commit_write(page, 0, end); + if (err < 0) { + unlock_page(page); + ret = block_page_mkwrite_return(err); + goto out_unlock; + } +out_dirty: set_page_dirty(page); - unlock_page(page); - put_page(page); - return copied; + wait_for_stable_page(page); +out_unlock: + filemap_invalidate_unlock_shared(mapping); + sb_end_pagefault(inode->i_sb); + return ret; } -const struct address_space_operations udf_adinicb_aops = { - .dirty_folio = block_dirty_folio, - .invalidate_folio = block_invalidate_folio, - .read_folio = udf_adinicb_read_folio, - .writepage = udf_adinicb_writepage, - .write_begin = udf_adinicb_write_begin, - .write_end = udf_adinicb_write_end, - .direct_IO = udf_adinicb_direct_IO, +static const struct vm_operations_struct udf_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = udf_page_mkwrite, }; static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -140,7 +95,6 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct udf_inode_info *iinfo = UDF_I(inode); - int err; inode_lock(inode); @@ -148,27 +102,23 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (retval <= 0) goto out; - down_write(&iinfo->i_data_sem); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - loff_t end = iocb->ki_pos + iov_iter_count(from); - - if (inode->i_sb->s_blocksize < - (udf_file_entry_alloc_offset(inode) + end)) { - err = udf_expand_file_adinicb(inode); - if (err) { - inode_unlock(inode); - udf_debug("udf_expand_adinicb: err=%d\n", err); - return err; - } - } else { - iinfo->i_lenAlloc = max(end, inode->i_size); - up_write(&iinfo->i_data_sem); - } - } else - up_write(&iinfo->i_data_sem); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && + inode->i_sb->s_blocksize < (udf_file_entry_alloc_offset(inode) + + iocb->ki_pos + iov_iter_count(from))) { + filemap_invalidate_lock(inode->i_mapping); + retval = udf_expand_file_adinicb(inode); + filemap_invalidate_unlock(inode->i_mapping); + if (retval) + goto out; + } retval = __generic_file_write_iter(iocb, from); out: + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && retval > 0) { + down_write(&iinfo->i_data_sem); + iinfo->i_lenAlloc = inode->i_size; + up_write(&iinfo->i_data_sem); + } inode_unlock(inode); if (retval > 0) { @@ -243,11 +193,19 @@ static int udf_release_file(struct inode *inode, struct file *filp) return 0; } +static int udf_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &udf_file_vm_ops; + + return 0; +} + const struct file_operations udf_file_operations = { .read_iter = generic_file_read_iter, .unlocked_ioctl = udf_ioctl, .open = generic_file_open, - .mmap = generic_file_mmap, + .mmap = udf_file_mmap, .write_iter = udf_file_write_iter, .release = udf_release_file, .fsync = generic_file_fsync, @@ -256,14 +214,14 @@ const struct file_operations udf_file_operations = { .llseek = generic_file_llseek, }; -static int udf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +static int udf_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -286,7 +244,7 @@ static int udf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (attr->ia_valid & ATTR_MODE) udf_update_extra_perms(inode, attr->ia_mode); - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index b5d611cee749..8d50121778a5 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -28,21 +28,7 @@ void udf_free_inode(struct inode *inode) { - struct super_block *sb = inode->i_sb; - struct udf_sb_info *sbi = UDF_SB(sb); - struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); - - if (lvidiu) { - mutex_lock(&sbi->s_alloc_mutex); - if (S_ISDIR(inode->i_mode)) - le32_add_cpu(&lvidiu->numDirs, -1); - else - le32_add_cpu(&lvidiu->numFiles, -1); - udf_updated_lvid(sb); - mutex_unlock(&sbi->s_alloc_mutex); - } - - udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1); + udf_free_blocks(inode->i_sb, NULL, &UDF_I(inode)->i_location, 0, 1); } struct inode *udf_new_inode(struct inode *dir, umode_t mode) @@ -54,7 +40,6 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; struct udf_inode_info *iinfo; struct udf_inode_info *dinfo = UDF_I(dir); - struct logicalVolIntegrityDescImpUse *lvidiu; int err; inode = new_inode(sb); @@ -92,20 +77,10 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) return ERR_PTR(err); } - lvidiu = udf_sb_lvidiu(sb); - if (lvidiu) { - iinfo->i_unique = lvid_get_unique_id(sb); - inode->i_generation = iinfo->i_unique; - mutex_lock(&sbi->s_alloc_mutex); - if (S_ISDIR(mode)) - le32_add_cpu(&lvidiu->numDirs, 1); - else - le32_add_cpu(&lvidiu->numFiles, 1); - udf_updated_lvid(sb); - mutex_unlock(&sbi->s_alloc_mutex); - } + iinfo->i_unique = lvid_get_unique_id(sb); + inode->i_generation = iinfo->i_unique; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) inode->i_uid = sbi->s_uid; if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 34e416327dd4..3b2adf4cbc57 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -52,21 +52,24 @@ #define FE_DELETE_PERMS (FE_PERM_U_DELETE | FE_PERM_G_DELETE | \ FE_PERM_O_DELETE) +struct udf_map_rq; + static umode_t udf_convert_permissions(struct fileEntry *); static int udf_update_inode(struct inode *, int); static int udf_sync_inode(struct inode *inode); static int udf_alloc_i_data(struct inode *inode, size_t size); -static sector_t inode_getblk(struct inode *, sector_t, int *, int *); -static int8_t udf_insert_aext(struct inode *, struct extent_position, - struct kernel_lb_addr, uint32_t); +static int inode_getblk(struct inode *inode, struct udf_map_rq *map); +static int udf_insert_aext(struct inode *, struct extent_position, + struct kernel_lb_addr, uint32_t); static void udf_split_extents(struct inode *, int *, int, udf_pblk_t, struct kernel_long_ad *, int *); static void udf_prealloc_extents(struct inode *, int, int, struct kernel_long_ad *, int *); static void udf_merge_extents(struct inode *, struct kernel_long_ad *, int *); -static void udf_update_extents(struct inode *, struct kernel_long_ad *, int, - int, struct extent_position *); -static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); +static int udf_update_extents(struct inode *, struct kernel_long_ad *, int, + int, struct extent_position *); +static int udf_get_block_wb(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create); static void __udf_clear_extent_cache(struct inode *inode) { @@ -182,14 +185,56 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) } } +static int udf_adinicb_writepage(struct page *page, + struct writeback_control *wbc, void *data) +{ + struct inode *inode = page->mapping->host; + struct udf_inode_info *iinfo = UDF_I(inode); + + BUG_ON(!PageLocked(page)); + memcpy_to_page(page, 0, iinfo->i_data + iinfo->i_lenEAttr, + i_size_read(inode)); + unlock_page(page); + mark_inode_dirty(inode); + + return 0; +} + static int udf_writepages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc) { - return mpage_writepages(mapping, wbc, udf_get_block); + struct inode *inode = mapping->host; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) + return mpage_writepages(mapping, wbc, udf_get_block_wb); + return write_cache_pages(mapping, wbc, udf_adinicb_writepage, NULL); +} + +static void udf_adinicb_readpage(struct page *page) +{ + struct inode *inode = page->mapping->host; + char *kaddr; + struct udf_inode_info *iinfo = UDF_I(inode); + loff_t isize = i_size_read(inode); + + kaddr = kmap_local_page(page); + memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, isize); + memset(kaddr + isize, 0, PAGE_SIZE - isize); + flush_dcache_page(page); + SetPageUptodate(page); + kunmap_local(kaddr); } static int udf_read_folio(struct file *file, struct folio *folio) { + struct udf_inode_info *iinfo = UDF_I(file_inode(file)); + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + udf_adinicb_readpage(&folio->page); + folio_unlock(folio); + return 0; + } return mpage_read_folio(folio, udf_get_block); } @@ -199,15 +244,49 @@ static void udf_readahead(struct readahead_control *rac) } static int udf_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + loff_t pos, unsigned len, + struct page **pagep, void **fsdata) { + struct udf_inode_info *iinfo = UDF_I(file_inode(file)); + struct page *page; int ret; - ret = block_write_begin(mapping, pos, len, pagep, udf_get_block); - if (unlikely(ret)) - udf_write_failed(mapping, pos + len); - return ret; + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + ret = block_write_begin(mapping, pos, len, pagep, + udf_get_block); + if (unlikely(ret)) + udf_write_failed(mapping, pos + len); + return ret; + } + if (WARN_ON_ONCE(pos >= PAGE_SIZE)) + return -EIO; + page = grab_cache_page_write_begin(mapping, 0); + if (!page) + return -ENOMEM; + *pagep = page; + if (!PageUptodate(page)) + udf_adinicb_readpage(page); + return 0; +} + +static int udf_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = file_inode(file); + loff_t last_pos; + + if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) + return generic_write_end(file, mapping, pos, len, copied, page, + fsdata); + last_pos = pos + copied; + if (last_pos > inode->i_size) + i_size_write(inode, last_pos); + set_page_dirty(page); + unlock_page(page); + put_page(page); + + return copied; } static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) @@ -218,6 +297,9 @@ static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) size_t count = iov_iter_count(iter); ssize_t ret; + /* Fallback to buffered IO for in-ICB files */ + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + return 0; ret = blockdev_direct_IO(iocb, inode, iter, udf_get_block); if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE)) udf_write_failed(mapping, iocb->ki_pos + count); @@ -226,6 +308,10 @@ static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static sector_t udf_bmap(struct address_space *mapping, sector_t block) { + struct udf_inode_info *iinfo = UDF_I(mapping->host); + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + return -EINVAL; return generic_block_bmap(mapping, block, udf_get_block); } @@ -236,7 +322,7 @@ const struct address_space_operations udf_aops = { .readahead = udf_readahead, .writepages = udf_writepages, .write_begin = udf_write_begin, - .write_end = generic_write_end, + .write_end = udf_write_end, .direct_IO = udf_direct_IO, .bmap = udf_bmap, .migrate_folio = buffer_migrate_folio, @@ -245,18 +331,17 @@ const struct address_space_operations udf_aops = { /* * Expand file stored in ICB to a normal one-block-file * - * This function requires i_data_sem for writing and releases it. * This function requires i_mutex held */ int udf_expand_file_adinicb(struct inode *inode) { struct page *page; - char *kaddr; struct udf_inode_info *iinfo = UDF_I(inode); int err; WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { + down_write(&iinfo->i_data_sem); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else @@ -267,26 +352,13 @@ int udf_expand_file_adinicb(struct inode *inode) mark_inode_dirty(inode); return 0; } - /* - * Release i_data_sem so that we can lock a page - page lock ranks - * above i_data_sem. i_mutex still protects us against file changes. - */ - up_write(&iinfo->i_data_sem); page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); if (!page) return -ENOMEM; - if (!PageUptodate(page)) { - kaddr = kmap_atomic(page); - memset(kaddr + iinfo->i_lenAlloc, 0x00, - PAGE_SIZE - iinfo->i_lenAlloc); - memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, - iinfo->i_lenAlloc); - flush_dcache_page(page); - SetPageUptodate(page); - kunmap_atomic(kaddr); - } + if (!PageUptodate(page)) + udf_adinicb_readpage(page); down_write(&iinfo->i_data_sem); memset(iinfo->i_data + iinfo->i_lenEAttr, 0x00, iinfo->i_lenAlloc); @@ -295,8 +367,6 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; - /* from now on we have normal address_space methods */ - inode->i_data.a_ops = &udf_aops; set_page_dirty(page); unlock_page(page); up_write(&iinfo->i_data_sem); @@ -305,12 +375,10 @@ int udf_expand_file_adinicb(struct inode *inode) /* Restore everything back so that we don't lose data... */ lock_page(page); down_write(&iinfo->i_data_sem); - kaddr = kmap_atomic(page); - memcpy(iinfo->i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); - kunmap_atomic(kaddr); + memcpy_to_page(page, 0, iinfo->i_data + iinfo->i_lenEAttr, + inode->i_size); unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; - inode->i_data.a_ops = &udf_adinicb_aops; iinfo->i_lenAlloc = inode->i_size; up_write(&iinfo->i_data_sem); } @@ -320,162 +388,103 @@ int udf_expand_file_adinicb(struct inode *inode) return err; } -struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, - udf_pblk_t *block, int *err) -{ - udf_pblk_t newblock; - struct buffer_head *dbh = NULL; - struct kernel_lb_addr eloc; - uint8_t alloctype; - struct extent_position epos; +#define UDF_MAP_CREATE 0x01 /* Mapping can allocate new blocks */ +#define UDF_MAP_NOPREALLOC 0x02 /* Do not preallocate blocks */ - struct udf_fileident_bh sfibh, dfibh; - loff_t f_pos = udf_ext0_offset(inode); - int size = udf_ext0_offset(inode) + inode->i_size; - struct fileIdentDesc cfi, *sfi, *dfi; - struct udf_inode_info *iinfo = UDF_I(inode); +#define UDF_BLK_MAPPED 0x01 /* Block was successfully mapped */ +#define UDF_BLK_NEW 0x02 /* Block was freshly allocated */ - if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) - alloctype = ICBTAG_FLAG_AD_SHORT; - else - alloctype = ICBTAG_FLAG_AD_LONG; +struct udf_map_rq { + sector_t lblk; + udf_pblk_t pblk; + int iflags; /* UDF_MAP_ flags determining behavior */ + int oflags; /* UDF_BLK_ flags reporting results */ +}; - if (!inode->i_size) { - iinfo->i_alloc_type = alloctype; - mark_inode_dirty(inode); - return NULL; - } +static int udf_map_block(struct inode *inode, struct udf_map_rq *map) +{ + int err; + struct udf_inode_info *iinfo = UDF_I(inode); - /* alloc block, and copy data to it */ - *block = udf_new_block(inode->i_sb, inode, - iinfo->i_location.partitionReferenceNum, - iinfo->i_location.logicalBlockNum, err); - if (!(*block)) - return NULL; - newblock = udf_get_pblock(inode->i_sb, *block, - iinfo->i_location.partitionReferenceNum, - 0); - if (!newblock) - return NULL; - dbh = udf_tgetblk(inode->i_sb, newblock); - if (!dbh) - return NULL; - lock_buffer(dbh); - memset(dbh->b_data, 0x00, inode->i_sb->s_blocksize); - set_buffer_uptodate(dbh); - unlock_buffer(dbh); - mark_buffer_dirty_inode(dbh, inode); - - sfibh.soffset = sfibh.eoffset = - f_pos & (inode->i_sb->s_blocksize - 1); - sfibh.sbh = sfibh.ebh = NULL; - dfibh.soffset = dfibh.eoffset = 0; - dfibh.sbh = dfibh.ebh = dbh; - while (f_pos < size) { - iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; - sfi = udf_fileident_read(inode, &f_pos, &sfibh, &cfi, NULL, - NULL, NULL, NULL); - if (!sfi) { - brelse(dbh); - return NULL; - } - iinfo->i_alloc_type = alloctype; - sfi->descTag.tagLocation = cpu_to_le32(*block); - dfibh.soffset = dfibh.eoffset; - dfibh.eoffset += (sfibh.eoffset - sfibh.soffset); - dfi = (struct fileIdentDesc *)(dbh->b_data + dfibh.soffset); - if (udf_write_fi(inode, sfi, dfi, &dfibh, sfi->impUse, - udf_get_fi_ident(sfi))) { - iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; - brelse(dbh); - return NULL; + map->oflags = 0; + if (!(map->iflags & UDF_MAP_CREATE)) { + struct kernel_lb_addr eloc; + uint32_t elen; + sector_t offset; + struct extent_position epos = {}; + + down_read(&iinfo->i_data_sem); + if (inode_bmap(inode, map->lblk, &epos, &eloc, &elen, &offset) + == (EXT_RECORDED_ALLOCATED >> 30)) { + map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, + offset); + map->oflags |= UDF_BLK_MAPPED; } - } - mark_buffer_dirty_inode(dbh, inode); + up_read(&iinfo->i_data_sem); + brelse(epos.bh); - memset(iinfo->i_data + iinfo->i_lenEAttr, 0, iinfo->i_lenAlloc); - iinfo->i_lenAlloc = 0; - eloc.logicalBlockNum = *block; - eloc.partitionReferenceNum = - iinfo->i_location.partitionReferenceNum; - iinfo->i_lenExtents = inode->i_size; - epos.bh = NULL; - epos.block = iinfo->i_location; - epos.offset = udf_file_entry_alloc_offset(inode); - udf_add_aext(inode, &epos, &eloc, inode->i_size, 0); - /* UniqueID stuff */ - - brelse(epos.bh); - mark_inode_dirty(inode); - return dbh; -} - -static int udf_get_block(struct inode *inode, sector_t block, - struct buffer_head *bh_result, int create) -{ - int err, new; - sector_t phys = 0; - struct udf_inode_info *iinfo; - - if (!create) { - phys = udf_block_map(inode, block); - if (phys) - map_bh(bh_result, inode->i_sb, phys); return 0; } - err = -EIO; - new = 0; - iinfo = UDF_I(inode); - down_write(&iinfo->i_data_sem); - if (block == iinfo->i_next_alloc_block + 1) { - iinfo->i_next_alloc_block++; - iinfo->i_next_alloc_goal++; - } - /* * Block beyond EOF and prealloc extents? Just discard preallocation * as it is not useful and complicates things. */ - if (((loff_t)block) << inode->i_blkbits > iinfo->i_lenExtents) + if (((loff_t)map->lblk) << inode->i_blkbits >= iinfo->i_lenExtents) udf_discard_prealloc(inode); udf_clear_extent_cache(inode); - phys = inode_getblk(inode, block, &err, &new); - if (!phys) - goto abort; - - if (new) - set_buffer_new(bh_result); - map_bh(bh_result, inode->i_sb, phys); - -abort: + err = inode_getblk(inode, map); up_write(&iinfo->i_data_sem); return err; } -static struct buffer_head *udf_getblk(struct inode *inode, udf_pblk_t block, - int create, int *err) +static int __udf_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int flags) { - struct buffer_head *bh; - struct buffer_head dummy; - - dummy.b_state = 0; - dummy.b_blocknr = -1000; - *err = udf_get_block(inode, block, &dummy, create); - if (!*err && buffer_mapped(&dummy)) { - bh = sb_getblk(inode->i_sb, dummy.b_blocknr); - if (buffer_new(&dummy)) { - lock_buffer(bh); - memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); - set_buffer_uptodate(bh); - unlock_buffer(bh); - mark_buffer_dirty_inode(bh, inode); - } - return bh; + int err; + struct udf_map_rq map = { + .lblk = block, + .iflags = flags, + }; + + err = udf_map_block(inode, &map); + if (err < 0) + return err; + if (map.oflags & UDF_BLK_MAPPED) { + map_bh(bh_result, inode->i_sb, map.pblk); + if (map.oflags & UDF_BLK_NEW) + set_buffer_new(bh_result); } + return 0; +} - return NULL; +int udf_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + int flags = create ? UDF_MAP_CREATE : 0; + + /* + * We preallocate blocks only for regular files. It also makes sense + * for directories but there's a problem when to drop the + * preallocation. We might use some delayed work for that but I feel + * it's overengineering for a filesystem like UDF. + */ + if (!S_ISREG(inode->i_mode)) + flags |= UDF_MAP_NOPREALLOC; + return __udf_get_block(inode, block, bh_result, flags); +} + +/* + * We shouldn't be allocating blocks on page writeback since we allocate them + * on page fault. We can spot dirty buffers without allocated blocks though + * when truncate expands file. These however don't have valid data so we can + * safely ignore them. So never allocate blocks from page writeback. + */ +static int udf_get_block_wb(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + return __udf_get_block(inode, block, bh_result, 0); } /* Extend the file with new blocks totaling 'new_block_bytes', @@ -509,6 +518,7 @@ static int udf_do_extend_file(struct inode *inode, ~(sb->s_blocksize - 1); } + add = 0; /* Can we merge with the previous extent? */ if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == EXT_NOT_RECORDED_NOT_ALLOCATED) { @@ -521,8 +531,10 @@ static int udf_do_extend_file(struct inode *inode, } if (fake) { - udf_add_aext(inode, last_pos, &last_ext->extLocation, - last_ext->extLength, 1); + err = udf_add_aext(inode, last_pos, &last_ext->extLocation, + last_ext->extLength, 1); + if (err < 0) + goto out_err; count++; } else { struct kernel_lb_addr tmploc; @@ -539,6 +551,7 @@ static int udf_do_extend_file(struct inode *inode, if (new_block_bytes) udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0); } + iinfo->i_lenExtents += add; /* Managed to do everything necessary? */ if (!new_block_bytes) @@ -556,7 +569,8 @@ static int udf_do_extend_file(struct inode *inode, err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err) - return err; + goto out_err; + iinfo->i_lenExtents += add; count++; } if (new_block_bytes) { @@ -565,7 +579,8 @@ static int udf_do_extend_file(struct inode *inode, err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err) - return err; + goto out_err; + iinfo->i_lenExtents += new_block_bytes; count++; } @@ -579,6 +594,11 @@ out: return -EIO; return count; +out_err: + /* Remove extents we've created so far */ + udf_clear_extent_cache(inode); + udf_truncate_extents(inode); + return err; } /* Extend the final block of the file to final_block_len bytes */ @@ -626,6 +646,7 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) else BUG(); + down_write(&iinfo->i_data_sem); /* * When creating hole in file, just don't bother with preserving * preallocation. It likely won't be very useful anyway. @@ -668,14 +689,13 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) if (err < 0) goto out; err = 0; - iinfo->i_lenExtents = newsize; out: brelse(epos.bh); + up_write(&iinfo->i_data_sem); return err; } -static sector_t inode_getblk(struct inode *inode, sector_t block, - int *err, int *new) +static int inode_getblk(struct inode *inode, struct udf_map_rq *map) { struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; struct extent_position prev_epos, cur_epos, next_epos; @@ -684,21 +704,20 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, struct kernel_lb_addr eloc, tmpeloc; int c = 1; loff_t lbcount = 0, b_off = 0; - udf_pblk_t newblocknum, newblock = 0; + udf_pblk_t newblocknum; sector_t offset = 0; int8_t etype; struct udf_inode_info *iinfo = UDF_I(inode); udf_pblk_t goal = 0, pgoal = iinfo->i_location.logicalBlockNum; int lastblock = 0; bool isBeyondEOF; + int ret = 0; - *err = 0; - *new = 0; prev_epos.offset = udf_file_entry_alloc_offset(inode); prev_epos.block = iinfo->i_location; prev_epos.bh = NULL; cur_epos = next_epos = prev_epos; - b_off = (loff_t)block << inode->i_sb->s_blocksize_bits; + b_off = (loff_t)map->lblk << inode->i_sb->s_blocksize_bits; /* find the extent which contains the block we are looking for. alternate between laarr[0] and laarr[1] for locations of the @@ -757,15 +776,18 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, elen = EXT_RECORDED_ALLOCATED | ((elen + inode->i_sb->s_blocksize - 1) & ~(inode->i_sb->s_blocksize - 1)); + iinfo->i_lenExtents = + ALIGN(iinfo->i_lenExtents, + inode->i_sb->s_blocksize); udf_write_aext(inode, &cur_epos, &eloc, elen, 1); } - newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset); + map->oflags = UDF_BLK_MAPPED; + map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, offset); goto out_free; } /* Are we beyond EOF and preallocated extent? */ if (etype == -1) { - int ret; loff_t hole_len; isBeyondEOF = true; @@ -785,26 +807,22 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, /* Create extents for the hole between EOF and offset */ hole_len = (loff_t)offset << inode->i_blkbits; ret = udf_do_extend_file(inode, &prev_epos, laarr, hole_len); - if (ret < 0) { - *err = ret; + if (ret < 0) goto out_free; - } c = 0; offset = 0; count += ret; - /* We are not covered by a preallocated extent? */ - if ((laarr[0].extLength & UDF_EXTENT_FLAG_MASK) != - EXT_NOT_RECORDED_ALLOCATED) { - /* Is there any real extent? - otherwise we overwrite - * the fake one... */ - if (count) - c = !c; - laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | - inode->i_sb->s_blocksize; - memset(&laarr[c].extLocation, 0x00, - sizeof(struct kernel_lb_addr)); - count++; - } + /* + * Is there any real extent? - otherwise we overwrite the fake + * one... + */ + if (count) + c = !c; + laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | + inode->i_sb->s_blocksize; + memset(&laarr[c].extLocation, 0x00, + sizeof(struct kernel_lb_addr)); + count++; endnum = c + 1; lastblock = 1; } else { @@ -838,7 +856,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, if ((laarr[c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) newblocknum = laarr[c].extLocation.logicalBlockNum + offset; else { /* otherwise, allocate a new block */ - if (iinfo->i_next_alloc_block == block) + if (iinfo->i_next_alloc_block == map->lblk) goal = iinfo->i_next_alloc_goal; if (!goal) { @@ -848,11 +866,9 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, newblocknum = udf_new_block(inode->i_sb, inode, iinfo->i_location.partitionReferenceNum, - goal, err); - if (!newblocknum) { - *err = -ENOSPC; + goal, &ret); + if (!newblocknum) goto out_free; - } if (isBeyondEOF) iinfo->i_lenExtents += inode->i_sb->s_blocksize; } @@ -863,11 +879,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, * block */ udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); - /* We preallocate blocks only for regular files. It also makes sense - * for directories but there's a problem when to drop the - * preallocation. We might use some delayed work for that but I feel - * it's overengineering for a filesystem like UDF. */ - if (S_ISREG(inode->i_mode)) + if (!(map->iflags & UDF_MAP_NOPREALLOC)) udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); /* merge any continuous blocks in laarr */ @@ -876,28 +888,31 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, /* write back the new extents, inserting new extents if the new number * of extents is greater than the old number, and deleting extents if * the new number of extents is less than the old number */ - udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); + ret = udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); + if (ret < 0) + goto out_free; - newblock = udf_get_pblock(inode->i_sb, newblocknum, + map->pblk = udf_get_pblock(inode->i_sb, newblocknum, iinfo->i_location.partitionReferenceNum, 0); - if (!newblock) { - *err = -EIO; + if (!map->pblk) { + ret = -EFSCORRUPTED; goto out_free; } - *new = 1; - iinfo->i_next_alloc_block = block; - iinfo->i_next_alloc_goal = newblocknum; + map->oflags = UDF_BLK_NEW | UDF_BLK_MAPPED; + iinfo->i_next_alloc_block = map->lblk + 1; + iinfo->i_next_alloc_goal = newblocknum + 1; inode->i_ctime = current_time(inode); if (IS_SYNC(inode)) udf_sync_inode(inode); else mark_inode_dirty(inode); + ret = 0; out_free: brelse(prev_epos.bh); brelse(cur_epos.bh); brelse(next_epos.bh); - return newblock; + return ret; } static void udf_split_extents(struct inode *inode, int *c, int offset, @@ -1080,23 +1095,8 @@ static void udf_merge_extents(struct inode *inode, struct kernel_long_ad *laarr, blocksize - 1) >> blocksize_bits)))) { if (((li->extLength & UDF_EXTENT_LENGTH_MASK) + - (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + - blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) { - lip1->extLength = (lip1->extLength - - (li->extLength & - UDF_EXTENT_LENGTH_MASK) + - UDF_EXTENT_LENGTH_MASK) & - ~(blocksize - 1); - li->extLength = (li->extLength & - UDF_EXTENT_FLAG_MASK) + - (UDF_EXTENT_LENGTH_MASK + 1) - - blocksize; - lip1->extLocation.logicalBlockNum = - li->extLocation.logicalBlockNum + - ((li->extLength & - UDF_EXTENT_LENGTH_MASK) >> - blocksize_bits); - } else { + (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) <= UDF_EXTENT_LENGTH_MASK) { li->extLength = lip1->extLength + (((li->extLength & UDF_EXTENT_LENGTH_MASK) + @@ -1159,21 +1159,30 @@ static void udf_merge_extents(struct inode *inode, struct kernel_long_ad *laarr, } } -static void udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr, - int startnum, int endnum, - struct extent_position *epos) +static int udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr, + int startnum, int endnum, + struct extent_position *epos) { int start = 0, i; struct kernel_lb_addr tmploc; uint32_t tmplen; + int err; if (startnum > endnum) { for (i = 0; i < (startnum - endnum); i++) udf_delete_aext(inode, *epos); } else if (startnum < endnum) { for (i = 0; i < (endnum - startnum); i++) { - udf_insert_aext(inode, *epos, laarr[i].extLocation, - laarr[i].extLength); + err = udf_insert_aext(inode, *epos, + laarr[i].extLocation, + laarr[i].extLength); + /* + * If we fail here, we are likely corrupting the extent + * list and leaking blocks. At least stop early to + * limit the damage. + */ + if (err < 0) + return err; udf_next_aext(inode, epos, &laarr[i].extLocation, &laarr[i].extLength, 1); start++; @@ -1185,17 +1194,36 @@ static void udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr udf_write_aext(inode, epos, &laarr[i].extLocation, laarr[i].extLength, 1); } + return 0; } struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, int create, int *err) { struct buffer_head *bh = NULL; + struct udf_map_rq map = { + .lblk = block, + .iflags = UDF_MAP_NOPREALLOC | (create ? UDF_MAP_CREATE : 0), + }; - bh = udf_getblk(inode, block, create, err); - if (!bh) + *err = udf_map_block(inode, &map); + if (*err || !(map.oflags & UDF_BLK_MAPPED)) return NULL; + bh = sb_getblk(inode->i_sb, map.pblk); + if (!bh) { + *err = -ENOMEM; + return NULL; + } + if (map.oflags & UDF_BLK_NEW) { + lock_buffer(bh); + memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + mark_buffer_dirty_inode(bh, inode); + return bh; + } + if (bh_read(bh, 0) >= 0) return bh; @@ -1206,7 +1234,7 @@ struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, int udf_setsize(struct inode *inode, loff_t newsize) { - int err; + int err = 0; struct udf_inode_info *iinfo; unsigned int bsize = i_blocksize(inode); @@ -1216,28 +1244,25 @@ int udf_setsize(struct inode *inode, loff_t newsize) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; + filemap_invalidate_lock(inode->i_mapping); iinfo = UDF_I(inode); if (newsize > inode->i_size) { - down_write(&iinfo->i_data_sem); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - if (bsize < + if (bsize >= (udf_file_entry_alloc_offset(inode) + newsize)) { - err = udf_expand_file_adinicb(inode); - if (err) - return err; down_write(&iinfo->i_data_sem); - } else { iinfo->i_lenAlloc = newsize; + up_write(&iinfo->i_data_sem); goto set_size; } + err = udf_expand_file_adinicb(inode); + if (err) + goto out_unlock; } err = udf_extend_file(inode, newsize); - if (err) { - up_write(&iinfo->i_data_sem); - return err; - } + if (err) + goto out_unlock; set_size: - up_write(&iinfo->i_data_sem); truncate_setsize(inode, newsize); } else { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { @@ -1254,14 +1279,14 @@ set_size: err = block_truncate_page(inode->i_mapping, newsize, udf_get_block); if (err) - return err; + goto out_unlock; truncate_setsize(inode, newsize); down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); err = udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); if (err) - return err; + goto out_unlock; } update_time: inode->i_mtime = inode->i_ctime = current_time(inode); @@ -1269,7 +1294,9 @@ update_time: udf_sync_inode(inode); else mark_inode_dirty(inode); - return 0; +out_unlock: + filemap_invalidate_unlock(inode->i_mapping); + return err; } /* @@ -1381,6 +1408,7 @@ reread: ret = -EIO; goto out; } + iinfo->i_hidden = hidden_inode; iinfo->i_unique = 0; iinfo->i_lenEAttr = 0; iinfo->i_lenExtents = 0; @@ -1537,10 +1565,7 @@ reread: case ICBTAG_FILE_TYPE_REGULAR: case ICBTAG_FILE_TYPE_UNDEF: case ICBTAG_FILE_TYPE_VAT20: - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - inode->i_data.a_ops = &udf_adinicb_aops; - else - inode->i_data.a_ops = &udf_aops; + inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; inode->i_mode |= S_IFREG; @@ -1671,7 +1696,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; struct udf_inode_info *iinfo = UDF_I(inode); - bh = udf_tgetblk(inode->i_sb, + bh = sb_getblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0)); if (!bh) { udf_debug("getblk failure\n"); @@ -1716,8 +1741,12 @@ static int udf_update_inode(struct inode *inode, int do_sync) if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0) fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1); - else - fe->fileLinkCount = cpu_to_le16(inode->i_nlink); + else { + if (iinfo->i_hidden) + fe->fileLinkCount = cpu_to_le16(0); + else + fe->fileLinkCount = cpu_to_le16(inode->i_nlink); + } fe->informationLength = cpu_to_le64(inode->i_size); @@ -1888,8 +1917,13 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->i_state & I_NEW)) { + if (UDF_I(inode)->i_hidden != hidden_inode) { + iput(inode); + return ERR_PTR(-EFSCORRUPTED); + } return inode; + } memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); err = udf_read_inode(inode, hidden_inode); @@ -1922,7 +1956,7 @@ int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block, neloc.logicalBlockNum = block; neloc.partitionReferenceNum = epos->block.partitionReferenceNum; - bh = udf_tgetblk(sb, udf_get_lb_pblock(sb, &neloc, 0)); + bh = sb_getblk(sb, udf_get_lb_pblock(sb, &neloc, 0)); if (!bh) return -EIO; lock_buffer(bh); @@ -2139,7 +2173,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, epos->offset = sizeof(struct allocExtDesc); brelse(epos->bh); block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0); - epos->bh = udf_tread(inode->i_sb, block); + epos->bh = sb_bread(inode->i_sb, block); if (!epos->bh) { udf_debug("reading block %u failed!\n", block); return -1; @@ -2203,12 +2237,13 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, return etype; } -static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, - struct kernel_lb_addr neloc, uint32_t nelen) +static int udf_insert_aext(struct inode *inode, struct extent_position epos, + struct kernel_lb_addr neloc, uint32_t nelen) { struct kernel_lb_addr oeloc; uint32_t oelen; int8_t etype; + int err; if (epos.bh) get_bh(epos.bh); @@ -2218,10 +2253,10 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, neloc = oeloc; nelen = (etype << 30) | oelen; } - udf_add_aext(inode, &epos, &neloc, nelen, 1); + err = udf_add_aext(inode, &epos, &neloc, nelen, 1); brelse(epos.bh); - return (nelen >> 30); + return err; } int8_t udf_delete_aext(struct inode *inode, struct extent_position epos) @@ -2339,28 +2374,3 @@ int8_t inode_bmap(struct inode *inode, sector_t block, return etype; } - -udf_pblk_t udf_block_map(struct inode *inode, sector_t block) -{ - struct kernel_lb_addr eloc; - uint32_t elen; - sector_t offset; - struct extent_position epos = {}; - udf_pblk_t ret; - - down_read(&UDF_I(inode)->i_data_sem); - - if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == - (EXT_RECORDED_ALLOCATED >> 30)) - ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset); - else - ret = 0; - - up_read(&UDF_I(inode)->i_data_sem); - brelse(epos.bh); - - if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV)) - return udf_fixed_to_variable(ret); - else - return ret; -} diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 46d697172197..c87ed942d076 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -45,7 +45,7 @@ unsigned int udf_get_last_session(struct super_block *sb) return 0; } -unsigned long udf_get_last_block(struct super_block *sb) +udf_pblk_t udf_get_last_block(struct super_block *sb) { struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); unsigned long lblock = 0; @@ -54,8 +54,11 @@ unsigned long udf_get_last_block(struct super_block *sb) * The cdrom layer call failed or returned obviously bogus value? * Try using the device size... */ - if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0) + if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0) { + if (sb_bdev_nr_blocks(sb) > ~(udf_pblk_t)0) + return 0; lblock = sb_bdev_nr_blocks(sb); + } if (lblock) return lblock - 1; diff --git a/fs/udf/misc.c b/fs/udf/misc.c index 1614d308d0f0..3777468d06ce 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -28,22 +28,6 @@ #include "udf_i.h" #include "udf_sb.h" -struct buffer_head *udf_tgetblk(struct super_block *sb, udf_pblk_t block) -{ - if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV)) - return sb_getblk(sb, udf_fixed_to_variable(block)); - else - return sb_getblk(sb, block); -} - -struct buffer_head *udf_tread(struct super_block *sb, udf_pblk_t block) -{ - if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV)) - return sb_bread(sb, udf_fixed_to_variable(block)); - else - return sb_bread(sb, block); -} - struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, uint32_t type, uint8_t loc) { @@ -216,7 +200,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, if (block == 0xFFFFFFFF) return NULL; - bh = udf_tread(sb, block); + bh = sb_bread(sb, block); if (!bh) { udf_err(sb, "read failed, block=%u, location=%u\n", block, location); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 7c95c549dd64..fd20423d3ed2 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -41,283 +41,93 @@ static inline int udf_match(int len1, const unsigned char *name1, int len2, return !memcmp(name1, name2, len1); } -int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, - struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh, - uint8_t *impuse, uint8_t *fileident) -{ - uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag); - uint16_t crc; - int offset; - uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse); - uint8_t lfi = cfi->lengthFileIdent; - int padlen = fibh->eoffset - fibh->soffset - liu - lfi - - sizeof(struct fileIdentDesc); - int adinicb = 0; - - if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - adinicb = 1; - - offset = fibh->soffset + sizeof(struct fileIdentDesc); - - if (impuse) { - if (adinicb || (offset + liu < 0)) { - memcpy((uint8_t *)sfi->impUse, impuse, liu); - } else if (offset >= 0) { - memcpy(fibh->ebh->b_data + offset, impuse, liu); - } else { - memcpy((uint8_t *)sfi->impUse, impuse, -offset); - memcpy(fibh->ebh->b_data, impuse - offset, - liu + offset); - } - } - - offset += liu; - - if (fileident) { - if (adinicb || (offset + lfi < 0)) { - memcpy(sfi->impUse + liu, fileident, lfi); - } else if (offset >= 0) { - memcpy(fibh->ebh->b_data + offset, fileident, lfi); - } else { - memcpy(sfi->impUse + liu, fileident, -offset); - memcpy(fibh->ebh->b_data, fileident - offset, - lfi + offset); - } - } - - offset += lfi; - - if (adinicb || (offset + padlen < 0)) { - memset(sfi->impUse + liu + lfi, 0x00, padlen); - } else if (offset >= 0) { - memset(fibh->ebh->b_data + offset, 0x00, padlen); - } else { - memset(sfi->impUse + liu + lfi, 0x00, -offset); - memset(fibh->ebh->b_data, 0x00, padlen + offset); - } - - crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag), - sizeof(struct fileIdentDesc) - sizeof(struct tag)); - - if (fibh->sbh == fibh->ebh) { - crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, - crclen + sizeof(struct tag) - - sizeof(struct fileIdentDesc)); - } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) { - crc = crc_itu_t(crc, fibh->ebh->b_data + - sizeof(struct fileIdentDesc) + - fibh->soffset, - crclen + sizeof(struct tag) - - sizeof(struct fileIdentDesc)); - } else { - crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, - -fibh->soffset - sizeof(struct fileIdentDesc)); - crc = crc_itu_t(crc, fibh->ebh->b_data, fibh->eoffset); - } - - cfi->descTag.descCRC = cpu_to_le16(crc); - cfi->descTag.descCRCLength = cpu_to_le16(crclen); - cfi->descTag.tagChecksum = udf_tag_checksum(&cfi->descTag); - - if (adinicb || (sizeof(struct fileIdentDesc) <= -fibh->soffset)) { - memcpy((uint8_t *)sfi, (uint8_t *)cfi, - sizeof(struct fileIdentDesc)); - } else { - memcpy((uint8_t *)sfi, (uint8_t *)cfi, -fibh->soffset); - memcpy(fibh->ebh->b_data, (uint8_t *)cfi - fibh->soffset, - sizeof(struct fileIdentDesc) + fibh->soffset); - } - - if (adinicb) { - mark_inode_dirty(inode); - } else { - if (fibh->sbh != fibh->ebh) - mark_buffer_dirty_inode(fibh->ebh, inode); - mark_buffer_dirty_inode(fibh->sbh, inode); - } - inode_inc_iversion(inode); - - return 0; -} - /** - * udf_find_entry - find entry in given directory. + * udf_fiiter_find_entry - find entry in given directory. * * @dir: directory inode to search in * @child: qstr of the name - * @fibh: buffer head / inode with file identifier descriptor we found - * @cfi: found file identifier descriptor with given name + * @iter: iter to use for searching * * This function searches in the directory @dir for a file name @child. When - * found, @fibh points to the buffer head(s) (bh is NULL for in ICB - * directories) containing the file identifier descriptor (FID). In that case - * the function returns pointer to the FID in the buffer or inode - but note - * that FID may be split among two buffers (blocks) so accessing it via that - * pointer isn't easily possible. This pointer can be used only as an iterator - * for other directory manipulation functions. For inspection of the FID @cfi - * can be used - the found FID is copied there. + * found, @iter points to the position in the directory with given entry. * - * Returns pointer to FID, NULL when nothing found, or error code. + * Returns 0 on success, < 0 on error (including -ENOENT). */ -static struct fileIdentDesc *udf_find_entry(struct inode *dir, - const struct qstr *child, - struct udf_fileident_bh *fibh, - struct fileIdentDesc *cfi) +static int udf_fiiter_find_entry(struct inode *dir, const struct qstr *child, + struct udf_fileident_iter *iter) { - struct fileIdentDesc *fi = NULL; - loff_t f_pos; - udf_pblk_t block; int flen; - unsigned char *fname = NULL, *copy_name = NULL; - unsigned char *nameptr; - uint8_t lfi; - uint16_t liu; - loff_t size; - struct kernel_lb_addr eloc; - uint32_t elen; - sector_t offset; - struct extent_position epos = {}; - struct udf_inode_info *dinfo = UDF_I(dir); + unsigned char *fname = NULL; + struct super_block *sb = dir->i_sb; int isdotdot = child->len == 2 && child->name[0] == '.' && child->name[1] == '.'; - struct super_block *sb = dir->i_sb; - - size = udf_ext0_offset(dir) + dir->i_size; - f_pos = udf_ext0_offset(dir); - - fibh->sbh = fibh->ebh = NULL; - fibh->soffset = fibh->eoffset = f_pos & (sb->s_blocksize - 1); - if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - if (inode_bmap(dir, f_pos >> sb->s_blocksize_bits, &epos, - &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { - fi = ERR_PTR(-EIO); - goto out_err; - } - - block = udf_get_lb_pblock(sb, &eloc, offset); - if ((++offset << sb->s_blocksize_bits) < elen) { - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - } else - offset = 0; - - fibh->sbh = fibh->ebh = udf_tread(sb, block); - if (!fibh->sbh) { - fi = ERR_PTR(-EIO); - goto out_err; - } - } + int ret; fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); - if (!fname) { - fi = ERR_PTR(-ENOMEM); - goto out_err; - } - - while (f_pos < size) { - fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, - &elen, &offset); - if (!fi) { - fi = ERR_PTR(-EIO); - goto out_err; - } - - liu = le16_to_cpu(cfi->lengthOfImpUse); - lfi = cfi->lengthFileIdent; - - if (fibh->sbh == fibh->ebh) { - nameptr = udf_get_fi_ident(fi); - } else { - int poffset; /* Unpaded ending offset */ - - poffset = fibh->soffset + sizeof(struct fileIdentDesc) + - liu + lfi; - - if (poffset >= lfi) - nameptr = (uint8_t *)(fibh->ebh->b_data + - poffset - lfi); - else { - if (!copy_name) { - copy_name = kmalloc(UDF_NAME_LEN_CS0, - GFP_NOFS); - if (!copy_name) { - fi = ERR_PTR(-ENOMEM); - goto out_err; - } - } - nameptr = copy_name; - memcpy(nameptr, udf_get_fi_ident(fi), - lfi - poffset); - memcpy(nameptr + lfi - poffset, - fibh->ebh->b_data, poffset); - } - } + if (!fname) + return -ENOMEM; - if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { + for (ret = udf_fiiter_init(iter, dir, 0); + !ret && iter->pos < dir->i_size; + ret = udf_fiiter_advance(iter)) { + if (iter->fi.fileCharacteristics & FID_FILE_CHAR_DELETED) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) continue; } - if ((cfi->fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { + if (iter->fi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) continue; } - if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) && + if ((iter->fi.fileCharacteristics & FID_FILE_CHAR_PARENT) && isdotdot) goto out_ok; - if (!lfi) + if (!iter->fi.lengthFileIdent) continue; - flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); + flen = udf_get_filename(sb, iter->name, + iter->fi.lengthFileIdent, fname, UDF_NAME_LEN); if (flen < 0) { - fi = ERR_PTR(flen); + ret = flen; goto out_err; } if (udf_match(flen, fname, child->len, child->name)) goto out_ok; } + if (!ret) + ret = -ENOENT; - fi = NULL; out_err: - if (fibh->sbh != fibh->ebh) - brelse(fibh->ebh); - brelse(fibh->sbh); + udf_fiiter_release(iter); out_ok: - brelse(epos.bh); kfree(fname); - kfree(copy_name); - return fi; + return ret; } static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = NULL; - struct fileIdentDesc cfi; - struct udf_fileident_bh fibh; - struct fileIdentDesc *fi; + struct udf_fileident_iter iter; + int err; if (dentry->d_name.len > UDF_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); - if (IS_ERR(fi)) - return ERR_CAST(fi); + err = udf_fiiter_find_entry(dir, &dentry->d_name, &iter); + if (err < 0 && err != -ENOENT) + return ERR_PTR(err); - if (fi) { + if (err == 0) { struct kernel_lb_addr loc; - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); + loc = lelb_to_cpu(iter.fi.icb.extLocation); + udf_fiiter_release(&iter); - loc = lelb_to_cpu(cfi.icb.extLocation); inode = udf_iget(dir->i_sb, &loc); if (IS_ERR(inode)) return ERR_CAST(inode); @@ -326,287 +136,249 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, return d_splice_alias(inode, dentry); } -static struct fileIdentDesc *udf_add_entry(struct inode *dir, - struct dentry *dentry, - struct udf_fileident_bh *fibh, - struct fileIdentDesc *cfi, int *err) +static int udf_expand_dir_adinicb(struct inode *inode, udf_pblk_t *block) { - struct super_block *sb = dir->i_sb; - struct fileIdentDesc *fi = NULL; - unsigned char *name = NULL; - int namelen; - loff_t f_pos; - loff_t size = udf_ext0_offset(dir) + dir->i_size; - int nfidlen; - udf_pblk_t block; + udf_pblk_t newblock; + struct buffer_head *dbh = NULL; struct kernel_lb_addr eloc; - uint32_t elen = 0; - sector_t offset; - struct extent_position epos = {}; - struct udf_inode_info *dinfo; + struct extent_position epos; + uint8_t alloctype; + struct udf_inode_info *iinfo = UDF_I(inode); + struct udf_fileident_iter iter; + uint8_t *impuse; + int ret; - fibh->sbh = fibh->ebh = NULL; - name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS); - if (!name) { - *err = -ENOMEM; - goto out_err; - } + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) + alloctype = ICBTAG_FLAG_AD_SHORT; + else + alloctype = ICBTAG_FLAG_AD_LONG; - if (dentry) { - if (!dentry->d_name.len) { - *err = -EINVAL; - goto out_err; - } - namelen = udf_put_filename(sb, dentry->d_name.name, - dentry->d_name.len, - name, UDF_NAME_LEN_CS0); - if (!namelen) { - *err = -ENAMETOOLONG; - goto out_err; - } - } else { - namelen = 0; + if (!inode->i_size) { + iinfo->i_alloc_type = alloctype; + mark_inode_dirty(inode); + return 0; } - nfidlen = ALIGN(sizeof(struct fileIdentDesc) + namelen, UDF_NAME_PAD); - - f_pos = udf_ext0_offset(dir); - - fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1); - dinfo = UDF_I(dir); - if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, - &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { - block = udf_get_lb_pblock(dir->i_sb, - &dinfo->i_location, 0); - fibh->soffset = fibh->eoffset = sb->s_blocksize; - goto add; - } - block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); - if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - } else - offset = 0; - - fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); - if (!fibh->sbh) { - *err = -EIO; - goto out_err; - } + /* alloc block, and copy data to it */ + *block = udf_new_block(inode->i_sb, inode, + iinfo->i_location.partitionReferenceNum, + iinfo->i_location.logicalBlockNum, &ret); + if (!(*block)) + return ret; + newblock = udf_get_pblock(inode->i_sb, *block, + iinfo->i_location.partitionReferenceNum, + 0); + if (newblock == 0xffffffff) + return -EFSCORRUPTED; + dbh = sb_getblk(inode->i_sb, newblock); + if (!dbh) + return -ENOMEM; + lock_buffer(dbh); + memcpy(dbh->b_data, iinfo->i_data, inode->i_size); + memset(dbh->b_data + inode->i_size, 0, + inode->i_sb->s_blocksize - inode->i_size); + set_buffer_uptodate(dbh); + unlock_buffer(dbh); + + /* Drop inline data, add block instead */ + iinfo->i_alloc_type = alloctype; + memset(iinfo->i_data + iinfo->i_lenEAttr, 0, iinfo->i_lenAlloc); + iinfo->i_lenAlloc = 0; + eloc.logicalBlockNum = *block; + eloc.partitionReferenceNum = + iinfo->i_location.partitionReferenceNum; + iinfo->i_lenExtents = inode->i_size; + epos.bh = NULL; + epos.block = iinfo->i_location; + epos.offset = udf_file_entry_alloc_offset(inode); + ret = udf_add_aext(inode, &epos, &eloc, inode->i_size, 0); + brelse(epos.bh); + if (ret < 0) { + brelse(dbh); + udf_free_blocks(inode->i_sb, inode, &eloc, 0, 1); + return ret; + } + mark_inode_dirty(inode); - block = dinfo->i_location.logicalBlockNum; + /* Now fixup tags in moved directory entries */ + for (ret = udf_fiiter_init(&iter, inode, 0); + !ret && iter.pos < inode->i_size; + ret = udf_fiiter_advance(&iter)) { + iter.fi.descTag.tagLocation = cpu_to_le32(*block); + if (iter.fi.lengthOfImpUse != cpu_to_le16(0)) + impuse = dbh->b_data + iter.pos + + sizeof(struct fileIdentDesc); + else + impuse = NULL; + udf_fiiter_write_fi(&iter, impuse); } + brelse(dbh); + /* + * We don't expect the iteration to fail as the directory has been + * already verified to be correct + */ + WARN_ON_ONCE(ret); + udf_fiiter_release(&iter); - while (f_pos < size) { - fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, - &elen, &offset); + return 0; +} - if (!fi) { - *err = -EIO; - goto out_err; - } +static int udf_fiiter_add_entry(struct inode *dir, struct dentry *dentry, + struct udf_fileident_iter *iter) +{ + struct udf_inode_info *dinfo = UDF_I(dir); + int nfidlen, namelen = 0; + int ret; + int off, blksize = 1 << dir->i_blkbits; + udf_pblk_t block; + char name[UDF_NAME_LEN_CS0]; + + if (dentry) { + if (!dentry->d_name.len) + return -EINVAL; + namelen = udf_put_filename(dir->i_sb, dentry->d_name.name, + dentry->d_name.len, + name, UDF_NAME_LEN_CS0); + if (!namelen) + return -ENAMETOOLONG; + } + nfidlen = ALIGN(sizeof(struct fileIdentDesc) + namelen, UDF_NAME_PAD); - if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { - if (udf_dir_entry_len(cfi) == nfidlen) { - cfi->descTag.tagSerialNum = cpu_to_le16(1); - cfi->fileVersionNum = cpu_to_le16(1); - cfi->fileCharacteristics = 0; - cfi->lengthFileIdent = namelen; - cfi->lengthOfImpUse = cpu_to_le16(0); - if (!udf_write_fi(dir, cfi, fi, fibh, NULL, - name)) - goto out_ok; - else { - *err = -EIO; - goto out_err; - } + for (ret = udf_fiiter_init(iter, dir, 0); + !ret && iter->pos < dir->i_size; + ret = udf_fiiter_advance(iter)) { + if (iter->fi.fileCharacteristics & FID_FILE_CHAR_DELETED) { + if (udf_dir_entry_len(&iter->fi) == nfidlen) { + iter->fi.descTag.tagSerialNum = cpu_to_le16(1); + iter->fi.fileVersionNum = cpu_to_le16(1); + iter->fi.fileCharacteristics = 0; + iter->fi.lengthFileIdent = namelen; + iter->fi.lengthOfImpUse = cpu_to_le16(0); + memcpy(iter->namebuf, name, namelen); + iter->name = iter->namebuf; + return 0; } } } - -add: - f_pos += nfidlen; - + if (ret) { + udf_fiiter_release(iter); + return ret; + } if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && - sb->s_blocksize - fibh->eoffset < nfidlen) { - brelse(epos.bh); - epos.bh = NULL; - fibh->soffset -= udf_ext0_offset(dir); - fibh->eoffset -= udf_ext0_offset(dir); - f_pos -= udf_ext0_offset(dir); - if (fibh->sbh != fibh->ebh) - brelse(fibh->ebh); - brelse(fibh->sbh); - fibh->sbh = fibh->ebh = - udf_expand_dir_adinicb(dir, &block, err); - if (!fibh->sbh) - goto out_err; - epos.block = dinfo->i_location; - epos.offset = udf_file_entry_alloc_offset(dir); - /* Load extent udf_expand_dir_adinicb() has created */ - udf_current_aext(dir, &epos, &eloc, &elen, 1); + blksize - udf_ext0_offset(dir) - iter->pos < nfidlen) { + udf_fiiter_release(iter); + ret = udf_expand_dir_adinicb(dir, &block); + if (ret) + return ret; + ret = udf_fiiter_init(iter, dir, dir->i_size); + if (ret < 0) + return ret; } - /* Entry fits into current block? */ - if (sb->s_blocksize - fibh->eoffset >= nfidlen) { - fibh->soffset = fibh->eoffset; - fibh->eoffset += nfidlen; - if (fibh->sbh != fibh->ebh) { - brelse(fibh->sbh); - fibh->sbh = fibh->ebh; - } - - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - block = dinfo->i_location.logicalBlockNum; - fi = (struct fileIdentDesc *) - (dinfo->i_data + fibh->soffset - - udf_ext0_offset(dir) + - dinfo->i_lenEAttr); - } else { - block = eloc.logicalBlockNum + - ((elen - 1) >> - dir->i_sb->s_blocksize_bits); - fi = (struct fileIdentDesc *) - (fibh->sbh->b_data + fibh->soffset); - } + /* Get blocknumber to use for entry tag */ + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + block = dinfo->i_location.logicalBlockNum; } else { - /* Round up last extent in the file */ - elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - udf_write_aext(dir, &epos, &eloc, elen, 1); - dinfo->i_lenExtents = (dinfo->i_lenExtents + sb->s_blocksize - - 1) & ~(sb->s_blocksize - 1); - - fibh->soffset = fibh->eoffset - sb->s_blocksize; - fibh->eoffset += nfidlen - sb->s_blocksize; - if (fibh->sbh != fibh->ebh) { - brelse(fibh->sbh); - fibh->sbh = fibh->ebh; - } + block = iter->eloc.logicalBlockNum + + ((iter->elen - 1) >> dir->i_blkbits); + } + off = iter->pos & (blksize - 1); + if (!off) + off = blksize; + /* Entry fits into current block? */ + if (blksize - udf_ext0_offset(dir) - off >= nfidlen) + goto store_fi; - block = eloc.logicalBlockNum + ((elen - 1) >> - dir->i_sb->s_blocksize_bits); - fibh->ebh = udf_bread(dir, - f_pos >> dir->i_sb->s_blocksize_bits, 1, err); - if (!fibh->ebh) - goto out_err; - /* Extents could have been merged, invalidate our position */ - brelse(epos.bh); - epos.bh = NULL; - epos.block = dinfo->i_location; - epos.offset = udf_file_entry_alloc_offset(dir); - - if (!fibh->soffset) { - /* Find the freshly allocated block */ - while (udf_next_aext(dir, &epos, &eloc, &elen, 1) == - (EXT_RECORDED_ALLOCATED >> 30)) - ; - block = eloc.logicalBlockNum + ((elen - 1) >> - dir->i_sb->s_blocksize_bits); - brelse(fibh->sbh); - fibh->sbh = fibh->ebh; - fi = (struct fileIdentDesc *)(fibh->sbh->b_data); - } else { - fi = (struct fileIdentDesc *) - (fibh->sbh->b_data + sb->s_blocksize + - fibh->soffset); - } + ret = udf_fiiter_append_blk(iter); + if (ret) { + udf_fiiter_release(iter); + return ret; } - memset(cfi, 0, sizeof(struct fileIdentDesc)); - if (UDF_SB(sb)->s_udfrev >= 0x0200) - udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block, + /* Entry will be completely in the new block? Update tag location... */ + if (!(iter->pos & (blksize - 1))) + block = iter->eloc.logicalBlockNum + + ((iter->elen - 1) >> dir->i_blkbits); +store_fi: + memset(&iter->fi, 0, sizeof(struct fileIdentDesc)); + if (UDF_SB(dir->i_sb)->s_udfrev >= 0x0200) + udf_new_tag((char *)(&iter->fi), TAG_IDENT_FID, 3, 1, block, sizeof(struct tag)); else - udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block, + udf_new_tag((char *)(&iter->fi), TAG_IDENT_FID, 2, 1, block, sizeof(struct tag)); - cfi->fileVersionNum = cpu_to_le16(1); - cfi->lengthFileIdent = namelen; - cfi->lengthOfImpUse = cpu_to_le16(0); - if (!udf_write_fi(dir, cfi, fi, fibh, NULL, name)) { - dir->i_size += nfidlen; - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - dinfo->i_lenAlloc += nfidlen; - else { - /* Find the last extent and truncate it to proper size */ - while (udf_next_aext(dir, &epos, &eloc, &elen, 1) == - (EXT_RECORDED_ALLOCATED >> 30)) - ; - elen -= dinfo->i_lenExtents - dir->i_size; - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - udf_write_aext(dir, &epos, &eloc, elen, 1); - dinfo->i_lenExtents = dir->i_size; - } - - mark_inode_dirty(dir); - goto out_ok; + iter->fi.fileVersionNum = cpu_to_le16(1); + iter->fi.lengthFileIdent = namelen; + iter->fi.lengthOfImpUse = cpu_to_le16(0); + memcpy(iter->namebuf, name, namelen); + iter->name = iter->namebuf; + + dir->i_size += nfidlen; + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + dinfo->i_lenAlloc += nfidlen; } else { - *err = -EIO; - goto out_err; + /* Truncate last extent to proper size */ + udf_fiiter_update_elen(iter, iter->elen - + (dinfo->i_lenExtents - dir->i_size)); } + mark_inode_dirty(dir); -out_err: - fi = NULL; - if (fibh->sbh != fibh->ebh) - brelse(fibh->ebh); - brelse(fibh->sbh); -out_ok: - brelse(epos.bh); - kfree(name); - return fi; + return 0; } -static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, - struct udf_fileident_bh *fibh, - struct fileIdentDesc *cfi) +static void udf_fiiter_delete_entry(struct udf_fileident_iter *iter) { - cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED; + iter->fi.fileCharacteristics |= FID_FILE_CHAR_DELETED; + + if (UDF_QUERY_FLAG(iter->dir->i_sb, UDF_FLAG_STRICT)) + memset(&iter->fi.icb, 0x00, sizeof(struct long_ad)); + + udf_fiiter_write_fi(iter, NULL); +} - if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) - memset(&(cfi->icb), 0x00, sizeof(struct long_ad)); +static void udf_add_fid_counter(struct super_block *sb, bool dir, int val) +{ + struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); - return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); + if (!lvidiu) + return; + mutex_lock(&UDF_SB(sb)->s_alloc_mutex); + if (dir) + le32_add_cpu(&lvidiu->numDirs, val); + else + le32_add_cpu(&lvidiu->numFiles, val); + udf_updated_lvid(sb); + mutex_unlock(&UDF_SB(sb)->s_alloc_mutex); } static int udf_add_nondir(struct dentry *dentry, struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); struct inode *dir = d_inode(dentry->d_parent); - struct udf_fileident_bh fibh; - struct fileIdentDesc cfi, *fi; + struct udf_fileident_iter iter; int err; - fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (unlikely(!fi)) { + err = udf_fiiter_add_entry(dir, dentry, &iter); + if (err) { inode_dec_link_count(inode); discard_new_inode(inode); return err; } - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); - udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); + udf_fiiter_write_fi(&iter, NULL); dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); + udf_fiiter_release(&iter); + udf_add_fid_counter(dir->i_sb, false, 1); d_instantiate_new(dentry, inode); return 0; } -static int udf_create(struct user_namespace *mnt_userns, struct inode *dir, +static int udf_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode = udf_new_inode(dir, mode); @@ -614,10 +386,7 @@ static int udf_create(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(inode)) return PTR_ERR(inode); - if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - inode->i_data.a_ops = &udf_adinicb_aops; - else - inode->i_data.a_ops = &udf_aops; + inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; mark_inode_dirty(inode); @@ -625,7 +394,7 @@ static int udf_create(struct user_namespace *mnt_userns, struct inode *dir, return udf_add_nondir(dentry, inode); } -static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int udf_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode = udf_new_inode(dir, mode); @@ -633,10 +402,7 @@ static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(inode)) return PTR_ERR(inode); - if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - inode->i_data.a_ops = &udf_adinicb_aops; - else - inode->i_data.a_ops = &udf_aops; + inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; mark_inode_dirty(inode); @@ -645,7 +411,7 @@ static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return finish_open_simple(file, 0); } -static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int udf_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -661,12 +427,11 @@ static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir, return udf_add_nondir(dentry, inode); } -static int udf_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - struct udf_fileident_bh fibh; - struct fileIdentDesc cfi, *fi; + struct udf_fileident_iter iter; int err; struct udf_inode_info *dinfo = UDF_I(dir); struct udf_inode_info *iinfo; @@ -678,183 +443,113 @@ static int udf_mkdir(struct user_namespace *mnt_userns, struct inode *dir, iinfo = UDF_I(inode); inode->i_op = &udf_dir_inode_operations; inode->i_fop = &udf_dir_operations; - fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err); - if (!fi) { - inode_dec_link_count(inode); + err = udf_fiiter_add_entry(inode, NULL, &iter); + if (err) { + clear_nlink(inode); discard_new_inode(inode); - goto out; + return err; } set_nlink(inode, 2); - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location); - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + iter.fi.icb.extLocation = cpu_to_lelb(dinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(dinfo->i_unique & 0x00000000FFFFFFFFUL); - cfi.fileCharacteristics = + iter.fi.fileCharacteristics = FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT; - udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL); - brelse(fibh.sbh); + udf_fiiter_write_fi(&iter, NULL); + udf_fiiter_release(&iter); mark_inode_dirty(inode); - fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (!fi) { + err = udf_fiiter_add_entry(dir, dentry, &iter); + if (err) { clear_nlink(inode); - mark_inode_dirty(inode); discard_new_inode(inode); - goto out; + return err; } - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); - cfi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY; - udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); + iter.fi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY; + udf_fiiter_write_fi(&iter, NULL); + udf_fiiter_release(&iter); + udf_add_fid_counter(dir->i_sb, true, 1); inc_nlink(dir); dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); d_instantiate_new(dentry, inode); - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - err = 0; -out: - return err; + return 0; } static int empty_dir(struct inode *dir) { - struct fileIdentDesc *fi, cfi; - struct udf_fileident_bh fibh; - loff_t f_pos; - loff_t size = udf_ext0_offset(dir) + dir->i_size; - udf_pblk_t block; - struct kernel_lb_addr eloc; - uint32_t elen; - sector_t offset; - struct extent_position epos = {}; - struct udf_inode_info *dinfo = UDF_I(dir); - - f_pos = udf_ext0_offset(dir); - fibh.soffset = fibh.eoffset = f_pos & (dir->i_sb->s_blocksize - 1); - - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - fibh.sbh = fibh.ebh = NULL; - else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, - &epos, &eloc, &elen, &offset) == - (EXT_RECORDED_ALLOCATED >> 30)) { - block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); - if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - } else - offset = 0; - - fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block); - if (!fibh.sbh) { - brelse(epos.bh); + struct udf_fileident_iter iter; + int ret; + + for (ret = udf_fiiter_init(&iter, dir, 0); + !ret && iter.pos < dir->i_size; + ret = udf_fiiter_advance(&iter)) { + if (iter.fi.lengthFileIdent && + !(iter.fi.fileCharacteristics & FID_FILE_CHAR_DELETED)) { + udf_fiiter_release(&iter); return 0; } - } else { - brelse(epos.bh); - return 0; } - - while (f_pos < size) { - fi = udf_fileident_read(dir, &f_pos, &fibh, &cfi, &epos, &eloc, - &elen, &offset); - if (!fi) { - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - brelse(epos.bh); - return 0; - } - - if (cfi.lengthFileIdent && - (cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) == 0) { - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - brelse(epos.bh); - return 0; - } - } - - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - brelse(epos.bh); + udf_fiiter_release(&iter); return 1; } static int udf_rmdir(struct inode *dir, struct dentry *dentry) { - int retval; + int ret; struct inode *inode = d_inode(dentry); - struct udf_fileident_bh fibh; - struct fileIdentDesc *fi, cfi; + struct udf_fileident_iter iter; struct kernel_lb_addr tloc; - retval = -ENOENT; - fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); - if (IS_ERR_OR_NULL(fi)) { - if (fi) - retval = PTR_ERR(fi); + ret = udf_fiiter_find_entry(dir, &dentry->d_name, &iter); + if (ret) goto out; - } - retval = -EIO; - tloc = lelb_to_cpu(cfi.icb.extLocation); + ret = -EFSCORRUPTED; + tloc = lelb_to_cpu(iter.fi.icb.extLocation); if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) goto end_rmdir; - retval = -ENOTEMPTY; + ret = -ENOTEMPTY; if (!empty_dir(inode)) goto end_rmdir; - retval = udf_delete_entry(dir, fi, &fibh, &cfi); - if (retval) - goto end_rmdir; + udf_fiiter_delete_entry(&iter); if (inode->i_nlink != 2) udf_warn(inode->i_sb, "empty directory has nlink != 2 (%u)\n", inode->i_nlink); clear_nlink(inode); inode->i_size = 0; inode_dec_link_count(dir); + udf_add_fid_counter(dir->i_sb, true, -1); inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); mark_inode_dirty(dir); - + ret = 0; end_rmdir: - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - + udf_fiiter_release(&iter); out: - return retval; + return ret; } static int udf_unlink(struct inode *dir, struct dentry *dentry) { - int retval; + int ret; struct inode *inode = d_inode(dentry); - struct udf_fileident_bh fibh; - struct fileIdentDesc *fi; - struct fileIdentDesc cfi; + struct udf_fileident_iter iter; struct kernel_lb_addr tloc; - retval = -ENOENT; - fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); - - if (IS_ERR_OR_NULL(fi)) { - if (fi) - retval = PTR_ERR(fi); + ret = udf_fiiter_find_entry(dir, &dentry->d_name, &iter); + if (ret) goto out; - } - retval = -EIO; - tloc = lelb_to_cpu(cfi.icb.extLocation); + ret = -EFSCORRUPTED; + tloc = lelb_to_cpu(iter.fi.icb.extLocation); if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) goto end_unlink; @@ -863,25 +558,20 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } - retval = udf_delete_entry(dir, fi, &fibh, &cfi); - if (retval) - goto end_unlink; + udf_fiiter_delete_entry(&iter); dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); inode_dec_link_count(inode); + udf_add_fid_counter(dir->i_sb, false, -1); inode->i_ctime = dir->i_ctime; - retval = 0; - + ret = 0; end_unlink: - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - + udf_fiiter_release(&iter); out: - return retval; + return ret; } -static int udf_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int udf_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777); @@ -929,15 +619,20 @@ static int udf_symlink(struct user_namespace *mnt_userns, struct inode *dir, iinfo->i_location.partitionReferenceNum; bsize = sb->s_blocksize; iinfo->i_lenExtents = bsize; - udf_add_aext(inode, &epos, &eloc, bsize, 0); + err = udf_add_aext(inode, &epos, &eloc, bsize, 0); brelse(epos.bh); + if (err < 0) { + udf_free_blocks(sb, inode, &eloc, 0, 1); + goto out_no_entry; + } block = udf_get_pblock(sb, block, iinfo->i_location.partitionReferenceNum, 0); - epos.bh = udf_tgetblk(sb, block); + epos.bh = sb_getblk(sb, block); if (unlikely(!epos.bh)) { err = -ENOMEM; + udf_free_blocks(sb, inode, &eloc, 0, 1); goto out_no_entry; } lock_buffer(epos.bh); @@ -1038,28 +733,23 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); - struct udf_fileident_bh fibh; - struct fileIdentDesc cfi, *fi; + struct udf_fileident_iter iter; int err; - fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (!fi) { + err = udf_fiiter_add_entry(dir, dentry, &iter); + if (err) return err; - } - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location); + iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + iter.fi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location); if (UDF_SB(inode->i_sb)->s_lvid_bh) { - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(lvid_get_unique_id(inode->i_sb)); } - udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); - if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - mark_inode_dirty(dir); + udf_fiiter_write_fi(&iter, NULL); + udf_fiiter_release(&iter); - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); inc_nlink(inode); + udf_add_fid_counter(dir->i_sb, false, 1); inode->i_ctime = current_time(inode); mark_inode_dirty(inode); dir->i_ctime = dir->i_mtime = current_time(dir); @@ -1073,84 +763,81 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, /* Anybody can rename anything with this: the permission checks are left to the * higher-level routines. */ -static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct udf_fileident_bh ofibh, nfibh; - struct fileIdentDesc *ofi = NULL, *nfi = NULL, *dir_fi = NULL; - struct fileIdentDesc ocfi, ncfi; - struct buffer_head *dir_bh = NULL; - int retval = -ENOENT; + struct udf_fileident_iter oiter, niter, diriter; + bool has_diriter = false; + int retval; struct kernel_lb_addr tloc; - struct udf_inode_info *old_iinfo = UDF_I(old_inode); if (flags & ~RENAME_NOREPLACE) return -EINVAL; - ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); - if (!ofi || IS_ERR(ofi)) { - if (IS_ERR(ofi)) - retval = PTR_ERR(ofi); - goto end_rename; - } - - if (ofibh.sbh != ofibh.ebh) - brelse(ofibh.ebh); - - brelse(ofibh.sbh); - tloc = lelb_to_cpu(ocfi.icb.extLocation); - if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino) - goto end_rename; + retval = udf_fiiter_find_entry(old_dir, &old_dentry->d_name, &oiter); + if (retval) + return retval; - nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi); - if (IS_ERR(nfi)) { - retval = PTR_ERR(nfi); - goto end_rename; - } - if (nfi && !new_inode) { - if (nfibh.sbh != nfibh.ebh) - brelse(nfibh.ebh); - brelse(nfibh.sbh); - nfi = NULL; + tloc = lelb_to_cpu(oiter.fi.icb.extLocation); + if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino) { + retval = -ENOENT; + goto out_oiter; } - if (S_ISDIR(old_inode->i_mode)) { - int offset = udf_ext0_offset(old_inode); + if (S_ISDIR(old_inode->i_mode)) { if (new_inode) { retval = -ENOTEMPTY; if (!empty_dir(new_inode)) - goto end_rename; + goto out_oiter; + } + /* + * We need to protect against old_inode getting converted from + * ICB to normal directory. + */ + inode_lock_nested(old_inode, I_MUTEX_NONDIR2); + retval = udf_fiiter_find_entry(old_inode, &dotdot_name, + &diriter); + if (retval == -ENOENT) { + udf_err(old_inode->i_sb, + "directory (ino %lu) has no '..' entry\n", + old_inode->i_ino); + retval = -EFSCORRUPTED; } - retval = -EIO; - if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - dir_fi = udf_get_fileident( - old_iinfo->i_data - - (old_iinfo->i_efe ? - sizeof(struct extendedFileEntry) : - sizeof(struct fileEntry)), - old_inode->i_sb->s_blocksize, &offset); - } else { - dir_bh = udf_bread(old_inode, 0, 0, &retval); - if (!dir_bh) - goto end_rename; - dir_fi = udf_get_fileident(dir_bh->b_data, - old_inode->i_sb->s_blocksize, &offset); + if (retval) { + inode_unlock(old_inode); + goto out_oiter; } - if (!dir_fi) - goto end_rename; - tloc = lelb_to_cpu(dir_fi->icb.extLocation); + has_diriter = true; + tloc = lelb_to_cpu(diriter.fi.icb.extLocation); if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) != - old_dir->i_ino) - goto end_rename; + old_dir->i_ino) { + retval = -EFSCORRUPTED; + udf_err(old_inode->i_sb, + "directory (ino %lu) has parent entry pointing to another inode (%lu != %u)\n", + old_inode->i_ino, old_dir->i_ino, + udf_get_lb_pblock(old_inode->i_sb, &tloc, 0)); + goto out_oiter; + } + } + + retval = udf_fiiter_find_entry(new_dir, &new_dentry->d_name, &niter); + if (retval && retval != -ENOENT) + goto out_oiter; + /* Entry found but not passed by VFS? */ + if (!retval && !new_inode) { + retval = -EFSCORRUPTED; + udf_fiiter_release(&niter); + goto out_oiter; } - if (!nfi) { - nfi = udf_add_entry(new_dir, new_dentry, &nfibh, &ncfi, - &retval); - if (!nfi) - goto end_rename; + /* Entry not found? Need to add one... */ + if (retval) { + udf_fiiter_release(&niter); + retval = udf_fiiter_add_entry(new_dir, new_dentry, &niter); + if (retval) + goto out_oiter; } /* @@ -1163,31 +850,46 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, /* * ok, that's it */ - ncfi.fileVersionNum = ocfi.fileVersionNum; - ncfi.fileCharacteristics = ocfi.fileCharacteristics; - memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(ocfi.icb)); - udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL); + niter.fi.fileVersionNum = oiter.fi.fileVersionNum; + niter.fi.fileCharacteristics = oiter.fi.fileCharacteristics; + memcpy(&(niter.fi.icb), &(oiter.fi.icb), sizeof(oiter.fi.icb)); + udf_fiiter_write_fi(&niter, NULL); + udf_fiiter_release(&niter); - /* The old fid may have moved - find it again */ - ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); - udf_delete_entry(old_dir, ofi, &ofibh, &ocfi); + /* + * The old entry may have moved due to new entry allocation. Find it + * again. + */ + udf_fiiter_release(&oiter); + retval = udf_fiiter_find_entry(old_dir, &old_dentry->d_name, &oiter); + if (retval) { + udf_err(old_dir->i_sb, + "failed to find renamed entry again in directory (ino %lu)\n", + old_dir->i_ino); + } else { + udf_fiiter_delete_entry(&oiter); + udf_fiiter_release(&oiter); + } if (new_inode) { new_inode->i_ctime = current_time(new_inode); inode_dec_link_count(new_inode); + udf_add_fid_counter(old_dir->i_sb, S_ISDIR(new_inode->i_mode), + -1); } old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir); mark_inode_dirty(old_dir); mark_inode_dirty(new_dir); - if (dir_fi) { - dir_fi->icb.extLocation = cpu_to_lelb(UDF_I(new_dir)->i_location); - udf_update_tag((char *)dir_fi, udf_dir_entry_len(dir_fi)); - if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - mark_inode_dirty(old_inode); - else - mark_buffer_dirty_inode(dir_bh, old_inode); + if (has_diriter) { + diriter.fi.icb.extLocation = + cpu_to_lelb(UDF_I(new_dir)->i_location); + udf_update_tag((char *)&diriter.fi, + udf_dir_entry_len(&diriter.fi)); + udf_fiiter_write_fi(&diriter, NULL); + udf_fiiter_release(&diriter); + inode_unlock(old_inode); inode_dec_link_count(old_dir); if (new_inode) @@ -1197,22 +899,13 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, mark_inode_dirty(new_dir); } } - - if (ofi) { - if (ofibh.sbh != ofibh.ebh) - brelse(ofibh.ebh); - brelse(ofibh.sbh); - } - - retval = 0; - -end_rename: - brelse(dir_bh); - if (nfi) { - if (nfibh.sbh != nfibh.ebh) - brelse(nfibh.ebh); - brelse(nfibh.sbh); + return 0; +out_oiter: + if (has_diriter) { + udf_fiiter_release(&diriter); + inode_unlock(old_inode); } + udf_fiiter_release(&oiter); return retval; } @@ -1221,17 +914,15 @@ static struct dentry *udf_get_parent(struct dentry *child) { struct kernel_lb_addr tloc; struct inode *inode = NULL; - struct fileIdentDesc cfi; - struct udf_fileident_bh fibh; - - if (!udf_find_entry(d_inode(child), &dotdot_name, &fibh, &cfi)) - return ERR_PTR(-EACCES); + struct udf_fileident_iter iter; + int err; - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); + err = udf_fiiter_find_entry(d_inode(child), &dotdot_name, &iter); + if (err) + return ERR_PTR(err); - tloc = lelb_to_cpu(cfi.icb.extLocation); + tloc = lelb_to_cpu(iter.fi.icb.extLocation); + udf_fiiter_release(&iter); inode = udf_iget(child->d_sb, &tloc); if (IS_ERR(inode)) return ERR_CAST(inode); diff --git a/fs/udf/partition.c b/fs/udf/partition.c index 4cbf40575965..5bcfe78d5cab 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -54,6 +54,7 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, struct udf_part_map *map; struct udf_virtual_data *vdata; struct udf_inode_info *iinfo = UDF_I(sbi->s_vat_inode); + int err; map = &sbi->s_partmaps[partition]; vdata = &map->s_type_specific.s_virtual; @@ -79,12 +80,10 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, index = vdata->s_start_offset / sizeof(uint32_t) + block; } - loc = udf_block_map(sbi->s_vat_inode, newblock); - - bh = sb_bread(sb, loc); + bh = udf_bread(sbi->s_vat_inode, newblock, 0, &err); if (!bh) { - udf_debug("get_pblock(UDF_VIRTUAL_MAP:%p,%u,%u) VAT: %u[%u]\n", - sb, block, partition, loc, index); + udf_debug("get_pblock(UDF_VIRTUAL_MAP:%p,%u,%u)\n", + sb, block, partition); return 0xFFFFFFFF; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 06eda8177b5f..6304e3c5c3d9 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -86,6 +86,13 @@ enum { #define UDF_MAX_LVID_NESTING 1000 enum { UDF_MAX_LINKS = 0xffff }; +/* + * We limit filesize to 4TB. This is arbitrary as the on-disk format supports + * more but because the file space is described by a linked list of extents, + * each of which can have at most 1GB, the creation and handling of extents + * gets unusably slow beyond certain point... + */ +#define UDF_MAX_FILESIZE (1ULL << 42) /* These are the "meat" - everything else is stuffing */ static int udf_fill_super(struct super_block *, void *, int); @@ -147,6 +154,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb) ei->i_next_alloc_goal = 0; ei->i_strat4096 = 0; ei->i_streamdir = 0; + ei->i_hidden = 0; init_rwsem(&ei->i_data_sem); ei->cached_extent.lstart = -1; spin_lock_init(&ei->i_extent_cache_lock); @@ -733,7 +741,7 @@ static int udf_check_vsd(struct super_block *sb) * added */ for (; !nsr && sector < VSD_MAX_SECTOR_OFFSET; sector += sectorsize) { /* Read a block */ - bh = udf_tread(sb, sector >> sb->s_blocksize_bits); + bh = sb_bread(sb, sector >> sb->s_blocksize_bits); if (!bh) break; @@ -1175,7 +1183,6 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) struct udf_part_map *map = &sbi->s_partmaps[p_index]; struct buffer_head *bh = NULL; struct udf_inode_info *vati; - uint32_t pos; struct virtualAllocationTable20 *vat20; sector_t blocks = sb_bdev_nr_blocks(sb); @@ -1197,10 +1204,14 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) } else if (map->s_partition_type == UDF_VIRTUAL_MAP20) { vati = UDF_I(sbi->s_vat_inode); if (vati->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - pos = udf_block_map(sbi->s_vat_inode, 0); - bh = sb_bread(sb, pos); - if (!bh) - return -EIO; + int err = 0; + + bh = udf_bread(sbi->s_vat_inode, 0, 0, &err); + if (!bh) { + if (!err) + err = -EFSCORRUPTED; + return err; + } vat20 = (struct virtualAllocationTable20 *)bh->b_data; } else { vat20 = (struct virtualAllocationTable20 *) @@ -1838,10 +1849,6 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block, uint16_t ident; int ret; - if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && - udf_fixed_to_variable(block) >= sb_bdev_nr_blocks(sb)) - return -EAGAIN; - bh = udf_read_tagged(sb, block, block, &ident); if (!bh) return -EAGAIN; @@ -1860,10 +1867,10 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block, * Returns < 0 on error, 0 on success. -EAGAIN is special - try next set * of anchors. */ -static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock, +static int udf_scan_anchors(struct super_block *sb, udf_pblk_t *lastblock, struct kernel_lb_addr *fileset) { - sector_t last[6]; + udf_pblk_t last[6]; int i; struct udf_sb_info *sbi = UDF_SB(sb); int last_count = 0; @@ -1924,46 +1931,6 @@ static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock, } /* - * Find an anchor volume descriptor and load Volume Descriptor Sequence from - * area specified by it. The function expects sbi->s_lastblock to be the last - * block on the media. - * - * Return <0 on error, 0 if anchor found. -EAGAIN is special meaning anchor - * was not found. - */ -static int udf_find_anchor(struct super_block *sb, - struct kernel_lb_addr *fileset) -{ - struct udf_sb_info *sbi = UDF_SB(sb); - sector_t lastblock = sbi->s_last_block; - int ret; - - ret = udf_scan_anchors(sb, &lastblock, fileset); - if (ret != -EAGAIN) - goto out; - - /* No anchor found? Try VARCONV conversion of block numbers */ - UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); - lastblock = udf_variable_to_fixed(sbi->s_last_block); - /* Firstly, we try to not convert number of the last block */ - ret = udf_scan_anchors(sb, &lastblock, fileset); - if (ret != -EAGAIN) - goto out; - - lastblock = sbi->s_last_block; - /* Secondly, we try with converted number of the last block */ - ret = udf_scan_anchors(sb, &lastblock, fileset); - if (ret < 0) { - /* VARCONV didn't help. Clear it. */ - UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV); - } -out: - if (ret == 0) - sbi->s_last_block = lastblock; - return ret; -} - -/* * Check Volume Structure Descriptor, find Anchor block and load Volume * Descriptor Sequence. * @@ -2003,7 +1970,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, /* Look for anchor block and load Volume Descriptor Sequence */ sbi->s_anchor = uopt->anchor; - ret = udf_find_anchor(sb, fileset); + ret = udf_scan_anchors(sb, &sbi->s_last_block, fileset); if (ret < 0) { if (!silent && ret == -EAGAIN) udf_warn(sb, "No anchor found\n"); @@ -2297,7 +2264,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) ret = -ENOMEM; goto error_out; } - sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_maxbytes = UDF_MAX_FILESIZE; sb->s_max_links = UDF_MAX_LINKS; return 0; @@ -2454,7 +2421,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, if (bytes) { brelse(bh); newblock = udf_get_lb_pblock(sb, &loc, ++block); - bh = udf_tread(sb, newblock); + bh = sb_bread(sb, newblock); if (!bh) { udf_debug("read failed\n"); goto out; diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index f3642f9c23f8..a34c8c4e6d21 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -107,53 +107,45 @@ static int udf_symlink_filler(struct file *file, struct folio *folio) struct inode *inode = page->mapping->host; struct buffer_head *bh = NULL; unsigned char *symlink; - int err; + int err = 0; unsigned char *p = page_address(page); - struct udf_inode_info *iinfo; - uint32_t pos; + struct udf_inode_info *iinfo = UDF_I(inode); /* We don't support symlinks longer than one block */ if (inode->i_size > inode->i_sb->s_blocksize) { err = -ENAMETOOLONG; - goto out_unmap; + goto out_unlock; } - iinfo = UDF_I(inode); - pos = udf_block_map(inode, 0); - - down_read(&iinfo->i_data_sem); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { symlink = iinfo->i_data + iinfo->i_lenEAttr; } else { - bh = sb_bread(inode->i_sb, pos); - + bh = udf_bread(inode, 0, 0, &err); if (!bh) { - err = -EIO; - goto out_unlock_inode; + if (!err) + err = -EFSCORRUPTED; + goto out_err; } - symlink = bh->b_data; } err = udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p, PAGE_SIZE); brelse(bh); if (err) - goto out_unlock_inode; + goto out_err; - up_read(&iinfo->i_data_sem); SetPageUptodate(page); unlock_page(page); return 0; -out_unlock_inode: - up_read(&iinfo->i_data_sem); +out_err: SetPageError(page); -out_unmap: +out_unlock: unlock_page(page); return err; } -static int udf_symlink_getattr(struct user_namespace *mnt_userns, +static int udf_symlink_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -161,7 +153,7 @@ static int udf_symlink_getattr(struct user_namespace *mnt_userns, struct inode *inode = d_backing_inode(dentry); struct page *page; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); page = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(page)) return PTR_ERR(page); diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 036ebd892b85..871856c69df5 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -125,7 +125,7 @@ void udf_discard_prealloc(struct inode *inode) struct kernel_lb_addr eloc; uint32_t elen; uint64_t lbcount = 0; - int8_t etype = -1, netype; + int8_t etype = -1; struct udf_inode_info *iinfo = UDF_I(inode); int bsize = 1 << inode->i_blkbits; @@ -136,7 +136,7 @@ void udf_discard_prealloc(struct inode *inode) epos.block = iinfo->i_location; /* Find the last extent in the file */ - while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 0)) != -1) { + while (udf_next_aext(inode, &epos, &eloc, &elen, 0) != -1) { brelse(prev_epos.bh); prev_epos = epos; if (prev_epos.bh) @@ -240,7 +240,7 @@ int udf_truncate_extents(struct inode *inode) brelse(epos.bh); epos.offset = sizeof(struct allocExtDesc); epos.block = eloc; - epos.bh = udf_tread(sb, + epos.bh = sb_bread(sb, udf_get_lb_pblock(sb, &eloc, 0)); /* Error reading indirect block? */ if (!epos.bh) diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index 06ff7006b822..312b7c9ef10e 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -44,7 +44,8 @@ struct udf_inode_info { unsigned i_use : 1; /* unallocSpaceEntry */ unsigned i_strat4096 : 1; unsigned i_streamdir : 1; - unsigned reserved : 25; + unsigned i_hidden : 1; /* hidden system inode */ + unsigned reserved : 24; __u8 *i_data; struct kernel_lb_addr i_locStreamdir; __u64 i_lenStreams; diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 291b56dd011e..9af6ff7f9747 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -23,7 +23,6 @@ #define UDF_FLAG_STRICT 5 #define UDF_FLAG_UNDELETE 6 #define UDF_FLAG_UNHIDE 7 -#define UDF_FLAG_VARCONV 8 #define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */ #define UDF_FLAG_GID_FORGET 12 #define UDF_FLAG_UID_SET 13 @@ -55,6 +54,8 @@ #define MF_DUPLICATE_MD 0x01 #define MF_MIRROR_FE_LOADED 0x02 +#define EFSCORRUPTED EUCLEAN + struct udf_meta_data { __u32 s_meta_file_loc; __u32 s_mirror_file_loc; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 7e258f15b8ef..88692512a466 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -34,9 +34,6 @@ extern __printf(3, 4) void _udf_warn(struct super_block *sb, #define udf_debug(fmt, ...) \ pr_debug("%s:%d:%s: " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__) -#define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) ) -#define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) ) - #define UDF_EXTENT_LENGTH_MASK 0x3FFFFFFF #define UDF_EXTENT_FLAG_MASK 0xC0000000 @@ -83,14 +80,24 @@ extern const struct inode_operations udf_file_inode_operations; extern const struct file_operations udf_file_operations; extern const struct inode_operations udf_symlink_inode_operations; extern const struct address_space_operations udf_aops; -extern const struct address_space_operations udf_adinicb_aops; extern const struct address_space_operations udf_symlink_aops; -struct udf_fileident_bh { - struct buffer_head *sbh; - struct buffer_head *ebh; - int soffset; - int eoffset; +struct udf_fileident_iter { + struct inode *dir; /* Directory we are working with */ + loff_t pos; /* Logical position in a dir */ + struct buffer_head *bh[2]; /* Buffer containing 'pos' and possibly + * next buffer if entry straddles + * blocks */ + struct kernel_lb_addr eloc; /* Start of extent containing 'pos' */ + uint32_t elen; /* Length of extent containing 'pos' */ + sector_t loffset; /* Block offset of 'pos' within above + * extent */ + struct extent_position epos; /* Position after the above extent */ + struct fileIdentDesc fi; /* Copied directory entry */ + uint8_t *name; /* Pointer to entry name */ + uint8_t *namebuf; /* Storage for entry name in case + * the name is split between two blocks + */ }; struct udf_vds_record { @@ -121,22 +128,16 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb, u32 meta_file_loc, u32 partition_num); /* namei.c */ -extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, - struct fileIdentDesc *, struct udf_fileident_bh *, - uint8_t *, uint8_t *); static inline unsigned int udf_dir_entry_len(struct fileIdentDesc *cfi) { return ALIGN(sizeof(struct fileIdentDesc) + le16_to_cpu(cfi->lengthOfImpUse) + cfi->lengthFileIdent, UDF_NAME_PAD); } -static inline uint8_t *udf_get_fi_ident(struct fileIdentDesc *fi) -{ - return ((uint8_t *)(fi + 1)) + le16_to_cpu(fi->lengthOfImpUse); -} /* file.c */ extern long udf_ioctl(struct file *, unsigned int, unsigned long); + /* inode.c */ extern struct inode *__udf_iget(struct super_block *, struct kernel_lb_addr *, bool hidden_inode); @@ -151,16 +152,14 @@ static inline struct inode *udf_iget(struct super_block *sb, return __udf_iget(sb, ino, false); } extern int udf_expand_file_adinicb(struct inode *); -extern struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, - udf_pblk_t *block, int *err); extern struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, int create, int *err); extern int udf_setsize(struct inode *, loff_t); extern void udf_evict_inode(struct inode *); extern int udf_write_inode(struct inode *, struct writeback_control *wbc); -extern udf_pblk_t udf_block_map(struct inode *inode, sector_t block); extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, struct kernel_lb_addr *, uint32_t *, sector_t *); +int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); extern int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block, struct extent_position *epos); extern int __udf_add_aext(struct inode *inode, struct extent_position *epos, @@ -177,9 +176,6 @@ extern int8_t udf_current_aext(struct inode *, struct extent_position *, extern void udf_update_extra_perms(struct inode *inode, umode_t mode); /* misc.c */ -extern struct buffer_head *udf_tgetblk(struct super_block *sb, - udf_pblk_t block); -extern struct buffer_head *udf_tread(struct super_block *sb, udf_pblk_t block); extern struct genericFormat *udf_add_extendedattr(struct inode *, uint32_t, uint32_t, uint8_t); extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t, @@ -194,7 +190,7 @@ extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int); /* lowlevel.c */ extern unsigned int udf_get_last_session(struct super_block *); -extern unsigned long udf_get_last_block(struct super_block *); +udf_pblk_t udf_get_last_block(struct super_block *); /* partition.c */ extern uint32_t udf_get_pblock(struct super_block *, uint32_t, uint16_t, @@ -243,14 +239,13 @@ extern udf_pblk_t udf_new_block(struct super_block *sb, struct inode *inode, uint16_t partition, uint32_t goal, int *err); /* directory.c */ -extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *, - struct udf_fileident_bh *, - struct fileIdentDesc *, - struct extent_position *, - struct kernel_lb_addr *, uint32_t *, - sector_t *); -extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, - int *offset); +int udf_fiiter_init(struct udf_fileident_iter *iter, struct inode *dir, + loff_t pos); +int udf_fiiter_advance(struct udf_fileident_iter *iter); +void udf_fiiter_release(struct udf_fileident_iter *iter); +void udf_fiiter_write_fi(struct udf_fileident_iter *iter, uint8_t *impuse); +void udf_fiiter_update_elen(struct udf_fileident_iter *iter, uint32_t new_elen); +int udf_fiiter_append_blk(struct udf_fileident_iter *iter); extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 7e3e08c0166f..06bd84d555bd 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -289,7 +289,7 @@ cg_found: ufs_mark_sb_dirty(sb); inode->i_ino = cg * uspi->s_ipg + bit; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_blocks = 0; inode->i_generation = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index a873de7dec1c..a4246c83a8cd 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1212,14 +1212,14 @@ out: return err; } -int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ufs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -1229,7 +1229,7 @@ int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return error; } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 29d5a0e0c8f0..36154b5aca6d 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -69,7 +69,7 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsi * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ufs_create (struct user_namespace * mnt_userns, +static int ufs_create (struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { @@ -86,7 +86,7 @@ static int ufs_create (struct user_namespace * mnt_userns, return ufs_add_nondir(dentry, inode); } -static int ufs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ufs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -106,7 +106,7 @@ static int ufs_mknod(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int ufs_symlink (struct user_namespace * mnt_userns, struct inode * dir, +static int ufs_symlink (struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, const char * symname) { struct super_block * sb = dir->i_sb; @@ -166,7 +166,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return error; } -static int ufs_mkdir(struct user_namespace * mnt_userns, struct inode * dir, +static int ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, umode_t mode) { struct inode * inode; @@ -243,7 +243,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) return err; } -static int ufs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 550f7c5a3636..6b499180643b 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -123,7 +123,7 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long); extern int ufs_write_inode (struct inode *, struct writeback_control *); extern int ufs_sync_inode (struct inode *); extern void ufs_evict_inode (struct inode *); -extern int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int ufs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); /* namei.c */ diff --git a/fs/utimes.c b/fs/utimes.c index 39f356017635..3701b3946f88 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -7,6 +7,7 @@ #include <linux/uaccess.h> #include <linux/compat.h> #include <asm/unistd.h> +#include <linux/filelock.h> static bool nsec_valid(long nsec) { @@ -62,7 +63,7 @@ int vfs_utimes(const struct path *path, struct timespec64 *times) } retry_deleg: inode_lock(inode); - error = notify_change(mnt_user_ns(path->mnt), path->dentry, &newattrs, + error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (delegated_inode) { diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index c4769a9396c5..075f15c43c78 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -294,14 +294,14 @@ out: return err; } -static int vboxsf_dir_mkfile(struct user_namespace *mnt_userns, +static int vboxsf_dir_mkfile(struct mnt_idmap *idmap, struct inode *parent, struct dentry *dentry, umode_t mode, bool excl) { return vboxsf_dir_create(parent, dentry, mode, false, excl, NULL); } -static int vboxsf_dir_mkdir(struct user_namespace *mnt_userns, +static int vboxsf_dir_mkdir(struct mnt_idmap *idmap, struct inode *parent, struct dentry *dentry, umode_t mode) { @@ -387,7 +387,7 @@ static int vboxsf_dir_unlink(struct inode *parent, struct dentry *dentry) return 0; } -static int vboxsf_dir_rename(struct user_namespace *mnt_userns, +static int vboxsf_dir_rename(struct mnt_idmap *idmap, struct inode *old_parent, struct dentry *old_dentry, struct inode *new_parent, @@ -430,7 +430,7 @@ err_put_old_path: return err; } -static int vboxsf_dir_symlink(struct user_namespace *mnt_userns, +static int vboxsf_dir_symlink(struct mnt_idmap *idmap, struct inode *parent, struct dentry *dentry, const char *symname) { diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index e1db0f3f7e5e..dd0ae1188e87 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -231,7 +231,7 @@ int vboxsf_inode_revalidate(struct dentry *dentry) return 0; } -int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path, +int vboxsf_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *kstat, u32 request_mask, unsigned int flags) { int err; @@ -252,11 +252,11 @@ int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path, if (err) return err; - generic_fillattr(&init_user_ns, d_inode(dentry), kstat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), kstat); return 0; } -int vboxsf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int vboxsf_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct vboxsf_inode *sf_i = VBOXSF_I(d_inode(dentry)); diff --git a/fs/vboxsf/vfsmod.h b/fs/vboxsf/vfsmod.h index 9047befa66c5..05973eb89d52 100644 --- a/fs/vboxsf/vfsmod.h +++ b/fs/vboxsf/vfsmod.h @@ -97,10 +97,10 @@ int vboxsf_stat(struct vboxsf_sbi *sbi, struct shfl_string *path, struct shfl_fsobjinfo *info); int vboxsf_stat_dentry(struct dentry *dentry, struct shfl_fsobjinfo *info); int vboxsf_inode_revalidate(struct dentry *dentry); -int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path, +int vboxsf_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *kstat, u32 request_mask, unsigned int query_flags); -int vboxsf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int vboxsf_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); struct shfl_string *vboxsf_path_from_dentry(struct vboxsf_sbi *sbi, struct dentry *dentry); diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index aad1f1d998b9..a7ffd718f171 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -34,14 +34,6 @@ config FS_VERITY If unsure, say N. -config FS_VERITY_DEBUG - bool "FS Verity debugging" - depends on FS_VERITY - help - Enable debugging messages related to fs-verity by default. - - Say N unless you are an fs-verity developer. - config FS_VERITY_BUILTIN_SIGNATURES bool "FS Verity builtin signature support" depends on FS_VERITY diff --git a/fs/verity/enable.c b/fs/verity/enable.c index df6b499bf6a1..e13db6507b38 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -7,136 +7,50 @@ #include "fsverity_private.h" -#include <crypto/hash.h> -#include <linux/backing-dev.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> -/* - * Read a file data page for Merkle tree construction. Do aggressive readahead, - * since we're sequentially reading the entire file. - */ -static struct page *read_file_data_page(struct file *file, pgoff_t index, - struct file_ra_state *ra, - unsigned long remaining_pages) -{ - DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, index); - struct folio *folio; - - folio = __filemap_get_folio(ractl.mapping, index, FGP_ACCESSED, 0); - if (!folio || !folio_test_uptodate(folio)) { - if (folio) - folio_put(folio); - else - page_cache_sync_ra(&ractl, remaining_pages); - folio = read_cache_folio(ractl.mapping, index, NULL, file); - if (IS_ERR(folio)) - return &folio->page; - } - if (folio_test_readahead(folio)) - page_cache_async_ra(&ractl, folio, remaining_pages); - return folio_file_page(folio, index); -} +struct block_buffer { + u32 filled; + u8 *data; +}; -static int build_merkle_tree_level(struct file *filp, unsigned int level, - u64 num_blocks_to_hash, - const struct merkle_tree_params *params, - u8 *pending_hashes, - struct ahash_request *req) +/* Hash a block, writing the result to the next level's pending block buffer. */ +static int hash_one_block(struct inode *inode, + const struct merkle_tree_params *params, + struct ahash_request *req, struct block_buffer *cur) { - struct inode *inode = file_inode(filp); - const struct fsverity_operations *vops = inode->i_sb->s_vop; - struct file_ra_state ra = { 0 }; - unsigned int pending_size = 0; - u64 dst_block_num; - u64 i; + struct block_buffer *next = cur + 1; int err; - if (WARN_ON(params->block_size != PAGE_SIZE)) /* checked earlier too */ - return -EINVAL; - - if (level < params->num_levels) { - dst_block_num = params->level_start[level]; - } else { - if (WARN_ON(num_blocks_to_hash != 1)) - return -EINVAL; - dst_block_num = 0; /* unused */ - } + /* Zero-pad the block if it's shorter than the block size. */ + memset(&cur->data[cur->filled], 0, params->block_size - cur->filled); - file_ra_state_init(&ra, filp->f_mapping); - - for (i = 0; i < num_blocks_to_hash; i++) { - struct page *src_page; - - if ((pgoff_t)i % 10000 == 0 || i + 1 == num_blocks_to_hash) - pr_debug("Hashing block %llu of %llu for level %u\n", - i + 1, num_blocks_to_hash, level); - - if (level == 0) { - /* Leaf: hashing a data block */ - src_page = read_file_data_page(filp, i, &ra, - num_blocks_to_hash - i); - if (IS_ERR(src_page)) { - err = PTR_ERR(src_page); - fsverity_err(inode, - "Error %d reading data page %llu", - err, i); - return err; - } - } else { - unsigned long num_ra_pages = - min_t(unsigned long, num_blocks_to_hash - i, - inode->i_sb->s_bdi->io_pages); - - /* Non-leaf: hashing hash block from level below */ - src_page = vops->read_merkle_tree_page(inode, - params->level_start[level - 1] + i, - num_ra_pages); - if (IS_ERR(src_page)) { - err = PTR_ERR(src_page); - fsverity_err(inode, - "Error %d reading Merkle tree page %llu", - err, params->level_start[level - 1] + i); - return err; - } - } + err = fsverity_hash_block(params, inode, req, virt_to_page(cur->data), + offset_in_page(cur->data), + &next->data[next->filled]); + if (err) + return err; + next->filled += params->digest_size; + cur->filled = 0; + return 0; +} - err = fsverity_hash_page(params, inode, req, src_page, - &pending_hashes[pending_size]); - put_page(src_page); - if (err) - return err; - pending_size += params->digest_size; - - if (level == params->num_levels) /* Root hash? */ - return 0; - - if (pending_size + params->digest_size > params->block_size || - i + 1 == num_blocks_to_hash) { - /* Flush the pending hash block */ - memset(&pending_hashes[pending_size], 0, - params->block_size - pending_size); - err = vops->write_merkle_tree_block(inode, - pending_hashes, - dst_block_num, - params->log_blocksize); - if (err) { - fsverity_err(inode, - "Error %d writing Merkle tree block %llu", - err, dst_block_num); - return err; - } - dst_block_num++; - pending_size = 0; - } +static int write_merkle_tree_block(struct inode *inode, const u8 *buf, + unsigned long index, + const struct merkle_tree_params *params) +{ + u64 pos = (u64)index << params->log_blocksize; + int err; - if (fatal_signal_pending(current)) - return -EINTR; - cond_resched(); - } - return 0; + err = inode->i_sb->s_vop->write_merkle_tree_block(inode, buf, pos, + params->block_size); + if (err) + fsverity_err(inode, "Error %d writing Merkle tree block %lu", + err, index); + return err; } /* @@ -152,13 +66,17 @@ static int build_merkle_tree(struct file *filp, u8 *root_hash) { struct inode *inode = file_inode(filp); - u8 *pending_hashes; + const u64 data_size = inode->i_size; + const int num_levels = params->num_levels; struct ahash_request *req; - u64 blocks; - unsigned int level; - int err = -ENOMEM; + struct block_buffer _buffers[1 + FS_VERITY_MAX_LEVELS + 1] = {}; + struct block_buffer *buffers = &_buffers[1]; + unsigned long level_offset[FS_VERITY_MAX_LEVELS]; + int level; + u64 offset; + int err; - if (inode->i_size == 0) { + if (data_size == 0) { /* Empty file is a special case; root hash is all 0's */ memset(root_hash, 0, params->digest_size); return 0; @@ -167,29 +85,95 @@ static int build_merkle_tree(struct file *filp, /* This allocation never fails, since it's mempool-backed. */ req = fsverity_alloc_hash_request(params->hash_alg, GFP_KERNEL); - pending_hashes = kmalloc(params->block_size, GFP_KERNEL); - if (!pending_hashes) - goto out; - /* - * Build each level of the Merkle tree, starting at the leaf level - * (level 0) and ascending to the root node (level 'num_levels - 1'). - * Then at the end (level 'num_levels'), calculate the root hash. + * Allocate the block buffers. Buffer "-1" is for data blocks. + * Buffers 0 <= level < num_levels are for the actual tree levels. + * Buffer 'num_levels' is for the root hash. */ - blocks = ((u64)inode->i_size + params->block_size - 1) >> - params->log_blocksize; - for (level = 0; level <= params->num_levels; level++) { - err = build_merkle_tree_level(filp, level, blocks, params, - pending_hashes, req); + for (level = -1; level < num_levels; level++) { + buffers[level].data = kzalloc(params->block_size, GFP_KERNEL); + if (!buffers[level].data) { + err = -ENOMEM; + goto out; + } + } + buffers[num_levels].data = root_hash; + + BUILD_BUG_ON(sizeof(level_offset) != sizeof(params->level_start)); + memcpy(level_offset, params->level_start, sizeof(level_offset)); + + /* Hash each data block, also hashing the tree blocks as they fill up */ + for (offset = 0; offset < data_size; offset += params->block_size) { + ssize_t bytes_read; + loff_t pos = offset; + + buffers[-1].filled = min_t(u64, params->block_size, + data_size - offset); + bytes_read = __kernel_read(filp, buffers[-1].data, + buffers[-1].filled, &pos); + if (bytes_read < 0) { + err = bytes_read; + fsverity_err(inode, "Error %d reading file data", err); + goto out; + } + if (bytes_read != buffers[-1].filled) { + err = -EINVAL; + fsverity_err(inode, "Short read of file data"); + goto out; + } + err = hash_one_block(inode, params, req, &buffers[-1]); if (err) goto out; - blocks = (blocks + params->hashes_per_block - 1) >> - params->log_arity; + for (level = 0; level < num_levels; level++) { + if (buffers[level].filled + params->digest_size <= + params->block_size) { + /* Next block at @level isn't full yet */ + break; + } + /* Next block at @level is full */ + + err = hash_one_block(inode, params, req, + &buffers[level]); + if (err) + goto out; + err = write_merkle_tree_block(inode, + buffers[level].data, + level_offset[level], + params); + if (err) + goto out; + level_offset[level]++; + } + if (fatal_signal_pending(current)) { + err = -EINTR; + goto out; + } + cond_resched(); + } + /* Finish all nonempty pending tree blocks. */ + for (level = 0; level < num_levels; level++) { + if (buffers[level].filled != 0) { + err = hash_one_block(inode, params, req, + &buffers[level]); + if (err) + goto out; + err = write_merkle_tree_block(inode, + buffers[level].data, + level_offset[level], + params); + if (err) + goto out; + } + } + /* The root hash was filled by the last call to hash_one_block(). */ + if (WARN_ON(buffers[num_levels].filled != params->digest_size)) { + err = -EINVAL; + goto out; } - memcpy(root_hash, pending_hashes, params->digest_size); err = 0; out: - kfree(pending_hashes); + for (level = -1; level < num_levels; level++) + kfree(buffers[level].data); fsverity_free_hash_request(params->hash_alg, req); return err; } @@ -263,15 +247,12 @@ static int enable_verity(struct file *filp, * ->begin_enable_verity() and ->end_enable_verity() using the inode * lock and only allow one process to be here at a time on a given file. */ - pr_debug("Building Merkle tree...\n"); BUILD_BUG_ON(sizeof(desc->root_hash) < FS_VERITY_MAX_DIGEST_SIZE); err = build_merkle_tree(filp, ¶ms, desc->root_hash); if (err) { fsverity_err(inode, "Error %d building Merkle tree", err); goto rollback; } - pr_debug("Done building Merkle tree. Root hash is %s:%*phN\n", - params.hash_alg->name, params.digest_size, desc->root_hash); /* * Create the fsverity_info. Don't bother trying to save work by @@ -286,10 +267,6 @@ static int enable_verity(struct file *filp, goto rollback; } - if (arg->sig_size) - pr_debug("Storing a %u-byte PKCS#7 signature alongside the file\n", - arg->sig_size); - /* * Tell the filesystem to finish enabling verity on the file. * Serialized with ->begin_enable_verity() by the inode lock. @@ -352,7 +329,7 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) memchr_inv(arg.__reserved2, 0, sizeof(arg.__reserved2))) return -EINVAL; - if (arg.block_size != PAGE_SIZE) + if (!is_power_of_2(arg.block_size)) return -EINVAL; if (arg.salt_size > sizeof_field(struct fsverity_descriptor, salt)) diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index c7fcb855e068..d34dcc033d72 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -8,10 +8,6 @@ #ifndef _FSVERITY_PRIVATE_H #define _FSVERITY_PRIVATE_H -#ifdef CONFIG_FS_VERITY_DEBUG -#define DEBUG -#endif - #define pr_fmt(fmt) "fs-verity: " fmt #include <linux/fsverity.h> @@ -46,17 +42,20 @@ struct merkle_tree_params { unsigned int digest_size; /* same as hash_alg->digest_size */ unsigned int block_size; /* size of data and tree blocks */ unsigned int hashes_per_block; /* number of hashes per tree block */ - unsigned int log_blocksize; /* log2(block_size) */ - unsigned int log_arity; /* log2(hashes_per_block) */ + unsigned int blocks_per_page; /* PAGE_SIZE / block_size */ + u8 log_digestsize; /* log2(digest_size) */ + u8 log_blocksize; /* log2(block_size) */ + u8 log_arity; /* log2(hashes_per_block) */ + u8 log_blocks_per_page; /* log2(blocks_per_page) */ unsigned int num_levels; /* number of levels in Merkle tree */ u64 tree_size; /* Merkle tree size in bytes */ - unsigned long level0_blocks; /* number of blocks in tree level 0 */ + unsigned long tree_pages; /* Merkle tree size in pages */ /* * Starting block index for each tree level, ordered from leaf level (0) * to root level ('num_levels - 1') */ - u64 level_start[FS_VERITY_MAX_LEVELS]; + unsigned long level_start[FS_VERITY_MAX_LEVELS]; }; /* @@ -73,9 +72,10 @@ struct fsverity_info { u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE]; u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE]; const struct inode *inode; + unsigned long *hash_block_verified; + spinlock_t hash_page_init_lock; }; - #define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \ sizeof(struct fsverity_descriptor)) @@ -91,9 +91,9 @@ void fsverity_free_hash_request(struct fsverity_hash_alg *alg, struct ahash_request *req); const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg, const u8 *salt, size_t salt_size); -int fsverity_hash_page(const struct merkle_tree_params *params, - const struct inode *inode, - struct ahash_request *req, struct page *page, u8 *out); +int fsverity_hash_block(const struct merkle_tree_params *params, + const struct inode *inode, struct ahash_request *req, + struct page *page, unsigned int offset, u8 *out); int fsverity_hash_buffer(struct fsverity_hash_alg *alg, const void *data, size_t size, u8 *out); void __init fsverity_check_hash_algs(void); diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index 6f8170cf4ae7..13fcf31be844 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -220,35 +220,33 @@ err_free: } /** - * fsverity_hash_page() - hash a single data or hash page + * fsverity_hash_block() - hash a single data or hash block * @params: the Merkle tree's parameters * @inode: inode for which the hashing is being done * @req: preallocated hash request - * @page: the page to hash + * @page: the page containing the block to hash + * @offset: the offset of the block within @page * @out: output digest, size 'params->digest_size' bytes * - * Hash a single data or hash block, assuming block_size == PAGE_SIZE. - * The hash is salted if a salt is specified in the Merkle tree parameters. + * Hash a single data or hash block. The hash is salted if a salt is specified + * in the Merkle tree parameters. * * Return: 0 on success, -errno on failure */ -int fsverity_hash_page(const struct merkle_tree_params *params, - const struct inode *inode, - struct ahash_request *req, struct page *page, u8 *out) +int fsverity_hash_block(const struct merkle_tree_params *params, + const struct inode *inode, struct ahash_request *req, + struct page *page, unsigned int offset, u8 *out) { struct scatterlist sg; DECLARE_CRYPTO_WAIT(wait); int err; - if (WARN_ON(params->block_size != PAGE_SIZE)) - return -EINVAL; - sg_init_table(&sg, 1); - sg_set_page(&sg, page, PAGE_SIZE, 0); + sg_set_page(&sg, page, params->block_size, offset); ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &wait); - ahash_request_set_crypt(req, &sg, out, PAGE_SIZE); + ahash_request_set_crypt(req, &sg, out, params->block_size); if (params->hashstate) { err = crypto_ahash_import(req, params->hashstate); @@ -264,7 +262,7 @@ int fsverity_hash_page(const struct merkle_tree_params *params, err = crypto_wait_req(err, &wait); if (err) - fsverity_err(inode, "Error %d computing page hash", err); + fsverity_err(inode, "Error %d computing block hash", err); return err; } diff --git a/fs/verity/init.c b/fs/verity/init.c index c98b7016f446..023905151035 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -49,7 +49,6 @@ static int __init fsverity_init(void) if (err) goto err_exit_workqueue; - pr_debug("Initialized fs-verity\n"); return 0; err_exit_workqueue: diff --git a/fs/verity/open.c b/fs/verity/open.c index 81ff94442f7b..9366b441d01c 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -7,6 +7,7 @@ #include "fsverity_private.h" +#include <linux/mm.h> #include <linux/slab.h> static struct kmem_cache *fsverity_info_cachep; @@ -34,6 +35,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, struct fsverity_hash_alg *hash_alg; int err; u64 blocks; + u64 blocks_in_level[FS_VERITY_MAX_LEVELS]; u64 offset; int level; @@ -54,7 +56,23 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, goto out_err; } - if (log_blocksize != PAGE_SHIFT) { + /* + * fs/verity/ directly assumes that the Merkle tree block size is a + * power of 2 less than or equal to PAGE_SIZE. Another restriction + * arises from the interaction between fs/verity/ and the filesystems + * themselves: filesystems expect to be able to verify a single + * filesystem block of data at a time. Therefore, the Merkle tree block + * size must also be less than or equal to the filesystem block size. + * + * The above are the only hard limitations, so in theory the Merkle tree + * block size could be as small as twice the digest size. However, + * that's not useful, and it would result in some unusually deep and + * large Merkle trees. So we currently require that the Merkle tree + * block size be at least 1024 bytes. That's small enough to test the + * sub-page block case on systems with 4K pages, but not too small. + */ + if (log_blocksize < 10 || log_blocksize > PAGE_SHIFT || + log_blocksize > inode->i_blkbits) { fsverity_warn(inode, "Unsupported log_blocksize: %u", log_blocksize); err = -EINVAL; @@ -62,6 +80,8 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, } params->log_blocksize = log_blocksize; params->block_size = 1 << log_blocksize; + params->log_blocks_per_page = PAGE_SHIFT - log_blocksize; + params->blocks_per_page = 1 << params->log_blocks_per_page; if (WARN_ON(!is_power_of_2(params->digest_size))) { err = -EINVAL; @@ -74,13 +94,10 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, err = -EINVAL; goto out_err; } - params->log_arity = params->log_blocksize - ilog2(params->digest_size); + params->log_digestsize = ilog2(params->digest_size); + params->log_arity = log_blocksize - params->log_digestsize; params->hashes_per_block = 1 << params->log_arity; - pr_debug("Merkle tree uses %s with %u-byte blocks (%u hashes/block), salt=%*phN\n", - hash_alg->name, params->block_size, params->hashes_per_block, - (int)salt_size, salt); - /* * Compute the number of levels in the Merkle tree and create a map from * level to the starting block of that level. Level 'num_levels - 1' is @@ -90,31 +107,45 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, /* Compute number of levels and the number of blocks in each level */ blocks = ((u64)inode->i_size + params->block_size - 1) >> log_blocksize; - pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks); while (blocks > 1) { if (params->num_levels >= FS_VERITY_MAX_LEVELS) { fsverity_err(inode, "Too many levels in Merkle tree"); - err = -EINVAL; + err = -EFBIG; goto out_err; } blocks = (blocks + params->hashes_per_block - 1) >> params->log_arity; - /* temporarily using level_start[] to store blocks in level */ - params->level_start[params->num_levels++] = blocks; + blocks_in_level[params->num_levels++] = blocks; } - params->level0_blocks = params->level_start[0]; /* Compute the starting block of each level */ offset = 0; for (level = (int)params->num_levels - 1; level >= 0; level--) { - blocks = params->level_start[level]; params->level_start[level] = offset; - pr_debug("Level %d is %llu blocks starting at index %llu\n", - level, blocks, offset); - offset += blocks; + offset += blocks_in_level[level]; + } + + /* + * With block_size != PAGE_SIZE, an in-memory bitmap will need to be + * allocated to track the "verified" status of hash blocks. Don't allow + * this bitmap to get too large. For now, limit it to 1 MiB, which + * limits the file size to about 4.4 TB with SHA-256 and 4K blocks. + * + * Together with the fact that the data, and thus also the Merkle tree, + * cannot have more than ULONG_MAX pages, this implies that hash block + * indices can always fit in an 'unsigned long'. But to be safe, we + * explicitly check for that too. Note, this is only for hash block + * indices; data block indices might not fit in an 'unsigned long'. + */ + if ((params->block_size != PAGE_SIZE && offset > 1 << 23) || + offset > ULONG_MAX) { + fsverity_err(inode, "Too many blocks in Merkle tree"); + err = -EFBIG; + goto out_err; } params->tree_size = offset << log_blocksize; + params->tree_pages = PAGE_ALIGN(params->tree_size) >> PAGE_SHIFT; return 0; out_err: @@ -165,7 +196,7 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, fsverity_err(inode, "Error %d initializing Merkle tree parameters", err); - goto out; + goto fail; } memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size); @@ -174,20 +205,48 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, vi->file_digest); if (err) { fsverity_err(inode, "Error %d computing file digest", err); - goto out; + goto fail; } - pr_debug("Computed file digest: %s:%*phN\n", - vi->tree_params.hash_alg->name, - vi->tree_params.digest_size, vi->file_digest); err = fsverity_verify_signature(vi, desc->signature, le32_to_cpu(desc->sig_size)); -out: - if (err) { - fsverity_free_info(vi); - vi = ERR_PTR(err); + if (err) + goto fail; + + if (vi->tree_params.block_size != PAGE_SIZE) { + /* + * When the Merkle tree block size and page size differ, we use + * a bitmap to keep track of which hash blocks have been + * verified. This bitmap must contain one bit per hash block, + * including alignment to a page boundary at the end. + * + * Eventually, to support extremely large files in an efficient + * way, it might be necessary to make pages of this bitmap + * reclaimable. But for now, simply allocating the whole bitmap + * is a simple solution that works well on the files on which + * fsverity is realistically used. E.g., with SHA-256 and 4K + * blocks, a 100MB file only needs a 24-byte bitmap, and the + * bitmap for any file under 17GB fits in a 4K page. + */ + unsigned long num_bits = + vi->tree_params.tree_pages << + vi->tree_params.log_blocks_per_page; + + vi->hash_block_verified = kvcalloc(BITS_TO_LONGS(num_bits), + sizeof(unsigned long), + GFP_KERNEL); + if (!vi->hash_block_verified) { + err = -ENOMEM; + goto fail; + } + spin_lock_init(&vi->hash_page_init_lock); } + return vi; + +fail: + fsverity_free_info(vi); + return ERR_PTR(err); } void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) @@ -214,6 +273,7 @@ void fsverity_free_info(struct fsverity_info *vi) if (!vi) return; kfree(vi->tree_params.hashstate); + kvfree(vi->hash_block_verified); kmem_cache_free(fsverity_info_cachep, vi); } @@ -325,67 +385,28 @@ out_free_desc: return err; } -/** - * fsverity_file_open() - prepare to open a verity file - * @inode: the inode being opened - * @filp: the struct file being set up - * - * When opening a verity file, deny the open if it is for writing. Otherwise, - * set up the inode's ->i_verity_info if not already done. - * - * When combined with fscrypt, this must be called after fscrypt_file_open(). - * Otherwise, we won't have the key set up to decrypt the verity metadata. - * - * Return: 0 on success, -errno on failure - */ -int fsverity_file_open(struct inode *inode, struct file *filp) +int __fsverity_file_open(struct inode *inode, struct file *filp) { - if (!IS_VERITY(inode)) - return 0; - - if (filp->f_mode & FMODE_WRITE) { - pr_debug("Denying opening verity file (ino %lu) for write\n", - inode->i_ino); + if (filp->f_mode & FMODE_WRITE) return -EPERM; - } - return ensure_verity_info(inode); } -EXPORT_SYMBOL_GPL(fsverity_file_open); +EXPORT_SYMBOL_GPL(__fsverity_file_open); -/** - * fsverity_prepare_setattr() - prepare to change a verity inode's attributes - * @dentry: dentry through which the inode is being changed - * @attr: attributes to change - * - * Verity files are immutable, so deny truncates. This isn't covered by the - * open-time check because sys_truncate() takes a path, not a file descriptor. - * - * Return: 0 on success, -errno on failure - */ -int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) +int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) { - if (IS_VERITY(d_inode(dentry)) && (attr->ia_valid & ATTR_SIZE)) { - pr_debug("Denying truncate of verity file (ino %lu)\n", - d_inode(dentry)->i_ino); + if (attr->ia_valid & ATTR_SIZE) return -EPERM; - } return 0; } -EXPORT_SYMBOL_GPL(fsverity_prepare_setattr); +EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr); -/** - * fsverity_cleanup_inode() - free the inode's verity info, if present - * @inode: an inode being evicted - * - * Filesystems must call this on inode eviction to free ->i_verity_info. - */ -void fsverity_cleanup_inode(struct inode *inode) +void __fsverity_cleanup_inode(struct inode *inode) { fsverity_free_info(inode->i_verity_info); inode->i_verity_info = NULL; } -EXPORT_SYMBOL_GPL(fsverity_cleanup_inode); +EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode); int __init fsverity_init_info_cache(void) { diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 143a530a8008..e7d3ca919a1e 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -82,8 +82,6 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return err; } - pr_debug("Valid signature for file digest %s:%*phN\n", - hash_alg->name, hash_alg->digest_size, vi->file_digest); return 0; } diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 961ba248021f..f50e3b5b52c9 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -9,39 +9,12 @@ #include <crypto/hash.h> #include <linux/bio.h> -#include <linux/ratelimit.h> static struct workqueue_struct *fsverity_read_workqueue; -/** - * hash_at_level() - compute the location of the block's hash at the given level - * - * @params: (in) the Merkle tree parameters - * @dindex: (in) the index of the data block being verified - * @level: (in) the level of hash we want (0 is leaf level) - * @hindex: (out) the index of the hash block containing the wanted hash - * @hoffset: (out) the byte offset to the wanted hash within the hash block - */ -static void hash_at_level(const struct merkle_tree_params *params, - pgoff_t dindex, unsigned int level, pgoff_t *hindex, - unsigned int *hoffset) -{ - pgoff_t position; - - /* Offset of the hash within the level's region, in hashes */ - position = dindex >> (level * params->log_arity); - - /* Index of the hash block in the tree overall */ - *hindex = params->level_start[level] + (position >> params->log_arity); - - /* Offset of the wanted hash (in bytes) within the hash block */ - *hoffset = (position & ((1 << params->log_arity) - 1)) << - (params->log_blocksize - params->log_arity); -} - static inline int cmp_hashes(const struct fsverity_info *vi, const u8 *want_hash, const u8 *real_hash, - pgoff_t index, int level) + u64 data_pos, int level) { const unsigned int hsize = vi->tree_params.digest_size; @@ -49,159 +22,312 @@ static inline int cmp_hashes(const struct fsverity_info *vi, return 0; fsverity_err(vi->inode, - "FILE CORRUPTED! index=%lu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", - index, level, + "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", + data_pos, level, vi->tree_params.hash_alg->name, hsize, want_hash, vi->tree_params.hash_alg->name, hsize, real_hash); return -EBADMSG; } +static bool data_is_zeroed(struct inode *inode, struct page *page, + unsigned int len, unsigned int offset) +{ + void *virt = kmap_local_page(page); + + if (memchr_inv(virt + offset, 0, len)) { + kunmap_local(virt); + fsverity_err(inode, + "FILE CORRUPTED! Data past EOF is not zeroed"); + return false; + } + kunmap_local(virt); + return true; +} + +/* + * Returns true if the hash block with index @hblock_idx in the tree, located in + * @hpage, has already been verified. + */ +static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, + unsigned long hblock_idx) +{ + bool verified; + unsigned int blocks_per_page; + unsigned int i; + + /* + * When the Merkle tree block size and page size are the same, then the + * ->hash_block_verified bitmap isn't allocated, and we use PG_checked + * to directly indicate whether the page's block has been verified. + * + * Using PG_checked also guarantees that we re-verify hash pages that + * get evicted and re-instantiated from the backing storage, as new + * pages always start out with PG_checked cleared. + */ + if (!vi->hash_block_verified) + return PageChecked(hpage); + + /* + * When the Merkle tree block size and page size differ, we use a bitmap + * to indicate whether each hash block has been verified. + * + * However, we still need to ensure that hash pages that get evicted and + * re-instantiated from the backing storage are re-verified. To do + * this, we use PG_checked again, but now it doesn't really mean + * "checked". Instead, now it just serves as an indicator for whether + * the hash page is newly instantiated or not. + * + * The first thread that sees PG_checked=0 must clear the corresponding + * bitmap bits, then set PG_checked=1. This requires a spinlock. To + * avoid having to take this spinlock in the common case of + * PG_checked=1, we start with an opportunistic lockless read. + */ + if (PageChecked(hpage)) { + /* + * A read memory barrier is needed here to give ACQUIRE + * semantics to the above PageChecked() test. + */ + smp_rmb(); + return test_bit(hblock_idx, vi->hash_block_verified); + } + spin_lock(&vi->hash_page_init_lock); + if (PageChecked(hpage)) { + verified = test_bit(hblock_idx, vi->hash_block_verified); + } else { + blocks_per_page = vi->tree_params.blocks_per_page; + hblock_idx = round_down(hblock_idx, blocks_per_page); + for (i = 0; i < blocks_per_page; i++) + clear_bit(hblock_idx + i, vi->hash_block_verified); + /* + * A write memory barrier is needed here to give RELEASE + * semantics to the below SetPageChecked() operation. + */ + smp_wmb(); + SetPageChecked(hpage); + verified = false; + } + spin_unlock(&vi->hash_page_init_lock); + return verified; +} + /* - * Verify a single data page against the file's Merkle tree. + * Verify a single data block against the file's Merkle tree. * * In principle, we need to verify the entire path to the root node. However, - * for efficiency the filesystem may cache the hash pages. Therefore we need - * only ascend the tree until an already-verified page is seen, as indicated by - * the PageChecked bit being set; then verify the path to that page. - * - * This code currently only supports the case where the verity block size is - * equal to PAGE_SIZE. Doing otherwise would be possible but tricky, since we - * wouldn't be able to use the PageChecked bit. - * - * Note that multiple processes may race to verify a hash page and mark it - * Checked, but it doesn't matter; the result will be the same either way. + * for efficiency the filesystem may cache the hash blocks. Therefore we need + * only ascend the tree until an already-verified hash block is seen, and then + * verify the path to that block. * - * Return: true if the page is valid, else false. + * Return: %true if the data block is valid, else %false. */ -static bool verify_page(struct inode *inode, const struct fsverity_info *vi, - struct ahash_request *req, struct page *data_page, - unsigned long level0_ra_pages) +static bool +verify_data_block(struct inode *inode, struct fsverity_info *vi, + struct ahash_request *req, struct page *data_page, + u64 data_pos, unsigned int dblock_offset_in_page, + unsigned long max_ra_pages) { const struct merkle_tree_params *params = &vi->tree_params; const unsigned int hsize = params->digest_size; - const pgoff_t index = data_page->index; int level; u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE]; const u8 *want_hash; u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE]; - struct page *hpages[FS_VERITY_MAX_LEVELS]; - unsigned int hoffsets[FS_VERITY_MAX_LEVELS]; + /* The hash blocks that are traversed, indexed by level */ + struct { + /* Page containing the hash block */ + struct page *page; + /* Index of the hash block in the tree overall */ + unsigned long index; + /* Byte offset of the hash block within @page */ + unsigned int offset_in_page; + /* Byte offset of the wanted hash within @page */ + unsigned int hoffset; + } hblocks[FS_VERITY_MAX_LEVELS]; + /* + * The index of the previous level's block within that level; also the + * index of that block's hash within the current level. + */ + u64 hidx = data_pos >> params->log_blocksize; int err; - if (WARN_ON_ONCE(!PageLocked(data_page) || PageUptodate(data_page))) - return false; - - pr_debug_ratelimited("Verifying data page %lu...\n", index); + if (unlikely(data_pos >= inode->i_size)) { + /* + * This can happen in the data page spanning EOF when the Merkle + * tree block size is less than the page size. The Merkle tree + * doesn't cover data blocks fully past EOF. But the entire + * page spanning EOF can be visible to userspace via a mmap, and + * any part past EOF should be all zeroes. Therefore, we need + * to verify that any data blocks fully past EOF are all zeroes. + */ + return data_is_zeroed(inode, data_page, params->block_size, + dblock_offset_in_page); + } /* - * Starting at the leaf level, ascend the tree saving hash pages along - * the way until we find a verified hash page, indicated by PageChecked; - * or until we reach the root. + * Starting at the leaf level, ascend the tree saving hash blocks along + * the way until we find a hash block that has already been verified, or + * until we reach the root. */ for (level = 0; level < params->num_levels; level++) { - pgoff_t hindex; + unsigned long next_hidx; + unsigned long hblock_idx; + pgoff_t hpage_idx; + unsigned int hblock_offset_in_page; unsigned int hoffset; struct page *hpage; - hash_at_level(params, index, level, &hindex, &hoffset); + /* + * The index of the block in the current level; also the index + * of that block's hash within the next level. + */ + next_hidx = hidx >> params->log_arity; + + /* Index of the hash block in the tree overall */ + hblock_idx = params->level_start[level] + next_hidx; + + /* Index of the hash page in the tree overall */ + hpage_idx = hblock_idx >> params->log_blocks_per_page; - pr_debug_ratelimited("Level %d: hindex=%lu, hoffset=%u\n", - level, hindex, hoffset); + /* Byte offset of the hash block within the page */ + hblock_offset_in_page = + (hblock_idx << params->log_blocksize) & ~PAGE_MASK; - hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, hindex, - level == 0 ? level0_ra_pages : 0); + /* Byte offset of the hash within the page */ + hoffset = hblock_offset_in_page + + ((hidx << params->log_digestsize) & + (params->block_size - 1)); + + hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, + hpage_idx, level == 0 ? min(max_ra_pages, + params->tree_pages - hpage_idx) : 0); if (IS_ERR(hpage)) { err = PTR_ERR(hpage); fsverity_err(inode, "Error %d reading Merkle tree page %lu", - err, hindex); + err, hpage_idx); goto out; } - - if (PageChecked(hpage)) { + if (is_hash_block_verified(vi, hpage, hblock_idx)) { memcpy_from_page(_want_hash, hpage, hoffset, hsize); want_hash = _want_hash; put_page(hpage); - pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n", - params->hash_alg->name, - hsize, want_hash); goto descend; } - pr_debug_ratelimited("Hash page not yet checked\n"); - hpages[level] = hpage; - hoffsets[level] = hoffset; + hblocks[level].page = hpage; + hblocks[level].index = hblock_idx; + hblocks[level].offset_in_page = hblock_offset_in_page; + hblocks[level].hoffset = hoffset; + hidx = next_hidx; } want_hash = vi->root_hash; - pr_debug("Want root hash: %s:%*phN\n", - params->hash_alg->name, hsize, want_hash); descend: - /* Descend the tree verifying hash pages */ + /* Descend the tree verifying hash blocks. */ for (; level > 0; level--) { - struct page *hpage = hpages[level - 1]; - unsigned int hoffset = hoffsets[level - 1]; - - err = fsverity_hash_page(params, inode, req, hpage, real_hash); + struct page *hpage = hblocks[level - 1].page; + unsigned long hblock_idx = hblocks[level - 1].index; + unsigned int hblock_offset_in_page = + hblocks[level - 1].offset_in_page; + unsigned int hoffset = hblocks[level - 1].hoffset; + + err = fsverity_hash_block(params, inode, req, hpage, + hblock_offset_in_page, real_hash); if (err) goto out; - err = cmp_hashes(vi, want_hash, real_hash, index, level - 1); + err = cmp_hashes(vi, want_hash, real_hash, data_pos, level - 1); if (err) goto out; - SetPageChecked(hpage); + /* + * Mark the hash block as verified. This must be atomic and + * idempotent, as the same hash block might be verified by + * multiple threads concurrently. + */ + if (vi->hash_block_verified) + set_bit(hblock_idx, vi->hash_block_verified); + else + SetPageChecked(hpage); memcpy_from_page(_want_hash, hpage, hoffset, hsize); want_hash = _want_hash; put_page(hpage); - pr_debug("Verified hash page at level %d, now want %s:%*phN\n", - level - 1, params->hash_alg->name, hsize, want_hash); } - /* Finally, verify the data page */ - err = fsverity_hash_page(params, inode, req, data_page, real_hash); + /* Finally, verify the data block. */ + err = fsverity_hash_block(params, inode, req, data_page, + dblock_offset_in_page, real_hash); if (err) goto out; - err = cmp_hashes(vi, want_hash, real_hash, index, -1); + err = cmp_hashes(vi, want_hash, real_hash, data_pos, -1); out: for (; level > 0; level--) - put_page(hpages[level - 1]); + put_page(hblocks[level - 1].page); return err == 0; } +static bool +verify_data_blocks(struct inode *inode, struct fsverity_info *vi, + struct ahash_request *req, struct folio *data_folio, + size_t len, size_t offset, unsigned long max_ra_pages) +{ + const unsigned int block_size = vi->tree_params.block_size; + u64 pos = (u64)data_folio->index << PAGE_SHIFT; + + if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size))) + return false; + if (WARN_ON_ONCE(!folio_test_locked(data_folio) || + folio_test_uptodate(data_folio))) + return false; + do { + struct page *data_page = + folio_page(data_folio, offset >> PAGE_SHIFT); + + if (!verify_data_block(inode, vi, req, data_page, pos + offset, + offset & ~PAGE_MASK, max_ra_pages)) + return false; + offset += block_size; + len -= block_size; + } while (len); + return true; +} + /** - * fsverity_verify_page() - verify a data page - * @page: the page to verity + * fsverity_verify_blocks() - verify data in a folio + * @folio: the folio containing the data to verify + * @len: the length of the data to verify in the folio + * @offset: the offset of the data to verify in the folio * - * Verify a page that has just been read from a verity file. The page must be a - * pagecache page that is still locked and not yet uptodate. + * Verify data that has just been read from a verity file. The data must be + * located in a pagecache folio that is still locked and not yet uptodate. The + * length and offset of the data must be Merkle tree block size aligned. * - * Return: true if the page is valid, else false. + * Return: %true if the data is valid, else %false. */ -bool fsverity_verify_page(struct page *page) +bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) { - struct inode *inode = page->mapping->host; - const struct fsverity_info *vi = inode->i_verity_info; + struct inode *inode = folio->mapping->host; + struct fsverity_info *vi = inode->i_verity_info; struct ahash_request *req; bool valid; /* This allocation never fails, since it's mempool-backed. */ req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS); - valid = verify_page(inode, vi, req, page, 0); + valid = verify_data_blocks(inode, vi, req, folio, len, offset, 0); fsverity_free_hash_request(vi->tree_params.hash_alg, req); return valid; } -EXPORT_SYMBOL_GPL(fsverity_verify_page); +EXPORT_SYMBOL_GPL(fsverity_verify_blocks); #ifdef CONFIG_BLOCK /** * fsverity_verify_bio() - verify a 'read' bio that has just completed * @bio: the bio to verify * - * Verify a set of pages that have just been read from a verity file. The pages - * must be pagecache pages that are still locked and not yet uptodate. If a - * page fails verification, then bio->bi_status is set to an error status. + * Verify the bio's data against the file's Merkle tree. All bio data segments + * must be aligned to the file's Merkle tree block size. If any data fails + * verification, then bio->bi_status is set to an error status. * * This is a helper function for use by the ->readahead() method of filesystems * that issue bios to read data directly into the page cache. Filesystems that @@ -212,15 +338,13 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page); void fsverity_verify_bio(struct bio *bio) { struct inode *inode = bio_first_page_all(bio)->mapping->host; - const struct fsverity_info *vi = inode->i_verity_info; - const struct merkle_tree_params *params = &vi->tree_params; + struct fsverity_info *vi = inode->i_verity_info; struct ahash_request *req; - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; unsigned long max_ra_pages = 0; /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(params->hash_alg, GFP_NOFS); + req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS); if (bio->bi_opf & REQ_RAHEAD) { /* @@ -232,24 +356,18 @@ void fsverity_verify_bio(struct bio *bio) * This improves sequential read performance, as it greatly * reduces the number of I/O requests made to the Merkle tree. */ - bio_for_each_segment_all(bv, bio, iter_all) - max_ra_pages++; - max_ra_pages /= 4; + max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2); } - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; - unsigned long level0_index = page->index >> params->log_arity; - unsigned long level0_ra_pages = - min(max_ra_pages, params->level0_blocks - level0_index); - - if (!verify_page(inode, vi, req, page, level0_ra_pages)) { + bio_for_each_folio_all(fi, bio) { + if (!verify_data_blocks(inode, vi, req, fi.folio, fi.length, + fi.offset, max_ra_pages)) { bio->bi_status = BLK_STS_IOERR; break; } } - fsverity_free_hash_request(params->hash_alg, req); + fsverity_free_hash_request(vi->tree_params.hash_alg, req); } EXPORT_SYMBOL_GPL(fsverity_verify_bio); #endif /* CONFIG_BLOCK */ diff --git a/fs/xattr.c b/fs/xattr.c index adab9a70b536..14a7eb3c8fa8 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -9,6 +9,7 @@ Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/xattr.h> @@ -82,7 +83,7 @@ xattr_resolve_name(struct inode *inode, const char **name) /** * may_write_xattr - check whether inode allows writing xattr - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: the inode on which to set an xattr * * Check whether the inode allows writing xattrs. Specifically, we can never @@ -94,13 +95,13 @@ xattr_resolve_name(struct inode *inode, const char **name) * * Return: On success zero is returned. On error a negative errno is returned. */ -int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode) +int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode) { if (IS_IMMUTABLE(inode)) return -EPERM; if (IS_APPEND(inode)) return -EPERM; - if (HAS_UNMAPPED_ID(mnt_userns, inode)) + if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; return 0; } @@ -110,13 +111,13 @@ int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode) * because different namespaces have very different rules. */ static int -xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, +xattr_permission(struct mnt_idmap *idmap, struct inode *inode, const char *name, int mask) { if (mask & MAY_WRITE) { int ret; - ret = may_write_xattr(mnt_userns, inode); + ret = may_write_xattr(idmap, inode); if (ret) return ret; } @@ -148,11 +149,11 @@ xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, return (mask & MAY_WRITE) ? -EPERM : -ENODATA; if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && (mask & MAY_WRITE) && - !inode_owner_or_capable(mnt_userns, inode)) + !inode_owner_or_capable(idmap, inode)) return -EPERM; } - return inode_permission(mnt_userns, inode, mask); + return inode_permission(idmap, inode, mask); } /* @@ -183,7 +184,7 @@ xattr_supported_namespace(struct inode *inode, const char *prefix) EXPORT_SYMBOL(xattr_supported_namespace); int -__vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +__vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { @@ -199,7 +200,7 @@ __vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, return -EOPNOTSUPP; if (size == 0) value = ""; /* empty EA, do not remove */ - return handler->set(handler, mnt_userns, dentry, inode, name, value, + return handler->set(handler, idmap, dentry, inode, name, value, size, flags); } EXPORT_SYMBOL(__vfs_setxattr); @@ -208,7 +209,7 @@ EXPORT_SYMBOL(__vfs_setxattr); * __vfs_setxattr_noperm - perform setxattr operation without performing * permission checks. * - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to @@ -221,7 +222,7 @@ EXPORT_SYMBOL(__vfs_setxattr); * is executed. It also assumes that the caller will make the appropriate * permission checks. */ -int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, +int __vfs_setxattr_noperm(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { @@ -233,7 +234,7 @@ int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, if (issec) inode->i_flags &= ~S_NOSEC; if (inode->i_opflags & IOP_XATTR) { - error = __vfs_setxattr(mnt_userns, dentry, inode, name, value, + error = __vfs_setxattr(idmap, dentry, inode, name, value, size, flags); if (!error) { fsnotify_xattr(dentry); @@ -264,7 +265,7 @@ int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, * __vfs_setxattr_locked - set an extended attribute while holding the inode * lock * - * @mnt_userns: user namespace of the mount of the target inode + * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to @@ -274,18 +275,18 @@ int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, * a delegation was broken on, NULL if none. */ int -__vfs_setxattr_locked(struct user_namespace *mnt_userns, struct dentry *dentry, +__vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; - error = xattr_permission(mnt_userns, inode, name, MAY_WRITE); + error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; - error = security_inode_setxattr(mnt_userns, dentry, name, value, size, + error = security_inode_setxattr(idmap, dentry, name, value, size, flags); if (error) goto out; @@ -294,7 +295,7 @@ __vfs_setxattr_locked(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) goto out; - error = __vfs_setxattr_noperm(mnt_userns, dentry, name, value, + error = __vfs_setxattr_noperm(idmap, dentry, name, value, size, flags); out: @@ -303,7 +304,7 @@ out: EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); int -vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; @@ -312,7 +313,7 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { - error = cap_convert_nscap(mnt_userns, dentry, &value, size); + error = cap_convert_nscap(idmap, dentry, &value, size); if (error < 0) return error; size = error; @@ -320,7 +321,7 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, retry_deleg: inode_lock(inode); - error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size, + error = __vfs_setxattr_locked(idmap, dentry, name, value, size, flags, &delegated_inode); inode_unlock(inode); @@ -337,19 +338,19 @@ retry_deleg: EXPORT_SYMBOL_GPL(vfs_setxattr); static ssize_t -xattr_getsecurity(struct user_namespace *mnt_userns, struct inode *inode, +xattr_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void *value, size_t size) { void *buffer = NULL; ssize_t len; if (!value || !size) { - len = security_inode_getsecurity(mnt_userns, inode, name, + len = security_inode_getsecurity(idmap, inode, name, &buffer, false); goto out_noalloc; } - len = security_inode_getsecurity(mnt_userns, inode, name, &buffer, + len = security_inode_getsecurity(idmap, inode, name, &buffer, true); if (len < 0) return len; @@ -374,7 +375,7 @@ out_noalloc: * Returns the result of alloc, if failed, or the getxattr operation. */ int -vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry, +vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t xattr_size, gfp_t flags) { @@ -383,7 +384,7 @@ vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry, char *value = *xattr_value; int error; - error = xattr_permission(mnt_userns, inode, name, MAY_READ); + error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; @@ -427,13 +428,13 @@ __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, EXPORT_SYMBOL(__vfs_getxattr); ssize_t -vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, void *value, size_t size) { struct inode *inode = dentry->d_inode; int error; - error = xattr_permission(mnt_userns, inode, name, MAY_READ); + error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; @@ -444,7 +445,7 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - int ret = xattr_getsecurity(mnt_userns, inode, suffix, value, + int ret = xattr_getsecurity(idmap, inode, suffix, value, size); /* * Only overwrite the return value if a security module @@ -480,7 +481,7 @@ vfs_listxattr(struct dentry *dentry, char *list, size_t size) EXPORT_SYMBOL_GPL(vfs_listxattr); int -__vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, +__vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = d_inode(dentry); @@ -494,7 +495,7 @@ __vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; - return handler->set(handler, mnt_userns, dentry, inode, name, NULL, 0, + return handler->set(handler, idmap, dentry, inode, name, NULL, 0, XATTR_REPLACE); } EXPORT_SYMBOL(__vfs_removexattr); @@ -503,25 +504,25 @@ EXPORT_SYMBOL(__vfs_removexattr); * __vfs_removexattr_locked - set an extended attribute while holding the inode * lock * - * @mnt_userns: user namespace of the mount of the target inode + * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: name of xattr to remove * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int -__vfs_removexattr_locked(struct user_namespace *mnt_userns, +__vfs_removexattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; - error = xattr_permission(mnt_userns, inode, name, MAY_WRITE); + error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; - error = security_inode_removexattr(mnt_userns, dentry, name); + error = security_inode_removexattr(idmap, dentry, name); if (error) goto out; @@ -529,7 +530,7 @@ __vfs_removexattr_locked(struct user_namespace *mnt_userns, if (error) goto out; - error = __vfs_removexattr(mnt_userns, dentry, name); + error = __vfs_removexattr(idmap, dentry, name); if (!error) { fsnotify_xattr(dentry); @@ -542,7 +543,7 @@ out: EXPORT_SYMBOL_GPL(__vfs_removexattr_locked); int -vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, +vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; @@ -551,7 +552,7 @@ vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, retry_deleg: inode_lock(inode); - error = __vfs_removexattr_locked(mnt_userns, dentry, + error = __vfs_removexattr_locked(idmap, dentry, name, &delegated_inode); inode_unlock(inode); @@ -605,7 +606,7 @@ int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, return do_set_acl(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size); - return vfs_setxattr(mnt_idmap_owner(idmap), dentry, ctx->kname->name, + return vfs_setxattr(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size, ctx->flags); } @@ -714,8 +715,7 @@ do_getxattr(struct mnt_idmap *idmap, struct dentry *d, if (is_posix_acl_xattr(ctx->kname->name)) error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size); else - error = vfs_getxattr(mnt_idmap_owner(idmap), d, kname, - ctx->kvalue, ctx->size); + error = vfs_getxattr(idmap, d, kname, ctx->kvalue, ctx->size); if (error > 0) { if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; @@ -892,9 +892,9 @@ removexattr(struct mnt_idmap *idmap, struct dentry *d, return error; if (is_posix_acl_xattr(kname)) - return vfs_remove_acl(mnt_idmap_owner(idmap), d, kname); + return vfs_remove_acl(idmap, d, kname); - return vfs_removexattr(mnt_idmap_owner(idmap), d, kname); + return vfs_removexattr(idmap, d, kname); } static int path_removexattr(const char __user *pathname, diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index a05f44eb8178..791db7d9c849 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -242,7 +242,7 @@ xfs_acl_set_mode( } int -xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +xfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { umode_t mode; @@ -258,7 +258,7 @@ xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, return error; if (type == ACL_TYPE_ACCESS) { - error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl); + error = posix_acl_update_mode(idmap, inode, &mode, &acl); if (error) return error; set_mode = true; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index dcd176149c7a..bf7f960997d3 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -11,7 +11,7 @@ struct posix_acl; #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu); -extern int xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int xfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); void xfs_forget_acl(struct inode *inode, const char *name); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 595a5bcf46b9..d06c0cc62f61 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1047,7 +1047,7 @@ xfs_file_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; - error = xfs_vn_setattr_size(file_mnt_user_ns(file), + error = xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), &iattr); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d354ea2b74f9..7f1d715faab5 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -777,7 +777,7 @@ xfs_inode_inherit_flags2( */ int xfs_init_new_inode( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_trans *tp, struct xfs_inode *pip, xfs_ino_t ino, @@ -823,11 +823,11 @@ xfs_init_new_inode( ip->i_projid = prid; if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { - inode_fsuid_set(inode, mnt_userns); + inode_fsuid_set(inode, idmap); inode->i_gid = dir->i_gid; inode->i_mode = mode; } else { - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); } /* @@ -836,7 +836,7 @@ xfs_init_new_inode( * (and only if the irix_sgid_inherit compatibility variable is set). */ if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && - !vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode))) + !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode))) inode->i_mode &= ~S_ISGID; ip->i_disk_size = 0; @@ -946,7 +946,7 @@ xfs_bumplink( int xfs_create( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, xfs_inode_t *dp, struct xfs_name *name, umode_t mode, @@ -978,8 +978,8 @@ xfs_create( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), - mapped_fsgid(mnt_userns, &init_user_ns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), + mapped_fsgid(idmap, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1020,7 +1020,7 @@ xfs_create( */ error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); if (!error) - error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode, + error = xfs_init_new_inode(idmap, tp, dp, ino, mode, is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip); if (error) goto out_trans_cancel; @@ -1102,7 +1102,7 @@ xfs_create( int xfs_create_tmpfile( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_inode *dp, umode_t mode, struct xfs_inode **ipp) @@ -1127,8 +1127,8 @@ xfs_create_tmpfile( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), - mapped_fsgid(mnt_userns, &init_user_ns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), + mapped_fsgid(idmap, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1144,7 +1144,7 @@ xfs_create_tmpfile( error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); if (!error) - error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode, + error = xfs_init_new_inode(idmap, tp, dp, ino, mode, 0, 0, prid, false, &ip); if (error) goto out_trans_cancel; @@ -2709,7 +2709,7 @@ out_trans_abort: */ static int xfs_rename_alloc_whiteout( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_name *src_name, struct xfs_inode *dp, struct xfs_inode **wip) @@ -2718,7 +2718,7 @@ xfs_rename_alloc_whiteout( struct qstr name; int error; - error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE, + error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE, &tmpfile); if (error) return error; @@ -2750,7 +2750,7 @@ xfs_rename_alloc_whiteout( */ int xfs_rename( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, @@ -2782,7 +2782,7 @@ xfs_rename( * appropriately. */ if (flags & RENAME_WHITEOUT) { - error = xfs_rename_alloc_whiteout(mnt_userns, src_name, + error = xfs_rename_alloc_whiteout(idmap, src_name, target_dp, &wip); if (error) return error; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index fa780f08dc89..69d21e42c10a 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -473,18 +473,18 @@ int xfs_release(struct xfs_inode *ip); void xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); -int xfs_create(struct user_namespace *mnt_userns, +int xfs_create(struct mnt_idmap *idmap, struct xfs_inode *dp, struct xfs_name *name, umode_t mode, dev_t rdev, bool need_xattr, struct xfs_inode **ipp); -int xfs_create_tmpfile(struct user_namespace *mnt_userns, +int xfs_create_tmpfile(struct mnt_idmap *idmap, struct xfs_inode *dp, umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, struct xfs_name *target_name); -int xfs_rename(struct user_namespace *mnt_userns, +int xfs_rename(struct mnt_idmap *idmap, struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, struct xfs_inode *target_dp, struct xfs_name *target_name, @@ -515,7 +515,7 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); -int xfs_init_new_inode(struct user_namespace *mnt_userns, struct xfs_trans *tp, +int xfs_init_new_inode(struct mnt_idmap *idmap, struct xfs_trans *tp, struct xfs_inode *pip, xfs_ino_t ino, umode_t mode, xfs_nlink_t nlink, dev_t rdev, prid_t prid, bool init_xattrs, struct xfs_inode **ipp); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 736510bc241b..55bb01173cde 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -665,7 +665,7 @@ xfs_ioc_fsbulkstat( struct xfs_fsop_bulkreq bulkreq; struct xfs_ibulk breq = { .mp = mp, - .mnt_userns = file_mnt_user_ns(file), + .idmap = file_mnt_idmap(file), .ocount = 0, }; xfs_ino_t lastino; @@ -844,7 +844,7 @@ xfs_ioc_bulkstat( struct xfs_bulk_ireq hdr; struct xfs_ibulk breq = { .mp = mp, - .mnt_userns = file_mnt_user_ns(file), + .idmap = file_mnt_idmap(file), }; int error; @@ -1297,7 +1297,7 @@ xfs_ioctl_setattr_check_projid( int xfs_fileattr_set( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { @@ -1371,7 +1371,7 @@ xfs_fileattr_set( */ if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) && - !capable_wrt_inode_uidgid(mnt_userns, VFS_I(ip), CAP_FSETID)) + !capable_wrt_inode_uidgid(idmap, VFS_I(ip), CAP_FSETID)) VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID); /* Change the ownerships and register project quota modifications */ diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index d4abba2c13c1..38be600b5e1e 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -49,7 +49,7 @@ xfs_fileattr_get( extern int xfs_fileattr_set( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 2f54b701eead..ee35eea1ecce 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -204,7 +204,7 @@ xfs_compat_ioc_fsbulkstat( struct xfs_fsop_bulkreq bulkreq; struct xfs_ibulk breq = { .mp = mp, - .mnt_userns = file_mnt_user_ns(file), + .idmap = file_mnt_idmap(file), .ocount = 0, }; xfs_ino_t lastino; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 515318dfbc38..24718adb3c16 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -162,12 +162,12 @@ xfs_create_need_xattr( STATIC int xfs_generic_create( - struct user_namespace *mnt_userns, - struct inode *dir, - struct dentry *dentry, - umode_t mode, - dev_t rdev, - struct file *tmpfile) /* unnamed file */ + struct mnt_idmap *idmap, + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev, + struct file *tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; @@ -196,11 +196,11 @@ xfs_generic_create( goto out_free_acl; if (!tmpfile) { - error = xfs_create(mnt_userns, XFS_I(dir), &name, mode, rdev, + error = xfs_create(idmap, XFS_I(dir), &name, mode, rdev, xfs_create_need_xattr(dir, default_acl, acl), &ip); } else { - error = xfs_create_tmpfile(mnt_userns, XFS_I(dir), mode, &ip); + error = xfs_create_tmpfile(idmap, XFS_I(dir), mode, &ip); } if (unlikely(error)) goto out_free_acl; @@ -255,35 +255,34 @@ xfs_generic_create( STATIC int xfs_vn_mknod( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, NULL); + return xfs_generic_create(idmap, dir, dentry, mode, rdev, NULL); } STATIC int xfs_vn_create( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool flags) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, NULL); + return xfs_generic_create(idmap, dir, dentry, mode, 0, NULL); } STATIC int xfs_vn_mkdir( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0, - NULL); + return xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL); } STATIC struct dentry * @@ -400,7 +399,7 @@ xfs_vn_unlink( STATIC int xfs_vn_symlink( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) @@ -417,7 +416,7 @@ xfs_vn_symlink( if (unlikely(error)) goto out; - error = xfs_symlink(mnt_userns, XFS_I(dir), &name, symname, mode, &cip); + error = xfs_symlink(idmap, XFS_I(dir), &name, symname, mode, &cip); if (unlikely(error)) goto out; @@ -443,7 +442,7 @@ xfs_vn_symlink( STATIC int xfs_vn_rename( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *odir, struct dentry *odentry, struct inode *ndir, @@ -472,7 +471,7 @@ xfs_vn_rename( if (unlikely(error)) return error; - return xfs_rename(mnt_userns, XFS_I(odir), &oname, + return xfs_rename(idmap, XFS_I(odir), &oname, XFS_I(d_inode(odentry)), XFS_I(ndir), &nname, new_inode ? XFS_I(new_inode) : NULL, flags); } @@ -549,7 +548,7 @@ xfs_stat_blksize( STATIC int xfs_vn_getattr( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, @@ -558,8 +557,8 @@ xfs_vn_getattr( struct inode *inode = d_inode(path->dentry); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); trace_xfs_getattr(ip); @@ -627,7 +626,7 @@ xfs_vn_getattr( static int xfs_vn_change_ok( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { @@ -639,7 +638,7 @@ xfs_vn_change_ok( if (xfs_is_shutdown(mp)) return -EIO; - return setattr_prepare(mnt_userns, dentry, iattr); + return setattr_prepare(idmap, dentry, iattr); } /* @@ -650,7 +649,7 @@ xfs_vn_change_ok( */ static int xfs_setattr_nonsize( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct xfs_inode *ip, struct iattr *iattr) @@ -679,14 +678,14 @@ xfs_setattr_nonsize( uint qflags = 0; if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { - uid = from_vfsuid(mnt_userns, i_user_ns(inode), + uid = from_vfsuid(idmap, i_user_ns(inode), iattr->ia_vfsuid); qflags |= XFS_QMOPT_UQUOTA; } else { uid = inode->i_uid; } if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { - gid = from_vfsgid(mnt_userns, i_user_ns(inode), + gid = from_vfsgid(idmap, i_user_ns(inode), iattr->ia_vfsgid); qflags |= XFS_QMOPT_GQUOTA; } else { @@ -719,18 +718,18 @@ xfs_setattr_nonsize( * also. */ if (XFS_IS_UQUOTA_ON(mp) && - i_uid_needs_update(mnt_userns, iattr, inode)) { + i_uid_needs_update(idmap, iattr, inode)) { ASSERT(udqp); old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } if (XFS_IS_GQUOTA_ON(mp) && - i_gid_needs_update(mnt_userns, iattr, inode)) { + i_gid_needs_update(idmap, iattr, inode)) { ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp)); ASSERT(gdqp); old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - setattr_copy(mnt_userns, inode, iattr); + setattr_copy(idmap, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); @@ -758,7 +757,7 @@ xfs_setattr_nonsize( * Posix ACL code seems to care about this issue either. */ if (mask & ATTR_MODE) { - error = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); + error = posix_acl_chmod(idmap, dentry, inode->i_mode); if (error) return error; } @@ -779,7 +778,7 @@ out_dqrele: */ STATIC int xfs_setattr_size( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct xfs_inode *ip, struct iattr *iattr) @@ -812,7 +811,7 @@ xfs_setattr_size( * Use the regular setattr path to update the timestamps. */ iattr->ia_valid &= ~ATTR_SIZE; - return xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr); + return xfs_setattr_nonsize(idmap, dentry, ip, iattr); } /* @@ -956,7 +955,7 @@ xfs_setattr_size( } ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); - setattr_copy(mnt_userns, inode, iattr); + setattr_copy(idmap, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); @@ -977,7 +976,7 @@ out_trans_cancel: int xfs_vn_setattr_size( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { @@ -986,15 +985,15 @@ xfs_vn_setattr_size( trace_xfs_setattr(ip); - error = xfs_vn_change_ok(mnt_userns, dentry, iattr); + error = xfs_vn_change_ok(idmap, dentry, iattr); if (error) return error; - return xfs_setattr_size(mnt_userns, dentry, ip, iattr); + return xfs_setattr_size(idmap, dentry, ip, iattr); } STATIC int xfs_vn_setattr( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { @@ -1014,14 +1013,14 @@ xfs_vn_setattr( return error; } - error = xfs_vn_setattr_size(mnt_userns, dentry, iattr); + error = xfs_vn_setattr_size(idmap, dentry, iattr); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); } else { trace_xfs_setattr(ip); - error = xfs_vn_change_ok(mnt_userns, dentry, iattr); + error = xfs_vn_change_ok(idmap, dentry, iattr); if (!error) - error = xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr); + error = xfs_setattr_nonsize(idmap, dentry, ip, iattr); } return error; @@ -1092,12 +1091,12 @@ xfs_vn_fiemap( STATIC int xfs_vn_tmpfile( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { - int err = xfs_generic_create(mnt_userns, dir, file->f_path.dentry, mode, 0, file); + int err = xfs_generic_create(idmap, dir, file->f_path.dentry, mode, 0, file); return finish_open_simple(file, err); } diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index e570dcb5df8d..7f84a0843b24 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -13,7 +13,7 @@ extern const struct file_operations xfs_dir_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -int xfs_vn_setattr_size(struct user_namespace *mnt_userns, +int xfs_vn_setattr_size(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *vap); int xfs_inode_init_security(struct inode *inode, struct inode *dir, diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index a1c2bcf65d37..f225413a993c 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -55,7 +55,7 @@ struct xfs_bstat_chunk { STATIC int xfs_bulkstat_one_int( struct xfs_mount *mp, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_trans *tp, xfs_ino_t ino, struct xfs_bstat_chunk *bc) @@ -83,8 +83,8 @@ xfs_bulkstat_one_int( ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); inode = VFS_I(ip); - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); - vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid = i_gid_into_vfsgid(idmap, inode); /* xfs_iget returns the following without needing * further change. @@ -178,7 +178,7 @@ xfs_bulkstat_one( struct xfs_trans *tp; int error; - if (breq->mnt_userns != &init_user_ns) { + if (breq->idmap != &nop_mnt_idmap) { xfs_warn_ratelimited(breq->mp, "bulkstat not supported inside of idmapped mounts."); return -EINVAL; @@ -199,7 +199,7 @@ xfs_bulkstat_one( if (error) goto out; - error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, tp, + error = xfs_bulkstat_one_int(breq->mp, breq->idmap, tp, breq->startino, &bc); xfs_trans_cancel(tp); out: @@ -225,7 +225,7 @@ xfs_bulkstat_iwalk( struct xfs_bstat_chunk *bc = data; int error; - error = xfs_bulkstat_one_int(mp, bc->breq->mnt_userns, tp, ino, data); + error = xfs_bulkstat_one_int(mp, bc->breq->idmap, tp, ino, data); /* bulkstat just skips over missing inodes */ if (error == -ENOENT || error == -EINVAL) return 0; @@ -270,7 +270,7 @@ xfs_bulkstat( unsigned int iwalk_flags = 0; int error; - if (breq->mnt_userns != &init_user_ns) { + if (breq->idmap != &nop_mnt_idmap) { xfs_warn_ratelimited(breq->mp, "bulkstat not supported inside of idmapped mounts."); return -EINVAL; diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index e2d0eba43f35..1659f13f17a8 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -8,7 +8,7 @@ /* In-memory representation of a userspace request for batch inode data. */ struct xfs_ibulk { struct xfs_mount *mp; - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; void __user *ubuffer; /* user output buffer */ xfs_ino_t startino; /* start with this inode */ unsigned int icount; /* number of elements in ubuffer */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index f9878021e7d0..e88f18f85e4b 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -34,6 +34,7 @@ typedef __u32 xfs_nlink_t; #include <linux/module.h> #include <linux/mutex.h> #include <linux/file.h> +#include <linux/filelock.h> #include <linux/swap.h> #include <linux/errno.h> #include <linux/sched/signal.h> diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 38d23f0e703a..23d16186e1a3 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -322,7 +322,7 @@ xfs_fs_commit_blocks( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); if (update_isize) { i_size_write(inode, iattr->ia_size); ip->i_disk_size = iattr->ia_size; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index e2c542f6dcd4..7dc0db7f5a76 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -787,7 +787,7 @@ xfs_qm_qino_alloc( error = xfs_dialloc(&tp, 0, S_IFREG, &ino); if (!error) - error = xfs_init_new_inode(&init_user_ns, tp, NULL, ino, + error = xfs_init_new_inode(&nop_mnt_idmap, tp, NULL, ino, S_IFREG, 1, 0, 0, false, ipp); if (error) { xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 8389f3ef88ef..85e433df6a3f 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -144,7 +144,7 @@ xfs_readlink( int xfs_symlink( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, @@ -193,8 +193,8 @@ xfs_symlink( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), - mapped_fsgid(mnt_userns, &init_user_ns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), + mapped_fsgid(idmap, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -231,7 +231,7 @@ xfs_symlink( */ error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino); if (!error) - error = xfs_init_new_inode(mnt_userns, tp, dp, ino, + error = xfs_init_new_inode(idmap, tp, dp, ino, S_IFLNK | (mode & ~S_IFMT), 1, 0, prid, false, &ip); if (error) diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h index 2586b7e393f3..d1ca1ce62a93 100644 --- a/fs/xfs/xfs_symlink.h +++ b/fs/xfs/xfs_symlink.h @@ -7,7 +7,7 @@ /* Kernel only symlink definitions */ -int xfs_symlink(struct user_namespace *mnt_userns, struct xfs_inode *dp, +int xfs_symlink(struct mnt_idmap *idmap, struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, umode_t mode, struct xfs_inode **ipp); int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 10aa1fd39d2b..7b9a0ed1b11f 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -133,7 +133,7 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, static int xfs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, struct dentry *unused, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index a9c5c3f720ad..72ef97320b99 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -604,7 +604,7 @@ unlock: return ret; } -static int zonefs_inode_setattr(struct user_namespace *mnt_userns, +static int zonefs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); @@ -613,7 +613,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns, if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; - ret = setattr_prepare(&init_user_ns, dentry, iattr); + ret = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (ret) return ret; @@ -630,7 +630,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns, !uid_eq(iattr->ia_uid, inode->i_uid)) || ((iattr->ia_valid & ATTR_GID) && !gid_eq(iattr->ia_gid, inode->i_gid))) { - ret = dquot_transfer(mnt_userns, inode, iattr); + ret = dquot_transfer(&nop_mnt_idmap, inode, iattr); if (ret) return ret; } @@ -641,7 +641,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns, return ret; } - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); return 0; } @@ -1427,7 +1427,7 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, struct super_block *sb = parent->i_sb; inode->i_ino = bdev_nr_zones(sb->s_bdev) + type + 1; - inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555); + inode_init_owner(&nop_mnt_idmap, inode, parent, S_IFDIR | 0555); inode->i_op = &zonefs_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); diff --git a/include/acpi/actbl3.h b/include/acpi/actbl3.h index 7b9571e00cc4..832c6464f063 100644 --- a/include/acpi/actbl3.h +++ b/include/acpi/actbl3.h @@ -443,6 +443,7 @@ struct acpi_tpm2_phy { #define ACPI_TPM2_RESERVED10 10 #define ACPI_TPM2_COMMAND_BUFFER_WITH_ARM_SMC 11 /* V1.2 Rev 8 */ #define ACPI_TPM2_RESERVED 12 +#define ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON 13 /* Optional trailer appears after any start_method subtables */ diff --git a/include/drm/drm_client.h b/include/drm/drm_client.h index 4fc8018eddda..1220d185c776 100644 --- a/include/drm/drm_client.h +++ b/include/drm/drm_client.h @@ -127,11 +127,6 @@ struct drm_client_buffer { struct drm_client_dev *client; /** - * @handle: Buffer handle - */ - u32 handle; - - /** * @pitch: Buffer pitch */ u32 pitch; diff --git a/include/kunit/test.h b/include/kunit/test.h index 87ea90576b50..08d3559dd703 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -303,7 +303,6 @@ static inline int kunit_run_all_tests(void) */ #define kunit_test_init_section_suites(__suites...) \ __kunit_test_suites(CONCATENATE(__UNIQUE_ID(array), _probe), \ - CONCATENATE(__UNIQUE_ID(suites), _probe), \ ##__suites) #define kunit_test_init_section_suite(suite) \ @@ -683,8 +682,9 @@ do { \ .right_text = #right, \ }; \ \ - if (likely(memcmp(__left, __right, __size) op 0)) \ - break; \ + if (likely(__left && __right)) \ + if (likely(memcmp(__left, __right, __size) op 0)) \ + break; \ \ _KUNIT_FAILED(test, \ assert_type, \ diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 9270cd87da3f..6470f67e63c4 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -263,7 +263,7 @@ struct vgic_dist { struct vgic_io_device dist_iodev; bool has_its; - bool save_its_tables_in_progress; + bool table_write_in_progress; /* * Contains the attributes and gpa of the LPI configuration table. diff --git a/include/linux/audit.h b/include/linux/audit.h index 3608992848d3..31086a72e32a 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -14,6 +14,7 @@ #include <linux/audit_arch.h> #include <uapi/linux/audit.h> #include <uapi/linux/netfilter/nf_tables.h> +#include <uapi/linux/fanotify.h> #define AUDIT_INO_UNSET ((unsigned long)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) @@ -416,7 +417,7 @@ extern void __audit_log_capset(const struct cred *new, const struct cred *old); extern void __audit_mmap_fd(int fd, int flags); extern void __audit_openat2_how(struct open_how *how); extern void __audit_log_kern_module(char *name); -extern void __audit_fanotify(unsigned int response); +extern void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar); extern void __audit_tk_injoffset(struct timespec64 offset); extern void __audit_ntp_log(const struct audit_ntp_data *ad); extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, @@ -523,10 +524,10 @@ static inline void audit_log_kern_module(char *name) __audit_log_kern_module(name); } -static inline void audit_fanotify(unsigned int response) +static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { if (!audit_dummy_context()) - __audit_fanotify(response); + __audit_fanotify(response, friar); } static inline void audit_tk_injoffset(struct timespec64 offset) @@ -679,7 +680,7 @@ static inline void audit_log_kern_module(char *name) { } -static inline void audit_fanotify(unsigned int response) +static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { } static inline void audit_tk_injoffset(struct timespec64 offset) diff --git a/include/linux/bio.h b/include/linux/bio.h index c1da63f6c808..d766be7152e1 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -12,6 +12,8 @@ #define BIO_MAX_VECS 256U +struct queue_limits; + static inline unsigned int bio_max_segs(unsigned int nr_segs) { return min(nr_segs, BIO_MAX_VECS); @@ -375,6 +377,8 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip, void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, struct bio_set *bs, unsigned max_bytes); /** * bio_next_split - get next @sectors from a bio, splitting if necessary diff --git a/include/linux/capability.h b/include/linux/capability.h index 65efb74c3585..03c2a613ad40 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -42,6 +42,7 @@ struct inode; struct dentry; struct task_struct; struct user_namespace; +struct mnt_idmap; extern const kernel_cap_t __cap_empty_set; extern const kernel_cap_t __cap_init_eff_set; @@ -248,9 +249,9 @@ static inline bool ns_capable_setid(struct user_namespace *ns, int cap) } #endif /* CONFIG_MULTIUSER */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const struct inode *inode); -bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns, +bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); @@ -271,11 +272,11 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns) } /* audit system wants to get cap info from files as well */ -int get_vfs_caps_from_disk(struct user_namespace *mnt_userns, +int get_vfs_caps_from_disk(struct mnt_idmap *idmap, const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); -int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry, +int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry, const void **ivalue, size_t size); #endif /* !_LINUX_CAPABILITY_H */ diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 00af2c98da75..4497d0a6772c 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -99,16 +99,6 @@ struct ceph_options { #define CEPH_AUTH_NAME_DEFAULT "guest" -/* mount state */ -enum { - CEPH_MOUNT_MOUNTING, - CEPH_MOUNT_MOUNTED, - CEPH_MOUNT_UNMOUNTING, - CEPH_MOUNT_UNMOUNTED, - CEPH_MOUNT_SHUTDOWN, - CEPH_MOUNT_RECOVER, -}; - static inline unsigned long ceph_timeout_jiffies(unsigned long timeout) { return timeout ?: MAX_SCHEDULE_TIMEOUT; diff --git a/include/linux/efi.h b/include/linux/efi.h index 4b27519143f5..98598bd1d2fa 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -668,7 +668,8 @@ extern struct efi { #define EFI_RT_SUPPORTED_ALL 0x3fff -#define EFI_RT_SUPPORTED_TIME_SERVICES 0x000f +#define EFI_RT_SUPPORTED_TIME_SERVICES 0x0003 +#define EFI_RT_SUPPORTED_WAKEUP_SERVICES 0x000c #define EFI_RT_SUPPORTED_VARIABLE_SERVICES 0x0070 extern struct mm_struct efi_mm; diff --git a/include/linux/evm.h b/include/linux/evm.h index 7a9ee2157f69..7dc1ee74169f 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -21,34 +21,34 @@ extern enum integrity_status evm_verifyxattr(struct dentry *dentry, void *xattr_value, size_t xattr_value_len, struct integrity_iint_cache *iint); -extern int evm_inode_setattr(struct user_namespace *mnt_userns, +extern int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern void evm_inode_post_setattr(struct dentry *dentry, int ia_valid); -extern int evm_inode_setxattr(struct user_namespace *mnt_userns, +extern int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size); extern void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len); -extern int evm_inode_removexattr(struct user_namespace *mnt_userns, +extern int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name); extern void evm_inode_post_removexattr(struct dentry *dentry, const char *xattr_name); -static inline void evm_inode_post_remove_acl(struct user_namespace *mnt_userns, +static inline void evm_inode_post_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { evm_inode_post_removexattr(dentry, acl_name); } -extern int evm_inode_set_acl(struct user_namespace *mnt_userns, +extern int evm_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); -static inline int evm_inode_remove_acl(struct user_namespace *mnt_userns, +static inline int evm_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { - return evm_inode_set_acl(mnt_userns, dentry, acl_name, NULL); + return evm_inode_set_acl(idmap, dentry, acl_name, NULL); } static inline void evm_inode_post_set_acl(struct dentry *dentry, const char *acl_name, @@ -90,7 +90,7 @@ static inline enum integrity_status evm_verifyxattr(struct dentry *dentry, } #endif -static inline int evm_inode_setattr(struct user_namespace *mnt_userns, +static inline int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return 0; @@ -101,7 +101,7 @@ static inline void evm_inode_post_setattr(struct dentry *dentry, int ia_valid) return; } -static inline int evm_inode_setxattr(struct user_namespace *mnt_userns, +static inline int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size) { @@ -116,7 +116,7 @@ static inline void evm_inode_post_setxattr(struct dentry *dentry, return; } -static inline int evm_inode_removexattr(struct user_namespace *mnt_userns, +static inline int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { @@ -129,21 +129,21 @@ static inline void evm_inode_post_removexattr(struct dentry *dentry, return; } -static inline void evm_inode_post_remove_acl(struct user_namespace *mnt_userns, +static inline void evm_inode_post_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return; } -static inline int evm_inode_set_acl(struct user_namespace *mnt_userns, +static inline int evm_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { return 0; } -static inline int evm_inode_remove_acl(struct user_namespace *mnt_userns, +static inline int evm_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index fe848901fcc3..9f4d4bcbf251 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -213,7 +213,6 @@ struct export_operations { bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); - u64 (*fetch_iversion)(struct inode *); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 8ad743def6f3..4f1c4f603118 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -122,6 +122,11 @@ #define ALL_FANOTIFY_EVENT_BITS (FANOTIFY_OUTGOING_EVENTS | \ FANOTIFY_EVENT_FLAGS) +/* These masks check for invalid bits in permission responses. */ +#define FANOTIFY_RESPONSE_ACCESS (FAN_ALLOW | FAN_DENY) +#define FANOTIFY_RESPONSE_FLAGS (FAN_AUDIT | FAN_INFO) +#define FANOTIFY_RESPONSE_VALID_MASK (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS) + /* Do not use these old uapi constants internally */ #undef FAN_ALL_CLASS_BITS #undef FAN_ALL_INIT_FLAGS diff --git a/include/linux/fb.h b/include/linux/fb.h index 96b96323e9cb..73eb1f85ea8e 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -662,6 +662,7 @@ extern int fb_deferred_io_init(struct fb_info *info); extern void fb_deferred_io_open(struct fb_info *info, struct inode *inode, struct file *file); +extern void fb_deferred_io_release(struct fb_info *info); extern void fb_deferred_io_cleanup(struct fb_info *info); extern int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/include/linux/fileattr.h b/include/linux/fileattr.h index 9e37e063ac69..47c05a9851d0 100644 --- a/include/linux/fileattr.h +++ b/include/linux/fileattr.h @@ -53,7 +53,7 @@ static inline bool fileattr_has_fsx(const struct fileattr *fa) } int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int vfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, +int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); #endif /* _LINUX_FILEATTR_H */ diff --git a/include/linux/filelock.h b/include/linux/filelock.h new file mode 100644 index 000000000000..efcdd1631d9b --- /dev/null +++ b/include/linux/filelock.h @@ -0,0 +1,439 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FILELOCK_H +#define _LINUX_FILELOCK_H + +#include <linux/fs.h> + +#define FL_POSIX 1 +#define FL_FLOCK 2 +#define FL_DELEG 4 /* NFSv4 delegation */ +#define FL_ACCESS 8 /* not trying to lock, just looking */ +#define FL_EXISTS 16 /* when unlocking, test for existence */ +#define FL_LEASE 32 /* lease held on this file */ +#define FL_CLOSE 64 /* unlock on close */ +#define FL_SLEEP 128 /* A blocking lock */ +#define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ +#define FL_UNLOCK_PENDING 512 /* Lease is being broken */ +#define FL_OFDLCK 1024 /* lock is "owned" by struct file */ +#define FL_LAYOUT 2048 /* outstanding pNFS layout */ +#define FL_RECLAIM 4096 /* reclaiming from a reboot server */ + +#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) + +/* + * Special return value from posix_lock_file() and vfs_lock_file() for + * asynchronous locking. + */ +#define FILE_LOCK_DEFERRED 1 + +struct file_lock; + +struct file_lock_operations { + void (*fl_copy_lock)(struct file_lock *, struct file_lock *); + void (*fl_release_private)(struct file_lock *); +}; + +struct lock_manager_operations { + void *lm_mod_owner; + fl_owner_t (*lm_get_owner)(fl_owner_t); + void (*lm_put_owner)(fl_owner_t); + void (*lm_notify)(struct file_lock *); /* unblock callback */ + int (*lm_grant)(struct file_lock *, int); + bool (*lm_break)(struct file_lock *); + int (*lm_change)(struct file_lock *, int, struct list_head *); + void (*lm_setup)(struct file_lock *, void **); + bool (*lm_breaker_owns_lease)(struct file_lock *); + bool (*lm_lock_expirable)(struct file_lock *cfl); + void (*lm_expire_lock)(void); +}; + +struct lock_manager { + struct list_head list; + /* + * NFSv4 and up also want opens blocked during the grace period; + * NLM doesn't care: + */ + bool block_opens; +}; + +struct net; +void locks_start_grace(struct net *, struct lock_manager *); +void locks_end_grace(struct lock_manager *); +bool locks_in_grace(struct net *); +bool opens_in_grace(struct net *); + +/* + * struct file_lock has a union that some filesystems use to track + * their own private info. The NFS side of things is defined here: + */ +#include <linux/nfs_fs_i.h> + +/* + * struct file_lock represents a generic "file lock". It's used to represent + * POSIX byte range locks, BSD (flock) locks, and leases. It's important to + * note that the same struct is used to represent both a request for a lock and + * the lock itself, but the same object is never used for both. + * + * FIXME: should we create a separate "struct lock_request" to help distinguish + * these two uses? + * + * The varous i_flctx lists are ordered by: + * + * 1) lock owner + * 2) lock range start + * 3) lock range end + * + * Obviously, the last two criteria only matter for POSIX locks. + */ +struct file_lock { + struct file_lock *fl_blocker; /* The lock, that is blocking us */ + struct list_head fl_list; /* link into file_lock_context */ + struct hlist_node fl_link; /* node in global lists */ + struct list_head fl_blocked_requests; /* list of requests with + * ->fl_blocker pointing here + */ + struct list_head fl_blocked_member; /* node in + * ->fl_blocker->fl_blocked_requests + */ + fl_owner_t fl_owner; + unsigned int fl_flags; + unsigned char fl_type; + unsigned int fl_pid; + int fl_link_cpu; /* what cpu's list is this on? */ + wait_queue_head_t fl_wait; + struct file *fl_file; + loff_t fl_start; + loff_t fl_end; + + struct fasync_struct * fl_fasync; /* for lease break notifications */ + /* for lease breaks: */ + unsigned long fl_break_time; + unsigned long fl_downgrade_time; + + const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ + const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ + union { + struct nfs_lock_info nfs_fl; + struct nfs4_lock_info nfs4_fl; + struct { + struct list_head link; /* link in AFS vnode's pending_locks list */ + int state; /* state of grant or error if -ve */ + unsigned int debug_id; + } afs; + struct { + struct inode *inode; + } ceph; + } fl_u; +} __randomize_layout; + +struct file_lock_context { + spinlock_t flc_lock; + struct list_head flc_flock; + struct list_head flc_posix; + struct list_head flc_lease; +}; + +#ifdef CONFIG_FILE_LOCKING +int fcntl_getlk(struct file *, unsigned int, struct flock *); +int fcntl_setlk(unsigned int, struct file *, unsigned int, + struct flock *); + +#if BITS_PER_LONG == 32 +int fcntl_getlk64(struct file *, unsigned int, struct flock64 *); +int fcntl_setlk64(unsigned int, struct file *, unsigned int, + struct flock64 *); +#endif + +int fcntl_setlease(unsigned int fd, struct file *filp, long arg); +int fcntl_getlease(struct file *filp); + +/* fs/locks.c */ +void locks_free_lock_context(struct inode *inode); +void locks_free_lock(struct file_lock *fl); +void locks_init_lock(struct file_lock *); +struct file_lock * locks_alloc_lock(void); +void locks_copy_lock(struct file_lock *, struct file_lock *); +void locks_copy_conflock(struct file_lock *, struct file_lock *); +void locks_remove_posix(struct file *, fl_owner_t); +void locks_remove_file(struct file *); +void locks_release_private(struct file_lock *); +void posix_test_lock(struct file *, struct file_lock *); +int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); +int locks_delete_block(struct file_lock *); +int vfs_test_lock(struct file *, struct file_lock *); +int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); +int vfs_cancel_lock(struct file *filp, struct file_lock *fl); +bool vfs_inode_has_locks(struct inode *inode); +int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); +int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); +void lease_get_mtime(struct inode *, struct timespec64 *time); +int generic_setlease(struct file *, long, struct file_lock **, void **priv); +int vfs_setlease(struct file *, long, struct file_lock **, void **); +int lease_modify(struct file_lock *, int, struct list_head *); + +struct notifier_block; +int lease_register_notifier(struct notifier_block *); +void lease_unregister_notifier(struct notifier_block *); + +struct files_struct; +void show_fd_locks(struct seq_file *f, + struct file *filp, struct files_struct *files); +bool locks_owner_has_blockers(struct file_lock_context *flctx, + fl_owner_t owner); + +static inline struct file_lock_context * +locks_inode_context(const struct inode *inode) +{ + return smp_load_acquire(&inode->i_flctx); +} + +#else /* !CONFIG_FILE_LOCKING */ +static inline int fcntl_getlk(struct file *file, unsigned int cmd, + struct flock __user *user) +{ + return -EINVAL; +} + +static inline int fcntl_setlk(unsigned int fd, struct file *file, + unsigned int cmd, struct flock __user *user) +{ + return -EACCES; +} + +#if BITS_PER_LONG == 32 +static inline int fcntl_getlk64(struct file *file, unsigned int cmd, + struct flock64 *user) +{ + return -EINVAL; +} + +static inline int fcntl_setlk64(unsigned int fd, struct file *file, + unsigned int cmd, struct flock64 *user) +{ + return -EACCES; +} +#endif +static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg) +{ + return -EINVAL; +} + +static inline int fcntl_getlease(struct file *filp) +{ + return F_UNLCK; +} + +static inline void +locks_free_lock_context(struct inode *inode) +{ +} + +static inline void locks_init_lock(struct file_lock *fl) +{ + return; +} + +static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) +{ + return; +} + +static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) +{ + return; +} + +static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) +{ + return; +} + +static inline void locks_remove_file(struct file *filp) +{ + return; +} + +static inline void posix_test_lock(struct file *filp, struct file_lock *fl) +{ + return; +} + +static inline int posix_lock_file(struct file *filp, struct file_lock *fl, + struct file_lock *conflock) +{ + return -ENOLCK; +} + +static inline int locks_delete_block(struct file_lock *waiter) +{ + return -ENOENT; +} + +static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) +{ + return 0; +} + +static inline int vfs_lock_file(struct file *filp, unsigned int cmd, + struct file_lock *fl, struct file_lock *conf) +{ + return -ENOLCK; +} + +static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) +{ + return 0; +} + +static inline bool vfs_inode_has_locks(struct inode *inode) +{ + return false; +} + +static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) +{ + return -ENOLCK; +} + +static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) +{ + return 0; +} + +static inline void lease_get_mtime(struct inode *inode, + struct timespec64 *time) +{ + return; +} + +static inline int generic_setlease(struct file *filp, long arg, + struct file_lock **flp, void **priv) +{ + return -EINVAL; +} + +static inline int vfs_setlease(struct file *filp, long arg, + struct file_lock **lease, void **priv) +{ + return -EINVAL; +} + +static inline int lease_modify(struct file_lock *fl, int arg, + struct list_head *dispose) +{ + return -EINVAL; +} + +struct files_struct; +static inline void show_fd_locks(struct seq_file *f, + struct file *filp, struct files_struct *files) {} +static inline bool locks_owner_has_blockers(struct file_lock_context *flctx, + fl_owner_t owner) +{ + return false; +} + +static inline struct file_lock_context * +locks_inode_context(const struct inode *inode) +{ + return NULL; +} + +#endif /* !CONFIG_FILE_LOCKING */ + +static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) +{ + return locks_lock_inode_wait(file_inode(filp), fl); +} + +#ifdef CONFIG_FILE_LOCKING +static inline int break_lease(struct inode *inode, unsigned int mode) +{ + /* + * Since this check is lockless, we must ensure that any refcounts + * taken are done before checking i_flctx->flc_lease. Otherwise, we + * could end up racing with tasks trying to set a new lease on this + * file. + */ + smp_mb(); + if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) + return __break_lease(inode, mode, FL_LEASE); + return 0; +} + +static inline int break_deleg(struct inode *inode, unsigned int mode) +{ + /* + * Since this check is lockless, we must ensure that any refcounts + * taken are done before checking i_flctx->flc_lease. Otherwise, we + * could end up racing with tasks trying to set a new lease on this + * file. + */ + smp_mb(); + if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) + return __break_lease(inode, mode, FL_DELEG); + return 0; +} + +static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +{ + int ret; + + ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); + if (ret == -EWOULDBLOCK && delegated_inode) { + *delegated_inode = inode; + ihold(inode); + } + return ret; +} + +static inline int break_deleg_wait(struct inode **delegated_inode) +{ + int ret; + + ret = break_deleg(*delegated_inode, O_WRONLY); + iput(*delegated_inode); + *delegated_inode = NULL; + return ret; +} + +static inline int break_layout(struct inode *inode, bool wait) +{ + smp_mb(); + if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) + return __break_lease(inode, + wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, + FL_LAYOUT); + return 0; +} + +#else /* !CONFIG_FILE_LOCKING */ +static inline int break_lease(struct inode *inode, unsigned int mode) +{ + return 0; +} + +static inline int break_deleg(struct inode *inode, unsigned int mode) +{ + return 0; +} + +static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +{ + return 0; +} + +static inline int break_deleg_wait(struct inode **delegated_inode) +{ + BUG(); + return 0; +} + +static inline int break_layout(struct inode *inode, bool wait) +{ + return 0; +} + +#endif /* CONFIG_FILE_LOCKING */ + +#endif /* _LINUX_FILELOCK_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index c1769a2c5d70..2acc46fb5f97 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1003,135 +1003,11 @@ static inline struct file *get_file(struct file *f) #define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX) #endif -#define FL_POSIX 1 -#define FL_FLOCK 2 -#define FL_DELEG 4 /* NFSv4 delegation */ -#define FL_ACCESS 8 /* not trying to lock, just looking */ -#define FL_EXISTS 16 /* when unlocking, test for existence */ -#define FL_LEASE 32 /* lease held on this file */ -#define FL_CLOSE 64 /* unlock on close */ -#define FL_SLEEP 128 /* A blocking lock */ -#define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ -#define FL_UNLOCK_PENDING 512 /* Lease is being broken */ -#define FL_OFDLCK 1024 /* lock is "owned" by struct file */ -#define FL_LAYOUT 2048 /* outstanding pNFS layout */ -#define FL_RECLAIM 4096 /* reclaiming from a reboot server */ - -#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) - -/* - * Special return value from posix_lock_file() and vfs_lock_file() for - * asynchronous locking. - */ -#define FILE_LOCK_DEFERRED 1 - /* legacy typedef, should eventually be removed */ typedef void *fl_owner_t; struct file_lock; -struct file_lock_operations { - void (*fl_copy_lock)(struct file_lock *, struct file_lock *); - void (*fl_release_private)(struct file_lock *); -}; - -struct lock_manager_operations { - void *lm_mod_owner; - fl_owner_t (*lm_get_owner)(fl_owner_t); - void (*lm_put_owner)(fl_owner_t); - void (*lm_notify)(struct file_lock *); /* unblock callback */ - int (*lm_grant)(struct file_lock *, int); - bool (*lm_break)(struct file_lock *); - int (*lm_change)(struct file_lock *, int, struct list_head *); - void (*lm_setup)(struct file_lock *, void **); - bool (*lm_breaker_owns_lease)(struct file_lock *); - bool (*lm_lock_expirable)(struct file_lock *cfl); - void (*lm_expire_lock)(void); -}; - -struct lock_manager { - struct list_head list; - /* - * NFSv4 and up also want opens blocked during the grace period; - * NLM doesn't care: - */ - bool block_opens; -}; - -struct net; -void locks_start_grace(struct net *, struct lock_manager *); -void locks_end_grace(struct lock_manager *); -bool locks_in_grace(struct net *); -bool opens_in_grace(struct net *); - -/* that will die - we need it for nfs_lock_info */ -#include <linux/nfs_fs_i.h> - -/* - * struct file_lock represents a generic "file lock". It's used to represent - * POSIX byte range locks, BSD (flock) locks, and leases. It's important to - * note that the same struct is used to represent both a request for a lock and - * the lock itself, but the same object is never used for both. - * - * FIXME: should we create a separate "struct lock_request" to help distinguish - * these two uses? - * - * The varous i_flctx lists are ordered by: - * - * 1) lock owner - * 2) lock range start - * 3) lock range end - * - * Obviously, the last two criteria only matter for POSIX locks. - */ -struct file_lock { - struct file_lock *fl_blocker; /* The lock, that is blocking us */ - struct list_head fl_list; /* link into file_lock_context */ - struct hlist_node fl_link; /* node in global lists */ - struct list_head fl_blocked_requests; /* list of requests with - * ->fl_blocker pointing here - */ - struct list_head fl_blocked_member; /* node in - * ->fl_blocker->fl_blocked_requests - */ - fl_owner_t fl_owner; - unsigned int fl_flags; - unsigned char fl_type; - unsigned int fl_pid; - int fl_link_cpu; /* what cpu's list is this on? */ - wait_queue_head_t fl_wait; - struct file *fl_file; - loff_t fl_start; - loff_t fl_end; - - struct fasync_struct * fl_fasync; /* for lease break notifications */ - /* for lease breaks: */ - unsigned long fl_break_time; - unsigned long fl_downgrade_time; - - const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ - const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ - union { - struct nfs_lock_info nfs_fl; - struct nfs4_lock_info nfs4_fl; - struct { - struct list_head link; /* link in AFS vnode's pending_locks list */ - int state; /* state of grant or error if -ve */ - unsigned int debug_id; - } afs; - struct { - struct inode *inode; - } ceph; - } fl_u; -} __randomize_layout; - -struct file_lock_context { - spinlock_t flc_lock; - struct list_head flc_flock; - struct list_head flc_posix; - struct list_head flc_lease; -}; - /* The following constant reflects the upper bound of the file/locking space */ #ifndef OFFSET_MAX #define OFFSET_MAX type_max(loff_t) @@ -1140,216 +1016,6 @@ struct file_lock_context { extern void send_sigio(struct fown_struct *fown, int fd, int band); -#define locks_inode(f) file_inode(f) - -#ifdef CONFIG_FILE_LOCKING -extern int fcntl_getlk(struct file *, unsigned int, struct flock *); -extern int fcntl_setlk(unsigned int, struct file *, unsigned int, - struct flock *); - -#if BITS_PER_LONG == 32 -extern int fcntl_getlk64(struct file *, unsigned int, struct flock64 *); -extern int fcntl_setlk64(unsigned int, struct file *, unsigned int, - struct flock64 *); -#endif - -extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); -extern int fcntl_getlease(struct file *filp); - -/* fs/locks.c */ -void locks_free_lock_context(struct inode *inode); -void locks_free_lock(struct file_lock *fl); -extern void locks_init_lock(struct file_lock *); -extern struct file_lock * locks_alloc_lock(void); -extern void locks_copy_lock(struct file_lock *, struct file_lock *); -extern void locks_copy_conflock(struct file_lock *, struct file_lock *); -extern void locks_remove_posix(struct file *, fl_owner_t); -extern void locks_remove_file(struct file *); -extern void locks_release_private(struct file_lock *); -extern void posix_test_lock(struct file *, struct file_lock *); -extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); -extern int locks_delete_block(struct file_lock *); -extern int vfs_test_lock(struct file *, struct file_lock *); -extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); -extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl); -bool vfs_inode_has_locks(struct inode *inode); -extern int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); -extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); -extern void lease_get_mtime(struct inode *, struct timespec64 *time); -extern int generic_setlease(struct file *, long, struct file_lock **, void **priv); -extern int vfs_setlease(struct file *, long, struct file_lock **, void **); -extern int lease_modify(struct file_lock *, int, struct list_head *); - -struct notifier_block; -extern int lease_register_notifier(struct notifier_block *); -extern void lease_unregister_notifier(struct notifier_block *); - -struct files_struct; -extern void show_fd_locks(struct seq_file *f, - struct file *filp, struct files_struct *files); -extern bool locks_owner_has_blockers(struct file_lock_context *flctx, - fl_owner_t owner); - -static inline struct file_lock_context * -locks_inode_context(const struct inode *inode) -{ - return smp_load_acquire(&inode->i_flctx); -} - -#else /* !CONFIG_FILE_LOCKING */ -static inline int fcntl_getlk(struct file *file, unsigned int cmd, - struct flock __user *user) -{ - return -EINVAL; -} - -static inline int fcntl_setlk(unsigned int fd, struct file *file, - unsigned int cmd, struct flock __user *user) -{ - return -EACCES; -} - -#if BITS_PER_LONG == 32 -static inline int fcntl_getlk64(struct file *file, unsigned int cmd, - struct flock64 *user) -{ - return -EINVAL; -} - -static inline int fcntl_setlk64(unsigned int fd, struct file *file, - unsigned int cmd, struct flock64 *user) -{ - return -EACCES; -} -#endif -static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg) -{ - return -EINVAL; -} - -static inline int fcntl_getlease(struct file *filp) -{ - return F_UNLCK; -} - -static inline void -locks_free_lock_context(struct inode *inode) -{ -} - -static inline void locks_init_lock(struct file_lock *fl) -{ - return; -} - -static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) -{ - return; -} - -static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) -{ - return; -} - -static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) -{ - return; -} - -static inline void locks_remove_file(struct file *filp) -{ - return; -} - -static inline void posix_test_lock(struct file *filp, struct file_lock *fl) -{ - return; -} - -static inline int posix_lock_file(struct file *filp, struct file_lock *fl, - struct file_lock *conflock) -{ - return -ENOLCK; -} - -static inline int locks_delete_block(struct file_lock *waiter) -{ - return -ENOENT; -} - -static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) -{ - return 0; -} - -static inline int vfs_lock_file(struct file *filp, unsigned int cmd, - struct file_lock *fl, struct file_lock *conf) -{ - return -ENOLCK; -} - -static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) -{ - return 0; -} - -static inline bool vfs_inode_has_locks(struct inode *inode) -{ - return false; -} - -static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) -{ - return -ENOLCK; -} - -static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) -{ - return 0; -} - -static inline void lease_get_mtime(struct inode *inode, - struct timespec64 *time) -{ - return; -} - -static inline int generic_setlease(struct file *filp, long arg, - struct file_lock **flp, void **priv) -{ - return -EINVAL; -} - -static inline int vfs_setlease(struct file *filp, long arg, - struct file_lock **lease, void **priv) -{ - return -EINVAL; -} - -static inline int lease_modify(struct file_lock *fl, int arg, - struct list_head *dispose) -{ - return -EINVAL; -} - -struct files_struct; -static inline void show_fd_locks(struct seq_file *f, - struct file *filp, struct files_struct *files) {} -static inline bool locks_owner_has_blockers(struct file_lock_context *flctx, - fl_owner_t owner) -{ - return false; -} - -static inline struct file_lock_context * -locks_inode_context(const struct inode *inode) -{ - return NULL; -} - -#endif /* !CONFIG_FILE_LOCKING */ - static inline struct inode *file_inode(const struct file *f) { return f->f_inode; @@ -1360,11 +1026,6 @@ static inline struct dentry *file_dentry(const struct file *file) return d_real(file->f_path.dentry, file_inode(file)); } -static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) -{ - return locks_lock_inode_wait(locks_inode(filp), fl); -} - struct fasync_struct { rwlock_t fa_lock; int magic; @@ -1635,22 +1296,22 @@ static inline void i_gid_write(struct inode *inode, gid_t gid) } /** - * i_uid_into_vfsuid - map an inode's i_uid down into a mnt_userns - * @mnt_userns: user namespace of the mount the inode was found from + * i_uid_into_vfsuid - map an inode's i_uid down according to an idmapping + * @idmap: idmap of the mount the inode was found from * @inode: inode to map * - * Return: whe inode's i_uid mapped down according to @mnt_userns. + * Return: whe inode's i_uid mapped down according to @idmap. * If the inode's i_uid has no mapping INVALID_VFSUID is returned. */ -static inline vfsuid_t i_uid_into_vfsuid(struct user_namespace *mnt_userns, +static inline vfsuid_t i_uid_into_vfsuid(struct mnt_idmap *idmap, const struct inode *inode) { - return make_vfsuid(mnt_userns, i_user_ns(inode), inode->i_uid); + return make_vfsuid(idmap, i_user_ns(inode), inode->i_uid); } /** * i_uid_needs_update - check whether inode's i_uid needs to be updated - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * @@ -1659,50 +1320,50 @@ static inline vfsuid_t i_uid_into_vfsuid(struct user_namespace *mnt_userns, * * Return: true if @inode's i_uid field needs to be updated, false if not. */ -static inline bool i_uid_needs_update(struct user_namespace *mnt_userns, +static inline bool i_uid_needs_update(struct mnt_idmap *idmap, const struct iattr *attr, const struct inode *inode) { return ((attr->ia_valid & ATTR_UID) && !vfsuid_eq(attr->ia_vfsuid, - i_uid_into_vfsuid(mnt_userns, inode))); + i_uid_into_vfsuid(idmap, inode))); } /** * i_uid_update - update @inode's i_uid field - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_uid field translating the vfsuid of any idmapped * mount into the filesystem kuid. */ -static inline void i_uid_update(struct user_namespace *mnt_userns, +static inline void i_uid_update(struct mnt_idmap *idmap, const struct iattr *attr, struct inode *inode) { if (attr->ia_valid & ATTR_UID) - inode->i_uid = from_vfsuid(mnt_userns, i_user_ns(inode), + inode->i_uid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid); } /** - * i_gid_into_vfsgid - map an inode's i_gid down into a mnt_userns - * @mnt_userns: user namespace of the mount the inode was found from + * i_gid_into_vfsgid - map an inode's i_gid down according to an idmapping + * @idmap: idmap of the mount the inode was found from * @inode: inode to map * - * Return: the inode's i_gid mapped down according to @mnt_userns. + * Return: the inode's i_gid mapped down according to @idmap. * If the inode's i_gid has no mapping INVALID_VFSGID is returned. */ -static inline vfsgid_t i_gid_into_vfsgid(struct user_namespace *mnt_userns, +static inline vfsgid_t i_gid_into_vfsgid(struct mnt_idmap *idmap, const struct inode *inode) { - return make_vfsgid(mnt_userns, i_user_ns(inode), inode->i_gid); + return make_vfsgid(idmap, i_user_ns(inode), inode->i_gid); } /** * i_gid_needs_update - check whether inode's i_gid needs to be updated - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * @@ -1711,83 +1372,83 @@ static inline vfsgid_t i_gid_into_vfsgid(struct user_namespace *mnt_userns, * * Return: true if @inode's i_gid field needs to be updated, false if not. */ -static inline bool i_gid_needs_update(struct user_namespace *mnt_userns, +static inline bool i_gid_needs_update(struct mnt_idmap *idmap, const struct iattr *attr, const struct inode *inode) { return ((attr->ia_valid & ATTR_GID) && !vfsgid_eq(attr->ia_vfsgid, - i_gid_into_vfsgid(mnt_userns, inode))); + i_gid_into_vfsgid(idmap, inode))); } /** * i_gid_update - update @inode's i_gid field - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_gid field translating the vfsgid of any idmapped * mount into the filesystem kgid. */ -static inline void i_gid_update(struct user_namespace *mnt_userns, +static inline void i_gid_update(struct mnt_idmap *idmap, const struct iattr *attr, struct inode *inode) { if (attr->ia_valid & ATTR_GID) - inode->i_gid = from_vfsgid(mnt_userns, i_user_ns(inode), + inode->i_gid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid); } /** * inode_fsuid_set - initialize inode's i_uid field with callers fsuid * @inode: inode to initialize - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * * Initialize the i_uid field of @inode. If the inode was found/created via - * an idmapped mount map the caller's fsuid according to @mnt_users. + * an idmapped mount map the caller's fsuid according to @idmap. */ static inline void inode_fsuid_set(struct inode *inode, - struct user_namespace *mnt_userns) + struct mnt_idmap *idmap) { - inode->i_uid = mapped_fsuid(mnt_userns, i_user_ns(inode)); + inode->i_uid = mapped_fsuid(idmap, i_user_ns(inode)); } /** * inode_fsgid_set - initialize inode's i_gid field with callers fsgid * @inode: inode to initialize - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * * Initialize the i_gid field of @inode. If the inode was found/created via - * an idmapped mount map the caller's fsgid according to @mnt_users. + * an idmapped mount map the caller's fsgid according to @idmap. */ static inline void inode_fsgid_set(struct inode *inode, - struct user_namespace *mnt_userns) + struct mnt_idmap *idmap) { - inode->i_gid = mapped_fsgid(mnt_userns, i_user_ns(inode)); + inode->i_gid = mapped_fsgid(idmap, i_user_ns(inode)); } /** * fsuidgid_has_mapping() - check whether caller's fsuid/fsgid is mapped * @sb: the superblock we want a mapping in - * @mnt_userns: user namespace of the relevant mount + * @idmap: idmap of the relevant mount * * Check whether the caller's fsuid and fsgid have a valid mapping in the * s_user_ns of the superblock @sb. If the caller is on an idmapped mount map - * the caller's fsuid and fsgid according to the @mnt_userns first. + * the caller's fsuid and fsgid according to the @idmap first. * * Return: true if fsuid and fsgid is mapped, false if not. */ static inline bool fsuidgid_has_mapping(struct super_block *sb, - struct user_namespace *mnt_userns) + struct mnt_idmap *idmap) { struct user_namespace *fs_userns = sb->s_user_ns; kuid_t kuid; kgid_t kgid; - kuid = mapped_fsuid(mnt_userns, fs_userns); + kuid = mapped_fsuid(idmap, fs_userns); if (!uid_valid(kuid)) return false; - kgid = mapped_fsgid(mnt_userns, fs_userns); + kgid = mapped_fsgid(idmap, fs_userns); if (!gid_valid(kgid)) return false; return kuid_has_mapping(fs_userns, kuid) && @@ -1941,42 +1602,42 @@ static inline bool sb_start_intwrite_trylock(struct super_block *sb) return __sb_start_write_trylock(sb, SB_FREEZE_FS); } -bool inode_owner_or_capable(struct user_namespace *mnt_userns, +bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode); /* * VFS helper functions.. */ -int vfs_create(struct user_namespace *, struct inode *, +int vfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); -int vfs_mkdir(struct user_namespace *, struct inode *, +int vfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); -int vfs_mknod(struct user_namespace *, struct inode *, struct dentry *, +int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); -int vfs_symlink(struct user_namespace *, struct inode *, +int vfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); -int vfs_link(struct dentry *, struct user_namespace *, struct inode *, +int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, struct dentry *, struct inode **); -int vfs_rmdir(struct user_namespace *, struct inode *, struct dentry *); -int vfs_unlink(struct user_namespace *, struct inode *, struct dentry *, +int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *); +int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, struct inode **); /** * struct renamedata - contains all information required for renaming - * @old_mnt_userns: old user namespace of the mount the inode was found from + * @old_mnt_idmap: idmap of the old mount the inode was found from * @old_dir: parent of source * @old_dentry: source - * @new_mnt_userns: new user namespace of the mount the inode was found from + * @new_mnt_idmap: idmap of the new mount the inode was found from * @new_dir: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags */ struct renamedata { - struct user_namespace *old_mnt_userns; + struct mnt_idmap *old_mnt_idmap; struct inode *old_dir; struct dentry *old_dentry; - struct user_namespace *new_mnt_userns; + struct mnt_idmap *new_mnt_idmap; struct inode *new_dir; struct dentry *new_dentry; struct inode **delegated_inode; @@ -1985,14 +1646,14 @@ struct renamedata { int vfs_rename(struct renamedata *); -static inline int vfs_whiteout(struct user_namespace *mnt_userns, +static inline int vfs_whiteout(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { - return vfs_mknod(mnt_userns, dir, dentry, S_IFCHR | WHITEOUT_MODE, + return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); } -struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, +struct file *vfs_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred); @@ -2016,10 +1677,10 @@ extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, /* * VFS file helper functions. */ -void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, +void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); -umode_t mode_strip_sgid(struct user_namespace *mnt_userns, +umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode); /* @@ -2137,27 +1798,26 @@ struct file_operations { struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); - int (*permission) (struct user_namespace *, struct inode *, int); + int (*permission) (struct mnt_idmap *, struct inode *, int); struct posix_acl * (*get_inode_acl)(struct inode *, int, bool); int (*readlink) (struct dentry *, char __user *,int); - int (*create) (struct user_namespace *, struct inode *,struct dentry *, + int (*create) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t, bool); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); - int (*symlink) (struct user_namespace *, struct inode *,struct dentry *, + int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *, const char *); - int (*mkdir) (struct user_namespace *, struct inode *,struct dentry *, + int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t); int (*rmdir) (struct inode *,struct dentry *); - int (*mknod) (struct user_namespace *, struct inode *,struct dentry *, + int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); - int (*rename) (struct user_namespace *, struct inode *, struct dentry *, + int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); - int (*setattr) (struct user_namespace *, struct dentry *, - struct iattr *); - int (*getattr) (struct user_namespace *, const struct path *, + int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *); + int (*getattr) (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, @@ -2166,13 +1826,13 @@ struct inode_operations { int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); - int (*tmpfile) (struct user_namespace *, struct inode *, + int (*tmpfile) (struct mnt_idmap *, struct inode *, struct file *, umode_t); - struct posix_acl *(*get_acl)(struct user_namespace *, struct dentry *, + struct posix_acl *(*get_acl)(struct mnt_idmap *, struct dentry *, int); - int (*set_acl)(struct user_namespace *, struct dentry *, + int (*set_acl)(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); - int (*fileattr_set)(struct user_namespace *mnt_userns, + int (*fileattr_set)(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); } ____cacheline_aligned; @@ -2326,11 +1986,11 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) -static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns, +static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap, struct inode *inode) { - return !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) || - !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)); + return !vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)); } static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) @@ -2624,96 +2284,6 @@ extern struct kobject *fs_kobj; #define MAX_RW_COUNT (INT_MAX & PAGE_MASK) -#ifdef CONFIG_FILE_LOCKING -static inline int break_lease(struct inode *inode, unsigned int mode) -{ - /* - * Since this check is lockless, we must ensure that any refcounts - * taken are done before checking i_flctx->flc_lease. Otherwise, we - * could end up racing with tasks trying to set a new lease on this - * file. - */ - smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) - return __break_lease(inode, mode, FL_LEASE); - return 0; -} - -static inline int break_deleg(struct inode *inode, unsigned int mode) -{ - /* - * Since this check is lockless, we must ensure that any refcounts - * taken are done before checking i_flctx->flc_lease. Otherwise, we - * could end up racing with tasks trying to set a new lease on this - * file. - */ - smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) - return __break_lease(inode, mode, FL_DELEG); - return 0; -} - -static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) -{ - int ret; - - ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); - if (ret == -EWOULDBLOCK && delegated_inode) { - *delegated_inode = inode; - ihold(inode); - } - return ret; -} - -static inline int break_deleg_wait(struct inode **delegated_inode) -{ - int ret; - - ret = break_deleg(*delegated_inode, O_WRONLY); - iput(*delegated_inode); - *delegated_inode = NULL; - return ret; -} - -static inline int break_layout(struct inode *inode, bool wait) -{ - smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) - return __break_lease(inode, - wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, - FL_LAYOUT); - return 0; -} - -#else /* !CONFIG_FILE_LOCKING */ -static inline int break_lease(struct inode *inode, unsigned int mode) -{ - return 0; -} - -static inline int break_deleg(struct inode *inode, unsigned int mode) -{ - return 0; -} - -static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) -{ - return 0; -} - -static inline int break_deleg_wait(struct inode **delegated_inode) -{ - BUG(); - return 0; -} - -static inline int break_layout(struct inode *inode, bool wait) -{ - return 0; -} - -#endif /* CONFIG_FILE_LOCKING */ - /* fs/open.c */ struct audit_names; struct filename { @@ -2725,11 +2295,6 @@ struct filename { }; static_assert(offsetof(struct filename, iname) % sizeof(long) == 0); -static inline struct user_namespace *file_mnt_user_ns(struct file *file) -{ - return mnt_user_ns(file->f_path.mnt); -} - static inline struct mnt_idmap *file_mnt_idmap(struct file *file) { return mnt_idmap(file->f_path.mnt); @@ -2749,7 +2314,7 @@ static inline bool is_idmapped_mnt(const struct vfsmount *mnt) } extern long vfs_truncate(const struct path *, loff_t); -int do_truncate(struct user_namespace *, struct dentry *, loff_t start, +int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); @@ -2904,21 +2469,21 @@ static inline int bmap(struct inode *inode, sector_t *block) } #endif -int notify_change(struct user_namespace *, struct dentry *, +int notify_change(struct mnt_idmap *, struct dentry *, struct iattr *, struct inode **); -int inode_permission(struct user_namespace *, struct inode *, int); -int generic_permission(struct user_namespace *, struct inode *, int); +int inode_permission(struct mnt_idmap *, struct inode *, int); +int generic_permission(struct mnt_idmap *, struct inode *, int); static inline int file_permission(struct file *file, int mask) { - return inode_permission(file_mnt_user_ns(file), + return inode_permission(file_mnt_idmap(file), file_inode(file), mask); } static inline int path_permission(const struct path *path, int mask) { - return inode_permission(mnt_user_ns(path->mnt), + return inode_permission(mnt_idmap(path->mnt), d_inode(path->dentry), mask); } -int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir, +int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode); static inline bool execute_ok(struct inode *inode) @@ -3106,7 +2671,7 @@ extern void __destroy_inode(struct inode *); extern struct inode *new_inode_pseudo(struct super_block *sb); extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); -extern int setattr_should_drop_suidgid(struct user_namespace *, struct inode *); +extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); extern int file_remove_privs(struct file *); /* @@ -3265,7 +2830,7 @@ extern void page_put_link(void *); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); -void generic_fillattr(struct user_namespace *, struct inode *, struct kstat *); +void generic_fillattr(struct mnt_idmap *, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); @@ -3316,9 +2881,9 @@ extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); extern int dcache_readdir(struct file *, struct dir_context *); -extern int simple_setattr(struct user_namespace *, struct dentry *, +extern int simple_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); -extern int simple_getattr(struct user_namespace *, const struct path *, +extern int simple_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int simple_statfs(struct dentry *, struct kstatfs *); extern int simple_open(struct inode *inode, struct file *file); @@ -3327,7 +2892,7 @@ extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); -extern int simple_rename(struct user_namespace *, struct inode *, +extern int simple_rename(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); extern void simple_recursive_removal(struct dentry *, @@ -3369,11 +2934,11 @@ extern int generic_check_addressable(unsigned, u64); extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); -int may_setattr(struct user_namespace *mnt_userns, struct inode *inode, +int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); -int setattr_prepare(struct user_namespace *, struct dentry *, struct iattr *); +int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); extern int inode_newsize_ok(const struct inode *, loff_t offset); -void setattr_copy(struct user_namespace *, struct inode *inode, +void setattr_copy(struct mnt_idmap *, struct inode *inode, const struct iattr *attr); extern int file_update_time(struct file *file); @@ -3540,13 +3105,13 @@ static inline bool is_sxid(umode_t mode) return mode & (S_ISUID | S_ISGID); } -static inline int check_sticky(struct user_namespace *mnt_userns, +static inline int check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { if (!(dir->i_mode & S_ISVTX)) return 0; - return __check_sticky(mnt_userns, dir, inode); + return __check_sticky(idmap, dir, inode); } static inline void inode_has_no_xattr(struct inode *inode) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 4f5f8a651213..e0a49c3125eb 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -257,8 +257,8 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags); -int fscrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len, - unsigned int offs); +int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, + size_t offs); int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num); @@ -309,8 +309,6 @@ fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) /* keyring.c */ void fscrypt_destroy_keyring(struct super_block *sb); int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); -int fscrypt_add_test_dummy_key(struct super_block *sb, - const struct fscrypt_dummy_policy *dummy_policy); int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg); int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); @@ -422,9 +420,8 @@ static inline int fscrypt_encrypt_block_inplace(const struct inode *inode, return -EOPNOTSUPP; } -static inline int fscrypt_decrypt_pagecache_blocks(struct page *page, - unsigned int len, - unsigned int offs) +static inline int fscrypt_decrypt_pagecache_blocks(struct folio *folio, + size_t len, size_t offs) { return -EOPNOTSUPP; } @@ -530,13 +527,6 @@ static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg) return -EOPNOTSUPP; } -static inline int -fscrypt_add_test_dummy_key(struct super_block *sb, - const struct fscrypt_dummy_policy *dummy_policy) -{ - return 0; -} - static inline int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 40f14e5fed9d..119a3266791f 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -12,6 +12,7 @@ #define _LINUX_FSVERITY_H #include <linux/fs.h> +#include <linux/mm.h> #include <crypto/hash_info.h> #include <crypto/sha2.h> #include <uapi/linux/fsverity.h> @@ -93,8 +94,7 @@ struct fsverity_operations { * isn't already cached. Implementations may ignore this * argument; it's only a performance optimization. * - * This can be called at any time on an open verity file, as well as - * between ->begin_enable_verity() and ->end_enable_verity(). It may be + * This can be called at any time on an open verity file. It may be * called by multiple processes concurrently, even with the same page. * * Note that this must retrieve a *page*, not necessarily a *block*. @@ -109,9 +109,9 @@ struct fsverity_operations { * Write a Merkle tree block to the given inode. * * @inode: the inode for which the Merkle tree is being built - * @buf: block to write - * @index: 0-based index of the block within the Merkle tree - * @log_blocksize: log base 2 of the Merkle tree block size + * @buf: the Merkle tree block to write + * @pos: the position of the block in the Merkle tree (in bytes) + * @size: the Merkle tree block size (in bytes) * * This is only called between ->begin_enable_verity() and * ->end_enable_verity(). @@ -119,7 +119,7 @@ struct fsverity_operations { * Return: 0 on success, -errno on failure */ int (*write_merkle_tree_block)(struct inode *inode, const void *buf, - u64 index, int log_blocksize); + u64 pos, unsigned int size); }; #ifdef CONFIG_FS_VERITY @@ -148,9 +148,21 @@ int fsverity_get_digest(struct inode *inode, /* open.c */ -int fsverity_file_open(struct inode *inode, struct file *filp); -int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); -void fsverity_cleanup_inode(struct inode *inode); +int __fsverity_file_open(struct inode *inode, struct file *filp); +int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); +void __fsverity_cleanup_inode(struct inode *inode); + +/** + * fsverity_cleanup_inode() - free the inode's verity info, if present + * @inode: an inode being evicted + * + * Filesystems must call this on inode eviction to free ->i_verity_info. + */ +static inline void fsverity_cleanup_inode(struct inode *inode) +{ + if (inode->i_verity_info) + __fsverity_cleanup_inode(inode); +} /* read_metadata.c */ @@ -158,7 +170,7 @@ int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg); /* verify.c */ -bool fsverity_verify_page(struct page *page); +bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset); void fsverity_verify_bio(struct bio *bio); void fsverity_enqueue_verify_work(struct work_struct *work); @@ -193,15 +205,15 @@ static inline int fsverity_get_digest(struct inode *inode, /* open.c */ -static inline int fsverity_file_open(struct inode *inode, struct file *filp) +static inline int __fsverity_file_open(struct inode *inode, struct file *filp) { - return IS_VERITY(inode) ? -EOPNOTSUPP : 0; + return -EOPNOTSUPP; } -static inline int fsverity_prepare_setattr(struct dentry *dentry, - struct iattr *attr) +static inline int __fsverity_prepare_setattr(struct dentry *dentry, + struct iattr *attr) { - return IS_VERITY(d_inode(dentry)) ? -EOPNOTSUPP : 0; + return -EOPNOTSUPP; } static inline void fsverity_cleanup_inode(struct inode *inode) @@ -218,7 +230,8 @@ static inline int fsverity_ioctl_read_metadata(struct file *filp, /* verify.c */ -static inline bool fsverity_verify_page(struct page *page) +static inline bool fsverity_verify_blocks(struct folio *folio, size_t len, + size_t offset) { WARN_ON(1); return false; @@ -236,6 +249,16 @@ static inline void fsverity_enqueue_verify_work(struct work_struct *work) #endif /* !CONFIG_FS_VERITY */ +static inline bool fsverity_verify_folio(struct folio *folio) +{ + return fsverity_verify_blocks(folio, folio_size(folio), 0); +} + +static inline bool fsverity_verify_page(struct page *page) +{ + return fsverity_verify_blocks(page_folio(page), PAGE_SIZE, 0); +} + /** * fsverity_active() - do reads from the inode need to go through fs-verity? * @inode: inode to check @@ -254,4 +277,42 @@ static inline bool fsverity_active(const struct inode *inode) return fsverity_get_info(inode) != NULL; } +/** + * fsverity_file_open() - prepare to open a verity file + * @inode: the inode being opened + * @filp: the struct file being set up + * + * When opening a verity file, deny the open if it is for writing. Otherwise, + * set up the inode's ->i_verity_info if not already done. + * + * When combined with fscrypt, this must be called after fscrypt_file_open(). + * Otherwise, we won't have the key set up to decrypt the verity metadata. + * + * Return: 0 on success, -errno on failure + */ +static inline int fsverity_file_open(struct inode *inode, struct file *filp) +{ + if (IS_VERITY(inode)) + return __fsverity_file_open(inode, filp); + return 0; +} + +/** + * fsverity_prepare_setattr() - prepare to change a verity inode's attributes + * @dentry: dentry through which the inode is being changed + * @attr: attributes to change + * + * Verity files are immutable, so deny truncates. This isn't covered by the + * open-time check because sys_truncate() takes a path, not a file descriptor. + * + * Return: 0 on success, -errno on failure + */ +static inline int fsverity_prepare_setattr(struct dentry *dentry, + struct iattr *attr) +{ + if (IS_VERITY(d_inode(dentry))) + return __fsverity_prepare_setattr(dentry, attr); + return 0; +} + #endif /* _LINUX_FSVERITY_H */ diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 034b1106d022..a3028e400a9c 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -152,7 +152,10 @@ static inline void totalhigh_pages_add(long count) static inline bool is_kmap_addr(const void *x) { unsigned long addr = (unsigned long)x; - return addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP); + + return (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) || + (addr >= __fix_to_virt(FIX_KMAP_END) && + addr < __fix_to_virt(FIX_KMAP_BEGIN)); } #else /* CONFIG_HIGHMEM */ @@ -200,7 +203,7 @@ static inline void *kmap_local_pfn(unsigned long pfn) static inline void __kunmap_local(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP - kunmap_flush_on_unmap(addr); + kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif } @@ -227,7 +230,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn) static inline void __kunmap_atomic(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP - kunmap_flush_on_unmap(addr); + kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 551834cd5299..9ab9d3105d5c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/hugetlb_inline.h> #include <linux/cgroup.h> +#include <linux/page_ref.h> #include <linux/list.h> #include <linux/kref.h> #include <linux/pgtable.h> @@ -742,7 +743,10 @@ static inline struct hstate *hstate_sizelog(int page_size_log) if (!page_size_log) return &default_hstate; - return size_to_hstate(1UL << page_size_log); + if (page_size_log < BITS_PER_LONG) + return size_to_hstate(1UL << page_size_log); + + return NULL; } static inline struct hstate *hstate_vma(struct vm_area_struct *vma) @@ -1187,6 +1191,18 @@ static inline __init void hugetlb_cma_reserve(int order) } #endif +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +static inline bool hugetlb_pmd_shared(pte_t *pte) +{ + return page_count(virt_to_page(pte)) > 1; +} +#else +static inline bool hugetlb_pmd_shared(pte_t *pte) +{ + return false; +} +#endif + bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE diff --git a/include/linux/ima.h b/include/linux/ima.h index 5a0b2a285a18..172b113a9864 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -18,7 +18,7 @@ struct linux_binprm; extern enum hash_algo ima_get_current_hash_algo(void); extern int ima_bprm_check(struct linux_binprm *bprm); extern int ima_file_check(struct file *file, int mask); -extern void ima_post_create_tmpfile(struct user_namespace *mnt_userns, +extern void ima_post_create_tmpfile(struct mnt_idmap *idmap, struct inode *inode); extern void ima_file_free(struct file *file); extern int ima_file_mmap(struct file *file, unsigned long prot); @@ -30,7 +30,7 @@ extern int ima_read_file(struct file *file, enum kernel_read_file_id id, bool contents); extern int ima_post_read_file(struct file *file, void *buf, loff_t size, enum kernel_read_file_id id); -extern void ima_post_path_mknod(struct user_namespace *mnt_userns, +extern void ima_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry); extern int ima_file_hash(struct file *file, char *buf, size_t buf_size); extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size); @@ -66,7 +66,7 @@ static inline int ima_file_check(struct file *file, int mask) return 0; } -static inline void ima_post_create_tmpfile(struct user_namespace *mnt_userns, +static inline void ima_post_create_tmpfile(struct mnt_idmap *idmap, struct inode *inode) { } @@ -111,7 +111,7 @@ static inline int ima_post_read_file(struct file *file, void *buf, loff_t size, return 0; } -static inline void ima_post_path_mknod(struct user_namespace *mnt_userns, +static inline void ima_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry) { return; @@ -183,18 +183,18 @@ static inline void ima_post_key_create_or_update(struct key *keyring, #ifdef CONFIG_IMA_APPRAISE extern bool is_ima_appraise_enabled(void); -extern void ima_inode_post_setattr(struct user_namespace *mnt_userns, +extern void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry); extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len); -extern int ima_inode_set_acl(struct user_namespace *mnt_userns, +extern int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); -static inline int ima_inode_remove_acl(struct user_namespace *mnt_userns, +static inline int ima_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { - return ima_inode_set_acl(mnt_userns, dentry, acl_name, NULL); + return ima_inode_set_acl(idmap, dentry, acl_name, NULL); } extern int ima_inode_removexattr(struct dentry *dentry, const char *xattr_name); #else @@ -203,7 +203,7 @@ static inline bool is_ima_appraise_enabled(void) return 0; } -static inline void ima_inode_post_setattr(struct user_namespace *mnt_userns, +static inline void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry) { return; @@ -217,7 +217,7 @@ static inline int ima_inode_setxattr(struct dentry *dentry, return 0; } -static inline int ima_inode_set_acl(struct user_namespace *mnt_userns, +static inline int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { @@ -231,7 +231,7 @@ static inline int ima_inode_removexattr(struct dentry *dentry, return 0; } -static inline int ima_inode_remove_acl(struct user_namespace *mnt_userns, +static inline int ima_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 0983dfc9a203..fca43a4bd96b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -58,8 +58,7 @@ struct vm_fault; #define IOMAP_F_SHARED (1U << 2) #define IOMAP_F_MERGED (1U << 3) #define IOMAP_F_BUFFER_HEAD (1U << 4) -#define IOMAP_F_ZONE_APPEND (1U << 5) -#define IOMAP_F_XATTR (1U << 6) +#define IOMAP_F_XATTR (1U << 5) /* * Flags set by the core iomap code during operations: diff --git a/include/linux/iversion.h b/include/linux/iversion.h index e27bd4f55d84..f174ff1b59ee 100644 --- a/include/linux/iversion.h +++ b/include/linux/iversion.h @@ -9,8 +9,26 @@ * --------------------------- * The change attribute (i_version) is mandated by NFSv4 and is mostly for * knfsd, but is also used for other purposes (e.g. IMA). The i_version must - * appear different to observers if there was a change to the inode's data or - * metadata since it was last queried. + * appear larger to observers if there was an explicit change to the inode's + * data or metadata since it was last queried. + * + * An explicit change is one that would ordinarily result in a change to the + * inode status change time (aka ctime). i_version must appear to change, even + * if the ctime does not (since the whole point is to avoid missing updates due + * to timestamp granularity). If POSIX or other relevant spec mandates that the + * ctime must change due to an operation, then the i_version counter must be + * incremented as well. + * + * Making the i_version update completely atomic with the operation itself would + * be prohibitively expensive. Traditionally the kernel has updated the times on + * directories after an operation that changes its contents. For regular files, + * the ctime is usually updated before the data is copied into the cache for a + * write. This means that there is a window of time when an observer can + * associate a new timestamp with old file contents. Since the purpose of the + * i_version is to allow for better cache coherency, the i_version must always + * be updated after the results of the operation are visible. Updating it before + * and after a change is also permitted. (Note that no filesystems currently do + * this. Fixing that is a work-in-progress). * * Observers see the i_version as a 64-bit number that never decreases. If it * remains the same since it was last checked, then nothing has changed in the @@ -234,42 +252,6 @@ inode_peek_iversion(const struct inode *inode) return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT; } -/** - * inode_query_iversion - read i_version for later use - * @inode: inode from which i_version should be read - * - * Read the inode i_version counter. This should be used by callers that wish - * to store the returned i_version for later comparison. This will guarantee - * that a later query of the i_version will result in a different value if - * anything has changed. - * - * In this implementation, we fetch the current value, set the QUERIED flag and - * then try to swap it into place with a cmpxchg, if it wasn't already set. If - * that fails, we try again with the newly fetched value from the cmpxchg. - */ -static inline u64 -inode_query_iversion(struct inode *inode) -{ - u64 cur, new; - - cur = inode_peek_iversion_raw(inode); - do { - /* If flag is already set, then no need to swap */ - if (cur & I_VERSION_QUERIED) { - /* - * This barrier (and the implicit barrier in the - * cmpxchg below) pairs with the barrier in - * inode_maybe_inc_iversion(). - */ - smp_mb(); - break; - } - - new = cur | I_VERSION_QUERIED; - } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); - return cur >> I_VERSION_QUERIED_SHIFT; -} - /* * For filesystems without any sort of change attribute, the best we can * do is fake one up from the ctime: @@ -283,6 +265,8 @@ static inline u64 time_to_chattr(struct timespec64 *t) return chattr; } +u64 inode_query_iversion(struct inode *inode); + /** * inode_eq_iversion_raw - check whether the raw i_version counter has changed * @inode: inode to check diff --git a/include/linux/key.h b/include/linux/key.h index d27477faf00d..8dc7f7c3088b 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -386,6 +386,14 @@ extern int wait_for_key_construction(struct key *key, bool intr); extern int key_validate(const struct key *key); +extern key_ref_t key_create(key_ref_t keyring, + const char *type, + const char *description, + const void *payload, + size_t plen, + key_perm_t perm, + unsigned long flags); + extern key_ref_t key_create_or_update(key_ref_t keyring, const char *type, const char *description, diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 70ce419e2709..2b7f067af3c4 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -312,7 +312,7 @@ static inline struct file *nlmsvc_file_file(struct nlm_file *file) static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) { - return locks_inode(nlmsvc_file_file(file)); + return file_inode(nlmsvc_file_file(file)); } static inline int __nlm_privileged_request4(const struct sockaddr *sap) @@ -372,7 +372,7 @@ static inline int nlm_privileged_requester(const struct svc_rqst *rqstp) static inline int nlm_compare_locks(const struct file_lock *fl1, const struct file_lock *fl2) { - return locks_inode(fl1->fl_file) == locks_inode(fl2->fl_file) + return file_inode(fl1->fl_file) == file_inode(fl2->fl_file) && fl1->fl_pid == fl2->fl_pid && fl1->fl_owner == fl2->fl_owner && fl1->fl_start == fl2->fl_start diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index 67e4a2c5500b..b60fbcd8cdfa 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -11,6 +11,7 @@ #define LOCKD_XDR_H #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/nfs.h> #include <linux/sunrpc/xdr.h> diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ed6cb2ac55fa..094b76dc7164 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -136,25 +136,25 @@ LSM_HOOK(int, 0, inode_follow_link, struct dentry *dentry, struct inode *inode, LSM_HOOK(int, 0, inode_permission, struct inode *inode, int mask) LSM_HOOK(int, 0, inode_setattr, struct dentry *dentry, struct iattr *attr) LSM_HOOK(int, 0, inode_getattr, const struct path *path) -LSM_HOOK(int, 0, inode_setxattr, struct user_namespace *mnt_userns, +LSM_HOOK(int, 0, inode_setxattr, struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) LSM_HOOK(void, LSM_RET_VOID, inode_post_setxattr, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) LSM_HOOK(int, 0, inode_getxattr, struct dentry *dentry, const char *name) LSM_HOOK(int, 0, inode_listxattr, struct dentry *dentry) -LSM_HOOK(int, 0, inode_removexattr, struct user_namespace *mnt_userns, +LSM_HOOK(int, 0, inode_removexattr, struct mnt_idmap *idmap, struct dentry *dentry, const char *name) -LSM_HOOK(int, 0, inode_set_acl, struct user_namespace *mnt_userns, +LSM_HOOK(int, 0, inode_set_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) -LSM_HOOK(int, 0, inode_get_acl, struct user_namespace *mnt_userns, +LSM_HOOK(int, 0, inode_get_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) -LSM_HOOK(int, 0, inode_remove_acl, struct user_namespace *mnt_userns, +LSM_HOOK(int, 0, inode_remove_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) LSM_HOOK(int, 0, inode_need_killpriv, struct dentry *dentry) -LSM_HOOK(int, 0, inode_killpriv, struct user_namespace *mnt_userns, +LSM_HOOK(int, 0, inode_killpriv, struct mnt_idmap *idmap, struct dentry *dentry) -LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct user_namespace *mnt_userns, +LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 0a5ba81f7367..6e156d2acffc 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -475,7 +475,7 @@ * @inode_killpriv: * The setuid bit is being removed. Remove similar security labels. * Called with the dentry->d_inode->i_mutex held. - * @mnt_userns: user namespace of the mount. + * @idmap: idmap of the mount. * @dentry is the dentry being changed. * Return 0 on success. If error is returned, then the operation * causing setuid bit removal is failed. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d3c8203cab6c..85dc9b88ea37 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1666,10 +1666,13 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { + struct mem_cgroup *memcg; + if (mem_cgroup_disabled()) return; - if (unlikely(&folio_memcg(folio)->css != wb->memcg_css)) + memcg = folio_memcg(folio); + if (unlikely(memcg && &memcg->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(folio, wb); } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 76ef2e4fde38..333c1fec72f8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -573,6 +573,14 @@ struct mlx5_debugfs_entries { struct dentry *lag_debugfs; }; +enum mlx5_func_type { + MLX5_PF, + MLX5_VF, + MLX5_SF, + MLX5_HOST_PF, + MLX5_FUNC_TYPE_NUM, +}; + struct mlx5_ft_pool; struct mlx5_priv { /* IRQ table valid only for real pci devices PF or VF */ @@ -583,11 +591,10 @@ struct mlx5_priv { struct mlx5_nb pg_nb; struct workqueue_struct *pg_wq; struct xarray page_root_xa; - u32 fw_pages; atomic_t reg_pages; struct list_head free_list; - u32 vfs_pages; - u32 host_pf_pages; + u32 fw_pages; + u32 page_counters[MLX5_FUNC_TYPE_NUM]; u32 fw_pages_alloc_failed; u32 give_pages_dropped; u32 reclaim_pages_discard; diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f857163ac89..bd3197748562 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -137,7 +137,7 @@ extern int mmap_rnd_compat_bits __read_mostly; * define their own version of this macro in <asm/pgtable.h> */ #if BITS_PER_LONG == 64 -/* This function must be updated when the size of struct page grows above 80 +/* This function must be updated when the size of struct page grows above 96 * or reduces below 56. The idea that compiler optimizes out switch() * statement, and only leaves move/store instructions. Also the compiler can * combine write statements if they are both assignments and can be reordered, @@ -148,12 +148,18 @@ static inline void __mm_zero_struct_page(struct page *page) { unsigned long *_pp = (void *)page; - /* Check that struct page is either 56, 64, 72, or 80 bytes */ + /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */ BUILD_BUG_ON(sizeof(struct page) & 7); BUILD_BUG_ON(sizeof(struct page) < 56); - BUILD_BUG_ON(sizeof(struct page) > 80); + BUILD_BUG_ON(sizeof(struct page) > 96); switch (sizeof(struct page)) { + case 96: + _pp[11] = 0; + fallthrough; + case 88: + _pp[10] = 0; + fallthrough; case 80: _pp[9] = 0; fallthrough; @@ -2095,8 +2101,6 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); struct kvec; -int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, - struct page **pages); struct page *get_dump_page(unsigned long addr); bool folio_mark_dirty(struct folio *folio); diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index 0ccca33a7a6d..057c89867aa2 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -113,167 +113,23 @@ static inline bool vfsgid_eq_kgid(vfsgid_t vfsgid, kgid_t kgid) #define AS_KUIDT(val) (kuid_t){ __vfsuid_val(val) } #define AS_KGIDT(val) (kgid_t){ __vfsgid_val(val) } -#ifdef CONFIG_MULTIUSER -/** - * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups - * @vfsgid: the mnt gid to match - * - * This function can be used to determine whether @vfsuid matches any of the - * caller's groups. - * - * Return: 1 if vfsuid matches caller's groups, 0 if not. - */ -static inline int vfsgid_in_group_p(vfsgid_t vfsgid) -{ - return in_group_p(AS_KGIDT(vfsgid)); -} -#else -static inline int vfsgid_in_group_p(vfsgid_t vfsgid) -{ - return 1; -} -#endif +int vfsgid_in_group_p(vfsgid_t vfsgid); -/** - * initial_idmapping - check whether this is the initial mapping - * @ns: idmapping to check - * - * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1, - * [...], 1000 to 1000 [...]. - * - * Return: true if this is the initial mapping, false if not. - */ -static inline bool initial_idmapping(const struct user_namespace *ns) -{ - return ns == &init_user_ns; -} +vfsuid_t make_vfsuid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, kuid_t kuid); -/** - * no_idmapping - check whether we can skip remapping a kuid/gid - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * - * This function can be used to check whether a remapping between two - * idmappings is required. - * An idmapped mount is a mount that has an idmapping attached to it that - * is different from the filsystem's idmapping and the initial idmapping. - * If the initial mapping is used or the idmapping of the mount and the - * filesystem are identical no remapping is required. - * - * Return: true if remapping can be skipped, false if not. - */ -static inline bool no_idmapping(const struct user_namespace *mnt_userns, - const struct user_namespace *fs_userns) -{ - return initial_idmapping(mnt_userns) || mnt_userns == fs_userns; -} +vfsgid_t make_vfsgid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, kgid_t kgid); -/** - * make_vfsuid - map a filesystem kuid into a mnt_userns - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @kuid : kuid to be mapped - * - * Take a @kuid and remap it from @fs_userns into @mnt_userns. Use this - * function when preparing a @kuid to be reported to userspace. - * - * If no_idmapping() determines that this is not an idmapped mount we can - * simply return @kuid unchanged. - * If initial_idmapping() tells us that the filesystem is not mounted with an - * idmapping we know the value of @kuid won't change when calling - * from_kuid() so we can simply retrieve the value via __kuid_val() - * directly. - * - * Return: @kuid mapped according to @mnt_userns. - * If @kuid has no mapping in either @mnt_userns or @fs_userns INVALID_UID is - * returned. - */ +kuid_t from_vfsuid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, vfsuid_t vfsuid); -static inline vfsuid_t make_vfsuid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - kuid_t kuid) -{ - uid_t uid; - - if (no_idmapping(mnt_userns, fs_userns)) - return VFSUIDT_INIT(kuid); - if (initial_idmapping(fs_userns)) - uid = __kuid_val(kuid); - else - uid = from_kuid(fs_userns, kuid); - if (uid == (uid_t)-1) - return INVALID_VFSUID; - return VFSUIDT_INIT(make_kuid(mnt_userns, uid)); -} - -/** - * make_vfsgid - map a filesystem kgid into a mnt_userns - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @kgid : kgid to be mapped - * - * Take a @kgid and remap it from @fs_userns into @mnt_userns. Use this - * function when preparing a @kgid to be reported to userspace. - * - * If no_idmapping() determines that this is not an idmapped mount we can - * simply return @kgid unchanged. - * If initial_idmapping() tells us that the filesystem is not mounted with an - * idmapping we know the value of @kgid won't change when calling - * from_kgid() so we can simply retrieve the value via __kgid_val() - * directly. - * - * Return: @kgid mapped according to @mnt_userns. - * If @kgid has no mapping in either @mnt_userns or @fs_userns INVALID_GID is - * returned. - */ - -static inline vfsgid_t make_vfsgid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - kgid_t kgid) -{ - gid_t gid; - - if (no_idmapping(mnt_userns, fs_userns)) - return VFSGIDT_INIT(kgid); - if (initial_idmapping(fs_userns)) - gid = __kgid_val(kgid); - else - gid = from_kgid(fs_userns, kgid); - if (gid == (gid_t)-1) - return INVALID_VFSGID; - return VFSGIDT_INIT(make_kgid(mnt_userns, gid)); -} - -/** - * from_vfsuid - map a vfsuid into the filesystem idmapping - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @vfsuid : vfsuid to be mapped - * - * Map @vfsuid into the filesystem idmapping. This function has to be used in - * order to e.g. write @vfsuid to inode->i_uid. - * - * Return: @vfsuid mapped into the filesystem idmapping - */ -static inline kuid_t from_vfsuid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - vfsuid_t vfsuid) -{ - uid_t uid; - - if (no_idmapping(mnt_userns, fs_userns)) - return AS_KUIDT(vfsuid); - uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid)); - if (uid == (uid_t)-1) - return INVALID_UID; - if (initial_idmapping(fs_userns)) - return KUIDT_INIT(uid); - return make_kuid(fs_userns, uid); -} +kgid_t from_vfsgid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, vfsgid_t vfsgid); /** * vfsuid_has_fsmapping - check whether a vfsuid maps into the filesystem - * @mnt_userns: the mount's idmapping + * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsuid: vfsuid to be mapped * @@ -283,11 +139,11 @@ static inline kuid_t from_vfsuid(struct user_namespace *mnt_userns, * * Return: true if @vfsuid has a mapping in the filesystem, false if not. */ -static inline bool vfsuid_has_fsmapping(struct user_namespace *mnt_userns, +static inline bool vfsuid_has_fsmapping(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid) { - return uid_valid(from_vfsuid(mnt_userns, fs_userns, vfsuid)); + return uid_valid(from_vfsuid(idmap, fs_userns, vfsuid)); } static inline bool vfsuid_has_mapping(struct user_namespace *userns, @@ -310,35 +166,8 @@ static inline kuid_t vfsuid_into_kuid(vfsuid_t vfsuid) } /** - * from_vfsgid - map a vfsgid into the filesystem idmapping - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @vfsgid : vfsgid to be mapped - * - * Map @vfsgid into the filesystem idmapping. This function has to be used in - * order to e.g. write @vfsgid to inode->i_gid. - * - * Return: @vfsgid mapped into the filesystem idmapping - */ -static inline kgid_t from_vfsgid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - vfsgid_t vfsgid) -{ - gid_t gid; - - if (no_idmapping(mnt_userns, fs_userns)) - return AS_KGIDT(vfsgid); - gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid)); - if (gid == (gid_t)-1) - return INVALID_GID; - if (initial_idmapping(fs_userns)) - return KGIDT_INIT(gid); - return make_kgid(fs_userns, gid); -} - -/** * vfsgid_has_fsmapping - check whether a vfsgid maps into the filesystem - * @mnt_userns: the mount's idmapping + * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsgid: vfsgid to be mapped * @@ -348,11 +177,11 @@ static inline kgid_t from_vfsgid(struct user_namespace *mnt_userns, * * Return: true if @vfsgid has a mapping in the filesystem, false if not. */ -static inline bool vfsgid_has_fsmapping(struct user_namespace *mnt_userns, +static inline bool vfsgid_has_fsmapping(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid) { - return gid_valid(from_vfsgid(mnt_userns, fs_userns, vfsgid)); + return gid_valid(from_vfsgid(idmap, fs_userns, vfsgid)); } static inline bool vfsgid_has_mapping(struct user_namespace *userns, @@ -375,8 +204,8 @@ static inline kgid_t vfsgid_into_kgid(vfsgid_t vfsgid) } /** - * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns - * @mnt_userns: the mount's idmapping + * mapped_fsuid - return caller's fsuid mapped according to an idmapping + * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * * Use this helper to initialize a new vfs or filesystem object based on @@ -385,18 +214,17 @@ static inline kgid_t vfsgid_into_kgid(vfsgid_t vfsgid) * O_CREAT. Other examples include the allocation of quotas for a specific * user. * - * Return: the caller's current fsuid mapped up according to @mnt_userns. + * Return: the caller's current fsuid mapped up according to @idmap. */ -static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns, +static inline kuid_t mapped_fsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns) { - return from_vfsuid(mnt_userns, fs_userns, - VFSUIDT_INIT(current_fsuid())); + return from_vfsuid(idmap, fs_userns, VFSUIDT_INIT(current_fsuid())); } /** - * mapped_fsgid - return caller's fsgid mapped up into a mnt_userns - * @mnt_userns: the mount's idmapping + * mapped_fsgid - return caller's fsgid mapped according to an idmapping + * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * * Use this helper to initialize a new vfs or filesystem object based on @@ -405,13 +233,15 @@ static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns, * O_CREAT. Other examples include the allocation of quotas for a specific * user. * - * Return: the caller's current fsgid mapped up according to @mnt_userns. + * Return: the caller's current fsgid mapped up according to @idmap. */ -static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns, +static inline kgid_t mapped_fsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns) { - return from_vfsgid(mnt_userns, fs_userns, - VFSGIDT_INIT(current_fsgid())); + return from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(current_fsgid())); } +bool check_fsmapping(const struct mnt_idmap *idmap, + const struct super_block *sb); + #endif /* _LINUX_MNT_IDMAPPING_H */ diff --git a/include/linux/mount.h b/include/linux/mount.h index 62475996fac6..52f452b2259a 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -74,8 +74,6 @@ struct vfsmount { struct mnt_idmap *mnt_idmap; } __randomize_layout; -struct user_namespace *mnt_user_ns(const struct vfsmount *mnt); -struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap); static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt) { /* Pairs with smp_store_release() in do_idmap_mount(). */ diff --git a/include/linux/namei.h b/include/linux/namei.h index 00fee52df842..0d4531fd46e7 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -68,11 +68,11 @@ extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int); extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int); -struct dentry *lookup_one(struct user_namespace *, const char *, struct dentry *, int); -struct dentry *lookup_one_unlocked(struct user_namespace *mnt_userns, +struct dentry *lookup_one(struct mnt_idmap *, const char *, struct dentry *, int); +struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len); -struct dentry *lookup_one_positive_unlocked(struct user_namespace *mnt_userns, +struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index aad12a179e54..e6e02184c25a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2839,8 +2839,6 @@ int unregister_netdevice_notifier(struct notifier_block *nb); int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb); int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb); -void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, - struct notifier_block *nb); int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index d92fdfd2444c..d6c119e31d7a 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -392,11 +392,11 @@ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr); -extern int nfs_getattr(struct user_namespace *, const struct path *, +extern int nfs_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *, const struct cred *); extern void nfs_access_set_mask(struct nfs_access_entry *, u32); -extern int nfs_permission(struct user_namespace *, struct inode *, int); +extern int nfs_permission(struct mnt_idmap *, struct inode *, int); extern int nfs_open(struct inode *, struct file *); extern int nfs_attribute_cache_expired(struct inode *inode); extern int nfs_revalidate_inode(struct inode *inode, unsigned long flags); @@ -405,7 +405,7 @@ extern int nfs_clear_invalid_mapping(struct address_space *mapping); extern bool nfs_mapping_need_revalidate_inode(struct inode *inode); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); extern int nfs_revalidate_mapping_rcu(struct inode *inode); -extern int nfs_setattr(struct user_namespace *, struct dentry *, struct iattr *); +extern int nfs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr); extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 50caa117cb62..bb15c9234e21 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -70,7 +70,6 @@ struct nvmem_keepout { * @word_size: Minimum read/write access granularity. * @stride: Minimum read/write access stride. * @priv: User context passed to read/write callbacks. - * @wp-gpio: Write protect pin * @ignore_wp: Write Protect pin is managed by the provider. * * Note: A default "nvmem<id>" name will be assigned to the device if @@ -85,7 +84,6 @@ struct nvmem_config { const char *name; int id; struct module *owner; - struct gpio_desc *wp_gpio; const struct nvmem_cell_info *cells; int ncells; const struct nvmem_keepout *keepout; diff --git a/include/linux/pci.h b/include/linux/pci.h index adffd65e84b4..254c8a4126a8 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1621,6 +1621,18 @@ pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, flags, NULL); } +static inline struct msi_map pci_msix_alloc_irq_at(struct pci_dev *dev, unsigned int index, + const struct irq_affinity_desc *affdesc) +{ + struct msi_map map = { .index = -ENOSYS, }; + + return map; +} + +static inline void pci_msix_free_irq(struct pci_dev *pdev, struct msi_map map) +{ +} + static inline void pci_free_irq_vectors(struct pci_dev *dev) { } diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index ef914a600087..525b5d64e394 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -100,7 +100,6 @@ struct arm_pmu { void (*stop)(struct arm_pmu *); void (*reset)(void *); int (*map_event)(struct perf_event *event); - bool (*filter)(struct pmu *pmu, int cpu); int num_events; bool secure_access; /* 32-bit ARM only */ #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 diff --git a/include/linux/poison.h b/include/linux/poison.h index 2d3249eb0e62..0e8a1f2ceb2f 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -84,4 +84,7 @@ /********** kernel/bpf/ **********/ #define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA)) +/********** VFS **********/ +#define VFS_PTR_POISON ((void *)(0xF5 + POISON_POINTER_DELTA)) + #endif diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index ee608d22ecb9..21cc29b8a9e8 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -69,20 +69,20 @@ extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *); extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern struct posix_acl *get_posix_acl(struct inode *, int); -int set_posix_acl(struct user_namespace *, struct dentry *, int, +int set_posix_acl(struct mnt_idmap *, struct dentry *, int, struct posix_acl *); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags); #ifdef CONFIG_FS_POSIX_ACL -int posix_acl_chmod(struct user_namespace *, struct dentry *, umode_t); +int posix_acl_chmod(struct mnt_idmap *, struct dentry *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, struct posix_acl **); -int posix_acl_update_mode(struct user_namespace *, struct inode *, umode_t *, +int posix_acl_update_mode(struct mnt_idmap *, struct inode *, umode_t *, struct posix_acl **); -int simple_set_acl(struct user_namespace *, struct dentry *, +int simple_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); extern int simple_acl_create(struct inode *, struct inode *); @@ -91,7 +91,7 @@ void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl); void forget_cached_acl(struct inode *inode, int type); void forget_all_cached_acls(struct inode *inode); int posix_acl_valid(struct user_namespace *, const struct posix_acl *); -int posix_acl_permission(struct user_namespace *, struct inode *, +int posix_acl_permission(struct mnt_idmap *, struct inode *, const struct posix_acl *, int); static inline void cache_no_acl(struct inode *inode) @@ -100,14 +100,14 @@ static inline void cache_no_acl(struct inode *inode) inode->i_default_acl = NULL; } -int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); -struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); -int vfs_remove_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); #else -static inline int posix_acl_chmod(struct user_namespace *mnt_userns, +static inline int posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) { return 0; @@ -134,21 +134,21 @@ static inline void forget_all_cached_acls(struct inode *inode) { } -static inline int vfs_set_acl(struct user_namespace *mnt_userns, +static inline int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *acl) { return -EOPNOTSUPP; } -static inline struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, +static inline struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ERR_PTR(-EOPNOTSUPP); } -static inline int vfs_remove_acl(struct user_namespace *mnt_userns, +static inline int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return -EOPNOTSUPP; diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 0d8625d71733..11a4becff3a9 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -20,12 +20,12 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb) } /* i_mutex must being held */ -static inline bool is_quota_modification(struct user_namespace *mnt_userns, +static inline bool is_quota_modification(struct mnt_idmap *idmap, struct inode *inode, struct iattr *ia) { return ((ia->ia_valid & ATTR_SIZE) || - i_uid_needs_update(mnt_userns, ia, inode) || - i_gid_needs_update(mnt_userns, ia, inode)); + i_uid_needs_update(idmap, ia, inode) || + i_gid_needs_update(idmap, ia, inode)); } #if defined(CONFIG_QUOTA) @@ -116,7 +116,7 @@ int dquot_set_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int __dquot_transfer(struct inode *inode, struct dquot **transfer_to); -int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, +int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr); static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) @@ -236,7 +236,7 @@ static inline void dquot_free_inode(struct inode *inode) { } -static inline int dquot_transfer(struct user_namespace *mnt_userns, +static inline int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr) { return 0; diff --git a/include/linux/security.h b/include/linux/security.h index 5b67f208f7de..5984d0d550b4 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -153,12 +153,11 @@ extern int cap_capset(struct cred *new, const struct cred *old, extern int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file); int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); -int cap_inode_removexattr(struct user_namespace *mnt_userns, +int cap_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name); int cap_inode_need_killpriv(struct dentry *dentry); -int cap_inode_killpriv(struct user_namespace *mnt_userns, - struct dentry *dentry); -int cap_inode_getsecurity(struct user_namespace *mnt_userns, +int cap_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry); +int cap_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc); extern int cap_mmap_addr(unsigned long addr); @@ -356,29 +355,28 @@ int security_inode_readlink(struct dentry *dentry); int security_inode_follow_link(struct dentry *dentry, struct inode *inode, bool rcu); int security_inode_permission(struct inode *inode, int mask); -int security_inode_setattr(struct user_namespace *mnt_userns, +int security_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int security_inode_getattr(const struct path *path); -int security_inode_setxattr(struct user_namespace *mnt_userns, +int security_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags); -int security_inode_set_acl(struct user_namespace *mnt_userns, +int security_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); -int security_inode_get_acl(struct user_namespace *mnt_userns, +int security_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); -int security_inode_remove_acl(struct user_namespace *mnt_userns, +int security_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); void security_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); int security_inode_getxattr(struct dentry *dentry, const char *name); int security_inode_listxattr(struct dentry *dentry); -int security_inode_removexattr(struct user_namespace *mnt_userns, +int security_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name); int security_inode_need_killpriv(struct dentry *dentry); -int security_inode_killpriv(struct user_namespace *mnt_userns, - struct dentry *dentry); -int security_inode_getsecurity(struct user_namespace *mnt_userns, +int security_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry); +int security_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc); int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags); @@ -862,7 +860,7 @@ static inline int security_inode_permission(struct inode *inode, int mask) return 0; } -static inline int security_inode_setattr(struct user_namespace *mnt_userns, +static inline int security_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { @@ -874,14 +872,14 @@ static inline int security_inode_getattr(const struct path *path) return 0; } -static inline int security_inode_setxattr(struct user_namespace *mnt_userns, +static inline int security_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { return cap_inode_setxattr(dentry, name, value, size, flags); } -static inline int security_inode_set_acl(struct user_namespace *mnt_userns, +static inline int security_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) @@ -889,14 +887,14 @@ static inline int security_inode_set_acl(struct user_namespace *mnt_userns, return 0; } -static inline int security_inode_get_acl(struct user_namespace *mnt_userns, +static inline int security_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return 0; } -static inline int security_inode_remove_acl(struct user_namespace *mnt_userns, +static inline int security_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { @@ -918,11 +916,11 @@ static inline int security_inode_listxattr(struct dentry *dentry) return 0; } -static inline int security_inode_removexattr(struct user_namespace *mnt_userns, +static inline int security_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { - return cap_inode_removexattr(mnt_userns, dentry, name); + return cap_inode_removexattr(idmap, dentry, name); } static inline int security_inode_need_killpriv(struct dentry *dentry) @@ -930,18 +928,18 @@ static inline int security_inode_need_killpriv(struct dentry *dentry) return cap_inode_need_killpriv(dentry); } -static inline int security_inode_killpriv(struct user_namespace *mnt_userns, +static inline int security_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry) { - return cap_inode_killpriv(mnt_userns, dentry); + return cap_inode_killpriv(idmap, dentry); } -static inline int security_inode_getsecurity(struct user_namespace *mnt_userns, +static inline int security_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { - return cap_inode_getsecurity(mnt_userns, inode, name, buffer, alloc); + return cap_inode_getsecurity(idmap, inode, name, buffer, alloc); } static inline int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 71310efe2fab..7bde8e1c228a 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -107,7 +107,7 @@ extern void synchronize_shrinkers(void); #ifdef CONFIG_SHRINKER_DEBUG extern int shrinker_debugfs_add(struct shrinker *shrinker); -extern void shrinker_debugfs_remove(struct shrinker *shrinker); +extern struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker); extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...); #else /* CONFIG_SHRINKER_DEBUG */ @@ -115,8 +115,9 @@ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { return 0; } -static inline void shrinker_debugfs_remove(struct shrinker *shrinker) +static inline struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker) { + return NULL; } static inline __printf(2, 3) int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 1341f7d62da4..be48f1cb1878 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -476,6 +476,15 @@ extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, #define atomic_dec_and_lock_irqsave(atomic, lock, flags) \ __cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags))) +extern int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock); +#define atomic_dec_and_raw_lock(atomic, lock) \ + __cond_lock(lock, _atomic_dec_and_raw_lock(atomic, lock)) + +extern int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, + unsigned long *flags); +#define atomic_dec_and_raw_lock_irqsave(atomic, lock, flags) \ + __cond_lock(lock, _atomic_dec_and_raw_lock_irqsave(atomic, lock, &(flags))) + int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, size_t max_size, unsigned int cpu_mult, gfp_t gfp, const char *name, diff --git a/include/linux/stat.h b/include/linux/stat.h index ff277ced50e9..52150570d37a 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -52,6 +52,15 @@ struct kstat { u64 mnt_id; u32 dio_mem_align; u32 dio_offset_align; + u64 change_cookie; }; +/* These definitions are internal to the kernel for now. Mainly used by nfsd. */ + +/* mask values */ +#define STATX_CHANGE_COOKIE 0x40000000U /* Want/got stx_change_attr */ + +/* file attribute values */ +#define STATX_ATTR_CHANGE_MONOTONIC 0x8000000000000000ULL /* version monotonically increases */ + #endif diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 83ca2e8eb6b5..a152678b82b7 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -252,6 +252,7 @@ struct plat_stmmacenet_data { int rss_en; int mac_port_sel_speed; bool en_tx_lpi_clockgating; + bool rx_clk_runs_in_lpi; int has_xgmac; bool vlan_fail_q_en; u8 vlan_fail_q; diff --git a/include/linux/swap.h b/include/linux/swap.h index 2787b84eaf12..0ceed49516ad 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -418,8 +418,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options, - nodemask_t *nodemask); + unsigned int reclaim_options); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, diff --git a/include/linux/tpm.h b/include/linux/tpm.h index dfeb25a0362d..4dc97b9f65fb 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -265,6 +265,7 @@ enum tpm2_startup_types { enum tpm2_cc_attrs { TPM2_CC_ATTR_CHANDLES = 25, TPM2_CC_ATTR_RHANDLE = 28, + TPM2_CC_ATTR_VENDOR = 29, }; #define TPM_VID_INTEL 0x8086 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 4342e996bcdb..0e373222a6df 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -270,6 +270,7 @@ struct trace_event_fields { const int align; const int is_signed; const int filter_type; + const int len; }; int (*define_fields)(struct trace_event_call *); }; diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 72299f261b25..43db6e47503c 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -38,4 +38,16 @@ */ #define find_closest_descending(x, a, as) __find_closest(x, a, as, >=) +/** + * is_insidevar - check if the @ptr points inside the @var memory range. + * @ptr: the pointer to a memory address. + * @var: the variable which address and size identify the memory range. + * + * Evaluates to true if the address in @ptr lies within the memory + * range allocated to @var. + */ +#define is_insidevar(ptr, var) \ + ((uintptr_t)(ptr) >= (uintptr_t)(var) && \ + (uintptr_t)(ptr) < (uintptr_t)(var) + sizeof(var)) + #endif diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 2e7dd44926e4..6af72461397d 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -42,7 +42,7 @@ struct xattr_handler { struct inode *inode, const char *name, void *buffer, size_t size); int (*set)(const struct xattr_handler *, - struct user_namespace *mnt_userns, struct dentry *dentry, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags); }; @@ -56,25 +56,25 @@ struct xattr { }; ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t); -ssize_t vfs_getxattr(struct user_namespace *, struct dentry *, const char *, +ssize_t vfs_getxattr(struct mnt_idmap *, struct dentry *, const char *, void *, size_t); ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size); -int __vfs_setxattr(struct user_namespace *, struct dentry *, struct inode *, +int __vfs_setxattr(struct mnt_idmap *, struct dentry *, struct inode *, const char *, const void *, size_t, int); -int __vfs_setxattr_noperm(struct user_namespace *, struct dentry *, +int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); -int __vfs_setxattr_locked(struct user_namespace *, struct dentry *, +int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int, struct inode **); -int vfs_setxattr(struct user_namespace *, struct dentry *, const char *, +int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); -int __vfs_removexattr(struct user_namespace *, struct dentry *, const char *); -int __vfs_removexattr_locked(struct user_namespace *, struct dentry *, +int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); +int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *, const char *, struct inode **); -int vfs_removexattr(struct user_namespace *, struct dentry *, const char *); +int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size); -int vfs_getxattr_alloc(struct user_namespace *mnt_userns, +int vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t size, gfp_t flags); diff --git a/include/net/sock.h b/include/net/sock.h index dcd72e6285b2..556209727633 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2434,6 +2434,19 @@ static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struc return false; } +static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk) +{ + skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); + if (skb) { + if (sk_rmem_schedule(sk, skb, skb->truesize)) { + skb_set_owner_r(skb, sk); + return skb; + } + __kfree_skb(skb); + } + return NULL; +} + static inline void skb_prepare_for_gro(struct sk_buff *skb) { if (skb->destructor != sock_wfree) { diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 6548b5b5aa60..75d7d22c3a27 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -32,6 +32,7 @@ struct prelim_ref; struct btrfs_space_info; struct btrfs_raid_bio; struct raid56_bio_trace_info; +struct find_free_extent_ctl; #define show_ref_type(type) \ __print_symbolic(type, \ @@ -1241,76 +1242,156 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, TRACE_EVENT(find_free_extent, - TP_PROTO(const struct btrfs_root *root, u64 num_bytes, - u64 empty_size, u64 data), + TP_PROTO(const struct btrfs_root *root, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(root, num_bytes, empty_size, data), + TP_ARGS(root, ffe_ctl), TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, num_bytes ) __field( u64, empty_size ) - __field( u64, data ) + __field( u64, flags ) ), TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = root->root_key.objectid; - __entry->num_bytes = num_bytes; - __entry->empty_size = empty_size; - __entry->data = data; + __entry->num_bytes = ffe_ctl->num_bytes; + __entry->empty_size = ffe_ctl->empty_size; + __entry->flags = ffe_ctl->flags; ), TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)", show_root_type(__entry->root_objectid), - __entry->num_bytes, __entry->empty_size, __entry->data, - __print_flags((unsigned long)__entry->data, "|", + __entry->num_bytes, __entry->empty_size, __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", + BTRFS_GROUP_FLAGS)) +); + +TRACE_EVENT(find_free_extent_search_loop, + + TP_PROTO(const struct btrfs_root *root, + const struct find_free_extent_ctl *ffe_ctl), + + TP_ARGS(root, ffe_ctl), + + TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) + __field( u64, num_bytes ) + __field( u64, empty_size ) + __field( u64, flags ) + __field( u64, loop ) + ), + + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = root->root_key.objectid; + __entry->num_bytes = ffe_ctl->num_bytes; + __entry->empty_size = ffe_ctl->empty_size; + __entry->flags = ffe_ctl->flags; + __entry->loop = ffe_ctl->loop; + ), + + TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu", + show_root_type(__entry->root_objectid), + __entry->num_bytes, __entry->empty_size, __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), + __entry->loop) +); + +TRACE_EVENT(find_free_extent_have_block_group, + + TP_PROTO(const struct btrfs_root *root, + const struct find_free_extent_ctl *ffe_ctl, + const struct btrfs_block_group *block_group), + + TP_ARGS(root, ffe_ctl, block_group), + + TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) + __field( u64, num_bytes ) + __field( u64, empty_size ) + __field( u64, flags ) + __field( u64, loop ) + __field( bool, hinted ) + __field( u64, bg_start ) + __field( u64, bg_flags ) + ), + + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = root->root_key.objectid; + __entry->num_bytes = ffe_ctl->num_bytes; + __entry->empty_size = ffe_ctl->empty_size; + __entry->flags = ffe_ctl->flags; + __entry->loop = ffe_ctl->loop; + __entry->hinted = ffe_ctl->hinted; + __entry->bg_start = block_group->start; + __entry->bg_flags = block_group->flags; + ), + + TP_printk_btrfs( +"root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu hinted=%d block_group=%llu bg_flags=%llu(%s)", + show_root_type(__entry->root_objectid), + __entry->num_bytes, __entry->empty_size, __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), + __entry->loop, __entry->hinted, + __entry->bg_start, __entry->bg_flags, + __print_flags((unsigned long)__entry->bg_flags, "|", BTRFS_GROUP_FLAGS)) ); DECLARE_EVENT_CLASS(btrfs__reserve_extent, - TP_PROTO(const struct btrfs_block_group *block_group, u64 start, - u64 len), + TP_PROTO(const struct btrfs_block_group *block_group, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(block_group, start, len), + TP_ARGS(block_group, ffe_ctl), TP_STRUCT__entry_btrfs( __field( u64, bg_objectid ) __field( u64, flags ) + __field( int, bg_size_class ) __field( u64, start ) __field( u64, len ) + __field( u64, loop ) + __field( bool, hinted ) + __field( int, size_class ) ), TP_fast_assign_btrfs(block_group->fs_info, __entry->bg_objectid = block_group->start; __entry->flags = block_group->flags; - __entry->start = start; - __entry->len = len; + __entry->bg_size_class = block_group->size_class; + __entry->start = ffe_ctl->search_start; + __entry->len = ffe_ctl->num_bytes; + __entry->loop = ffe_ctl->loop; + __entry->hinted = ffe_ctl->hinted; + __entry->size_class = ffe_ctl->size_class; ), - TP_printk_btrfs("root=%llu(%s) block_group=%llu flags=%llu(%s) " - "start=%llu len=%llu", + TP_printk_btrfs( +"root=%llu(%s) block_group=%llu flags=%llu(%s) bg_size_class=%d start=%llu len=%llu loop=%llu hinted=%d size_class=%d", show_root_type(BTRFS_EXTENT_TREE_OBJECTID), __entry->bg_objectid, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), - __entry->start, __entry->len) + __entry->bg_size_class, __entry->start, __entry->len, + __entry->loop, __entry->hinted, __entry->size_class) ); DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent, - TP_PROTO(const struct btrfs_block_group *block_group, u64 start, - u64 len), + TP_PROTO(const struct btrfs_block_group *block_group, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(block_group, start, len) + TP_ARGS(block_group, ffe_ctl) ); DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster, - TP_PROTO(const struct btrfs_block_group *block_group, u64 start, - u64 len), + TP_PROTO(const struct btrfs_block_group *block_group, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(block_group, start, len) + TP_ARGS(block_group, ffe_ctl) ); TRACE_EVENT(btrfs_find_cluster, diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index 4f4c44ea3a65..cf4a0d28b178 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -19,12 +19,17 @@ struct erofs_map_blocks; { 1, "DIR" }) #define show_map_flags(flags) __print_flags(flags, "|", \ - { EROFS_GET_BLOCKS_RAW, "RAW" }) + { EROFS_GET_BLOCKS_FIEMAP, "FIEMAP" }, \ + { EROFS_GET_BLOCKS_READMORE, "READMORE" }, \ + { EROFS_GET_BLOCKS_FINDTAIL, "FINDTAIL" }) #define show_mflags(flags) __print_flags(flags, "", \ - { EROFS_MAP_MAPPED, "M" }, \ - { EROFS_MAP_META, "I" }, \ - { EROFS_MAP_ENCODED, "E" }) + { EROFS_MAP_MAPPED, "M" }, \ + { EROFS_MAP_META, "I" }, \ + { EROFS_MAP_ENCODED, "E" }, \ + { EROFS_MAP_FULL_MAPPED, "F" }, \ + { EROFS_MAP_FRAGMENT, "R" }, \ + { EROFS_MAP_PARTIAL_REF, "P" }) TRACE_EVENT(erofs_lookup, @@ -66,8 +71,8 @@ TRACE_EVENT(erofs_fill_inode, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; - __entry->blkaddr = erofs_blknr(iloc(EROFS_I_SB(inode), __entry->nid)); - __entry->ofs = erofs_blkoff(iloc(EROFS_I_SB(inode), __entry->nid)); + __entry->blkaddr = erofs_blknr(erofs_iloc(inode)); + __entry->ofs = erofs_blkoff(erofs_iloc(inode)); ), TP_printk("dev = (%d,%d), nid = %llu, blkaddr %u ofs %u", diff --git a/include/trace/stages/stage4_event_fields.h b/include/trace/stages/stage4_event_fields.h index affd541fd25e..b6f679ae21aa 100644 --- a/include/trace/stages/stage4_event_fields.h +++ b/include/trace/stages/stage4_event_fields.h @@ -26,7 +26,8 @@ #define __array(_type, _item, _len) { \ .type = #_type"["__stringify(_len)"]", .name = #_item, \ .size = sizeof(_type[_len]), .align = ALIGN_STRUCTFIELD(_type), \ - .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }, + .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER,\ + .len = _len }, #undef __dynamic_array #define __dynamic_array(_type, _item, _len) { \ diff --git a/include/uapi/drm/virtgpu_drm.h b/include/uapi/drm/virtgpu_drm.h index 0512fde5e697..7b158fcb02b4 100644 --- a/include/uapi/drm/virtgpu_drm.h +++ b/include/uapi/drm/virtgpu_drm.h @@ -64,6 +64,7 @@ struct drm_virtgpu_map { __u32 pad; }; +/* fence_fd is modified on success if VIRTGPU_EXECBUF_FENCE_FD_OUT flag is set. */ struct drm_virtgpu_execbuffer { __u32 flags; __u32 size; diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 436258214bb0..cd14c94e9a1e 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -188,15 +188,43 @@ struct fanotify_event_info_error { __u32 error_count; }; +/* + * User space may need to record additional information about its decision. + * The extra information type records what kind of information is included. + * The default is none. We also define an extra information buffer whose + * size is determined by the extra information type. + * + * If the information type is Audit Rule, then the information following + * is the rule number that triggered the user space decision that + * requires auditing. + */ + +#define FAN_RESPONSE_INFO_NONE 0 +#define FAN_RESPONSE_INFO_AUDIT_RULE 1 + struct fanotify_response { __s32 fd; __u32 response; }; +struct fanotify_response_info_header { + __u8 type; + __u8 pad; + __u16 len; +}; + +struct fanotify_response_info_audit_rule { + struct fanotify_response_info_header hdr; + __u32 rule_number; + __u32 subj_trust; + __u32 obj_trust; +}; + /* Legit userspace responses to a _PERM event */ #define FAN_ALLOW 0x01 #define FAN_DENY 0x02 -#define FAN_AUDIT 0x10 /* Bit mask to create audit record for result */ +#define FAN_AUDIT 0x10 /* Bitmask to create audit record for result */ +#define FAN_INFO 0x20 /* Bitmask to indicate additional information */ /* No fd set in event */ #define FAN_NOFD -1 diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index 874a92349bf5..283dec7e3645 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -18,6 +18,7 @@ #ifndef _UAPI_LINUX_IP_H #define _UAPI_LINUX_IP_H #include <linux/types.h> +#include <linux/stddef.h> #include <asm/byteorder.h> #define IPTOS_TOS_MASK 0x1E diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 81f4243bebb1..53326dfc59ec 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -4,6 +4,7 @@ #include <linux/libc-compat.h> #include <linux/types.h> +#include <linux/stddef.h> #include <linux/in6.h> #include <asm/byteorder.h> diff --git a/ipc/mqueue.c b/ipc/mqueue.c index d09aa1c1e3e6..0160e9f2b07c 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -608,7 +608,7 @@ out_unlock: return error; } -static int mqueue_create(struct user_namespace *mnt_userns, struct inode *dir, +static int mqueue_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return mqueue_create_attr(dentry, mode, NULL); @@ -887,7 +887,7 @@ static int prepare_open(struct dentry *dentry, int oflag, int ro, if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) return -EINVAL; acc = oflag2acc[oflag & O_ACCMODE]; - return inode_permission(&init_user_ns, d_inode(dentry), acc); + return inode_permission(&nop_mnt_idmap, d_inode(dentry), acc); } static int do_mq_open(const char __user *u_name, int oflag, umode_t mode, @@ -979,7 +979,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) err = -ENOENT; } else { ihold(inode); - err = vfs_unlink(&init_user_ns, d_inode(dentry->d_parent), + err = vfs_unlink(&nop_mnt_idmap, d_inode(dentry->d_parent), dentry, NULL); } dput(dentry); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 547c88be8a28..93d0b87f3283 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -64,6 +64,7 @@ #include <uapi/linux/limits.h> #include <uapi/linux/netfilter/nf_tables.h> #include <uapi/linux/openat2.h> // struct open_how +#include <uapi/linux/fanotify.h> #include "audit.h" @@ -2252,7 +2253,7 @@ static inline int audit_copy_fcaps(struct audit_names *name, if (!dentry) return 0; - rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps); + rc = get_vfs_caps_from_disk(&nop_mnt_idmap, dentry, &caps); if (rc) return rc; @@ -2807,7 +2808,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->d.next = context->aux; context->aux = (void *)ax; - get_vfs_caps_from_disk(&init_user_ns, + get_vfs_caps_from_disk(&nop_mnt_idmap, bprm->file->f_path.dentry, &vcaps); ax->fcap.permitted = vcaps.permitted; @@ -2877,10 +2878,21 @@ void __audit_log_kern_module(char *name) context->type = AUDIT_KERN_MODULE; } -void __audit_fanotify(unsigned int response) +void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { - audit_log(audit_context(), GFP_KERNEL, - AUDIT_FANOTIFY, "resp=%u", response); + /* {subj,obj}_trust values are {0,1,2}: no,yes,unknown */ + switch (friar->hdr.type) { + case FAN_RESPONSE_INFO_NONE: + audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, + "resp=%u fan_type=%u fan_info=0 subj_trust=2 obj_trust=2", + response, FAN_RESPONSE_INFO_NONE); + break; + case FAN_RESPONSE_INFO_AUDIT_RULE: + audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, + "resp=%u fan_type=%u fan_info=%X subj_trust=%u obj_trust=%u", + response, friar->hdr.type, friar->rule_number, + friar->subj_trust, friar->obj_trust); + } } void __audit_tk_injoffset(struct timespec64 offset) diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index a4a41ee3e80b..e14c822f8911 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -51,7 +51,6 @@ BTF_SET_END(bpf_lsm_current_hooks) */ BTF_SET_START(bpf_lsm_locked_sockopt_hooks) #ifdef CONFIG_SECURITY_NETWORK -BTF_ID(func, bpf_lsm_socket_sock_rcv_skb) BTF_ID(func, bpf_lsm_sock_graft) BTF_ID(func, bpf_lsm_inet_csk_clone) BTF_ID(func, bpf_lsm_inet_conn_established) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f7dd8af06413..b7017cae6fd1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7782,9 +7782,9 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL); - return 0; end: - btf_free_dtor_kfunc_tab(btf); + if (ret) + btf_free_dtor_kfunc_tab(btf); btf_put(btf); return ret; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 4f841e16779e..9948b542a470 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -122,7 +122,7 @@ static struct inode *bpf_get_inode(struct super_block *sb, inode->i_mtime = inode->i_atime; inode->i_ctime = inode->i_atime; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); return inode; } @@ -152,7 +152,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, dir->i_ctime = dir->i_mtime; } -static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -382,7 +382,7 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) return simple_lookup(dir, dentry, flags); } -static int bpf_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *target) { char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); @@ -559,7 +559,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags) static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) { struct bpf_prog *prog; - int ret = inode_permission(&init_user_ns, inode, MAY_READ); + int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ); if (ret) return ERR_PTR(ret); diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index ebcc3dd0fa19..1db156405b68 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -71,7 +71,7 @@ static int bpf_mem_cache_idx(size_t size) if (size <= 192) return size_index[(size - 1) / 8] - 1; - return fls(size - 1) - 1; + return fls(size - 1) - 2; } #define NUM_CACHES 11 diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dbef0b0967ae..7ee218827259 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3243,13 +3243,24 @@ static bool __is_pointer_value(bool allow_ptr_leaks, return reg->type != SCALAR_VALUE; } +/* Copy src state preserving dst->parent and dst->live fields */ +static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) +{ + struct bpf_reg_state *parent = dst->parent; + enum bpf_reg_liveness live = dst->live; + + *dst = *src; + dst->parent = parent; + dst->live = live; +} + static void save_register_state(struct bpf_func_state *state, int spi, struct bpf_reg_state *reg, int size) { int i; - state->stack[spi].spilled_ptr = *reg; + copy_register_state(&state->stack[spi].spilled_ptr, reg); if (size == BPF_REG_SIZE) state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; @@ -3577,7 +3588,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, */ s32 subreg_def = state->regs[dst_regno].subreg_def; - state->regs[dst_regno] = *reg; + copy_register_state(&state->regs[dst_regno], reg); state->regs[dst_regno].subreg_def = subreg_def; } else { for (i = 0; i < size; i++) { @@ -3598,7 +3609,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, if (dst_regno >= 0) { /* restore register state from stack */ - state->regs[dst_regno] = *reg; + copy_register_state(&state->regs[dst_regno], reg); /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions @@ -9592,7 +9603,7 @@ do_sim: */ if (!ptr_is_dst_reg) { tmp = *dst_reg; - *dst_reg = *ptr_reg; + copy_register_state(dst_reg, ptr_reg); } ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx); @@ -10845,7 +10856,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * to propagate min/max range. */ src_reg->id = ++env->id_gen; - *dst_reg = *src_reg; + copy_register_state(dst_reg, src_reg); dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { @@ -10856,7 +10867,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg); return -EACCES; } else if (src_reg->type == SCALAR_VALUE) { - *dst_reg = *src_reg; + copy_register_state(dst_reg, src_reg); /* Make sure ID is cleared otherwise * dst_reg min/max could be incorrectly * propagated into src_reg by find_equal_scalars() @@ -11655,7 +11666,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate, bpf_for_each_reg_in_vstate(vstate, state, reg, ({ if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) - *reg = *known_reg; + copy_register_state(reg, known_reg); })); } diff --git a/kernel/capability.c b/kernel/capability.c index 860fd22117c1..339a44dfe2f4 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -486,11 +486,11 @@ EXPORT_SYMBOL(file_ns_capable); * Return true if the inode uid and gid are within the namespace. */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const struct inode *inode) { - return vfsuid_has_mapping(ns, i_uid_into_vfsuid(mnt_userns, inode)) && - vfsgid_has_mapping(ns, i_gid_into_vfsgid(mnt_userns, inode)); + return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) && + vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode)); } /** @@ -502,13 +502,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, * its own user namespace and that the given inode's uid and gid are * mapped into the current user namespace. */ -bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns, +bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); return ns_capable(ns, cap) && - privileged_wrt_inode_uidgid(ns, mnt_userns, inode); + privileged_wrt_inode_uidgid(ns, idmap, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c099cf3fa02d..935e8121b21e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5065,7 +5065,7 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) if (!inode) return -ENOMEM; - ret = inode_permission(&init_user_ns, inode, MAY_WRITE); + ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE); iput(inode); return ret; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a29c0b13706b..ca826bd1eba3 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1205,12 +1205,13 @@ void rebuild_sched_domains(void) /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed + * @new_cpus: the temp variable for the new effective_cpus mask * * Iterate through each task of @cs updating its cpus_allowed to the * effective cpuset's. As this function is called with cpuset_rwsem held, * cpuset membership stays stable. */ -static void update_tasks_cpumask(struct cpuset *cs) +static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { struct css_task_iter it; struct task_struct *task; @@ -1224,7 +1225,10 @@ static void update_tasks_cpumask(struct cpuset *cs) if (top_cs && (task->flags & PF_KTHREAD) && kthread_is_per_cpu(task)) continue; - set_cpus_allowed_ptr(task, cs->effective_cpus); + + cpumask_and(new_cpus, cs->effective_cpus, + task_cpu_possible_mask(task)); + set_cpus_allowed_ptr(task, new_cpus); } css_task_iter_end(&it); } @@ -1346,7 +1350,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, * A parent can be left with no CPU as long as there is no * task directly associated with the parent partition. */ - if (!cpumask_intersects(cs->cpus_allowed, parent->effective_cpus) && + if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) && partition_is_populated(parent, cs)) return PERR_NOCPUS; @@ -1509,7 +1513,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, spin_unlock_irq(&callback_lock); if (adding || deleting) - update_tasks_cpumask(parent); + update_tasks_cpumask(parent, tmp->new_cpus); /* * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. @@ -1661,7 +1665,7 @@ update_parent_subparts: WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - update_tasks_cpumask(cp); + update_tasks_cpumask(cp, tmp->new_cpus); /* * On legacy hierarchy, if the effective cpumask of any non- @@ -2309,7 +2313,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) } } - update_tasks_cpumask(parent); + update_tasks_cpumask(parent, tmpmask.new_cpus); if (parent->child_ecpus_count) update_sibling_cpumasks(parent, cs, &tmpmask); @@ -2324,6 +2328,7 @@ out: new_prs = -new_prs; spin_lock_irq(&callback_lock); cs->partition_root_state = new_prs; + WRITE_ONCE(cs->prs_err, err); spin_unlock_irq(&callback_lock); /* * Update child cpusets, if present. @@ -3347,7 +3352,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, * as the tasks will be migrated to an ancestor. */ if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) - update_tasks_cpumask(cs); + update_tasks_cpumask(cs, new_cpus); if (mems_updated && !nodes_empty(cs->mems_allowed)) update_tasks_nodemask(cs); @@ -3384,7 +3389,7 @@ hotplug_update_tasks(struct cpuset *cs, spin_unlock_irq(&callback_lock); if (cpus_updated) - update_tasks_cpumask(cs); + update_tasks_cpumask(cs, new_cpus); if (mems_updated) update_tasks_nodemask(cs); } @@ -3691,15 +3696,38 @@ void __init cpuset_init_smp(void) * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of cpu_online_mask, even if this means going outside the - * tasks cpuset. + * tasks cpuset, except when the task is in the top cpuset. **/ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { unsigned long flags; + struct cpuset *cs; spin_lock_irqsave(&callback_lock, flags); - guarantee_online_cpus(tsk, pmask); + rcu_read_lock(); + + cs = task_cs(tsk); + if (cs != &top_cpuset) + guarantee_online_cpus(tsk, pmask); + /* + * Tasks in the top cpuset won't get update to their cpumasks + * when a hotplug online/offline event happens. So we include all + * offline cpus in the allowed cpu list. + */ + if ((cs == &top_cpuset) || cpumask_empty(pmask)) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + + /* + * We first exclude cpus allocated to partitions. If there is no + * allowable online cpu left, we fall back to all possible cpus. + */ + cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus); + if (!cpumask_intersects(pmask, cpu_online_mask)) + cpumask_copy(pmask, possible_mask); + } + + rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); } diff --git a/kernel/events/core.c b/kernel/events/core.c index d56328e5080e..c4be13e50547 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4813,19 +4813,17 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); epc = &cpc->epc; - + raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { atomic_set(&epc->refcount, 1); epc->embedded = 1; - raw_spin_lock_irq(&ctx->lock); list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; - raw_spin_unlock_irq(&ctx->lock); } else { WARN_ON_ONCE(epc->ctx != ctx); atomic_inc(&epc->refcount); } - + raw_spin_unlock_irq(&ctx->lock); return epc; } @@ -4896,33 +4894,30 @@ static void free_epc_rcu(struct rcu_head *head) static void put_pmu_ctx(struct perf_event_pmu_context *epc) { + struct perf_event_context *ctx = epc->ctx; unsigned long flags; - if (!atomic_dec_and_test(&epc->refcount)) + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because of the call-site in _free_event()/put_event() + * which isn't always called under ctx->mutex. + */ + if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags)) return; - if (epc->ctx) { - struct perf_event_context *ctx = epc->ctx; + WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); - /* - * XXX - * - * lockdep_assert_held(&ctx->mutex); - * - * can't because of the call-site in _free_event()/put_event() - * which isn't always called under ctx->mutex. - */ - - WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); - raw_spin_lock_irqsave(&ctx->lock, flags); - list_del_init(&epc->pmu_ctx_entry); - epc->ctx = NULL; - raw_spin_unlock_irqrestore(&ctx->lock, flags); - } + list_del_init(&epc->pmu_ctx_entry); + epc->ctx = NULL; WARN_ON_ONCE(!list_empty(&epc->pinned_active)); WARN_ON_ONCE(!list_empty(&epc->flexible_active)); + raw_spin_unlock_irqrestore(&ctx->lock, flags); + if (epc->embedded) return; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5c3fb6168eef..798a9042421f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1915,7 +1915,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d) static void debugfs_remove_domain_dir(struct irq_domain *d) { - debugfs_remove(debugfs_lookup(d->name, domain_dir)); + debugfs_lookup_and_remove(d->name, domain_dir); } void __init irq_domain_debugfs_init(struct dentry *root) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 010cf4e6d0b8..728f434de2bb 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -901,8 +901,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * then we need to wake the new top waiter up to try * to get the lock. */ - if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) - wake_up_state(waiter->task, waiter->wake_state); + top_waiter = rt_mutex_top_waiter(lock); + if (prerequeue_top_waiter != top_waiter) + wake_up_state(top_waiter->task, top_waiter->wake_state); raw_spin_unlock_irq(&lock->wait_lock); return 0; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e838feb6adc5..2a4918a1faa9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2951,8 +2951,11 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, } if (!(ctx->flags & SCA_MIGRATE_ENABLE)) { - if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) + if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) { + if (ctx->flags & SCA_USER) + swap(p->user_cpus_ptr, ctx->user_mask); goto out; + } if (WARN_ON_ONCE(p == current && is_migration_disabled(p) && diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 8ac8b81bfee6..02e011cabe91 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1343,10 +1343,11 @@ void psi_trigger_destroy(struct psi_trigger *t) group = t->group; /* - * Wakeup waiters to stop polling. Can happen if cgroup is deleted - * from under a polling process. + * Wakeup waiters to stop polling and clear the queue to prevent it from + * being accessed later. Can happen if cgroup is deleted from under a + * polling process. */ - wake_up_interruptible(&t->event_wait); + wake_up_pollfree(&t->event_wait); mutex_lock(&group->trigger_lock); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5897828b9d7e..7e5dff602585 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -470,11 +470,35 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) } EXPORT_SYMBOL_GPL(alarm_forward); -u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throttle) { struct alarm_base *base = &alarm_bases[alarm->type]; + ktime_t now = base->get_ktime(); + + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && throttle) { + /* + * Same issue as with posix_timer_fn(). Timers which are + * periodic but the signal is ignored can starve the system + * with a very small interval. The real fix which was + * promised in the context of posix_timer_fn() never + * materialized, but someone should really work on it. + * + * To prevent DOS fake @now to be 1 jiffie out which keeps + * the overrun accounting correct but creates an + * inconsistency vs. timer_gettime(2). + */ + ktime_t kj = NSEC_PER_SEC / HZ; + + if (interval < kj) + now = ktime_add(now, kj); + } + + return alarm_forward(alarm, now, interval); +} - return alarm_forward(alarm, base->get_ktime(), interval); +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ + return __alarm_forward_now(alarm, interval, false); } EXPORT_SYMBOL_GPL(alarm_forward_now); @@ -551,9 +575,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, if (posix_timer_event(ptr, si_private) && ptr->it_interval) { /* * Handle ignored signals and rearm the timer. This will go - * away once we handle ignored signals proper. + * away once we handle ignored signals proper. Ensure that + * small intervals cannot starve the system. */ - ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval); + ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true); ++ptr->it_requeue_pending; ptr->it_active = 1; result = ALARMTIMER_RESTART; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f47274de012b..c09792c551bf 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -833,6 +833,7 @@ static void do_bpf_send_signal(struct irq_work *entry) work = container_of(entry, struct send_signal_irq_work, irq_work); group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type); + put_task_struct(work->task); } static int bpf_send_signal_common(u32 sig, enum pid_type type) @@ -867,7 +868,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) * to the irq_work. The current task may change when queued * irq works get executed. */ - work->task = current; + work->task = get_task_struct(current); work->sig = sig; work->type = type; irq_work_queue(&work->irq_work); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 78ed5f1baa8c..c9e40f692650 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9148,9 +9148,6 @@ buffer_percent_write(struct file *filp, const char __user *ubuf, if (val > 100) return -EINVAL; - if (!val) - val = 1; - tr->buffer_percent = val; (*ppos)++; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4eb6d6b97a9f..085a31b978a5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1282,6 +1282,7 @@ struct ftrace_event_field { int offset; int size; int is_signed; + int len; }; struct prog_entry; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 33e0b4f8ebe6..6a942fa275c7 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -114,7 +114,7 @@ trace_find_event_field(struct trace_event_call *call, char *name) static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, - int is_signed, int filter_type) + int is_signed, int filter_type, int len) { struct ftrace_event_field *field; @@ -133,6 +133,7 @@ static int __trace_define_field(struct list_head *head, const char *type, field->offset = offset; field->size = size; field->is_signed = is_signed; + field->len = len; list_add(&field->link, head); @@ -150,14 +151,28 @@ int trace_define_field(struct trace_event_call *call, const char *type, head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type); + is_signed, filter_type, 0); } EXPORT_SYMBOL_GPL(trace_define_field); +static int trace_define_field_ext(struct trace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed, + int filter_type, int len) +{ + struct list_head *head; + + if (WARN_ON(!call->class)) + return 0; + + head = trace_get_fields(call); + return __trace_define_field(head, type, name, offset, size, + is_signed, filter_type, len); +} + #define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \ #item, 0, 0, is_signed_type(type), \ - filter_type); \ + filter_type, 0); \ if (ret) \ return ret; @@ -166,7 +181,7 @@ EXPORT_SYMBOL_GPL(trace_define_field); "common_" #item, \ offsetof(typeof(ent), item), \ sizeof(ent.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), FILTER_OTHER, 0); \ if (ret) \ return ret; @@ -1588,12 +1603,17 @@ static int f_show(struct seq_file *m, void *v) seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", field->type, field->name, field->offset, field->size, !!field->is_signed); - else - seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + else if (field->len) + seq_printf(m, "\tfield:%.*s %s[%d];\toffset:%u;\tsize:%u;\tsigned:%d;\n", (int)(array_descriptor - field->type), field->type, field->name, - array_descriptor, field->offset, + field->len, field->offset, field->size, !!field->is_signed); + else + seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n", + (int)(array_descriptor - field->type), + field->type, field->name, + field->offset, field->size, !!field->is_signed); return 0; } @@ -2379,9 +2399,10 @@ event_define_fields(struct trace_event_call *call) } offset = ALIGN(offset, field->align); - ret = trace_define_field(call, field->type, field->name, + ret = trace_define_field_ext(call, field->type, field->name, offset, field->size, - field->is_signed, field->filter_type); + field->is_signed, field->filter_type, + field->len); if (WARN_ON_ONCE(ret)) { pr_err("error code is %d\n", ret); break; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d960f6b11b5e..58f3946081e2 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -111,7 +111,8 @@ static void __always_unused ____ftrace_check_##name(void) \ #define __array(_type, _item, _len) { \ .type = #_type"["__stringify(_len)"]", .name = #_item, \ .size = sizeof(_type[_len]), .align = __alignof__(_type), \ - is_signed_type(_type), .filter_type = FILTER_OTHER }, + is_signed_type(_type), .filter_type = FILTER_OTHER, \ + .len = _len }, #undef __array_desc #define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len) diff --git a/kernel/umh.c b/kernel/umh.c index 850631518665..fbf872c624cb 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -438,21 +438,27 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; - if (wait & UMH_KILLABLE) - state |= TASK_KILLABLE; - if (wait & UMH_FREEZABLE) state |= TASK_FREEZABLE; - retval = wait_for_completion_state(&done, state); - if (!retval) - goto wait_done; - if (wait & UMH_KILLABLE) { + retval = wait_for_completion_state(&done, state | TASK_KILLABLE); + if (!retval) + goto wait_done; + /* umh_complete() will see NULL and free sub_info */ if (xchg(&sub_info->complete, NULL)) goto unlock; + + /* + * fallthrough; in case of -ERESTARTSYS now do uninterruptible + * wait_for_completion_state(). Since umh_complete() shall call + * complete() in a moment if xchg() above returned NULL, this + * uninterruptible wait_for_completion_state() will not block + * SIGKILL'ed processes for long. + */ } + wait_for_completion_state(&done, state); wait_done: retval = sub_info->retval; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 61a9425a311f..02ee440f7be3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -754,6 +754,7 @@ config DEBUG_KMEMLEAK select KALLSYMS select CRC32 select STACKDEPOT + select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF help Say Y here if you want to enable the memory leak detector. The memory allocation/freeing is traced in a way @@ -1207,7 +1208,7 @@ config SCHED_DEBUG depends on DEBUG_KERNEL && PROC_FS default y help - If you say Y here, the /proc/sched_debug file will be provided + If you say Y here, the /sys/kernel/debug/sched file will be provided that can help debug the scheduler. The runtime overhead of this option is minimal. diff --git a/lib/dec_and_lock.c b/lib/dec_and_lock.c index 9555b68bb774..1dcca8f2e194 100644 --- a/lib/dec_and_lock.c +++ b/lib/dec_and_lock.c @@ -49,3 +49,34 @@ int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, return 0; } EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave); + +int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock) +{ + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + if (atomic_add_unless(atomic, -1, 1)) + return 0; + + /* Otherwise do it the slow way */ + raw_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + raw_spin_unlock(lock); + return 0; +} +EXPORT_SYMBOL(_atomic_dec_and_raw_lock); + +int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, + unsigned long *flags) +{ + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + if (atomic_add_unless(atomic, -1, 1)) + return 0; + + /* Otherwise do it the slow way */ + raw_spin_lock_irqsave(lock, *flags); + if (atomic_dec_and_test(atomic)) + return 1; + raw_spin_unlock_irqrestore(lock, *flags); + return 0; +} +EXPORT_SYMBOL(_atomic_dec_and_raw_lock_irqsave); diff --git a/lib/kunit/assert.c b/lib/kunit/assert.c index f5b50babe38d..05a09652f5a1 100644 --- a/lib/kunit/assert.c +++ b/lib/kunit/assert.c @@ -241,24 +241,34 @@ void kunit_mem_assert_format(const struct kunit_assert *assert, mem_assert = container_of(assert, struct kunit_mem_assert, assert); - string_stream_add(stream, - KUNIT_SUBTEST_INDENT "Expected %s %s %s, but\n", - mem_assert->text->left_text, - mem_assert->text->operation, - mem_assert->text->right_text); + if (!mem_assert->left_value) { + string_stream_add(stream, + KUNIT_SUBTEST_INDENT "Expected %s is not null, but is\n", + mem_assert->text->left_text); + } else if (!mem_assert->right_value) { + string_stream_add(stream, + KUNIT_SUBTEST_INDENT "Expected %s is not null, but is\n", + mem_assert->text->right_text); + } else { + string_stream_add(stream, + KUNIT_SUBTEST_INDENT "Expected %s %s %s, but\n", + mem_assert->text->left_text, + mem_assert->text->operation, + mem_assert->text->right_text); - string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s ==\n", - mem_assert->text->left_text); - kunit_assert_hexdump(stream, mem_assert->left_value, - mem_assert->right_value, mem_assert->size); + string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s ==\n", + mem_assert->text->left_text); + kunit_assert_hexdump(stream, mem_assert->left_value, + mem_assert->right_value, mem_assert->size); - string_stream_add(stream, "\n"); + string_stream_add(stream, "\n"); - string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s ==\n", - mem_assert->text->right_text); - kunit_assert_hexdump(stream, mem_assert->right_value, - mem_assert->left_value, mem_assert->size); + string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s ==\n", + mem_assert->text->right_text); + kunit_assert_hexdump(stream, mem_assert->right_value, + mem_assert->left_value, mem_assert->size); - kunit_assert_print_msg(message, stream); + kunit_assert_print_msg(message, stream); + } } EXPORT_SYMBOL_GPL(kunit_mem_assert_format); diff --git a/lib/kunit/test.c b/lib/kunit/test.c index c9ebf975e56b..890ba5b3a981 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -21,6 +21,7 @@ #include "try-catch-impl.h" DEFINE_STATIC_KEY_FALSE(kunit_running); +EXPORT_SYMBOL_GPL(kunit_running); #if IS_BUILTIN(CONFIG_KUNIT) /* diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 26e2045d3cda..5a976393c9ae 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -670,12 +670,13 @@ static inline unsigned long mte_pivot(const struct maple_enode *mn, unsigned char piv) { struct maple_node *node = mte_to_node(mn); + enum maple_type type = mte_node_type(mn); - if (piv >= mt_pivots[piv]) { + if (piv >= mt_pivots[type]) { WARN_ON(1); return 0; } - switch (mte_node_type(mn)) { + switch (type) { case maple_arange_64: return node->ma64.pivot[piv]; case maple_range_64: @@ -4887,7 +4888,7 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) unsigned long *pivots, *gaps; void __rcu **slots; unsigned long gap = 0; - unsigned long max, min, index; + unsigned long max, min; unsigned char offset; if (unlikely(mas_is_err(mas))) @@ -4909,8 +4910,7 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) min = mas_safe_min(mas, pivots, --offset); max = mas_safe_pivot(mas, pivots, offset, type); - index = mas->index; - while (index <= max) { + while (mas->index <= max) { gap = 0; if (gaps) gap = gaps[offset]; @@ -4941,10 +4941,8 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) min = mas_safe_min(mas, pivots, offset); } - if (unlikely(index > max)) { - mas_set_err(mas, -EBUSY); - return false; - } + if (unlikely((mas->index > max) || (size - 1 > max - mas->index))) + goto no_space; if (unlikely(ma_is_leaf(type))) { mas->offset = offset; @@ -4961,9 +4959,11 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) return false; ascend: - if (mte_is_root(mas->node)) - mas_set_err(mas, -EBUSY); + if (!mte_is_root(mas->node)) + return false; +no_space: + mas_set_err(mas, -EBUSY); return false; } diff --git a/lib/parser.c b/lib/parser.c index bcb23484100e..2b5e2b480253 100644 --- a/lib/parser.c +++ b/lib/parser.c @@ -11,6 +11,15 @@ #include <linux/slab.h> #include <linux/string.h> +/* + * max size needed by different bases to express U64 + * HEX: "0xFFFFFFFFFFFFFFFF" --> 18 + * DEC: "18446744073709551615" --> 20 + * OCT: "01777777777777777777777" --> 23 + * pick the max one to define NUMBER_BUF_LEN + */ +#define NUMBER_BUF_LEN 24 + /** * match_one - Determines if a string matches a simple pattern * @s: the string to examine for presence of the pattern @@ -129,14 +138,12 @@ EXPORT_SYMBOL(match_token); static int match_number(substring_t *s, int *result, int base) { char *endp; - char *buf; + char buf[NUMBER_BUF_LEN]; int ret; long val; - buf = match_strdup(s); - if (!buf) - return -ENOMEM; - + if (match_strlcpy(buf, s, NUMBER_BUF_LEN) >= NUMBER_BUF_LEN) + return -ERANGE; ret = 0; val = simple_strtol(buf, &endp, base); if (endp == buf) @@ -145,7 +152,6 @@ static int match_number(substring_t *s, int *result, int base) ret = -ERANGE; else *result = (int) val; - kfree(buf); return ret; } @@ -163,18 +169,15 @@ static int match_number(substring_t *s, int *result, int base) */ static int match_u64int(substring_t *s, u64 *result, int base) { - char *buf; + char buf[NUMBER_BUF_LEN]; int ret; u64 val; - buf = match_strdup(s); - if (!buf) - return -ENOMEM; - + if (match_strlcpy(buf, s, NUMBER_BUF_LEN) >= NUMBER_BUF_LEN) + return -ERANGE; ret = kstrtoull(buf, base, &val); if (!ret) *result = val; - kfree(buf); return ret; } @@ -206,14 +209,12 @@ EXPORT_SYMBOL(match_int); */ int match_uint(substring_t *s, unsigned int *result) { - int err = -ENOMEM; - char *buf = match_strdup(s); + char buf[NUMBER_BUF_LEN]; - if (buf) { - err = kstrtouint(buf, 10, result); - kfree(buf); - } - return err; + if (match_strlcpy(buf, s, NUMBER_BUF_LEN) >= NUMBER_BUF_LEN) + return -ERANGE; + + return kstrtouint(buf, 10, result); } EXPORT_SYMBOL(match_uint); diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 497fc93ccf9e..ec847bf4dcb4 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -2517,6 +2517,91 @@ static noinline void check_bnode_min_spanning(struct maple_tree *mt) mt_set_non_kernel(0); } +static noinline void check_empty_area_window(struct maple_tree *mt) +{ + unsigned long i, nr_entries = 20; + MA_STATE(mas, mt, 0, 0); + + for (i = 1; i <= nr_entries; i++) + mtree_store_range(mt, i*10, i*10 + 9, + xa_mk_value(i), GFP_KERNEL); + + /* Create another hole besides the one at 0 */ + mtree_store_range(mt, 160, 169, NULL, GFP_KERNEL); + + /* Check lower bounds that don't fit */ + rcu_read_lock(); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 10) != -EBUSY); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 6, 90, 5) != -EBUSY); + + /* Check lower bound that does fit */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 5) != 0); + MT_BUG_ON(mt, mas.index != 5); + MT_BUG_ON(mt, mas.last != 9); + rcu_read_unlock(); + + /* Check one gap that doesn't fit and one that does */ + rcu_read_lock(); + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 217, 9) != 0); + MT_BUG_ON(mt, mas.index != 161); + MT_BUG_ON(mt, mas.last != 169); + + /* Check one gap that does fit above the min */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 3) != 0); + MT_BUG_ON(mt, mas.index != 216); + MT_BUG_ON(mt, mas.last != 218); + + /* Check size that doesn't fit any gap */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 16) != -EBUSY); + + /* + * Check size that doesn't fit the lower end of the window but + * does fit the gap + */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 167, 200, 4) != -EBUSY); + + /* + * Check size that doesn't fit the upper end of the window but + * does fit the gap + */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 162, 4) != -EBUSY); + + /* Check mas_empty_area forward */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 9) != 0); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 8); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 4) != 0); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 3); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 11) != -EBUSY); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 5, 100, 6) != -EBUSY); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 8, 10) != -EBUSY); + + mas_reset(&mas); + mas_empty_area(&mas, 100, 165, 3); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 100, 163, 6) != -EBUSY); + rcu_read_unlock(); +} + static DEFINE_MTREE(tree); static int maple_tree_seed(void) { @@ -2765,6 +2850,10 @@ static int maple_tree_seed(void) check_bnode_min_spanning(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_empty_area_window(&tree); + mtree_destroy(&tree); + #if defined(BENCH) skip: #endif diff --git a/mm/filemap.c b/mm/filemap.c index c4d4ace9cc70..0e20a8d6dd93 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2588,18 +2588,19 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, struct folio *folio; int err = 0; + /* "last_index" is the index of the page beyond the end of the read */ last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); retry: if (fatal_signal_pending(current)) return -EINTR; - filemap_get_read_batch(mapping, index, last_index, fbatch); + filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); - filemap_get_read_batch(mapping, index, last_index, fbatch); + filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) @@ -1914,7 +1914,7 @@ static unsigned long collect_longterm_unpinnable_pages( drain_allow = false; } - if (!folio_isolate_lru(folio)) + if (folio_isolate_lru(folio)) continue; list_add_tail(&folio->lru, movable_page_list); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index abe6cfd92ffa..1b791b26d72d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3272,8 +3272,6 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); - if (is_writable_migration_entry(entry)) - pmde = maybe_pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); if (!is_migration_entry_young(entry)) @@ -3281,6 +3279,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) /* NOTE: this may contain setting soft-dirty on some archs */ if (PageDirty(new) && is_migration_entry_dirty(entry)) pmde = pmd_mkdirty(pmde); + if (is_writable_migration_entry(entry)) + pmde = maybe_pmd_mkwrite(pmde, vma); + else + pmde = pmd_wrprotect(pmde); if (PageAnon(new)) { rmap_t rmap_flags = RMAP_COMPOUND; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7fcdb98c9e68..bdbfeb6fb393 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5051,6 +5051,9 @@ again: entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry); } else if (unlikely(is_pte_marker(entry))) { + /* No swap on hugetlb */ + WARN_ON_ONCE( + is_swapin_error_entry(pte_to_swp_entry(entry))); /* * We copy the pte marker only if the dst vma has * uffd-wp enabled. diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 833bf2cfd2a3..21e66d7f261d 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -246,6 +246,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) { + if (!kasan_arch_is_ready()) + return false; + if (ptr != page_address(virt_to_head_page(ptr))) { kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE); return true; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index b076f597a378..cb762982c8ba 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -191,7 +191,12 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, bool kasan_byte_accessible(const void *addr) { - s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); + s8 shadow_byte; + + if (!kasan_arch_is_ready()) + return true; + + shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); return shadow_byte >= 0 && shadow_byte < KASAN_GRANULE_SIZE; } diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 2fba1f51f042..15cfb34d16a1 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -291,6 +291,9 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) unsigned long shadow_start, shadow_end; int ret; + if (!kasan_arch_is_ready()) + return 0; + if (!is_vmalloc_or_module_addr((void *)addr)) return 0; @@ -459,6 +462,9 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long region_start, region_end; unsigned long size; + if (!kasan_arch_is_ready()) + return; + region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE); region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE); @@ -502,6 +508,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. */ + if (!kasan_arch_is_ready()) + return (void *)start; + if (!is_vmalloc_or_module_addr(start)) return (void *)start; @@ -524,6 +533,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, */ void __kasan_poison_vmalloc(const void *start, unsigned long size) { + if (!kasan_arch_is_ready()) + return; + if (!is_vmalloc_or_module_addr(start)) return; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 79be13133322..a26a28e3738c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -847,6 +847,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } +/* + * See pmd_trans_unstable() for how the result may change out from + * underneath us, even if we hold mmap_lock in read. + */ static int find_pmd_or_thp_or_none(struct mm_struct *mm, unsigned long address, pmd_t **pmd) @@ -865,8 +869,12 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, #endif if (pmd_none(pmde)) return SCAN_PMD_NONE; + if (!pmd_present(pmde)) + return SCAN_PMD_NULL; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; + if (pmd_devmap(pmde)) + return SCAN_PMD_NULL; if (pmd_bad(pmde)) return SCAN_PMD_NULL; return SCAN_SUCCEED; @@ -1642,7 +1650,7 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, * has higher cost too. It would also probably require locking * the anon_vma. */ - if (vma->anon_vma) { + if (READ_ONCE(vma->anon_vma)) { result = SCAN_PAGE_ANON; goto next; } @@ -1671,6 +1679,18 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, if ((cc->is_khugepaged || is_target) && mmap_write_trylock(mm)) { /* + * Re-check whether we have an ->anon_vma, because + * collapse_and_free_pmd() requires that either no + * ->anon_vma exists or the anon_vma is locked. + * We already checked ->anon_vma above, but that check + * is racy because ->anon_vma can be populated under the + * mmap lock in read mode. + */ + if (vma->anon_vma) { + result = SCAN_PAGE_ANON; + goto unlock_next; + } + /* * When a vma is registered with uffd-wp, we can't * recycle the pmd pgtable because there can be pte * markers installed. Skip it only, so the rest mm/vma @@ -2591,6 +2611,7 @@ static int madvise_collapse_errno(enum scan_result r) case SCAN_CGROUP_CHARGE_FAIL: return -EBUSY; /* Resource temporary unavailable - trying again might succeed */ + case SCAN_PAGE_COUNT: case SCAN_PAGE_LOCK: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 92f670edbf51..55dc8b8b0616 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -2070,8 +2070,10 @@ static int __init kmemleak_boot_config(char *str) return -EINVAL; if (strcmp(str, "off") == 0) kmemleak_disable(); - else if (strcmp(str, "on") == 0) + else if (strcmp(str, "on") == 0) { kmemleak_skip_disable = 1; + stack_depot_want_early_init(); + } else return -EINVAL; return 0; @@ -2093,7 +2095,6 @@ void __init kmemleak_init(void) if (kmemleak_error) return; - stack_depot_init(); jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); @@ -2629,8 +2629,11 @@ struct page *ksm_might_need_to_copy(struct page *page, new_page = NULL; } if (new_page) { - copy_user_highpage(new_page, page, address, vma); - + if (copy_mc_user_highpage(new_page, page, address, vma)) { + put_page(new_page); + memory_failure_queue(page_to_pfn(page), 0); + return ERR_PTR(-EHWPOISON); + } SetPageDirty(new_page); __SetPageUptodate(new_page); __SetPageLocked(new_page); diff --git a/mm/madvise.c b/mm/madvise.c index b6ea204d4e23..18c2e2affac4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -329,7 +329,7 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma) * otherwise we'd be including shared non-exclusive mappings, which * opens a side channel. */ - return inode_owner_or_capable(&init_user_ns, + return inode_owner_or_capable(&nop_mnt_idmap, file_inode(vma->vm_file)) || file_permission(vma->vm_file, MAY_WRITE) == 0; } diff --git a/mm/memblock.c b/mm/memblock.c index 685e30e6d27c..d036c7861310 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1640,13 +1640,7 @@ void __init memblock_free_late(phys_addr_t base, phys_addr_t size) end = PFN_DOWN(base + size); for (; cursor < end; cursor++) { - /* - * Reserved pages are always initialized by the end of - * memblock_free_all() (by memmap_init() and, if deferred - * initialization is enabled, memmap_init_reserved_pages()), so - * these pages can be released directly to the buddy allocator. - */ - __free_pages_core(pfn_to_page(cursor), 0); + memblock_free_pages(pfn_to_page(cursor), cursor, 0); totalram_pages_inc(); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab457f0394ab..73afff8062f9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -63,7 +63,6 @@ #include <linux/resume_user_mode.h> #include <linux/psi.h> #include <linux/seq_buf.h> -#include <linux/parser.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -2393,8 +2392,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, - MEMCG_RECLAIM_MAY_SWAP, - NULL); + MEMCG_RECLAIM_MAY_SWAP); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2685,8 +2683,7 @@ retry: psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, reclaim_options, - NULL); + gfp_mask, reclaim_options); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -3506,8 +3503,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, } if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, - NULL)) { + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { ret = -EBUSY; break; } @@ -3618,8 +3614,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return -EINTR; if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - MEMCG_RECLAIM_MAY_SWAP, - NULL)) + MEMCG_RECLAIM_MAY_SWAP)) nr_retries--; } @@ -6429,8 +6424,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, - NULL); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); if (!reclaimed && !nr_retries--) break; @@ -6479,8 +6473,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, - NULL)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) nr_reclaims--; continue; } @@ -6603,54 +6596,21 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } -enum { - MEMORY_RECLAIM_NODES = 0, - MEMORY_RECLAIM_NULL, -}; - -static const match_table_t if_tokens = { - { MEMORY_RECLAIM_NODES, "nodes=%s" }, - { MEMORY_RECLAIM_NULL, NULL }, -}; - static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; - unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP | - MEMCG_RECLAIM_PROACTIVE; - char *old_buf, *start; - substring_t args[MAX_OPT_ARGS]; - int token; - char value[256]; - nodemask_t nodemask = NODE_MASK_ALL; - - buf = strstrip(buf); - - old_buf = buf; - nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; - if (buf == old_buf) - return -EINVAL; + unsigned int reclaim_options; + int err; buf = strstrip(buf); + err = page_counter_memparse(buf, "", &nr_to_reclaim); + if (err) + return err; - while ((start = strsep(&buf, " ")) != NULL) { - if (!strlen(start)) - continue; - token = match_token(start, if_tokens, args); - match_strlcpy(value, args, sizeof(value)); - switch (token) { - case MEMORY_RECLAIM_NODES: - if (nodelist_parse(value, nodemask) < 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - } - + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed; @@ -6667,8 +6627,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, reclaim_options, - &nodemask); + GFP_KERNEL, reclaim_options); if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/memory.c b/mm/memory.c index aad226daf41b..f526b9152bef 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -828,12 +828,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, return -EBUSY; return -ENOENT; } else if (is_pte_marker_entry(entry)) { - /* - * We're copying the pgtable should only because dst_vma has - * uffd-wp enabled, do sanity check. - */ - WARN_ON_ONCE(!userfaultfd_wp(dst_vma)); - set_pte_at(dst_mm, addr, dst_pte, pte); + if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma)) + set_pte_at(dst_mm, addr, dst_pte, pte); return 0; } if (!userfaultfd_wp(dst_vma)) @@ -3629,8 +3625,12 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) /* * Be careful so that we will only recover a special uffd-wp pte into a * none pte. Otherwise it means the pte could have changed, so retry. + * + * This should also cover the case where e.g. the pte changed + * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR. + * So is_pte_marker() check is not enough to safely drop the pte. */ - if (is_pte_marker(*vmf->pte)) + if (pte_same(vmf->orig_pte, *vmf->pte)) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -3840,6 +3840,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!page)) { ret = VM_FAULT_OOM; goto out_page; + } else if (unlikely(PTR_ERR(page) == -EHWPOISON)) { + ret = VM_FAULT_HWPOISON; + goto out_page; } folio = page_folio(page); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 02c8a712282f..f940395667c8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -600,7 +600,8 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) { + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && + !hugetlb_pmd_shared(pte))) { if (isolate_hugetlb(page, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* diff --git a/mm/migrate.c b/mm/migrate.c index a4d3fc65085f..cc5455614e01 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -224,6 +224,8 @@ static bool remove_migration_pte(struct folio *folio, pte = maybe_mkwrite(pte, vma); else if (pte_swp_uffd_wp(*pvmw.pte)) pte = pte_mkuffd_wp(pte); + else + pte = pte_wrprotect(pte); if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) rmap_flags |= RMAP_EXCLUSIVE; diff --git a/mm/mincore.c b/mm/mincore.c index a085a2aeabd8..cd69b9db0081 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -168,7 +168,7 @@ static inline bool can_do_mincore(struct vm_area_struct *vma) * for writing; otherwise we'd be including shared non-exclusive * mappings, which opens a side channel. */ - return inode_owner_or_capable(&init_user_ns, + return inode_owner_or_capable(&nop_mnt_idmap, file_inode(vma->vm_file)) || file_permission(vma->vm_file, MAY_WRITE) == 0; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 908df12caa26..61cf60015a8b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -245,7 +245,13 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, newpte = pte_swp_mksoft_dirty(newpte); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); - } else if (pte_marker_entry_uffd_wp(entry)) { + } else if (is_pte_marker_entry(entry)) { + /* + * Ignore swapin errors unconditionally, + * because any access should sigbus anyway. + */ + if (is_swapin_error_entry(entry)) + continue; /* * If this is uffd-wp pte marker and we'd like * to unprotect it, drop it; the next page diff --git a/mm/mremap.c b/mm/mremap.c index fe587c5d6591..930f65c315c0 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1027,16 +1027,29 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } /* - * Function vma_merge() is called on the extension we are adding to - * the already existing vma, vma_merge() will merge this extension with - * the already existing vma (expand operation itself) and possibly also - * with the next vma if it becomes adjacent to the expanded vma and - * otherwise compatible. + * Function vma_merge() is called on the extension we + * are adding to the already existing vma, vma_merge() + * will merge this extension with the already existing + * vma (expand operation itself) and possibly also with + * the next vma if it becomes adjacent to the expanded + * vma and otherwise compatible. + * + * However, vma_merge() can currently fail due to + * is_mergeable_vma() check for vm_ops->close (see the + * comment there). Yet this should not prevent vma + * expanding, so perform a simple expand for such vma. + * Ideally the check for close op should be only done + * when a vma would be actually removed due to a merge. */ - vma = vma_merge(mm, vma, extension_start, extension_end, + if (!vma->vm_ops || !vma->vm_ops->close) { + vma = vma_merge(mm, vma, extension_start, extension_end, vma->vm_flags, vma->anon_vma, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + } else if (vma_adjust(vma, vma->vm_start, addr + new_len, + vma->vm_pgoff, NULL)) { + vma = NULL; + } if (!vma) { vm_unacct_memory(pages); ret = -ENOMEM; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0745aedebb37..3bb3484563ed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5631,9 +5631,12 @@ EXPORT_SYMBOL(get_zeroed_page); */ void __free_pages(struct page *page, unsigned int order) { + /* get PageHead before we drop reference */ + int head = PageHead(page); + if (put_page_testzero(page)) free_the_page(page, order); - else if (!PageHead(page)) + else if (!head) while (order-- > 0) free_the_page(page + (1 << order), order); } diff --git a/mm/secretmem.c b/mm/secretmem.c index 04c3ac9448a1..afcf46e99cda 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -162,7 +162,7 @@ const struct address_space_operations secretmem_aops = { .migrate_folio = secretmem_migrate_folio, }; -static int secretmem_setattr(struct user_namespace *mnt_userns, +static int secretmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); @@ -175,7 +175,7 @@ static int secretmem_setattr(struct user_namespace *mnt_userns, if ((ia_valid & ATTR_SIZE) && inode->i_size) ret = -EINVAL; else - ret = simple_setattr(mnt_userns, dentry, iattr); + ret = simple_setattr(idmap, dentry, iattr); filemap_invalidate_unlock(mapping); diff --git a/mm/shmem.c b/mm/shmem.c index 0005ab2c29af..41f82c5a5e28 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1045,7 +1045,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) } EXPORT_SYMBOL_GPL(shmem_truncate_range); -static int shmem_getattr(struct user_namespace *mnt_userns, +static int shmem_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -1066,7 +1066,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns, stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(idmap, inode, stat); if (shmem_is_huge(NULL, inode, 0, false)) stat->blksize = HPAGE_PMD_SIZE; @@ -1080,7 +1080,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns, return 0; } -static int shmem_setattr(struct user_namespace *mnt_userns, +static int shmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -1089,7 +1089,7 @@ static int shmem_setattr(struct user_namespace *mnt_userns, bool update_mtime = false; bool update_ctime = true; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(idmap, dentry, attr); if (error) return error; @@ -1127,9 +1127,9 @@ static int shmem_setattr(struct user_namespace *mnt_userns, } } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(idmap, inode, attr); if (attr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + error = posix_acl_chmod(idmap, dentry, inode->i_mode); if (!error && update_ctime) { inode->i_ctime = current_time(inode); if (update_mtime) @@ -2327,8 +2327,9 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) #define shmem_initxattrs NULL #endif -static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, - umode_t mode, dev_t dev, unsigned long flags) +static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, + struct inode *dir, umode_t mode, dev_t dev, + unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; @@ -2341,7 +2342,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, inode = new_inode(sb); if (inode) { inode->i_ino = ino; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_generation = get_random_u32(); @@ -2913,13 +2914,13 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) * File creation. Allocate an inode, and we're done.. */ static int -shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir, +shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; int error = -ENOSPC; - inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); + inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { error = simple_acl_create(dir, inode); if (error) @@ -2944,13 +2945,13 @@ out_iput: } static int -shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; int error = -ENOSPC; - inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE); + inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); if (inode) { error = security_inode_init_security(inode, dir, NULL, @@ -2968,22 +2969,22 @@ out_iput: return error; } -static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int error; - if ((error = shmem_mknod(&init_user_ns, dir, dentry, - mode | S_IFDIR, 0))) + error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); + if (error) return error; inc_nlink(dir); return 0; } -static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir, +static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); + return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0); } /* @@ -3043,7 +3044,7 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) return shmem_unlink(dir, dentry); } -static int shmem_whiteout(struct user_namespace *mnt_userns, +static int shmem_whiteout(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry) { struct dentry *whiteout; @@ -3053,7 +3054,7 @@ static int shmem_whiteout(struct user_namespace *mnt_userns, if (!whiteout) return -ENOMEM; - error = shmem_mknod(&init_user_ns, old_dir, whiteout, + error = shmem_mknod(idmap, old_dir, whiteout, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); dput(whiteout); if (error) @@ -3076,7 +3077,7 @@ static int shmem_whiteout(struct user_namespace *mnt_userns, * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ -static int shmem_rename2(struct user_namespace *mnt_userns, +static int shmem_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -3096,7 +3097,7 @@ static int shmem_rename2(struct user_namespace *mnt_userns, if (flags & RENAME_WHITEOUT) { int error; - error = shmem_whiteout(&init_user_ns, old_dir, old_dentry); + error = shmem_whiteout(idmap, old_dir, old_dentry); if (error) return error; } @@ -3122,7 +3123,7 @@ static int shmem_rename2(struct user_namespace *mnt_userns, return 0; } -static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int error; @@ -3134,7 +3135,7 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (len > PAGE_SIZE) return -ENAMETOOLONG; - inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0, + inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, VM_NORESERVE); if (!inode) return -ENOSPC; @@ -3227,7 +3228,7 @@ static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -static int shmem_fileattr_set(struct user_namespace *mnt_userns, +static int shmem_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -3301,7 +3302,7 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler, } static int shmem_xattr_handler_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -3817,7 +3818,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) #endif uuid_gen(&sb->s_uuid); - inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); + inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, + VM_NORESERVE); if (!inode) goto failed; inode->i_uid = sbinfo->uid; @@ -4042,7 +4044,11 @@ static struct file_system_type shmem_fs_type = { .parameters = shmem_fs_parameters, #endif .kill_sb = kill_litter_super, +#ifdef CONFIG_SHMEM + .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, +#else .fs_flags = FS_USERNS_MOUNT, +#endif }; void __init shmem_init(void) @@ -4194,7 +4200,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops #define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations -#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) +#define shmem_get_inode(idmap, sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) @@ -4217,8 +4223,11 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l if (shmem_acct_size(flags, size)) return ERR_PTR(-ENOMEM); - inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, - flags); + if (is_idmapped_mnt(mnt)) + return ERR_PTR(-EINVAL); + + inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, + S_IFREG | S_IRWXUGO, 0, flags); if (unlikely(!inode)) { shmem_unacct_size(flags, size); return ERR_PTR(-ENOSPC); diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index b05295bab322..39c3491e28a3 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -246,18 +246,21 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) } EXPORT_SYMBOL(shrinker_debugfs_rename); -void shrinker_debugfs_remove(struct shrinker *shrinker) +struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker) { + struct dentry *entry = shrinker->debugfs_entry; + lockdep_assert_held(&shrinker_rwsem); kfree_const(shrinker->name); shrinker->name = NULL; - if (!shrinker->debugfs_entry) - return; + if (entry) { + ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id); + shrinker->debugfs_entry = NULL; + } - debugfs_remove_recursive(shrinker->debugfs_entry); - ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id); + return entry; } static int __init shrinker_debugfs_init(void) diff --git a/mm/swap.c b/mm/swap.c index 70e2063ef43a..4c03ecab698e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -158,36 +158,6 @@ void put_pages_list(struct list_head *pages) } EXPORT_SYMBOL(put_pages_list); -/* - * get_kernel_pages() - pin kernel pages in memory - * @kiov: An array of struct kvec structures - * @nr_segs: number of segments to pin - * @write: pinning for read/write, currently ignored - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_segs long. - * - * Returns number of pages pinned. This may be fewer than the number requested. - * If nr_segs is 0 or negative, returns 0. If no pages were pinned, returns 0. - * Each page returned must be released with a put_page() call when it is - * finished with. - */ -int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, - struct page **pages) -{ - int seg; - - for (seg = 0; seg < nr_segs; seg++) { - if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) - return seg; - - pages[seg] = kmap_to_page(kiov[seg].iov_base); - get_page(pages[seg]); - } - - return seg; -} -EXPORT_SYMBOL_GPL(get_kernel_pages); - typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) diff --git a/mm/swapfile.c b/mm/swapfile.c index 908a529bca12..eb9b0bf1fcdd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1100,6 +1100,7 @@ start_over: goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); + cond_resched(); spin_lock(&swap_avail_lock); nextsi: @@ -1763,12 +1764,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, struct page *swapcache; spinlock_t *ptl; pte_t *pte, new_pte; + bool hwposioned = false; int ret = 1; swapcache = page; page = ksm_might_need_to_copy(page, vma, addr); if (unlikely(!page)) return -ENOMEM; + else if (unlikely(PTR_ERR(page) == -EHWPOISON)) + hwposioned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { @@ -1776,15 +1780,19 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } - if (unlikely(!PageUptodate(page))) { - pte_t pteval; + if (unlikely(hwposioned || !PageUptodate(page))) { + swp_entry_t swp_entry; dec_mm_counter(vma->vm_mm, MM_SWAPENTS); - pteval = swp_entry_to_pte(make_swapin_error_entry()); - set_pte_at(vma->vm_mm, addr, pte, pteval); - swap_free(entry); + if (hwposioned) { + swp_entry = make_hwpoison_entry(swapcache); + page = swapcache; + } else { + swp_entry = make_swapin_error_entry(); + } + new_pte = swp_entry_to_pte(swp_entry); ret = 0; - goto out; + goto setpte; } /* See do_swap_page() */ @@ -1816,6 +1824,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, new_pte = pte_mksoft_dirty(new_pte); if (pte_swp_uffd_wp(*pte)) new_pte = pte_mkuffd_wp(new_pte); +setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); swap_free(entry); out: diff --git a/mm/vmscan.c b/mm/vmscan.c index bd6637fcd8f9..5b7b8d4f5297 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -741,6 +741,8 @@ EXPORT_SYMBOL(register_shrinker); */ void unregister_shrinker(struct shrinker *shrinker) { + struct dentry *debugfs_entry; + if (!(shrinker->flags & SHRINKER_REGISTERED)) return; @@ -749,9 +751,11 @@ void unregister_shrinker(struct shrinker *shrinker) shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); - shrinker_debugfs_remove(shrinker); + debugfs_entry = shrinker_debugfs_remove(shrinker); up_write(&shrinker_rwsem); + debugfs_remove_recursive(debugfs_entry); + kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; } @@ -3323,13 +3327,16 @@ void lru_gen_migrate_mm(struct mm_struct *mm) if (mem_cgroup_disabled()) return; + /* migration can happen before addition */ + if (!mm->lru_gen.memcg) + return; + rcu_read_lock(); memcg = mem_cgroup_from_task(task); rcu_read_unlock(); if (memcg == mm->lru_gen.memcg) return; - VM_WARN_ON_ONCE(!mm->lru_gen.memcg); VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); lru_gen_del_mm(mm); @@ -6754,8 +6761,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options, - nodemask_t *nodemask) + unsigned int reclaim_options) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -6770,7 +6776,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), - .nodemask = nodemask, }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9445bee6b014..702bc3fd687a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -113,7 +113,23 @@ * have room for two bit at least. */ #define OBJ_ALLOCATED_TAG 1 -#define OBJ_TAG_BITS 1 + +#ifdef CONFIG_ZPOOL +/* + * The second least-significant bit in the object's header identifies if the + * value stored at the header is a deferred handle from the last reclaim + * attempt. + * + * As noted above, this is valid because we have room for two bits. + */ +#define OBJ_DEFERRED_HANDLE_TAG 2 +#define OBJ_TAG_BITS 2 +#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG) +#else +#define OBJ_TAG_BITS 1 +#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG +#endif /* CONFIG_ZPOOL */ + #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) @@ -222,6 +238,12 @@ struct link_free { * Handle of allocated object. */ unsigned long handle; +#ifdef CONFIG_ZPOOL + /* + * Deferred handle of a reclaimed object. + */ + unsigned long deferred_handle; +#endif }; }; @@ -272,8 +294,6 @@ struct zspage { /* links the zspage to the lru list in the pool */ struct list_head lru; bool under_reclaim; - /* list of unfreed handles whose objects have been reclaimed */ - unsigned long *deferred_handles; #endif struct zs_pool *pool; @@ -897,7 +917,8 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) +static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle, + int tag) { unsigned long handle; struct zspage *zspage = get_zspage(page); @@ -908,13 +929,27 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) } else handle = *(unsigned long *)obj; - if (!(handle & OBJ_ALLOCATED_TAG)) + if (!(handle & tag)) return false; - *phandle = handle & ~OBJ_ALLOCATED_TAG; + /* Clear all tags before returning the handle */ + *phandle = handle & ~OBJ_TAG_MASK; return true; } +static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) +{ + return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG); +} + +#ifdef CONFIG_ZPOOL +static bool obj_stores_deferred_handle(struct page *page, void *obj, + unsigned long *phandle) +{ + return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG); +} +#endif + static void reset_page(struct page *page) { __ClearPageMovable(page); @@ -946,22 +981,36 @@ unlock: } #ifdef CONFIG_ZPOOL +static unsigned long find_deferred_handle_obj(struct size_class *class, + struct page *page, int *obj_idx); + /* * Free all the deferred handles whose objects are freed in zs_free. */ -static void free_handles(struct zs_pool *pool, struct zspage *zspage) +static void free_handles(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) { - unsigned long handle = (unsigned long)zspage->deferred_handles; + int obj_idx = 0; + struct page *page = get_first_page(zspage); + unsigned long handle; - while (handle) { - unsigned long nxt_handle = handle_to_obj(handle); + while (1) { + handle = find_deferred_handle_obj(class, page, &obj_idx); + if (!handle) { + page = get_next_page(page); + if (!page) + break; + obj_idx = 0; + continue; + } cache_free_handle(pool, handle); - handle = nxt_handle; + obj_idx++; } } #else -static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {} +static inline void free_handles(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) {} #endif static void __free_zspage(struct zs_pool *pool, struct size_class *class, @@ -979,7 +1028,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, VM_BUG_ON(fg != ZS_EMPTY); /* Free all deferred handles from zs_free */ - free_handles(pool, zspage); + free_handles(pool, class, zspage); next = page = get_first_page(zspage); do { @@ -1067,7 +1116,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) #ifdef CONFIG_ZPOOL INIT_LIST_HEAD(&zspage->lru); zspage->under_reclaim = false; - zspage->deferred_handles = NULL; #endif set_freeobj(zspage, 0); @@ -1568,7 +1616,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) } EXPORT_SYMBOL_GPL(zs_malloc); -static void obj_free(int class_size, unsigned long obj) +static void obj_free(int class_size, unsigned long obj, unsigned long *handle) { struct link_free *link; struct zspage *zspage; @@ -1582,15 +1630,29 @@ static void obj_free(int class_size, unsigned long obj) zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); - - /* Insert this object in containing zspage's freelist */ link = (struct link_free *)(vaddr + f_offset); - if (likely(!ZsHugePage(zspage))) - link->next = get_freeobj(zspage) << OBJ_TAG_BITS; - else - f_page->index = 0; + + if (handle) { +#ifdef CONFIG_ZPOOL + /* Stores the (deferred) handle in the object's header */ + *handle |= OBJ_DEFERRED_HANDLE_TAG; + *handle &= ~OBJ_ALLOCATED_TAG; + + if (likely(!ZsHugePage(zspage))) + link->deferred_handle = *handle; + else + f_page->index = *handle; +#endif + } else { + /* Insert this object in containing zspage's freelist */ + if (likely(!ZsHugePage(zspage))) + link->next = get_freeobj(zspage) << OBJ_TAG_BITS; + else + f_page->index = 0; + set_freeobj(zspage, f_objidx); + } + kunmap_atomic(vaddr); - set_freeobj(zspage, f_objidx); mod_zspage_inuse(zspage, -1); } @@ -1615,7 +1677,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle) zspage = get_zspage(f_page); class = zspage_class(pool, zspage); - obj_free(class->size, obj); class_stat_dec(class, OBJ_USED, 1); #ifdef CONFIG_ZPOOL @@ -1624,15 +1685,15 @@ void zs_free(struct zs_pool *pool, unsigned long handle) * Reclaim needs the handles during writeback. It'll free * them along with the zspage when it's done with them. * - * Record current deferred handle at the memory location - * whose address is given by handle. + * Record current deferred handle in the object's header. */ - record_obj(handle, (unsigned long)zspage->deferred_handles); - zspage->deferred_handles = (unsigned long *)handle; + obj_free(class->size, obj, &handle); spin_unlock(&pool->lock); return; } #endif + obj_free(class->size, obj, NULL); + fullness = fix_fullness_group(class, zspage); if (fullness == ZS_EMPTY) free_zspage(pool, class, zspage); @@ -1713,11 +1774,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, } /* - * Find alloced object in zspage from index object and + * Find object with a certain tag in zspage from index object and * return handle. */ -static unsigned long find_alloced_obj(struct size_class *class, - struct page *page, int *obj_idx) +static unsigned long find_tagged_obj(struct size_class *class, + struct page *page, int *obj_idx, int tag) { unsigned int offset; int index = *obj_idx; @@ -1728,7 +1789,7 @@ static unsigned long find_alloced_obj(struct size_class *class, offset += class->size * index; while (offset < PAGE_SIZE) { - if (obj_allocated(page, addr + offset, &handle)) + if (obj_tagged(page, addr + offset, &handle, tag)) break; offset += class->size; @@ -1742,6 +1803,28 @@ static unsigned long find_alloced_obj(struct size_class *class, return handle; } +/* + * Find alloced object in zspage from index object and + * return handle. + */ +static unsigned long find_alloced_obj(struct size_class *class, + struct page *page, int *obj_idx) +{ + return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG); +} + +#ifdef CONFIG_ZPOOL +/* + * Find object storing a deferred handle in header in zspage from index object + * and return handle. + */ +static unsigned long find_deferred_handle_obj(struct size_class *class, + struct page *page, int *obj_idx) +{ + return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG); +} +#endif + struct zs_compact_control { /* Source spage for migration which could be a subpage of zspage */ struct page *s_page; @@ -1784,7 +1867,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, zs_object_copy(class, free_obj, used_obj); obj_idx++; record_obj(handle, free_obj); - obj_free(class->size, used_obj); + obj_free(class->size, used_obj, NULL); } /* Remember last position in this iteration */ @@ -2478,6 +2561,90 @@ void zs_destroy_pool(struct zs_pool *pool) EXPORT_SYMBOL_GPL(zs_destroy_pool); #ifdef CONFIG_ZPOOL +static void restore_freelist(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) +{ + unsigned int obj_idx = 0; + unsigned long handle, off = 0; /* off is within-page offset */ + struct page *page = get_first_page(zspage); + struct link_free *prev_free = NULL; + void *prev_page_vaddr = NULL; + + /* in case no free object found */ + set_freeobj(zspage, (unsigned int)(-1UL)); + + while (page) { + void *vaddr = kmap_atomic(page); + struct page *next_page; + + while (off < PAGE_SIZE) { + void *obj_addr = vaddr + off; + + /* skip allocated object */ + if (obj_allocated(page, obj_addr, &handle)) { + obj_idx++; + off += class->size; + continue; + } + + /* free deferred handle from reclaim attempt */ + if (obj_stores_deferred_handle(page, obj_addr, &handle)) + cache_free_handle(pool, handle); + + if (prev_free) + prev_free->next = obj_idx << OBJ_TAG_BITS; + else /* first free object found */ + set_freeobj(zspage, obj_idx); + + prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free); + /* if last free object in a previous page, need to unmap */ + if (prev_page_vaddr) { + kunmap_atomic(prev_page_vaddr); + prev_page_vaddr = NULL; + } + + obj_idx++; + off += class->size; + } + + /* + * Handle the last (full or partial) object on this page. + */ + next_page = get_next_page(page); + if (next_page) { + if (!prev_free || prev_page_vaddr) { + /* + * There is no free object in this page, so we can safely + * unmap it. + */ + kunmap_atomic(vaddr); + } else { + /* update prev_page_vaddr since prev_free is on this page */ + prev_page_vaddr = vaddr; + } + } else { /* this is the last page */ + if (prev_free) { + /* + * Reset OBJ_TAG_BITS bit to last link to tell + * whether it's allocated object or not. + */ + prev_free->next = -1UL << OBJ_TAG_BITS; + } + + /* unmap previous page (if not done yet) */ + if (prev_page_vaddr) { + kunmap_atomic(prev_page_vaddr); + prev_page_vaddr = NULL; + } + + kunmap_atomic(vaddr); + } + + page = next_page; + off %= PAGE_SIZE; + } +} + static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries) { int i, obj_idx, ret = 0; @@ -2561,6 +2728,12 @@ next: return 0; } + /* + * Eviction fails on one of the handles, so we need to restore zspage. + * We need to rebuild its freelist (and free stored deferred handles), + * put it back to the correct size class, and add it to the LRU list. + */ + restore_freelist(pool, class, zspage); putback_zspage(class, zspage); list_add(&zspage->lru, &pool->lru); unlock_zspage(zspage); diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index f20f4373ff40..9554abcfd5b4 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -871,6 +871,7 @@ static unsigned int ip_sabotage_in(void *priv, if (nf_bridge && !nf_bridge->in_prerouting && !netif_is_l3_master(skb->dev) && !netif_is_l3_slave(skb->dev)) { + nf_bridge_info_free(skb); state->okfn(state->net, state->sk, skb); return NF_STOLEN; } diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 748be7253248..78c9729a6057 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -1015,6 +1015,7 @@ static void caif_sock_destructor(struct sock *sk) return; } sk_stream_kill_queues(&cf_sk->sk); + WARN_ON_ONCE(sk->sk_forward_alloc); caif_free_client(&cf_sk->layer); } diff --git a/net/can/isotp.c b/net/can/isotp.c index 608f8c24ae46..fc81d77724a1 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -140,7 +140,7 @@ struct isotp_sock { canid_t rxid; ktime_t tx_gap; ktime_t lastrxcf_tstamp; - struct hrtimer rxtimer, txtimer; + struct hrtimer rxtimer, txtimer, txfrtimer; struct can_isotp_options opt; struct can_isotp_fc_options rxfc, txfc; struct can_isotp_ll_options ll; @@ -871,7 +871,7 @@ static void isotp_rcv_echo(struct sk_buff *skb, void *data) } /* start timer to send next consecutive frame with correct delay */ - hrtimer_start(&so->txtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); + hrtimer_start(&so->txfrtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); } static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) @@ -879,49 +879,39 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txtimer); struct sock *sk = &so->sk; - enum hrtimer_restart restart = HRTIMER_NORESTART; - switch (so->tx.state) { - case ISOTP_SENDING: - - /* cfecho should be consumed by isotp_rcv_echo() here */ - if (!so->cfecho) { - /* start timeout for unlikely lost echo skb */ - hrtimer_set_expires(&so->txtimer, - ktime_add(ktime_get(), - ktime_set(ISOTP_ECHO_TIMEOUT, 0))); - restart = HRTIMER_RESTART; + /* don't handle timeouts in IDLE state */ + if (so->tx.state == ISOTP_IDLE) + return HRTIMER_NORESTART; - /* push out the next consecutive frame */ - isotp_send_cframe(so); - break; - } + /* we did not get any flow control or echo frame in time */ - /* cfecho has not been cleared in isotp_rcv_echo() */ - pr_notice_once("can-isotp: cfecho %08X timeout\n", so->cfecho); - fallthrough; + /* report 'communication error on send' */ + sk->sk_err = ECOMM; + if (!sock_flag(sk, SOCK_DEAD)) + sk_error_report(sk); - case ISOTP_WAIT_FC: - case ISOTP_WAIT_FIRST_FC: + /* reset tx state */ + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); - /* we did not get any flow control frame in time */ + return HRTIMER_NORESTART; +} - /* report 'communication error on send' */ - sk->sk_err = ECOMM; - if (!sock_flag(sk, SOCK_DEAD)) - sk_error_report(sk); +static enum hrtimer_restart isotp_txfr_timer_handler(struct hrtimer *hrtimer) +{ + struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, + txfrtimer); - /* reset tx state */ - so->tx.state = ISOTP_IDLE; - wake_up_interruptible(&so->wait); - break; + /* start echo timeout handling and cover below protocol error */ + hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), + HRTIMER_MODE_REL_SOFT); - default: - WARN_ONCE(1, "can-isotp: tx timer state %08X cfecho %08X\n", - so->tx.state, so->cfecho); - } + /* cfecho should be consumed by isotp_rcv_echo() here */ + if (so->tx.state == ISOTP_SENDING && !so->cfecho) + isotp_send_cframe(so); - return restart; + return HRTIMER_NORESTART; } static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) @@ -1162,6 +1152,10 @@ static int isotp_release(struct socket *sock) /* wait for complete transmission of current pdu */ wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + /* force state machines to be idle also when a signal occurred */ + so->tx.state = ISOTP_IDLE; + so->rx.state = ISOTP_IDLE; + spin_lock(&isotp_notifier_lock); while (isotp_busy_notifier == so) { spin_unlock(&isotp_notifier_lock); @@ -1194,6 +1188,7 @@ static int isotp_release(struct socket *sock) } } + hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); hrtimer_cancel(&so->rxtimer); @@ -1597,6 +1592,8 @@ static int isotp_init(struct sock *sk) so->rxtimer.function = isotp_rx_timer_handler; hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->txtimer.function = isotp_tx_timer_handler; + hrtimer_init(&so->txfrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + so->txfrtimer.function = isotp_txfr_timer_handler; init_waitqueue_head(&so->wait); spin_lock_init(&so->rx_lock); diff --git a/net/can/j1939/address-claim.c b/net/can/j1939/address-claim.c index f33c47327927..ca4ad6cdd5cb 100644 --- a/net/can/j1939/address-claim.c +++ b/net/can/j1939/address-claim.c @@ -165,6 +165,46 @@ static void j1939_ac_process(struct j1939_priv *priv, struct sk_buff *skb) * leaving this function. */ ecu = j1939_ecu_get_by_name_locked(priv, name); + + if (ecu && ecu->addr == skcb->addr.sa) { + /* The ISO 11783-5 standard, in "4.5.2 - Address claim + * requirements", states: + * d) No CF shall begin, or resume, transmission on the + * network until 250 ms after it has successfully claimed + * an address except when responding to a request for + * address-claimed. + * + * But "Figure 6" and "Figure 7" in "4.5.4.2 - Address-claim + * prioritization" show that the CF begins the transmission + * after 250 ms from the first AC (address-claimed) message + * even if it sends another AC message during that time window + * to resolve the address contention with another CF. + * + * As stated in "4.4.2.3 - Address-claimed message": + * In order to successfully claim an address, the CF sending + * an address claimed message shall not receive a contending + * claim from another CF for at least 250 ms. + * + * As stated in "4.4.3.2 - NAME management (NM) message": + * 1) A commanding CF can + * d) request that a CF with a specified NAME transmit + * the address-claimed message with its current NAME. + * 2) A target CF shall + * d) send an address-claimed message in response to a + * request for a matching NAME + * + * Taking the above arguments into account, the 250 ms wait is + * requested only during network initialization. + * + * Do not restart the timer on AC message if both the NAME and + * the address match and so if the address has already been + * claimed (timer has expired) or the AC message has been sent + * to resolve the contention with another CF (timer is still + * running). + */ + goto out_ecu_put; + } + if (!ecu && j1939_address_is_unicast(skcb->addr.sa)) ecu = j1939_ecu_create_locked(priv, name); diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 5c722b55fe23..fce9b9ebf13f 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1092,10 +1092,6 @@ static bool j1939_session_deactivate(struct j1939_session *session) bool active; j1939_session_list_lock(priv); - /* This function should be called with a session ref-count of at - * least 2. - */ - WARN_ON_ONCE(kref_read(&session->kref) < 2); active = j1939_session_deactivate_locked(session); j1939_session_list_unlock(priv); diff --git a/net/can/raw.c b/net/can/raw.c index 81071cdb0301..ba86782ba8bb 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -132,8 +132,8 @@ static void raw_rcv(struct sk_buff *oskb, void *data) return; /* make sure to not pass oversized frames to the socket */ - if ((can_is_canfd_skb(oskb) && !ro->fd_frames && !ro->xl_frames) || - (can_is_canxl_skb(oskb) && !ro->xl_frames)) + if ((!ro->fd_frames && can_is_canfd_skb(oskb)) || + (!ro->xl_frames && can_is_canxl_skb(oskb))) return; /* eliminate multiple filter matches for the same skb */ @@ -670,6 +670,11 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (copy_from_sockptr(&ro->fd_frames, optval, optlen)) return -EFAULT; + /* Enabling CAN XL includes CAN FD */ + if (ro->xl_frames && !ro->fd_frames) { + ro->fd_frames = ro->xl_frames; + return -EINVAL; + } break; case CAN_RAW_XL_FRAMES: @@ -679,6 +684,9 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (copy_from_sockptr(&ro->xl_frames, optval, optlen)) return -EFAULT; + /* Enabling CAN XL includes CAN FD */ + if (ro->xl_frames) + ro->fd_frames = ro->xl_frames; break; case CAN_RAW_JOIN_FILTERS: @@ -786,6 +794,25 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, return 0; } +static bool raw_bad_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu) +{ + /* Classical CAN -> no checks for flags and device capabilities */ + if (can_is_can_skb(skb)) + return false; + + /* CAN FD -> needs to be enabled and a CAN FD or CAN XL device */ + if (ro->fd_frames && can_is_canfd_skb(skb) && + (mtu == CANFD_MTU || can_is_canxl_dev_mtu(mtu))) + return false; + + /* CAN XL -> needs to be enabled and a CAN XL device */ + if (ro->xl_frames && can_is_canxl_skb(skb) && + can_is_canxl_dev_mtu(mtu)) + return false; + + return true; +} + static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; @@ -833,20 +860,8 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) goto free_skb; err = -EINVAL; - if (ro->xl_frames && can_is_canxl_dev_mtu(dev->mtu)) { - /* CAN XL, CAN FD and Classical CAN */ - if (!can_is_canxl_skb(skb) && !can_is_canfd_skb(skb) && - !can_is_can_skb(skb)) - goto free_skb; - } else if (ro->fd_frames && dev->mtu == CANFD_MTU) { - /* CAN FD and Classical CAN */ - if (!can_is_canfd_skb(skb) && !can_is_can_skb(skb)) - goto free_skb; - } else { - /* Classical CAN */ - if (!can_is_can_skb(skb)) - goto free_skb; - } + if (raw_bad_txframe(ro, skb, dev->mtu)) + goto free_skb; sockcm_init(&sockc, sk); if (msg->msg_controllen) { diff --git a/net/core/dev.c b/net/core/dev.c index b76fb37b381e..f23e287602b7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1869,14 +1869,6 @@ static void __move_netdevice_notifier_net(struct net *src_net, __register_netdevice_notifier_net(dst_net, nb, true); } -void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, - struct notifier_block *nb) -{ - rtnl_lock(); - __move_netdevice_notifier_net(src_net, dst_net, nb); - rtnl_unlock(); -} - int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) @@ -10375,7 +10367,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) - dst[i] = atomic_long_read(&src[i]); + dst[i] = (unsigned long)atomic_long_read(&src[i]); /* zero out counters that only exist in rtnl_link_stats64 */ memset((char *)stats64 + n * sizeof(u64), 0, sizeof(*stats64) - n * sizeof(u64)); diff --git a/net/core/devlink.c b/net/core/devlink.c index 032d6d0a5ce6..0bfc144df8b9 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4742,11 +4742,8 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, if (err) return err; - if (dest_net && !net_eq(dest_net, curr_net)) { - move_netdevice_notifier_net(curr_net, dest_net, - &devlink->netdevice_nb); + if (dest_net && !net_eq(dest_net, curr_net)) write_pnet(&devlink->_net, dest_net); - } err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); devlink_reload_failed_set(devlink, !!err); @@ -9979,7 +9976,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, goto err_xa_alloc; devlink->netdevice_nb.notifier_call = devlink_netdevice_event; - ret = register_netdevice_notifier_net(net, &devlink->netdevice_nb); + ret = register_netdevice_notifier(&devlink->netdevice_nb); if (ret) goto err_register_netdevice_notifier; @@ -10171,8 +10168,7 @@ void devlink_free(struct devlink *devlink) xa_destroy(&devlink->snapshot_ids); xa_destroy(&devlink->ports); - WARN_ON_ONCE(unregister_netdevice_notifier_net(devlink_net(devlink), - &devlink->netdevice_nb)); + WARN_ON_ONCE(unregister_netdevice_notifier(&devlink->netdevice_nb)); xa_erase(&devlinks, devlink->index); @@ -10503,6 +10499,8 @@ static int devlink_netdevice_event(struct notifier_block *nb, break; case NETDEV_REGISTER: case NETDEV_CHANGENAME: + if (devlink_net(devlink) != dev_net(netdev)) + return NOTIFY_OK; /* Set the netdev on top of previously set type. Note this * event happens also during net namespace change so here * we take into account netdev pointer appearing in this @@ -10512,6 +10510,8 @@ static int devlink_netdevice_event(struct notifier_block *nb, netdev); break; case NETDEV_UNREGISTER: + if (devlink_net(devlink) != dev_net(netdev)) + return NOTIFY_OK; /* Clear netdev pointer, but not the type. This event happens * also during net namespace change so we need to clear * pointer to netdev that is going to another net namespace. diff --git a/net/core/gro.c b/net/core/gro.c index 506f83d715f8..4bac7ea6e025 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -162,6 +162,15 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) struct sk_buff *lp; int segs; + /* Do not splice page pool based packets w/ non-page pool + * packets. This can result in reference count issues as page + * pool pages will not decrement the reference count and will + * instead be immediately returned to the pool or have frag + * count decremented. + */ + if (p->pp_recycle != skb->pp_recycle) + return -ETOOMANYREFS; + /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ gro_max_size = READ_ONCE(p->dev->gro_max_size); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f00a79fc301b..4edd2176e238 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -269,7 +269,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) (n->nud_state == NUD_NOARP) || (tbl->is_multicast && tbl->is_multicast(n->primary_key)) || - time_after(tref, n->updated)) + !time_in_range(n->updated, tref, jiffies)) remove = true; write_unlock(&n->lock); @@ -289,7 +289,17 @@ static int neigh_forced_gc(struct neigh_table *tbl) static void neigh_add_timer(struct neighbour *n, unsigned long when) { + /* Use safe distance from the jiffies - LONG_MAX point while timer + * is running in DELAY/PROBE state but still show to user space + * large times in the past. + */ + unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ); + neigh_hold(n); + if (!time_in_range(n->confirmed, mint, jiffies)) + n->confirmed = mint; + if (time_before(n->used, n->confirmed)) + n->used = n->confirmed; if (unlikely(mod_timer(&n->timer, when))) { printk("NEIGH: BUG, double timer add, state is %x\n", n->nud_state); @@ -1001,12 +1011,14 @@ static void neigh_periodic_work(struct work_struct *work) goto next_elt; } - if (time_before(n->used, n->confirmed)) + if (time_before(n->used, n->confirmed) && + time_is_before_eq_jiffies(n->confirmed)) n->used = n->confirmed; if (refcount_read(&n->refcnt) == 1 && (state == NUD_FAILED || - time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { + !time_in_range_open(jiffies, n->used, + n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { *np = n->next; neigh_mark_dead(n); write_unlock(&n->lock); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 078a0a420c8a..7b69cf882b8e 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -304,6 +304,12 @@ struct net *get_net_ns_by_id(const struct net *net, int id) } EXPORT_SYMBOL_GPL(get_net_ns_by_id); +/* init code that must occur even if setup_net() is not called. */ +static __net_init void preinit_net(struct net *net) +{ + ref_tracker_dir_init(&net->notrefcnt_tracker, 128); +} + /* * setup_net runs the initializers for the network namespace object. */ @@ -316,7 +322,6 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128); - ref_tracker_dir_init(&net->notrefcnt_tracker, 128); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); @@ -472,6 +477,8 @@ struct net *copy_net_ns(unsigned long flags, rv = -ENOMEM; goto dec_ucounts; } + + preinit_net(net); refcount_set(&net->passive, 1); net->ucounts = ucounts; get_user_ns(user_ns); @@ -1118,6 +1125,7 @@ void __init net_ns_init(void) init_net.key_domain = &init_net_key_domain; #endif down_write(&pernet_ops_rwsem); + preinit_net(&init_net); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4a0eb5593275..a31ff4d83ecc 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4100,7 +4100,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_shinfo(skb)->frag_list = NULL; - do { + while (list_skb) { nskb = list_skb; list_skb = list_skb->next; @@ -4146,8 +4146,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, if (skb_needs_linearize(nskb, features) && __skb_linearize(nskb)) goto err_linearize; - - } while (list_skb); + } skb->truesize = skb->truesize - delta_truesize; skb->data_len = skb->data_len - delta_len; diff --git a/net/core/sock.c b/net/core/sock.c index f954d5893e79..6f27c24016fe 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1531,6 +1531,8 @@ set_sndbuf: ret = -EINVAL; break; } + if ((u8)val == SOCK_TXREHASH_DEFAULT) + val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); /* Paired with READ_ONCE() in tcp_rtx_synack() */ WRITE_ONCE(sk->sk_txrehash, (u8)val); break; @@ -3451,7 +3453,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_pacing_rate = ~0UL; WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; - sk->sk_txrehash = SOCK_TXREHASH_DEFAULT; sk_rx_queue_clear(sk); /* diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 22fa2c5bc6ec..a68a7290a3b2 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1569,15 +1569,16 @@ void sock_map_unhash(struct sock *sk) psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); - if (sk->sk_prot->unhash) - sk->sk_prot->unhash(sk); - return; + saved_unhash = READ_ONCE(sk->sk_prot)->unhash; + } else { + saved_unhash = psock->saved_unhash; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); } - - saved_unhash = psock->saved_unhash; - sock_map_remove_links(sk, psock); - rcu_read_unlock(); - saved_unhash(sk); + if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) + return; + if (saved_unhash) + saved_unhash(sk); } EXPORT_SYMBOL_GPL(sock_map_unhash); @@ -1590,17 +1591,18 @@ void sock_map_destroy(struct sock *sk) psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); - if (sk->sk_prot->destroy) - sk->sk_prot->destroy(sk); - return; + saved_destroy = READ_ONCE(sk->sk_prot)->destroy; + } else { + saved_destroy = psock->saved_destroy; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + sk_psock_stop(psock); + sk_psock_put(sk, psock); } - - saved_destroy = psock->saved_destroy; - sock_map_remove_links(sk, psock); - rcu_read_unlock(); - sk_psock_stop(psock); - sk_psock_put(sk, psock); - saved_destroy(sk); + if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) + return; + if (saved_destroy) + saved_destroy(sk); } EXPORT_SYMBOL_GPL(sock_map_destroy); @@ -1615,16 +1617,21 @@ void sock_map_close(struct sock *sk, long timeout) if (unlikely(!psock)) { rcu_read_unlock(); release_sock(sk); - return sk->sk_prot->close(sk, timeout); + saved_close = READ_ONCE(sk->sk_prot)->close; + } else { + saved_close = psock->saved_close; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + sk_psock_stop(psock); + release_sock(sk); + cancel_work_sync(&psock->work); + sk_psock_put(sk, psock); } - - saved_close = psock->saved_close; - sock_map_remove_links(sk, psock); - rcu_read_unlock(); - sk_psock_stop(psock); - release_sock(sk); - cancel_work_sync(&psock->work); - sk_psock_put(sk, psock); + /* Make sure we do not recurse. This is a bug. + * Leak the socket instead of crashing on a stack overflow. + */ + if (WARN_ON_ONCE(saved_close == sock_map_close)) + return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); diff --git a/net/core/stream.c b/net/core/stream.c index cd06750dd329..434446ab14c5 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -209,7 +209,6 @@ void sk_stream_kill_queues(struct sock *sk) sk_mem_reclaim_final(sk); WARN_ON_ONCE(sk->sk_wmem_queued); - WARN_ON_ONCE(sk->sk_forward_alloc); /* It is _impossible_ for the backlog to contain anything * when we get here. All user references to this socket diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4260fe466993..b9d7c3dd1cb3 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -551,11 +551,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); /* Clone pktoptions received with SYN, if we own the req */ if (*own_req && ireq->pktopts) { - newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC); + newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; - if (newnp->pktoptions) - skb_set_owner_r(newnp->pktoptions, newsk); } return newsk; @@ -615,7 +613,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) --ANK (980728) */ if (np->rxopt.all) - opt_skb = skb_clone(skb, GFP_ATOMIC); + opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == DCCP_OPEN) { /* Fast path */ if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) @@ -679,7 +677,6 @@ ipv6_pktoptions: np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &DCCP_SKB_CB(opt_skb)->header.h6)) { - skb_set_owner_r(opt_skb, sk); memmove(IP6CB(opt_skb), &DCCP_SKB_CB(opt_skb)->header.h6, sizeof(struct inet6_skb_parm)); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6c0ec2789943..cf11f10927e1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -347,6 +347,7 @@ lookup_protocol: sk->sk_destruct = inet_sock_destruct; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; + sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); inet->uc_ttl = -1; inet->mc_loop = 1; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d1f837579398..f2c43f67187d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1225,9 +1225,6 @@ int inet_csk_listen_start(struct sock *sk) sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); - if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT) - sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); - /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 94aad3870c5f..cf26d65ca389 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -6,6 +6,7 @@ #include <linux/bpf.h> #include <linux/init.h> #include <linux/wait.h> +#include <linux/util_macros.h> #include <net/inet_common.h> #include <net/tls.h> @@ -639,10 +640,9 @@ EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); */ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) { - int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; struct proto *prot = newsk->sk_prot; - if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE]) + if (is_insidevar(prot, tcp_bpf_prots)) newsk->sk_prot = sk->sk_prot_creator; } #endif /* CONFIG_BPF_SYSCALL */ diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f7a84a4acffc..faa47f9ea73a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3127,17 +3127,17 @@ static void add_v4_addrs(struct inet6_dev *idev) offset = sizeof(struct in6_addr) - 4; memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4); - if (idev->dev->flags&IFF_POINTOPOINT) { + if (!(idev->dev->flags & IFF_POINTOPOINT) && idev->dev->type == ARPHRD_SIT) { + scope = IPV6_ADDR_COMPATv4; + plen = 96; + pflags |= RTF_NONEXTHOP; + } else { if (idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_NONE) return; addr.s6_addr32[0] = htonl(0xfe800000); scope = IFA_LINK; plen = 64; - } else { - scope = IPV6_ADDR_COMPATv4; - plen = 96; - pflags |= RTF_NONEXTHOP; } if (addr.s6_addr32[3]) { @@ -3447,6 +3447,30 @@ static void addrconf_gre_config(struct net_device *dev) } #endif +static void addrconf_init_auto_addrs(struct net_device *dev) +{ + switch (dev->type) { +#if IS_ENABLED(CONFIG_IPV6_SIT) + case ARPHRD_SIT: + addrconf_sit_config(dev); + break; +#endif +#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) + case ARPHRD_IP6GRE: + case ARPHRD_IPGRE: + addrconf_gre_config(dev); + break; +#endif + case ARPHRD_LOOPBACK: + init_loopback(dev); + break; + + default: + addrconf_dev_config(dev); + break; + } +} + static int fixup_permanent_addr(struct net *net, struct inet6_dev *idev, struct inet6_ifaddr *ifp) @@ -3615,26 +3639,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, run_pending = 1; } - switch (dev->type) { -#if IS_ENABLED(CONFIG_IPV6_SIT) - case ARPHRD_SIT: - addrconf_sit_config(dev); - break; -#endif -#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) - case ARPHRD_IP6GRE: - case ARPHRD_IPGRE: - addrconf_gre_config(dev); - break; -#endif - case ARPHRD_LOOPBACK: - init_loopback(dev); - break; - - default: - addrconf_dev_config(dev); - break; - } + addrconf_init_auto_addrs(dev); if (!IS_ERR_OR_NULL(idev)) { if (run_pending) @@ -6397,7 +6402,7 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, if (idev->cnf.addr_gen_mode != new_val) { idev->cnf.addr_gen_mode = new_val; - addrconf_dev_config(idev->dev); + addrconf_init_auto_addrs(idev->dev); } } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) { struct net_device *dev; @@ -6408,7 +6413,7 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, if (idev && idev->cnf.addr_gen_mode != new_val) { idev->cnf.addr_gen_mode = new_val; - addrconf_dev_config(idev->dev); + addrconf_init_auto_addrs(idev->dev); } } } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index fee9163382c2..847934763868 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -222,6 +222,7 @@ lookup_protocol: np->pmtudisc = IPV6_PMTUDISC_WANT; np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; + sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index e624497fa992..9b6818453afe 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -51,7 +51,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) fl6->flowi6_mark = sk->sk_mark; fl6->fl6_dport = inet->inet_dport; fl6->fl6_sport = inet->inet_sport; - fl6->flowlabel = np->flow_label; + fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6->flowi6_uid = sk->sk_uid; if (!oif) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 11b736a76bd7..a52a4f12f146 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -272,6 +272,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = sk->sk_v6_daddr; fl6.saddr = saddr ? *saddr : np->saddr; + fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; @@ -1387,14 +1388,11 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * /* Clone pktoptions received with SYN, if we own the req */ if (ireq->pktopts) { - newnp->pktoptions = skb_clone(ireq->pktopts, - sk_gfp_mask(sk, GFP_ATOMIC)); + newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; - if (newnp->pktoptions) { + if (newnp->pktoptions) tcp_v6_restore_cb(newnp->pktoptions); - skb_set_owner_r(newnp->pktoptions, newsk); - } } } else { if (!req_unhash && found_dup_sk) { @@ -1466,7 +1464,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) --ANK (980728) */ if (np->rxopt.all) - opt_skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); + opt_skb = skb_clone_and_charge_r(skb, sk); reason = SKB_DROP_REASON_NOT_SPECIFIED; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ @@ -1552,7 +1550,6 @@ ipv6_pktoptions: if (np->repflow) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { - skb_set_owner_r(opt_skb, sk); tcp_v6_restore_cb(opt_skb); opt_skb = xchg(&np->pktoptions, opt_skb); } else { diff --git a/net/key/af_key.c b/net/key/af_key.c index 2bdbcec781cd..a815f5ab4c49 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1261,7 +1261,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, const struct sadb_x_nat_t_type* n_type; struct xfrm_encap_tmpl *natt; - x->encap = kmalloc(sizeof(*x->encap), GFP_KERNEL); + x->encap = kzalloc(sizeof(*x->encap), GFP_KERNEL); if (!x->encap) { err = -ENOMEM; goto out; diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index c2aae2a6d6a6..97bb4401dd3e 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -213,7 +213,6 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local, ret = ieee802154_parse_frame_start(skb, &hdr); if (ret) { pr_debug("got invalid frame\n"); - kfree_skb(skb); return; } diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index 45bbe3e54cc2..3150f3f0c872 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -587,6 +587,11 @@ static void mctp_sk_unhash(struct sock *sk) del_timer_sync(&msk->key_expiry); } +static void mctp_sk_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); +} + static struct proto mctp_proto = { .name = "MCTP", .owner = THIS_MODULE, @@ -623,6 +628,7 @@ static int mctp_pf_create(struct net *net, struct socket *sock, return -ENOMEM; sock_init_data(sock, sk); + sk->sk_destruct = mctp_sk_destruct; rc = 0; if (sk->sk_prot->init) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 35b5f806fdda..dc5165d3eec4 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1428,6 +1428,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev, free: kfree(table); out: + mdev->sysctl = NULL; return -ENOBUFS; } @@ -1437,6 +1438,9 @@ static void mpls_dev_sysctl_unregister(struct net_device *dev, struct net *net = dev_net(dev); struct ctl_table *table; + if (!mdev->sysctl) + return; + table = mdev->sysctl->ctl_table_arg; unregister_net_sysctl_table(mdev->sysctl); kfree(table); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 2ea7eae43bdb..10fe9771a852 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -998,8 +998,8 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, { int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; - struct mptcp_sock *msk; struct socket *ssock; + struct sock *newsk; int backlog = 1024; int err; @@ -1008,11 +1008,13 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, if (err) return err; - msk = mptcp_sk(entry->lsk->sk); - if (!msk) + newsk = entry->lsk->sk; + if (!newsk) return -EINVAL; - ssock = __mptcp_nmpc_socket(msk); + lock_sock(newsk); + ssock = __mptcp_nmpc_socket(mptcp_sk(newsk)); + release_sock(newsk); if (!ssock) return -EINVAL; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8cd6cc67c2c5..bc6c1f62a690 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2897,6 +2897,7 @@ bool __mptcp_close(struct sock *sk, long timeout) struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; + int subflows_alive = 0; sk->sk_shutdown = SHUTDOWN_MASK; @@ -2922,6 +2923,8 @@ cleanup: struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); + subflows_alive += ssk->sk_state != TCP_CLOSE; + /* since the close timeout takes precedence on the fail one, * cancel the latter */ @@ -2937,6 +2940,12 @@ cleanup: } sock_orphan(sk); + /* all the subflows are closed, only timeout can change the msk + * state, let's not keep resources busy for no reasons + */ + if (subflows_alive == 0) + inet_sk_state_store(sk, TCP_CLOSE); + sock_hold(sk); pr_debug("msk=%p state=%d", sk, sk->sk_state); if (msk->token) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index d4b1e6ec1b36..7f2c3727ab23 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -760,14 +760,21 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname, sockptr_t optval, unsigned int optlen) { + struct sock *sk = (struct sock *)msk; struct socket *sock; + int ret = -EINVAL; /* Limit to first subflow, before the connection establishment */ + lock_sock(sk); sock = __mptcp_nmpc_socket(msk); if (!sock) - return -EINVAL; + goto unlock; - return tcp_setsockopt(sock->sk, level, optname, optval, optlen); + ret = tcp_setsockopt(sock->sk, level, optname, optval, optlen); + +unlock: + release_sock(sk); + return ret; } static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ec54413fb31f..32904c76c6a1 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1399,6 +1399,7 @@ void __mptcp_error_report(struct sock *sk) mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int err = sock_error(ssk); + int ssk_state; if (!err) continue; @@ -1409,7 +1410,14 @@ void __mptcp_error_report(struct sock *sk) if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk)) continue; - inet_sk_state_store(sk, inet_sk_state_load(ssk)); + /* We need to propagate only transition to CLOSE state. + * Orphaned socket will see such state change via + * subflow_sched_work_if_closed() and that path will properly + * destroy the msk as needed. + */ + ssk_state = inet_sk_state_load(ssk); + if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) + inet_sk_state_store(sk, ssk_state); sk->sk_err = -err; /* This barrier is coupled with smp_rmb() in mptcp_poll() */ @@ -1679,7 +1687,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, if (err) return err; - lock_sock(sf->sk); + lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING); /* the newly created socket has to be in the same cgroup as its parent */ mptcp_attach_cgroup(sk, sf->sk); diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 945dd40e7077..011d414038ea 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -142,10 +142,11 @@ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) } #endif +/* do_basic_checks ensures sch->length > 0, do not use before */ #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ - ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))) && \ - (sch)->length; \ + (offset) < (skb)->len && \ + ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \ (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++) /* Some validity checks to make sure the chunks are fine */ diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 6f7f4392cffb..5a4cb796150f 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -400,6 +400,11 @@ static int nr_listen(struct socket *sock, int backlog) struct sock *sk = sock->sk; lock_sock(sk); + if (sock->state != SS_UNCONNECTED) { + release_sock(sk); + return -EINVAL; + } + if (sk->sk_state != TCP_LISTEN) { memset(&nr_sk(sk)->user_addr, 0, AX25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index a71795355aec..fcee6012293b 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1004,14 +1004,14 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) { error = -ENOMEM; - goto err_kfree_key; + goto err_kfree_flow; } ovs_match_init(&match, key, false, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) - goto err_kfree_flow; + goto err_kfree_key; ovs_flow_mask_key(&new_flow->key, key, true, &mask); @@ -1019,14 +1019,14 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], key, log); if (error) - goto err_kfree_flow; + goto err_kfree_key; /* Validate actions. */ error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, &acts, log); if (error) { OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); - goto err_kfree_flow; + goto err_kfree_key; } reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false, @@ -1126,10 +1126,10 @@ err_unlock_ovs: kfree_skb(reply); err_kfree_acts: ovs_nla_free_flow_actions(acts); -err_kfree_flow: - ovs_flow_free(new_flow, false); err_kfree_key: kfree(key); +err_kfree_flow: + ovs_flow_free(new_flow, false); error: return error; } diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 6e38f68f88c2..f2698d2316df 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -449,7 +449,7 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) err = attach_meter(meter_tbl, meter); if (err) - goto exit_unlock; + goto exit_free_old_meter; ovs_unlock(); @@ -472,6 +472,8 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); +exit_free_old_meter: + ovs_meter_free(old_meter); exit_unlock: ovs_unlock(); nlmsg_free(reply); diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 1990d496fcfc..e595079c2caf 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -83,7 +83,10 @@ static struct qrtr_node *node_get(unsigned int node_id) node->id = node_id; - radix_tree_insert(&nodes, node_id, node); + if (radix_tree_insert(&nodes, node_id, node)) { + kfree(node); + return NULL; + } return node; } diff --git a/net/rds/message.c b/net/rds/message.c index b47e4f0a1639..c19c93561227 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -104,9 +104,9 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs, spin_lock_irqsave(&q->lock, flags); head = &q->zcookie_head; if (!list_empty(head)) { - info = list_entry(head, struct rds_msg_zcopy_info, - rs_zcookie_next); - if (info && rds_zcookie_add(info, cookie)) { + info = list_first_entry(head, struct rds_msg_zcopy_info, + rs_zcookie_next); + if (rds_zcookie_add(info, cookie)) { spin_unlock_irqrestore(&q->lock, flags); kfree(rds_info_from_znotifier(znotif)); /* caller invokes rds_wake_sk_sleep() */ diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 36fefc3957d7..ca2b17f32670 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -488,6 +488,12 @@ static int rose_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; + lock_sock(sk); + if (sock->state != SS_UNCONNECTED) { + release_sock(sk); + return -EINVAL; + } + if (sk->sk_state != TCP_LISTEN) { struct rose_sock *rose = rose_sk(sk); @@ -497,8 +503,10 @@ static int rose_listen(struct socket *sock, int backlog) memset(rose->dest_digis, 0, AX25_ADDR_LEN * ROSE_MAX_DIGIS); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; + release_sock(sk); return 0; } + release_sock(sk); return -EOPNOTSUPP; } diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 4b1b59da5c0b..4d15b6a6169c 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -93,7 +93,7 @@ TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, cp = rcu_dereference_bh(ca->params); tcf_lastuse_update(&ca->tcf_tm); - bstats_update(&ca->tcf_bstats, skb); + tcf_action_update_bstats(&ca->common, skb); action = READ_ONCE(ca->tcf_action); wlen = skb_network_offset(skb); @@ -212,8 +212,8 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, index = actparm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_ctinfo_ops, bind, false, flags); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_ctinfo_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index ee2a050c887b..6640e75eaa02 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -12,6 +12,7 @@ #include <linux/errno.h> #include <linux/slab.h> #include <linux/refcount.h> +#include <linux/rcupdate.h> #include <net/act_api.h> #include <net/netlink.h> #include <net/pkt_cls.h> @@ -339,6 +340,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tcf_result cr = {}; int err, balloc = 0; struct tcf_exts e; + bool update_h = false; err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) @@ -456,10 +458,13 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } } - if (cp->perfect) + if (cp->perfect) { r = cp->perfect + handle; - else - r = tcindex_lookup(cp, handle) ? : &new_filter_result; + } else { + /* imperfect area is updated in-place using rcu */ + update_h = !!tcindex_lookup(cp, handle); + r = &new_filter_result; + } if (r == &new_filter_result) { f = kzalloc(sizeof(*f), GFP_KERNEL); @@ -485,7 +490,28 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, rcu_assign_pointer(tp->root, cp); - if (r == &new_filter_result) { + if (update_h) { + struct tcindex_filter __rcu **fp; + struct tcindex_filter *cf; + + f->result.res = r->res; + tcf_exts_change(&f->result.exts, &r->exts); + + /* imperfect area bucket */ + fp = cp->h + (handle % cp->hash); + + /* lookup the filter, guaranteed to exist */ + for (cf = rcu_dereference_bh_rtnl(*fp); cf; + fp = &cf->next, cf = rcu_dereference_bh_rtnl(*fp)) + if (cf->key == (u16)handle) + break; + + f->next = cf->next; + + cf = rcu_replace_pointer(*fp, f, 1); + tcf_exts_get_net(&cf->result.exts); + tcf_queue_work(&cf->rwork, tcindex_destroy_fexts_work); + } else if (r == &new_filter_result) { struct tcindex_filter *nfp; struct tcindex_filter __rcu **fp; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index f46643850df8..92f2975b6a82 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -431,7 +431,10 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) while (cl->cmode == HTB_MAY_BORROW && p && mask) { m = mask; while (m) { - int prio = ffz(~m); + unsigned int prio = ffz(~m); + + if (WARN_ON_ONCE(prio >= ARRAY_SIZE(p->inner.clprio))) + break; m &= ~(1 << prio); if (p->inner.clprio[prio].feed.rb_node) diff --git a/net/sctp/diag.c b/net/sctp/diag.c index a557009e9832..c3d6b92dd386 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -343,11 +343,9 @@ static int sctp_sock_filter(struct sctp_endpoint *ep, struct sctp_transport *tsp struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; const struct inet_diag_req_v2 *r = commp->r; - struct sctp_association *assoc = - list_entry(ep->asocs.next, struct sctp_association, asocs); /* find the ep only once through the transports by this condition */ - if (tsp->asoc != assoc) + if (!list_is_first(&tsp->asoc->asocs, &ep->asocs)) return 0; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) diff --git a/net/sctp/transport.c b/net/sctp/transport.c index ca1eba95c293..2f66a2006517 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -196,9 +196,7 @@ void sctp_transport_reset_hb_timer(struct sctp_transport *transport) /* When a data chunk is sent, reset the heartbeat interval. */ expires = jiffies + sctp_transport_timeout(transport); - if ((time_before(transport->hb_timer.expires, expires) || - !timer_pending(&transport->hb_timer)) && - !mod_timer(&transport->hb_timer, + if (!mod_timer(&transport->hb_timer, expires + get_random_u32_below(transport->rto))) sctp_transport_hold(transport); } diff --git a/net/socket.c b/net/socket.c index 888cd618a968..c6c44e26e954 100644 --- a/net/socket.c +++ b/net/socket.c @@ -385,7 +385,7 @@ static const struct xattr_handler sockfs_xattr_handler = { }; static int sockfs_security_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) @@ -589,10 +589,10 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, return used; } -static int sockfs_setattr(struct user_namespace *mnt_userns, +static int sockfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { - int err = simple_setattr(&init_user_ns, dentry, iattr); + int err = simple_setattr(&nop_mnt_idmap, dentry, iattr); if (!err && (iattr->ia_valid & ATTR_UID)) { struct socket *sock = SOCKET_I(d_inode(dentry)); @@ -971,9 +971,12 @@ static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, static void sock_recv_mark(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - if (sock_flag(sk, SOCK_RCVMARK) && skb) - put_cmsg(msg, SOL_SOCKET, SO_MARK, sizeof(__u32), - &skb->mark); + if (sock_flag(sk, SOCK_RCVMARK) && skb) { + /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */ + __u32 mark = skb->mark; + + put_cmsg(msg, SOL_SOCKET, SO_MARK, sizeof(__u32), &mark); + } } void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b35c8701876a..a38733f2197a 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2614,6 +2614,7 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest, /* Send a 'SYN-' to destination */ m.msg_name = dest; m.msg_namelen = destlen; + iov_iter_kvec(&m.msg_iter, ITER_SOURCE, NULL, 0, 0); /* If connect is in non-blocking case, set MSG_DONTWAIT to * indicate send_msg() is never blocked. @@ -2776,6 +2777,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, __skb_queue_head(&new_sk->sk_receive_queue, buf); skb_set_owner_r(buf, new_sk); } + iov_iter_kvec(&m.msg_iter, ITER_SOURCE, NULL, 0, 0); __tipc_sendstream(new_sock, &m, 0); release_sock(new_sk); exit: diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9ed978634125..a83d2b4275fa 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2427,7 +2427,7 @@ static bool tls_is_tx_ready(struct tls_sw_context_tx *ctx) { struct tls_rec *rec; - rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); + rec = list_first_entry_or_null(&ctx->tx_list, struct tls_rec, list); if (!rec) return false; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f0c2293f1d3b..81ff98298996 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1190,7 +1190,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct net *net = sock_net(sk); - struct user_namespace *ns; // barf... + struct mnt_idmap *idmap; struct unix_address *addr; struct dentry *dentry; struct path parent; @@ -1217,10 +1217,10 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, /* * All right, let's create it. */ - ns = mnt_user_ns(parent.mnt); + idmap = mnt_idmap(parent.mnt); err = security_path_mknod(&parent, dentry, mode, 0); if (!err) - err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0); + err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); if (err) goto out_path; err = mutex_lock_interruptible(&u->bindlock); @@ -1245,7 +1245,7 @@ out_unlock: err = -EINVAL; out_unlink: /* failed after successful mknod? unlink what we'd created... */ - vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL); + vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); out_path: done_path_create(&parent, dentry); out: diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index a0f62fa02e06..8cbf45a8bcdc 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -5,6 +5,7 @@ * Based on code and translator idea by: Florian Westphal <fw@strlen.de> */ #include <linux/compat.h> +#include <linux/nospec.h> #include <linux/xfrm.h> #include <net/xfrm.h> @@ -302,7 +303,7 @@ static int xfrm_xlate64(struct sk_buff *dst, const struct nlmsghdr *nlh_src) nla_for_each_attr(nla, attrs, len, remaining) { int err; - switch (type) { + switch (nlh_src->nlmsg_type) { case XFRM_MSG_NEWSPDINFO: err = xfrm_nla_cpy(dst, nla, nla_len(nla)); break; @@ -437,6 +438,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, NL_SET_ERR_MSG(extack, "Bad attribute"); return -EOPNOTSUPP; } + type = array_index_nospec(type, XFRMA_MAX + 1); if (nla_len(nla) < compat_policy[type].len) { NL_SET_ERR_MSG(extack, "Attribute bad length"); return -EOPNOTSUPP; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index c06e54a10540..436d29640ac2 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -279,8 +279,7 @@ static int xfrm6_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv6_copy_dscp(ipv6_get_dsfield(ipv6_hdr(skb)), - ipipv6_hdr(skb)); + ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipipv6_hdr(skb)); if (!(x->props.flags & XFRM_STATE_NOECN)) ipip6_ecn_decapsulate(skb); diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 1f99dc469027..35279c220bd7 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -310,6 +310,52 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) skb->mark = 0; } +static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type, unsigned short family) +{ + struct sec_path *sp; + + sp = skb_sec_path(skb); + if (sp && (sp->len || sp->olen) && + !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + goto discard; + + XFRM_SPI_SKB_CB(skb)->family = family; + if (family == AF_INET) { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + } else { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + } + + return xfrm_input(skb, nexthdr, spi, encap_type); +discard: + kfree_skb(skb); + return 0; +} + +static int xfrmi4_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); +} + +static int xfrmi6_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], + 0, 0, AF_INET6); +} + +static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); +} + +static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); +} + static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; @@ -945,8 +991,8 @@ static struct pernet_operations xfrmi_net_ops = { }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { - .handler = xfrm6_rcv, - .input_handler = xfrm_input, + .handler = xfrmi6_rcv, + .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, @@ -996,8 +1042,8 @@ static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { - .handler = xfrm4_rcv, - .input_handler = xfrm_input, + .handler = xfrmi4_rcv, + .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e9eb82c5457d..5c61ec04b839 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -336,7 +336,7 @@ static void xfrm_policy_timer(struct timer_list *t) } if (xp->lft.hard_use_expires_seconds) { time64_t tmo = xp->lft.hard_use_expires_seconds + - (xp->curlft.use_time ? : xp->curlft.add_time) - now; + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) goto expired; if (tmo < next) @@ -354,7 +354,7 @@ static void xfrm_policy_timer(struct timer_list *t) } if (xp->lft.soft_use_expires_seconds) { time64_t tmo = xp->lft.soft_use_expires_seconds + - (xp->curlft.use_time ? : xp->curlft.add_time) - now; + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; @@ -3661,7 +3661,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, return 1; } - pol->curlft.use_time = ktime_get_real_seconds(); + /* This lockless write can happen from different cpus. */ + WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds()); pols[0] = pol; npols++; @@ -3676,7 +3677,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, xfrm_pol_put(pols[0]); return 0; } - pols[1]->curlft.use_time = ktime_get_real_seconds(); + /* This write can happen from different cpus. */ + WRITE_ONCE(pols[1]->curlft.use_time, + ktime_get_real_seconds()); npols++; } } @@ -3742,6 +3745,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, goto reject; } + if (if_id) + secpath_reset(skb); + xfrm_pols_put(pols, npols); return 1; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 89c731f4f0c7..00afe831c71c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -577,7 +577,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) if (x->km.state == XFRM_STATE_EXPIRED) goto expired; if (x->lft.hard_add_expires_seconds) { - long tmo = x->lft.hard_add_expires_seconds + + time64_t tmo = x->lft.hard_add_expires_seconds + x->curlft.add_time - now; if (tmo <= 0) { if (x->xflags & XFRM_SOFT_EXPIRE) { @@ -594,8 +594,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) next = tmo; } if (x->lft.hard_use_expires_seconds) { - long tmo = x->lft.hard_use_expires_seconds + - (x->curlft.use_time ? : now) - now; + time64_t tmo = x->lft.hard_use_expires_seconds + + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) goto expired; if (tmo < next) @@ -604,7 +604,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) if (x->km.dying) goto resched; if (x->lft.soft_add_expires_seconds) { - long tmo = x->lft.soft_add_expires_seconds + + time64_t tmo = x->lft.soft_add_expires_seconds + x->curlft.add_time - now; if (tmo <= 0) { warn = 1; @@ -616,8 +616,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) } } if (x->lft.soft_use_expires_seconds) { - long tmo = x->lft.soft_use_expires_seconds + - (x->curlft.use_time ? : now) - now; + time64_t tmo = x->lft.soft_use_expires_seconds + + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) warn = 1; else if (tmo < next) @@ -1906,7 +1906,7 @@ out: hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); - if (x1->curlft.use_time) + if (READ_ONCE(x1->curlft.use_time)) xfrm_state_check_expire(x1); if (x->props.smark.m || x->props.smark.v || x->if_id) { @@ -1940,8 +1940,8 @@ int xfrm_state_check_expire(struct xfrm_state *x) { xfrm_dev_state_update_curlft(x); - if (!x->curlft.use_time) - x->curlft.use_time = ktime_get_real_seconds(); + if (!READ_ONCE(x->curlft.use_time)) + WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds()); if (x->curlft.bytes >= x->lft.hard_byte_limit || x->curlft.packets >= x->lft.hard_packet_limit) { diff --git a/rust/Makefile b/rust/Makefile index ff70c4c916f8..8a521f2b6422 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -50,6 +50,7 @@ core-cfgs = \ --cfg no_fp_fmt_parse alloc-cfgs = \ + --cfg no_borrow \ --cfg no_fmt \ --cfg no_global_oom_handling \ --cfg no_macros \ @@ -359,8 +360,22 @@ rust-analyzer: $(Q)$(srctree)/scripts/generate_rust_analyzer.py $(srctree) $(objtree) \ $(RUST_LIB_SRC) > $(objtree)/rust-project.json +redirect-intrinsics = \ + __eqsf2 __gesf2 __lesf2 __nesf2 __unordsf2 \ + __unorddf2 \ + __muloti4 __multi3 \ + __udivmodti4 __udivti3 __umodti3 + +ifneq ($(or $(CONFIG_ARM64),$(and $(CONFIG_RISCV),$(CONFIG_64BIT))),) + # These intrinsics are defined for ARM64 and RISCV64 + redirect-intrinsics += \ + __ashrti3 \ + __ashlti3 __lshrti3 +endif + $(obj)/core.o: private skip_clippy = 1 $(obj)/core.o: private skip_flags = -Dunreachable_pub +$(obj)/core.o: private rustc_objcopy = $(foreach sym,$(redirect-intrinsics),--redefine-sym $(sym)=__rust$(sym)) $(obj)/core.o: private rustc_target_flags = $(core-cfgs) $(obj)/core.o: $(RUST_LIB_SRC)/core/src/lib.rs $(obj)/target.json FORCE $(call if_changed_dep,rustc_library) diff --git a/rust/alloc/borrow.rs b/rust/alloc/borrow.rs deleted file mode 100644 index dde4957200d4..000000000000 --- a/rust/alloc/borrow.rs +++ /dev/null @@ -1,498 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 OR MIT - -//! A module for working with borrowed data. - -#![stable(feature = "rust1", since = "1.0.0")] - -use core::cmp::Ordering; -use core::hash::{Hash, Hasher}; -use core::ops::Deref; -#[cfg(not(no_global_oom_handling))] -use core::ops::{Add, AddAssign}; - -#[stable(feature = "rust1", since = "1.0.0")] -pub use core::borrow::{Borrow, BorrowMut}; - -use core::fmt; -#[cfg(not(no_global_oom_handling))] -use crate::string::String; - -use Cow::*; - -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a, B: ?Sized> Borrow<B> for Cow<'a, B> -where - B: ToOwned, - <B as ToOwned>::Owned: 'a, -{ - fn borrow(&self) -> &B { - &**self - } -} - -/// A generalization of `Clone` to borrowed data. -/// -/// Some types make it possible to go from borrowed to owned, usually by -/// implementing the `Clone` trait. But `Clone` works only for going from `&T` -/// to `T`. The `ToOwned` trait generalizes `Clone` to construct owned data -/// from any borrow of a given type. -#[cfg_attr(not(test), rustc_diagnostic_item = "ToOwned")] -#[stable(feature = "rust1", since = "1.0.0")] -pub trait ToOwned { - /// The resulting type after obtaining ownership. - #[stable(feature = "rust1", since = "1.0.0")] - type Owned: Borrow<Self>; - - /// Creates owned data from borrowed data, usually by cloning. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// let s: &str = "a"; - /// let ss: String = s.to_owned(); - /// - /// let v: &[i32] = &[1, 2]; - /// let vv: Vec<i32> = v.to_owned(); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - #[must_use = "cloning is often expensive and is not expected to have side effects"] - fn to_owned(&self) -> Self::Owned; - - /// Uses borrowed data to replace owned data, usually by cloning. - /// - /// This is borrow-generalized version of `Clone::clone_from`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// # #![feature(toowned_clone_into)] - /// let mut s: String = String::new(); - /// "hello".clone_into(&mut s); - /// - /// let mut v: Vec<i32> = Vec::new(); - /// [1, 2][..].clone_into(&mut v); - /// ``` - #[unstable(feature = "toowned_clone_into", reason = "recently added", issue = "41263")] - fn clone_into(&self, target: &mut Self::Owned) { - *target = self.to_owned(); - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<T> ToOwned for T -where - T: Clone, -{ - type Owned = T; - fn to_owned(&self) -> T { - self.clone() - } - - fn clone_into(&self, target: &mut T) { - target.clone_from(self); - } -} - -/// A clone-on-write smart pointer. -/// -/// The type `Cow` is a smart pointer providing clone-on-write functionality: it -/// can enclose and provide immutable access to borrowed data, and clone the -/// data lazily when mutation or ownership is required. The type is designed to -/// work with general borrowed data via the `Borrow` trait. -/// -/// `Cow` implements `Deref`, which means that you can call -/// non-mutating methods directly on the data it encloses. If mutation -/// is desired, `to_mut` will obtain a mutable reference to an owned -/// value, cloning if necessary. -/// -/// If you need reference-counting pointers, note that -/// [`Rc::make_mut`][crate::rc::Rc::make_mut] and -/// [`Arc::make_mut`][crate::sync::Arc::make_mut] can provide clone-on-write -/// functionality as well. -/// -/// # Examples -/// -/// ``` -/// use std::borrow::Cow; -/// -/// fn abs_all(input: &mut Cow<[i32]>) { -/// for i in 0..input.len() { -/// let v = input[i]; -/// if v < 0 { -/// // Clones into a vector if not already owned. -/// input.to_mut()[i] = -v; -/// } -/// } -/// } -/// -/// // No clone occurs because `input` doesn't need to be mutated. -/// let slice = [0, 1, 2]; -/// let mut input = Cow::from(&slice[..]); -/// abs_all(&mut input); -/// -/// // Clone occurs because `input` needs to be mutated. -/// let slice = [-1, 0, 1]; -/// let mut input = Cow::from(&slice[..]); -/// abs_all(&mut input); -/// -/// // No clone occurs because `input` is already owned. -/// let mut input = Cow::from(vec![-1, 0, 1]); -/// abs_all(&mut input); -/// ``` -/// -/// Another example showing how to keep `Cow` in a struct: -/// -/// ``` -/// use std::borrow::Cow; -/// -/// struct Items<'a, X: 'a> where [X]: ToOwned<Owned = Vec<X>> { -/// values: Cow<'a, [X]>, -/// } -/// -/// impl<'a, X: Clone + 'a> Items<'a, X> where [X]: ToOwned<Owned = Vec<X>> { -/// fn new(v: Cow<'a, [X]>) -> Self { -/// Items { values: v } -/// } -/// } -/// -/// // Creates a container from borrowed values of a slice -/// let readonly = [1, 2]; -/// let borrowed = Items::new((&readonly[..]).into()); -/// match borrowed { -/// Items { values: Cow::Borrowed(b) } => println!("borrowed {b:?}"), -/// _ => panic!("expect borrowed value"), -/// } -/// -/// let mut clone_on_write = borrowed; -/// // Mutates the data from slice into owned vec and pushes a new value on top -/// clone_on_write.values.to_mut().push(3); -/// println!("clone_on_write = {:?}", clone_on_write.values); -/// -/// // The data was mutated. Let's check it out. -/// match clone_on_write { -/// Items { values: Cow::Owned(_) } => println!("clone_on_write contains owned data"), -/// _ => panic!("expect owned data"), -/// } -/// ``` -#[stable(feature = "rust1", since = "1.0.0")] -#[cfg_attr(not(test), rustc_diagnostic_item = "Cow")] -pub enum Cow<'a, B: ?Sized + 'a> -where - B: ToOwned, -{ - /// Borrowed data. - #[stable(feature = "rust1", since = "1.0.0")] - Borrowed(#[stable(feature = "rust1", since = "1.0.0")] &'a B), - - /// Owned data. - #[stable(feature = "rust1", since = "1.0.0")] - Owned(#[stable(feature = "rust1", since = "1.0.0")] <B as ToOwned>::Owned), -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<B: ?Sized + ToOwned> Clone for Cow<'_, B> { - fn clone(&self) -> Self { - match *self { - Borrowed(b) => Borrowed(b), - Owned(ref o) => { - let b: &B = o.borrow(); - Owned(b.to_owned()) - } - } - } - - fn clone_from(&mut self, source: &Self) { - match (self, source) { - (&mut Owned(ref mut dest), &Owned(ref o)) => o.borrow().clone_into(dest), - (t, s) => *t = s.clone(), - } - } -} - -impl<B: ?Sized + ToOwned> Cow<'_, B> { - /// Returns true if the data is borrowed, i.e. if `to_mut` would require additional work. - /// - /// # Examples - /// - /// ``` - /// #![feature(cow_is_borrowed)] - /// use std::borrow::Cow; - /// - /// let cow = Cow::Borrowed("moo"); - /// assert!(cow.is_borrowed()); - /// - /// let bull: Cow<'_, str> = Cow::Owned("...moo?".to_string()); - /// assert!(!bull.is_borrowed()); - /// ``` - #[unstable(feature = "cow_is_borrowed", issue = "65143")] - #[rustc_const_unstable(feature = "const_cow_is_borrowed", issue = "65143")] - pub const fn is_borrowed(&self) -> bool { - match *self { - Borrowed(_) => true, - Owned(_) => false, - } - } - - /// Returns true if the data is owned, i.e. if `to_mut` would be a no-op. - /// - /// # Examples - /// - /// ``` - /// #![feature(cow_is_borrowed)] - /// use std::borrow::Cow; - /// - /// let cow: Cow<'_, str> = Cow::Owned("moo".to_string()); - /// assert!(cow.is_owned()); - /// - /// let bull = Cow::Borrowed("...moo?"); - /// assert!(!bull.is_owned()); - /// ``` - #[unstable(feature = "cow_is_borrowed", issue = "65143")] - #[rustc_const_unstable(feature = "const_cow_is_borrowed", issue = "65143")] - pub const fn is_owned(&self) -> bool { - !self.is_borrowed() - } - - /// Acquires a mutable reference to the owned form of the data. - /// - /// Clones the data if it is not already owned. - /// - /// # Examples - /// - /// ``` - /// use std::borrow::Cow; - /// - /// let mut cow = Cow::Borrowed("foo"); - /// cow.to_mut().make_ascii_uppercase(); - /// - /// assert_eq!( - /// cow, - /// Cow::Owned(String::from("FOO")) as Cow<str> - /// ); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - pub fn to_mut(&mut self) -> &mut <B as ToOwned>::Owned { - match *self { - Borrowed(borrowed) => { - *self = Owned(borrowed.to_owned()); - match *self { - Borrowed(..) => unreachable!(), - Owned(ref mut owned) => owned, - } - } - Owned(ref mut owned) => owned, - } - } - - /// Extracts the owned data. - /// - /// Clones the data if it is not already owned. - /// - /// # Examples - /// - /// Calling `into_owned` on a `Cow::Borrowed` returns a clone of the borrowed data: - /// - /// ``` - /// use std::borrow::Cow; - /// - /// let s = "Hello world!"; - /// let cow = Cow::Borrowed(s); - /// - /// assert_eq!( - /// cow.into_owned(), - /// String::from(s) - /// ); - /// ``` - /// - /// Calling `into_owned` on a `Cow::Owned` returns the owned data. The data is moved out of the - /// `Cow` without being cloned. - /// - /// ``` - /// use std::borrow::Cow; - /// - /// let s = "Hello world!"; - /// let cow: Cow<str> = Cow::Owned(String::from(s)); - /// - /// assert_eq!( - /// cow.into_owned(), - /// String::from(s) - /// ); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - pub fn into_owned(self) -> <B as ToOwned>::Owned { - match self { - Borrowed(borrowed) => borrowed.to_owned(), - Owned(owned) => owned, - } - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -#[rustc_const_unstable(feature = "const_deref", issue = "88955")] -impl<B: ?Sized + ToOwned> const Deref for Cow<'_, B> -where - B::Owned: ~const Borrow<B>, -{ - type Target = B; - - fn deref(&self) -> &B { - match *self { - Borrowed(borrowed) => borrowed, - Owned(ref owned) => owned.borrow(), - } - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<B: ?Sized> Eq for Cow<'_, B> where B: Eq + ToOwned {} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<B: ?Sized> Ord for Cow<'_, B> -where - B: Ord + ToOwned, -{ - #[inline] - fn cmp(&self, other: &Self) -> Ordering { - Ord::cmp(&**self, &**other) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a, 'b, B: ?Sized, C: ?Sized> PartialEq<Cow<'b, C>> for Cow<'a, B> -where - B: PartialEq<C> + ToOwned, - C: ToOwned, -{ - #[inline] - fn eq(&self, other: &Cow<'b, C>) -> bool { - PartialEq::eq(&**self, &**other) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a, B: ?Sized> PartialOrd for Cow<'a, B> -where - B: PartialOrd + ToOwned, -{ - #[inline] - fn partial_cmp(&self, other: &Cow<'a, B>) -> Option<Ordering> { - PartialOrd::partial_cmp(&**self, &**other) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<B: ?Sized> fmt::Debug for Cow<'_, B> -where - B: fmt::Debug + ToOwned<Owned: fmt::Debug>, -{ - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match *self { - Borrowed(ref b) => fmt::Debug::fmt(b, f), - Owned(ref o) => fmt::Debug::fmt(o, f), - } - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<B: ?Sized> fmt::Display for Cow<'_, B> -where - B: fmt::Display + ToOwned<Owned: fmt::Display>, -{ - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match *self { - Borrowed(ref b) => fmt::Display::fmt(b, f), - Owned(ref o) => fmt::Display::fmt(o, f), - } - } -} - -#[stable(feature = "default", since = "1.11.0")] -impl<B: ?Sized> Default for Cow<'_, B> -where - B: ToOwned<Owned: Default>, -{ - /// Creates an owned Cow<'a, B> with the default value for the contained owned value. - fn default() -> Self { - Owned(<B as ToOwned>::Owned::default()) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<B: ?Sized> Hash for Cow<'_, B> -where - B: Hash + ToOwned, -{ - #[inline] - fn hash<H: Hasher>(&self, state: &mut H) { - Hash::hash(&**self, state) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<T: ?Sized + ToOwned> AsRef<T> for Cow<'_, T> { - fn as_ref(&self) -> &T { - self - } -} - -#[cfg(not(no_global_oom_handling))] -#[stable(feature = "cow_add", since = "1.14.0")] -impl<'a> Add<&'a str> for Cow<'a, str> { - type Output = Cow<'a, str>; - - #[inline] - fn add(mut self, rhs: &'a str) -> Self::Output { - self += rhs; - self - } -} - -#[cfg(not(no_global_oom_handling))] -#[stable(feature = "cow_add", since = "1.14.0")] -impl<'a> Add<Cow<'a, str>> for Cow<'a, str> { - type Output = Cow<'a, str>; - - #[inline] - fn add(mut self, rhs: Cow<'a, str>) -> Self::Output { - self += rhs; - self - } -} - -#[cfg(not(no_global_oom_handling))] -#[stable(feature = "cow_add", since = "1.14.0")] -impl<'a> AddAssign<&'a str> for Cow<'a, str> { - fn add_assign(&mut self, rhs: &'a str) { - if self.is_empty() { - *self = Cow::Borrowed(rhs) - } else if !rhs.is_empty() { - if let Cow::Borrowed(lhs) = *self { - let mut s = String::with_capacity(lhs.len() + rhs.len()); - s.push_str(lhs); - *self = Cow::Owned(s); - } - self.to_mut().push_str(rhs); - } - } -} - -#[cfg(not(no_global_oom_handling))] -#[stable(feature = "cow_add", since = "1.14.0")] -impl<'a> AddAssign<Cow<'a, str>> for Cow<'a, str> { - fn add_assign(&mut self, rhs: Cow<'a, str>) { - if self.is_empty() { - *self = rhs - } else if !rhs.is_empty() { - if let Cow::Borrowed(lhs) = *self { - let mut s = String::with_capacity(lhs.len() + rhs.len()); - s.push_str(lhs); - *self = Cow::Owned(s); - } - self.to_mut().push_str(&rhs); - } - } -} diff --git a/rust/alloc/lib.rs b/rust/alloc/lib.rs index 233bcd5e4654..3aebf83c9967 100644 --- a/rust/alloc/lib.rs +++ b/rust/alloc/lib.rs @@ -100,7 +100,7 @@ #![cfg_attr(not(no_global_oom_handling), feature(const_alloc_error))] #![feature(const_box)] #![cfg_attr(not(no_global_oom_handling), feature(const_btree_new))] -#![feature(const_cow_is_borrowed)] +#![cfg_attr(not(no_borrow), feature(const_cow_is_borrowed))] #![feature(const_convert)] #![feature(const_size_of_val)] #![feature(const_align_of_val)] @@ -215,6 +215,7 @@ pub mod boxed; mod boxed { pub use std::boxed::Box; } +#[cfg(not(no_borrow))] pub mod borrow; pub mod collections; #[cfg(not(no_global_oom_handling))] diff --git a/rust/alloc/vec/mod.rs b/rust/alloc/vec/mod.rs index 8ac6c1e3b2a8..f77c7368d534 100644 --- a/rust/alloc/vec/mod.rs +++ b/rust/alloc/vec/mod.rs @@ -72,6 +72,7 @@ use core::ptr::{self, NonNull}; use core::slice::{self, SliceIndex}; use crate::alloc::{Allocator, Global}; +#[cfg(not(no_borrow))] use crate::borrow::{Cow, ToOwned}; use crate::boxed::Box; use crate::collections::TryReserveError; @@ -94,6 +95,7 @@ pub use self::drain::Drain; mod drain; +#[cfg(not(no_borrow))] #[cfg(not(no_global_oom_handling))] mod cow; @@ -3103,6 +3105,7 @@ impl<T, const N: usize> From<[T; N]> for Vec<T> { } } +#[cfg(not(no_borrow))] #[stable(feature = "vec_from_cow_slice", since = "1.14.0")] impl<'a, T> From<Cow<'a, [T]>> for Vec<T> where diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index c48bc284214a..75d85bd6c592 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -7,6 +7,7 @@ */ #include <linux/slab.h> +#include <linux/refcount.h> /* `bindgen` gets confused at certain things. */ const gfp_t BINDINGS_GFP_KERNEL = GFP_KERNEL; diff --git a/rust/bindings/lib.rs b/rust/bindings/lib.rs index 6c50ee62c56b..7b246454e009 100644 --- a/rust/bindings/lib.rs +++ b/rust/bindings/lib.rs @@ -41,6 +41,7 @@ mod bindings_raw { #[allow(dead_code)] mod bindings_helper { // Import the generated bindings for types. + use super::bindings_raw::*; include!(concat!( env!("OBJTREE"), "/rust/bindings/bindings_helpers_generated.rs" diff --git a/rust/compiler_builtins.rs b/rust/compiler_builtins.rs index f8f39a3e6855..43378357ece9 100644 --- a/rust/compiler_builtins.rs +++ b/rust/compiler_builtins.rs @@ -28,7 +28,7 @@ macro_rules! define_panicking_intrinsics( ($reason: tt, { $($ident: ident, )* }) => { $( #[doc(hidden)] - #[no_mangle] + #[export_name = concat!("__rust", stringify!($ident))] pub extern "C" fn $ident() { panic!($reason); } @@ -61,3 +61,6 @@ define_panicking_intrinsics!("`u128` should not be used", { __udivti3, __umodti3, }); + +// NOTE: if you are adding a new intrinsic here, you should also add it to +// `redirect-intrinsics` in `rust/Makefile`. diff --git a/rust/helpers.c b/rust/helpers.c index b4f15eee2ffd..09a4d93f9d62 100644 --- a/rust/helpers.c +++ b/rust/helpers.c @@ -20,6 +20,7 @@ #include <linux/bug.h> #include <linux/build_bug.h> +#include <linux/refcount.h> __noreturn void rust_helper_BUG(void) { @@ -27,6 +28,24 @@ __noreturn void rust_helper_BUG(void) } EXPORT_SYMBOL_GPL(rust_helper_BUG); +refcount_t rust_helper_REFCOUNT_INIT(int n) +{ + return (refcount_t)REFCOUNT_INIT(n); +} +EXPORT_SYMBOL_GPL(rust_helper_REFCOUNT_INIT); + +void rust_helper_refcount_inc(refcount_t *r) +{ + refcount_inc(r); +} +EXPORT_SYMBOL_GPL(rust_helper_refcount_inc); + +bool rust_helper_refcount_dec_and_test(refcount_t *r) +{ + return refcount_dec_and_test(r); +} +EXPORT_SYMBOL_GPL(rust_helper_refcount_dec_and_test); + /* * We use `bindgen`'s `--size_t-is-usize` option to bind the C `size_t` type * as the Rust `usize` type, so we can use it in contexts where Rust diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index 53040fa9e897..223564f9f0cc 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -13,7 +13,12 @@ #![no_std] #![feature(allocator_api)] +#![feature(coerce_unsized)] #![feature(core_ffi_c)] +#![feature(dispatch_from_dyn)] +#![feature(generic_associated_types)] +#![feature(receiver_trait)] +#![feature(unsize)] // Ensure conditional compilation based on the kernel configuration works; // otherwise we may silently break things like initcall handling. @@ -31,6 +36,7 @@ mod static_assert; #[doc(hidden)] pub mod std_vendor; pub mod str; +pub mod sync; pub mod types; #[doc(hidden)] diff --git a/rust/kernel/prelude.rs b/rust/kernel/prelude.rs index 7a90249ee9b9..0bc1c97e5604 100644 --- a/rust/kernel/prelude.rs +++ b/rust/kernel/prelude.rs @@ -11,15 +11,21 @@ //! use kernel::prelude::*; //! ``` +#[doc(no_inline)] pub use core::pin::Pin; +#[doc(no_inline)] pub use alloc::{boxed::Box, vec::Vec}; +#[doc(no_inline)] pub use macros::{module, vtable}; pub use super::build_assert; -pub use super::{dbg, pr_alert, pr_crit, pr_debug, pr_emerg, pr_err, pr_info, pr_notice, pr_warn}; +// `super::std_vendor` is hidden, which makes the macro inline for some reason. +#[doc(no_inline)] +pub use super::dbg; +pub use super::{pr_alert, pr_crit, pr_debug, pr_emerg, pr_err, pr_info, pr_notice, pr_warn}; pub use super::static_assert; diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs new file mode 100644 index 000000000000..33da23e3076d --- /dev/null +++ b/rust/kernel/sync.rs @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Synchronisation primitives. +//! +//! This module contains the kernel APIs related to synchronisation that have been ported or +//! wrapped for usage by Rust code in the kernel. + +mod arc; + +pub use arc::{Arc, ArcBorrow, UniqueArc}; diff --git a/rust/kernel/sync/arc.rs b/rust/kernel/sync/arc.rs new file mode 100644 index 000000000000..f2f1c83d72ba --- /dev/null +++ b/rust/kernel/sync/arc.rs @@ -0,0 +1,524 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! A reference-counted pointer. +//! +//! This module implements a way for users to create reference-counted objects and pointers to +//! them. Such a pointer automatically increments and decrements the count, and drops the +//! underlying object when it reaches zero. It is also safe to use concurrently from multiple +//! threads. +//! +//! It is different from the standard library's [`Arc`] in a few ways: +//! 1. It is backed by the kernel's `refcount_t` type. +//! 2. It does not support weak references, which allows it to be half the size. +//! 3. It saturates the reference count instead of aborting when it goes over a threshold. +//! 4. It does not provide a `get_mut` method, so the ref counted object is pinned. +//! +//! [`Arc`]: https://doc.rust-lang.org/std/sync/struct.Arc.html + +use crate::{ + bindings, + error::Result, + types::{ForeignOwnable, Opaque}, +}; +use alloc::boxed::Box; +use core::{ + marker::{PhantomData, Unsize}, + mem::{ManuallyDrop, MaybeUninit}, + ops::{Deref, DerefMut}, + pin::Pin, + ptr::NonNull, +}; + +/// A reference-counted pointer to an instance of `T`. +/// +/// The reference count is incremented when new instances of [`Arc`] are created, and decremented +/// when they are dropped. When the count reaches zero, the underlying `T` is also dropped. +/// +/// # Invariants +/// +/// The reference count on an instance of [`Arc`] is always non-zero. +/// The object pointed to by [`Arc`] is always pinned. +/// +/// # Examples +/// +/// ``` +/// use kernel::sync::Arc; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// // Create a ref-counted instance of `Example`. +/// let obj = Arc::try_new(Example { a: 10, b: 20 })?; +/// +/// // Get a new pointer to `obj` and increment the refcount. +/// let cloned = obj.clone(); +/// +/// // Assert that both `obj` and `cloned` point to the same underlying object. +/// assert!(core::ptr::eq(&*obj, &*cloned)); +/// +/// // Destroy `obj` and decrement its refcount. +/// drop(obj); +/// +/// // Check that the values are still accessible through `cloned`. +/// assert_eq!(cloned.a, 10); +/// assert_eq!(cloned.b, 20); +/// +/// // The refcount drops to zero when `cloned` goes out of scope, and the memory is freed. +/// ``` +/// +/// Using `Arc<T>` as the type of `self`: +/// +/// ``` +/// use kernel::sync::Arc; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// impl Example { +/// fn take_over(self: Arc<Self>) { +/// // ... +/// } +/// +/// fn use_reference(self: &Arc<Self>) { +/// // ... +/// } +/// } +/// +/// let obj = Arc::try_new(Example { a: 10, b: 20 })?; +/// obj.use_reference(); +/// obj.take_over(); +/// ``` +/// +/// Coercion from `Arc<Example>` to `Arc<dyn MyTrait>`: +/// +/// ``` +/// use kernel::sync::{Arc, ArcBorrow}; +/// +/// trait MyTrait { +/// // Trait has a function whose `self` type is `Arc<Self>`. +/// fn example1(self: Arc<Self>) {} +/// +/// // Trait has a function whose `self` type is `ArcBorrow<'_, Self>`. +/// fn example2(self: ArcBorrow<'_, Self>) {} +/// } +/// +/// struct Example; +/// impl MyTrait for Example {} +/// +/// // `obj` has type `Arc<Example>`. +/// let obj: Arc<Example> = Arc::try_new(Example)?; +/// +/// // `coerced` has type `Arc<dyn MyTrait>`. +/// let coerced: Arc<dyn MyTrait> = obj; +/// ``` +pub struct Arc<T: ?Sized> { + ptr: NonNull<ArcInner<T>>, + _p: PhantomData<ArcInner<T>>, +} + +#[repr(C)] +struct ArcInner<T: ?Sized> { + refcount: Opaque<bindings::refcount_t>, + data: T, +} + +// This is to allow [`Arc`] (and variants) to be used as the type of `self`. +impl<T: ?Sized> core::ops::Receiver for Arc<T> {} + +// This is to allow coercion from `Arc<T>` to `Arc<U>` if `T` can be converted to the +// dynamically-sized type (DST) `U`. +impl<T: ?Sized + Unsize<U>, U: ?Sized> core::ops::CoerceUnsized<Arc<U>> for Arc<T> {} + +// This is to allow `Arc<U>` to be dispatched on when `Arc<T>` can be coerced into `Arc<U>`. +impl<T: ?Sized + Unsize<U>, U: ?Sized> core::ops::DispatchFromDyn<Arc<U>> for Arc<T> {} + +// SAFETY: It is safe to send `Arc<T>` to another thread when the underlying `T` is `Sync` because +// it effectively means sharing `&T` (which is safe because `T` is `Sync`); additionally, it needs +// `T` to be `Send` because any thread that has an `Arc<T>` may ultimately access `T` directly, for +// example, when the reference count reaches zero and `T` is dropped. +unsafe impl<T: ?Sized + Sync + Send> Send for Arc<T> {} + +// SAFETY: It is safe to send `&Arc<T>` to another thread when the underlying `T` is `Sync` for the +// same reason as above. `T` needs to be `Send` as well because a thread can clone an `&Arc<T>` +// into an `Arc<T>`, which may lead to `T` being accessed by the same reasoning as above. +unsafe impl<T: ?Sized + Sync + Send> Sync for Arc<T> {} + +impl<T> Arc<T> { + /// Constructs a new reference counted instance of `T`. + pub fn try_new(contents: T) -> Result<Self> { + // INVARIANT: The refcount is initialised to a non-zero value. + let value = ArcInner { + // SAFETY: There are no safety requirements for this FFI call. + refcount: Opaque::new(unsafe { bindings::REFCOUNT_INIT(1) }), + data: contents, + }; + + let inner = Box::try_new(value)?; + + // SAFETY: We just created `inner` with a reference count of 1, which is owned by the new + // `Arc` object. + Ok(unsafe { Self::from_inner(Box::leak(inner).into()) }) + } +} + +impl<T: ?Sized> Arc<T> { + /// Constructs a new [`Arc`] from an existing [`ArcInner`]. + /// + /// # Safety + /// + /// The caller must ensure that `inner` points to a valid location and has a non-zero reference + /// count, one of which will be owned by the new [`Arc`] instance. + unsafe fn from_inner(inner: NonNull<ArcInner<T>>) -> Self { + // INVARIANT: By the safety requirements, the invariants hold. + Arc { + ptr: inner, + _p: PhantomData, + } + } + + /// Returns an [`ArcBorrow`] from the given [`Arc`]. + /// + /// This is useful when the argument of a function call is an [`ArcBorrow`] (e.g., in a method + /// receiver), but we have an [`Arc`] instead. Getting an [`ArcBorrow`] is free when optimised. + #[inline] + pub fn as_arc_borrow(&self) -> ArcBorrow<'_, T> { + // SAFETY: The constraint that the lifetime of the shared reference must outlive that of + // the returned `ArcBorrow` ensures that the object remains alive and that no mutable + // reference can be created. + unsafe { ArcBorrow::new(self.ptr) } + } +} + +impl<T: 'static> ForeignOwnable for Arc<T> { + type Borrowed<'a> = ArcBorrow<'a, T>; + + fn into_foreign(self) -> *const core::ffi::c_void { + ManuallyDrop::new(self).ptr.as_ptr() as _ + } + + unsafe fn borrow<'a>(ptr: *const core::ffi::c_void) -> ArcBorrow<'a, T> { + // SAFETY: By the safety requirement of this function, we know that `ptr` came from + // a previous call to `Arc::into_foreign`. + let inner = NonNull::new(ptr as *mut ArcInner<T>).unwrap(); + + // SAFETY: The safety requirements of `from_foreign` ensure that the object remains alive + // for the lifetime of the returned value. Additionally, the safety requirements of + // `ForeignOwnable::borrow_mut` ensure that no new mutable references are created. + unsafe { ArcBorrow::new(inner) } + } + + unsafe fn from_foreign(ptr: *const core::ffi::c_void) -> Self { + // SAFETY: By the safety requirement of this function, we know that `ptr` came from + // a previous call to `Arc::into_foreign`, which guarantees that `ptr` is valid and + // holds a reference count increment that is transferrable to us. + unsafe { Self::from_inner(NonNull::new(ptr as _).unwrap()) } + } +} + +impl<T: ?Sized> Deref for Arc<T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is + // safe to dereference it. + unsafe { &self.ptr.as_ref().data } + } +} + +impl<T: ?Sized> Clone for Arc<T> { + fn clone(&self) -> Self { + // INVARIANT: C `refcount_inc` saturates the refcount, so it cannot overflow to zero. + // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is + // safe to increment the refcount. + unsafe { bindings::refcount_inc(self.ptr.as_ref().refcount.get()) }; + + // SAFETY: We just incremented the refcount. This increment is now owned by the new `Arc`. + unsafe { Self::from_inner(self.ptr) } + } +} + +impl<T: ?Sized> Drop for Arc<T> { + fn drop(&mut self) { + // SAFETY: By the type invariant, there is necessarily a reference to the object. We cannot + // touch `refcount` after it's decremented to a non-zero value because another thread/CPU + // may concurrently decrement it to zero and free it. It is ok to have a raw pointer to + // freed/invalid memory as long as it is never dereferenced. + let refcount = unsafe { self.ptr.as_ref() }.refcount.get(); + + // INVARIANT: If the refcount reaches zero, there are no other instances of `Arc`, and + // this instance is being dropped, so the broken invariant is not observable. + // SAFETY: Also by the type invariant, we are allowed to decrement the refcount. + let is_zero = unsafe { bindings::refcount_dec_and_test(refcount) }; + if is_zero { + // The count reached zero, we must free the memory. + // + // SAFETY: The pointer was initialised from the result of `Box::leak`. + unsafe { Box::from_raw(self.ptr.as_ptr()) }; + } + } +} + +impl<T: ?Sized> From<UniqueArc<T>> for Arc<T> { + fn from(item: UniqueArc<T>) -> Self { + item.inner + } +} + +impl<T: ?Sized> From<Pin<UniqueArc<T>>> for Arc<T> { + fn from(item: Pin<UniqueArc<T>>) -> Self { + // SAFETY: The type invariants of `Arc` guarantee that the data is pinned. + unsafe { Pin::into_inner_unchecked(item).inner } + } +} + +/// A borrowed reference to an [`Arc`] instance. +/// +/// For cases when one doesn't ever need to increment the refcount on the allocation, it is simpler +/// to use just `&T`, which we can trivially get from an `Arc<T>` instance. +/// +/// However, when one may need to increment the refcount, it is preferable to use an `ArcBorrow<T>` +/// over `&Arc<T>` because the latter results in a double-indirection: a pointer (shared reference) +/// to a pointer (`Arc<T>`) to the object (`T`). An [`ArcBorrow`] eliminates this double +/// indirection while still allowing one to increment the refcount and getting an `Arc<T>` when/if +/// needed. +/// +/// # Invariants +/// +/// There are no mutable references to the underlying [`Arc`], and it remains valid for the +/// lifetime of the [`ArcBorrow`] instance. +/// +/// # Example +/// +/// ``` +/// use crate::sync::{Arc, ArcBorrow}; +/// +/// struct Example; +/// +/// fn do_something(e: ArcBorrow<'_, Example>) -> Arc<Example> { +/// e.into() +/// } +/// +/// let obj = Arc::try_new(Example)?; +/// let cloned = do_something(obj.as_arc_borrow()); +/// +/// // Assert that both `obj` and `cloned` point to the same underlying object. +/// assert!(core::ptr::eq(&*obj, &*cloned)); +/// ``` +/// +/// Using `ArcBorrow<T>` as the type of `self`: +/// +/// ``` +/// use crate::sync::{Arc, ArcBorrow}; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// impl Example { +/// fn use_reference(self: ArcBorrow<'_, Self>) { +/// // ... +/// } +/// } +/// +/// let obj = Arc::try_new(Example { a: 10, b: 20 })?; +/// obj.as_arc_borrow().use_reference(); +/// ``` +pub struct ArcBorrow<'a, T: ?Sized + 'a> { + inner: NonNull<ArcInner<T>>, + _p: PhantomData<&'a ()>, +} + +// This is to allow [`ArcBorrow`] (and variants) to be used as the type of `self`. +impl<T: ?Sized> core::ops::Receiver for ArcBorrow<'_, T> {} + +// This is to allow `ArcBorrow<U>` to be dispatched on when `ArcBorrow<T>` can be coerced into +// `ArcBorrow<U>`. +impl<T: ?Sized + Unsize<U>, U: ?Sized> core::ops::DispatchFromDyn<ArcBorrow<'_, U>> + for ArcBorrow<'_, T> +{ +} + +impl<T: ?Sized> Clone for ArcBorrow<'_, T> { + fn clone(&self) -> Self { + *self + } +} + +impl<T: ?Sized> Copy for ArcBorrow<'_, T> {} + +impl<T: ?Sized> ArcBorrow<'_, T> { + /// Creates a new [`ArcBorrow`] instance. + /// + /// # Safety + /// + /// Callers must ensure the following for the lifetime of the returned [`ArcBorrow`] instance: + /// 1. That `inner` remains valid; + /// 2. That no mutable references to `inner` are created. + unsafe fn new(inner: NonNull<ArcInner<T>>) -> Self { + // INVARIANT: The safety requirements guarantee the invariants. + Self { + inner, + _p: PhantomData, + } + } +} + +impl<T: ?Sized> From<ArcBorrow<'_, T>> for Arc<T> { + fn from(b: ArcBorrow<'_, T>) -> Self { + // SAFETY: The existence of `b` guarantees that the refcount is non-zero. `ManuallyDrop` + // guarantees that `drop` isn't called, so it's ok that the temporary `Arc` doesn't own the + // increment. + ManuallyDrop::new(unsafe { Arc::from_inner(b.inner) }) + .deref() + .clone() + } +} + +impl<T: ?Sized> Deref for ArcBorrow<'_, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + // SAFETY: By the type invariant, the underlying object is still alive with no mutable + // references to it, so it is safe to create a shared reference. + unsafe { &self.inner.as_ref().data } + } +} + +/// A refcounted object that is known to have a refcount of 1. +/// +/// It is mutable and can be converted to an [`Arc`] so that it can be shared. +/// +/// # Invariants +/// +/// `inner` always has a reference count of 1. +/// +/// # Examples +/// +/// In the following example, we make changes to the inner object before turning it into an +/// `Arc<Test>` object (after which point, it cannot be mutated directly). Note that `x.into()` +/// cannot fail. +/// +/// ``` +/// use kernel::sync::{Arc, UniqueArc}; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// fn test() -> Result<Arc<Example>> { +/// let mut x = UniqueArc::try_new(Example { a: 10, b: 20 })?; +/// x.a += 1; +/// x.b += 1; +/// Ok(x.into()) +/// } +/// +/// # test().unwrap(); +/// ``` +/// +/// In the following example we first allocate memory for a ref-counted `Example` but we don't +/// initialise it on allocation. We do initialise it later with a call to [`UniqueArc::write`], +/// followed by a conversion to `Arc<Example>`. This is particularly useful when allocation happens +/// in one context (e.g., sleepable) and initialisation in another (e.g., atomic): +/// +/// ``` +/// use kernel::sync::{Arc, UniqueArc}; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// fn test() -> Result<Arc<Example>> { +/// let x = UniqueArc::try_new_uninit()?; +/// Ok(x.write(Example { a: 10, b: 20 }).into()) +/// } +/// +/// # test().unwrap(); +/// ``` +/// +/// In the last example below, the caller gets a pinned instance of `Example` while converting to +/// `Arc<Example>`; this is useful in scenarios where one needs a pinned reference during +/// initialisation, for example, when initialising fields that are wrapped in locks. +/// +/// ``` +/// use kernel::sync::{Arc, UniqueArc}; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// fn test() -> Result<Arc<Example>> { +/// let mut pinned = Pin::from(UniqueArc::try_new(Example { a: 10, b: 20 })?); +/// // We can modify `pinned` because it is `Unpin`. +/// pinned.as_mut().a += 1; +/// Ok(pinned.into()) +/// } +/// +/// # test().unwrap(); +/// ``` +pub struct UniqueArc<T: ?Sized> { + inner: Arc<T>, +} + +impl<T> UniqueArc<T> { + /// Tries to allocate a new [`UniqueArc`] instance. + pub fn try_new(value: T) -> Result<Self> { + Ok(Self { + // INVARIANT: The newly-created object has a ref-count of 1. + inner: Arc::try_new(value)?, + }) + } + + /// Tries to allocate a new [`UniqueArc`] instance whose contents are not initialised yet. + pub fn try_new_uninit() -> Result<UniqueArc<MaybeUninit<T>>> { + Ok(UniqueArc::<MaybeUninit<T>> { + // INVARIANT: The newly-created object has a ref-count of 1. + inner: Arc::try_new(MaybeUninit::uninit())?, + }) + } +} + +impl<T> UniqueArc<MaybeUninit<T>> { + /// Converts a `UniqueArc<MaybeUninit<T>>` into a `UniqueArc<T>` by writing a value into it. + pub fn write(mut self, value: T) -> UniqueArc<T> { + self.deref_mut().write(value); + let inner = ManuallyDrop::new(self).inner.ptr; + UniqueArc { + // SAFETY: The new `Arc` is taking over `ptr` from `self.inner` (which won't be + // dropped). The types are compatible because `MaybeUninit<T>` is compatible with `T`. + inner: unsafe { Arc::from_inner(inner.cast()) }, + } + } +} + +impl<T: ?Sized> From<UniqueArc<T>> for Pin<UniqueArc<T>> { + fn from(obj: UniqueArc<T>) -> Self { + // SAFETY: It is not possible to move/replace `T` inside a `Pin<UniqueArc<T>>` (unless `T` + // is `Unpin`), so it is ok to convert it to `Pin<UniqueArc<T>>`. + unsafe { Pin::new_unchecked(obj) } + } +} + +impl<T: ?Sized> Deref for UniqueArc<T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.inner.deref() + } +} + +impl<T: ?Sized> DerefMut for UniqueArc<T> { + fn deref_mut(&mut self) -> &mut Self::Target { + // SAFETY: By the `Arc` type invariant, there is necessarily a reference to the object, so + // it is safe to dereference it. Additionally, we know there is only one reference when + // it's inside a `UniqueArc`, so it is safe to get a mutable reference. + unsafe { &mut self.inner.ptr.as_mut().data } + } +} diff --git a/rust/kernel/types.rs b/rust/kernel/types.rs index e84e51ec9716..9d0fdbc55843 100644 --- a/rust/kernel/types.rs +++ b/rust/kernel/types.rs @@ -2,7 +2,220 @@ //! Kernel types. -use core::{cell::UnsafeCell, mem::MaybeUninit}; +use alloc::boxed::Box; +use core::{ + cell::UnsafeCell, + mem::MaybeUninit, + ops::{Deref, DerefMut}, +}; + +/// Used to transfer ownership to and from foreign (non-Rust) languages. +/// +/// Ownership is transferred from Rust to a foreign language by calling [`Self::into_foreign`] and +/// later may be transferred back to Rust by calling [`Self::from_foreign`]. +/// +/// This trait is meant to be used in cases when Rust objects are stored in C objects and +/// eventually "freed" back to Rust. +pub trait ForeignOwnable: Sized { + /// Type of values borrowed between calls to [`ForeignOwnable::into_foreign`] and + /// [`ForeignOwnable::from_foreign`]. + type Borrowed<'a>; + + /// Converts a Rust-owned object to a foreign-owned one. + /// + /// The foreign representation is a pointer to void. + fn into_foreign(self) -> *const core::ffi::c_void; + + /// Borrows a foreign-owned object. + /// + /// # Safety + /// + /// `ptr` must have been returned by a previous call to [`ForeignOwnable::into_foreign`] for + /// which a previous matching [`ForeignOwnable::from_foreign`] hasn't been called yet. + /// Additionally, all instances (if any) of values returned by [`ForeignOwnable::borrow_mut`] + /// for this object must have been dropped. + unsafe fn borrow<'a>(ptr: *const core::ffi::c_void) -> Self::Borrowed<'a>; + + /// Mutably borrows a foreign-owned object. + /// + /// # Safety + /// + /// `ptr` must have been returned by a previous call to [`ForeignOwnable::into_foreign`] for + /// which a previous matching [`ForeignOwnable::from_foreign`] hasn't been called yet. + /// Additionally, all instances (if any) of values returned by [`ForeignOwnable::borrow`] and + /// [`ForeignOwnable::borrow_mut`] for this object must have been dropped. + unsafe fn borrow_mut(ptr: *const core::ffi::c_void) -> ScopeGuard<Self, fn(Self)> { + // SAFETY: The safety requirements ensure that `ptr` came from a previous call to + // `into_foreign`. + ScopeGuard::new_with_data(unsafe { Self::from_foreign(ptr) }, |d| { + d.into_foreign(); + }) + } + + /// Converts a foreign-owned object back to a Rust-owned one. + /// + /// # Safety + /// + /// `ptr` must have been returned by a previous call to [`ForeignOwnable::into_foreign`] for + /// which a previous matching [`ForeignOwnable::from_foreign`] hasn't been called yet. + /// Additionally, all instances (if any) of values returned by [`ForeignOwnable::borrow`] and + /// [`ForeignOwnable::borrow_mut`] for this object must have been dropped. + unsafe fn from_foreign(ptr: *const core::ffi::c_void) -> Self; +} + +impl<T: 'static> ForeignOwnable for Box<T> { + type Borrowed<'a> = &'a T; + + fn into_foreign(self) -> *const core::ffi::c_void { + Box::into_raw(self) as _ + } + + unsafe fn borrow<'a>(ptr: *const core::ffi::c_void) -> &'a T { + // SAFETY: The safety requirements for this function ensure that the object is still alive, + // so it is safe to dereference the raw pointer. + // The safety requirements of `from_foreign` also ensure that the object remains alive for + // the lifetime of the returned value. + unsafe { &*ptr.cast() } + } + + unsafe fn from_foreign(ptr: *const core::ffi::c_void) -> Self { + // SAFETY: The safety requirements of this function ensure that `ptr` comes from a previous + // call to `Self::into_foreign`. + unsafe { Box::from_raw(ptr as _) } + } +} + +impl ForeignOwnable for () { + type Borrowed<'a> = (); + + fn into_foreign(self) -> *const core::ffi::c_void { + core::ptr::NonNull::dangling().as_ptr() + } + + unsafe fn borrow<'a>(_: *const core::ffi::c_void) -> Self::Borrowed<'a> {} + + unsafe fn from_foreign(_: *const core::ffi::c_void) -> Self {} +} + +/// Runs a cleanup function/closure when dropped. +/// +/// The [`ScopeGuard::dismiss`] function prevents the cleanup function from running. +/// +/// # Examples +/// +/// In the example below, we have multiple exit paths and we want to log regardless of which one is +/// taken: +/// ``` +/// # use kernel::ScopeGuard; +/// fn example1(arg: bool) { +/// let _log = ScopeGuard::new(|| pr_info!("example1 completed\n")); +/// +/// if arg { +/// return; +/// } +/// +/// pr_info!("Do something...\n"); +/// } +/// +/// # example1(false); +/// # example1(true); +/// ``` +/// +/// In the example below, we want to log the same message on all early exits but a different one on +/// the main exit path: +/// ``` +/// # use kernel::ScopeGuard; +/// fn example2(arg: bool) { +/// let log = ScopeGuard::new(|| pr_info!("example2 returned early\n")); +/// +/// if arg { +/// return; +/// } +/// +/// // (Other early returns...) +/// +/// log.dismiss(); +/// pr_info!("example2 no early return\n"); +/// } +/// +/// # example2(false); +/// # example2(true); +/// ``` +/// +/// In the example below, we need a mutable object (the vector) to be accessible within the log +/// function, so we wrap it in the [`ScopeGuard`]: +/// ``` +/// # use kernel::ScopeGuard; +/// fn example3(arg: bool) -> Result { +/// let mut vec = +/// ScopeGuard::new_with_data(Vec::new(), |v| pr_info!("vec had {} elements\n", v.len())); +/// +/// vec.try_push(10u8)?; +/// if arg { +/// return Ok(()); +/// } +/// vec.try_push(20u8)?; +/// Ok(()) +/// } +/// +/// # assert_eq!(example3(false), Ok(())); +/// # assert_eq!(example3(true), Ok(())); +/// ``` +/// +/// # Invariants +/// +/// The value stored in the struct is nearly always `Some(_)`, except between +/// [`ScopeGuard::dismiss`] and [`ScopeGuard::drop`]: in this case, it will be `None` as the value +/// will have been returned to the caller. Since [`ScopeGuard::dismiss`] consumes the guard, +/// callers won't be able to use it anymore. +pub struct ScopeGuard<T, F: FnOnce(T)>(Option<(T, F)>); + +impl<T, F: FnOnce(T)> ScopeGuard<T, F> { + /// Creates a new guarded object wrapping the given data and with the given cleanup function. + pub fn new_with_data(data: T, cleanup_func: F) -> Self { + // INVARIANT: The struct is being initialised with `Some(_)`. + Self(Some((data, cleanup_func))) + } + + /// Prevents the cleanup function from running and returns the guarded data. + pub fn dismiss(mut self) -> T { + // INVARIANT: This is the exception case in the invariant; it is not visible to callers + // because this function consumes `self`. + self.0.take().unwrap().0 + } +} + +impl ScopeGuard<(), fn(())> { + /// Creates a new guarded object with the given cleanup function. + pub fn new(cleanup: impl FnOnce()) -> ScopeGuard<(), impl FnOnce(())> { + ScopeGuard::new_with_data((), move |_| cleanup()) + } +} + +impl<T, F: FnOnce(T)> Deref for ScopeGuard<T, F> { + type Target = T; + + fn deref(&self) -> &T { + // The type invariants guarantee that `unwrap` will succeed. + &self.0.as_ref().unwrap().0 + } +} + +impl<T, F: FnOnce(T)> DerefMut for ScopeGuard<T, F> { + fn deref_mut(&mut self) -> &mut T { + // The type invariants guarantee that `unwrap` will succeed. + &mut self.0.as_mut().unwrap().0 + } +} + +impl<T, F: FnOnce(T)> Drop for ScopeGuard<T, F> { + fn drop(&mut self) { + // Run the cleanup function if one is still present. + if let Some((data, cleanup)) = self.0.take() { + cleanup(data) + } + } +} /// Stores an opaque value. /// diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst index 836391e5d209..4815a8e32227 100644 --- a/scripts/Makefile.modinst +++ b/scripts/Makefile.modinst @@ -66,9 +66,13 @@ endif # Don't stop modules_install even if we can't sign external modules. # ifeq ($(CONFIG_MODULE_SIG_ALL),y) +ifeq ($(filter pkcs11:%, $(CONFIG_MODULE_SIG_KEY)),) sig-key := $(if $(wildcard $(CONFIG_MODULE_SIG_KEY)),,$(srctree)/)$(CONFIG_MODULE_SIG_KEY) +else +sig-key := $(CONFIG_MODULE_SIG_KEY) +endif quiet_cmd_sign = SIGN $@ - cmd_sign = scripts/sign-file $(CONFIG_MODULE_SIG_HASH) $(sig-key) certs/signing_key.x509 $@ \ + cmd_sign = scripts/sign-file $(CONFIG_MODULE_SIG_HASH) "$(sig-key)" certs/signing_key.x509 $@ \ $(if $(KBUILD_EXTMOD),|| true) else quiet_cmd_sign := diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py index 15fc4626d236..9ee99f9fae8d 100644 --- a/scripts/gdb/linux/cpus.py +++ b/scripts/gdb/linux/cpus.py @@ -163,7 +163,7 @@ def get_current_task(cpu): task_ptr_type = task_type.get_type().pointer() if utils.is_target_arch("x86"): - var_ptr = gdb.parse_and_eval("¤t_task") + var_ptr = gdb.parse_and_eval("&pcpu_hot.current_task") return per_cpu(var_ptr, cpu).dereference() elif utils.is_target_arch("aarch64"): current_task_addr = gdb.parse_and_eval("$SP_EL0") diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 424b2c1e586d..db7a51acf9db 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1793,7 +1793,7 @@ fail2: return error; } -static int ns_mkdir_op(struct user_namespace *mnt_userns, struct inode *dir, +static int ns_mkdir_op(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct aa_ns *ns, *parent; diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 6dd3cc5309bf..f3715cda59c5 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -313,7 +313,7 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, d = bprm->file->f_path.dentry; for (i = 0; i < attach->xattr_count; i++) { - size = vfs_getxattr_alloc(&init_user_ns, d, attach->xattrs[i], + size = vfs_getxattr_alloc(&nop_mnt_idmap, d, attach->xattrs[i], &value, value_size, GFP_KERNEL); if (size >= 0) { u32 index, perm; @@ -862,7 +862,7 @@ int apparmor_bprm_creds_for_exec(struct linux_binprm *bprm) const char *info = NULL; int error = 0; bool unsafe = false; - vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_user_ns(bprm->file), + vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(bprm->file), file_inode(bprm->file)); struct path_cond cond = { vfsuid_into_kuid(vfsuid), diff --git a/security/apparmor/file.c b/security/apparmor/file.c index cb3d3060d104..9119ddda6217 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -459,7 +459,7 @@ static int __file_path_perm(const char *op, struct aa_label *label, { struct aa_profile *profile; struct aa_perms perms = {}; - vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_user_ns(file), + vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(file), file_inode(file)); struct path_cond cond = { .uid = vfsuid_into_kuid(vfsuid), diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index c6728a629437..d6cc4812ca53 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -227,8 +227,7 @@ static int common_perm(const char *op, const struct path *path, u32 mask, */ static int common_perm_cond(const char *op, const struct path *path, u32 mask) { - struct user_namespace *mnt_userns = mnt_user_ns(path->mnt); - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_idmap(path->mnt), d_backing_inode(path->dentry)); struct path_cond cond = { vfsuid_into_kuid(vfsuid), @@ -273,14 +272,13 @@ static int common_perm_rm(const char *op, const struct path *dir, struct dentry *dentry, u32 mask) { struct inode *inode = d_backing_inode(dentry); - struct user_namespace *mnt_userns = mnt_user_ns(dir->mnt); struct path_cond cond = { }; vfsuid_t vfsuid; if (!inode || !path_mediated_fs(dentry)) return 0; - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(mnt_idmap(dir->mnt), inode); cond.uid = vfsuid_into_kuid(vfsuid); cond.mode = inode->i_mode; @@ -379,7 +377,7 @@ static int apparmor_path_rename(const struct path *old_dir, struct dentry *old_d label = begin_current_label_crit_section(); if (!unconfined(label)) { - struct user_namespace *mnt_userns = mnt_user_ns(old_dir->mnt); + struct mnt_idmap *idmap = mnt_idmap(old_dir->mnt); vfsuid_t vfsuid; struct path old_path = { .mnt = old_dir->mnt, .dentry = old_dentry }; @@ -388,14 +386,14 @@ static int apparmor_path_rename(const struct path *old_dir, struct dentry *old_d struct path_cond cond = { .mode = d_backing_inode(old_dentry)->i_mode }; - vfsuid = i_uid_into_vfsuid(mnt_userns, d_backing_inode(old_dentry)); + vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond.uid = vfsuid_into_kuid(vfsuid); if (flags & RENAME_EXCHANGE) { struct path_cond cond_exchange = { .mode = d_backing_inode(new_dentry)->i_mode, }; - vfsuid = i_uid_into_vfsuid(mnt_userns, d_backing_inode(old_dentry)); + vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond_exchange.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_RENAME_SRC, label, &new_path, 0, @@ -460,13 +458,13 @@ static int apparmor_file_open(struct file *file) label = aa_get_newest_cred_label(file->f_cred); if (!unconfined(label)) { - struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); vfsuid_t vfsuid; struct path_cond cond = { .mode = inode->i_mode, }; - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); cond.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_OPEN, label, &file->f_path, 0, diff --git a/security/apparmor/policy_compat.c b/security/apparmor/policy_compat.c index 9e52e218bf30..cc89d1e88fb7 100644 --- a/security/apparmor/policy_compat.c +++ b/security/apparmor/policy_compat.c @@ -160,8 +160,7 @@ static struct aa_perms *compute_fperms(struct aa_dfa *dfa) if (!table) return NULL; - /* zero init so skip the trap state (state == 0) */ - for (state = 1; state < state_count; state++) { + for (state = 0; state < state_count; state++) { table[state * 2] = compute_fperms_user(dfa, state); table[state * 2 + 1] = compute_fperms_other(dfa, state); } diff --git a/security/commoncap.c b/security/commoncap.c index 1164278b97fd..aec62db55271 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -305,24 +305,24 @@ int cap_inode_need_killpriv(struct dentry *dentry) /** * cap_inode_killpriv - Erase the security markings on an inode * - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: The inode/dentry to alter * * Erase the privilege-enhancing security markings on an inode. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. * * Return: 0 if successful, -ve on error. */ -int cap_inode_killpriv(struct user_namespace *mnt_userns, struct dentry *dentry) +int cap_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry) { int error; - error = __vfs_removexattr(mnt_userns, dentry, XATTR_NAME_CAPS); + error = __vfs_removexattr(idmap, dentry, XATTR_NAME_CAPS); if (error == -EOPNOTSUPP) error = 0; return error; @@ -377,7 +377,7 @@ static bool is_v3header(int size, const struct vfs_cap_data *cap) * by the integrity subsystem, which really wants the unconverted values - * so that's good. */ -int cap_inode_getsecurity(struct user_namespace *mnt_userns, +int cap_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { @@ -398,7 +398,7 @@ int cap_inode_getsecurity(struct user_namespace *mnt_userns, dentry = d_find_any_alias(inode); if (!dentry) return -EINVAL; - size = vfs_getxattr_alloc(mnt_userns, dentry, XATTR_NAME_CAPS, &tmpbuf, + size = vfs_getxattr_alloc(idmap, dentry, XATTR_NAME_CAPS, &tmpbuf, sizeof(struct vfs_ns_cap_data), GFP_NOFS); dput(dentry); /* gcc11 complains if we don't check for !tmpbuf */ @@ -420,7 +420,7 @@ int cap_inode_getsecurity(struct user_namespace *mnt_userns, kroot = make_kuid(fs_ns, root); /* If this is an idmapped mount shift the kuid. */ - vfsroot = make_vfsuid(mnt_userns, fs_ns, kroot); + vfsroot = make_vfsuid(idmap, fs_ns, kroot); /* If the root kuid maps to a valid uid in current ns, then return * this as a nscap. */ @@ -510,7 +510,7 @@ static bool validheader(size_t size, const struct vfs_cap_data *cap) /** * cap_convert_nscap - check vfs caps * - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: used to retrieve inode to check permissions on * @ivalue: vfs caps value which may be modified by this function * @size: size of @ivalue @@ -518,15 +518,15 @@ static bool validheader(size_t size, const struct vfs_cap_data *cap) * User requested a write of security.capability. If needed, update the * xattr to change from v2 to v3, or to fixup the v3 rootid. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. * * Return: On success, return the new size; on error, return < 0. */ -int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry, +int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry, const void **ivalue, size_t size) { struct vfs_ns_cap_data *nscap; @@ -544,9 +544,9 @@ int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry, return -EINVAL; if (!validheader(size, cap)) return -EINVAL; - if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP)) + if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP)) return -EPERM; - if (size == XATTR_CAPS_SZ_2 && (mnt_userns == fs_ns)) + if (size == XATTR_CAPS_SZ_2 && (idmap == &nop_mnt_idmap)) if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP)) /* user is privileged, just write the v2 */ return size; @@ -555,7 +555,7 @@ int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry, if (!vfsuid_valid(vfsrootid)) return -EINVAL; - rootid = from_vfsuid(mnt_userns, fs_ns, vfsrootid); + rootid = from_vfsuid(idmap, fs_ns, vfsrootid); if (!uid_valid(rootid)) return -EINVAL; @@ -626,19 +626,19 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, /** * get_vfs_caps_from_disk - retrieve vfs caps from disk * - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: dentry from which @inode is retrieved * @cpu_caps: vfs capabilities * * Extract the on-exec-apply capability sets for an executable file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. */ -int get_vfs_caps_from_disk(struct user_namespace *mnt_userns, +int get_vfs_caps_from_disk(struct mnt_idmap *idmap, const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps) { @@ -695,7 +695,7 @@ int get_vfs_caps_from_disk(struct user_namespace *mnt_userns, return -EINVAL; } - rootvfsuid = make_vfsuid(mnt_userns, fs_ns, rootkuid); + rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid); if (!vfsuid_valid(rootvfsuid)) return -ENODATA; @@ -747,7 +747,7 @@ static int get_file_caps(struct linux_binprm *bprm, struct file *file, if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns)) return 0; - rc = get_vfs_caps_from_disk(file_mnt_user_ns(file), + rc = get_vfs_caps_from_disk(file_mnt_idmap(file), file->f_path.dentry, &vcaps); if (rc < 0) { if (rc == -EINVAL) @@ -1016,23 +1016,23 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name, /** * cap_inode_removexattr - Determine whether an xattr may be removed * - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: The inode/dentry being altered * @name: The name of the xattr to be changed * * Determine whether an xattr may be removed from an inode, returning 0 if * permission is granted, -ve if denied. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply pass @nop_mnt_idmap. * * This is used to make sure security xattrs don't get removed by those who * aren't privileged to remove them. */ -int cap_inode_removexattr(struct user_namespace *mnt_userns, +int cap_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct user_namespace *user_ns = dentry->d_sb->s_user_ns; @@ -1047,7 +1047,7 @@ int cap_inode_removexattr(struct user_namespace *mnt_userns, struct inode *inode = d_backing_inode(dentry); if (!inode) return -EINVAL; - if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP)) + if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP)) return -EPERM; return 0; } diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c index fa5ff13fa8c9..52b811da6989 100644 --- a/security/integrity/evm/evm_crypto.c +++ b/security/integrity/evm/evm_crypto.c @@ -265,7 +265,7 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry, req_xattr_value_len); continue; } - size = vfs_getxattr_alloc(&init_user_ns, dentry, xattr->name, + size = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, xattr->name, &xattr_value, xattr_size, GFP_NOFS); if (size == -ENOMEM) { error = -ENOMEM; @@ -274,7 +274,7 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry, if (size < 0) continue; - user_space_size = vfs_getxattr(&init_user_ns, dentry, + user_space_size = vfs_getxattr(&nop_mnt_idmap, dentry, xattr->name, NULL, 0); if (user_space_size != size) pr_debug("file %s: xattr %s size mismatch (kernel: %d, user: %d)\n", @@ -331,7 +331,7 @@ static int evm_is_immutable(struct dentry *dentry, struct inode *inode) return 1; /* Do this the hard way */ - rc = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_EVM, + rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0) { if (rc == -ENODATA) @@ -376,12 +376,12 @@ int evm_update_evmxattr(struct dentry *dentry, const char *xattr_name, xattr_value_len, &data); if (rc == 0) { data.hdr.xattr.sha1.type = EVM_XATTR_HMAC; - rc = __vfs_setxattr_noperm(&init_user_ns, dentry, + rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_EVM, &data.hdr.xattr.data[1], SHA1_DIGEST_SIZE + 1, 0); } else if (rc == -ENODATA && (inode->i_opflags & IOP_XATTR)) { - rc = __vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_EVM); + rc = __vfs_removexattr(&nop_mnt_idmap, dentry, XATTR_NAME_EVM); } return rc; } diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index f02e609460e2..cf24c5255583 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -184,7 +184,7 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry, /* if status is not PASS, try to check again - against -ENOMEM */ /* first need to know the sig type */ - rc = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_EVM, + rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0) { evm_status = INTEGRITY_FAIL; @@ -436,7 +436,7 @@ static enum integrity_status evm_verify_current_integrity(struct dentry *dentry) /* * evm_xattr_change - check if passed xattr value differs from current value - * @mnt_userns: user namespace of the idmapped mount + * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @xattr_name: requested xattr * @xattr_value: requested xattr value @@ -446,14 +446,14 @@ static enum integrity_status evm_verify_current_integrity(struct dentry *dentry) * * Returns 1 if passed xattr value differs from current value, 0 otherwise. */ -static int evm_xattr_change(struct user_namespace *mnt_userns, +static int evm_xattr_change(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { char *xattr_data = NULL; int rc = 0; - rc = vfs_getxattr_alloc(&init_user_ns, dentry, xattr_name, &xattr_data, + rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, xattr_name, &xattr_data, 0, GFP_NOFS); if (rc < 0) { rc = 1; @@ -482,7 +482,7 @@ out: * For posix xattr acls only, permit security.evm, even if it currently * doesn't exist, to be updated unless the EVM signature is immutable. */ -static int evm_protect_xattr(struct user_namespace *mnt_userns, +static int evm_protect_xattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { @@ -538,7 +538,7 @@ out: return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && - !evm_xattr_change(mnt_userns, dentry, xattr_name, xattr_value, + !evm_xattr_change(idmap, dentry, xattr_name, xattr_value, xattr_value_len)) return 0; @@ -553,7 +553,7 @@ out: /** * evm_inode_setxattr - protect the EVM extended attribute - * @mnt_userns: user namespace of the idmapped mount + * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * @xattr_value: pointer to the new extended attribute value @@ -565,7 +565,7 @@ out: * userspace from writing HMAC value. Writing 'security.evm' requires * requires CAP_SYS_ADMIN privileges. */ -int evm_inode_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { @@ -584,20 +584,20 @@ int evm_inode_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG) return -EPERM; } - return evm_protect_xattr(mnt_userns, dentry, xattr_name, xattr_value, + return evm_protect_xattr(idmap, dentry, xattr_name, xattr_value, xattr_value_len); } /** * evm_inode_removexattr - protect the EVM extended attribute - * @mnt_userns: user namespace of the idmapped mount + * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * * Removing 'security.evm' requires CAP_SYS_ADMIN privileges and that * the current value is valid. */ -int evm_inode_removexattr(struct user_namespace *mnt_userns, +int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { /* Policy permits modification of the protected xattrs even though @@ -606,11 +606,11 @@ int evm_inode_removexattr(struct user_namespace *mnt_userns, if (evm_initialized & EVM_ALLOW_METADATA_WRITES) return 0; - return evm_protect_xattr(mnt_userns, dentry, xattr_name, NULL, 0); + return evm_protect_xattr(idmap, dentry, xattr_name, NULL, 0); } #ifdef CONFIG_FS_POSIX_ACL -static int evm_inode_set_acl_change(struct user_namespace *mnt_userns, +static int evm_inode_set_acl_change(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *kacl) { @@ -622,14 +622,14 @@ static int evm_inode_set_acl_change(struct user_namespace *mnt_userns, if (!kacl) return 1; - rc = posix_acl_update_mode(mnt_userns, inode, &mode, &kacl); + rc = posix_acl_update_mode(idmap, inode, &mode, &kacl); if (rc || (inode->i_mode != mode)) return 1; return 0; } #else -static inline int evm_inode_set_acl_change(struct user_namespace *mnt_userns, +static inline int evm_inode_set_acl_change(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *kacl) @@ -640,7 +640,7 @@ static inline int evm_inode_set_acl_change(struct user_namespace *mnt_userns, /** * evm_inode_set_acl - protect the EVM extended attribute from posix acls - * @mnt_userns: user namespace of the idmapped mount + * @idmap: idmap of the idmapped mount * @dentry: pointer to the affected dentry * @acl_name: name of the posix acl * @kacl: pointer to the posix acls @@ -649,7 +649,7 @@ static inline int evm_inode_set_acl_change(struct user_namespace *mnt_userns, * and 'security.evm' xattr updated, unless the existing 'security.evm' is * valid. */ -int evm_inode_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int evm_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { enum integrity_status evm_status; @@ -678,7 +678,7 @@ int evm_inode_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && - !evm_inode_set_acl_change(mnt_userns, dentry, acl_name, kacl)) + !evm_inode_set_acl_change(idmap, dentry, acl_name, kacl)) return 0; if (evm_status != INTEGRITY_PASS_IMMUTABLE) @@ -779,14 +779,14 @@ void evm_inode_post_removexattr(struct dentry *dentry, const char *xattr_name) evm_update_evmxattr(dentry, xattr_name, NULL, 0); } -static int evm_attr_change(struct user_namespace *mnt_userns, +static int evm_attr_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_backing_inode(dentry); unsigned int ia_valid = attr->ia_valid; - if (!i_uid_needs_update(mnt_userns, attr, inode) && - !i_gid_needs_update(mnt_userns, attr, inode) && + if (!i_uid_needs_update(idmap, attr, inode) && + !i_gid_needs_update(idmap, attr, inode) && (!(ia_valid & ATTR_MODE) || attr->ia_mode == inode->i_mode)) return 0; @@ -800,7 +800,7 @@ static int evm_attr_change(struct user_namespace *mnt_userns, * Permit update of file attributes when files have a valid EVM signature, * except in the case of them having an immutable portable signature. */ -int evm_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; @@ -827,7 +827,7 @@ int evm_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && - !evm_attr_change(mnt_userns, dentry, attr)) + !evm_attr_change(idmap, dentry, attr)) return 0; integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry), diff --git a/security/integrity/evm/evm_secfs.c b/security/integrity/evm/evm_secfs.c index 8a9db7dfca7e..9b907c2fee60 100644 --- a/security/integrity/evm/evm_secfs.c +++ b/security/integrity/evm/evm_secfs.c @@ -228,7 +228,7 @@ static ssize_t evm_write_xattrs(struct file *file, const char __user *buf, newattrs.ia_valid = ATTR_MODE; inode = evm_xattrs->d_inode; inode_lock(inode); - err = simple_setattr(&init_user_ns, evm_xattrs, &newattrs); + err = simple_setattr(&nop_mnt_idmap, evm_xattrs, &newattrs); inode_unlock(inode); if (!err) err = count; diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 03b440921e61..d8530e722515 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -254,7 +254,7 @@ static inline void ima_process_queued_keys(void) {} #endif /* CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS */ /* LIM API function definitions */ -int ima_get_action(struct user_namespace *mnt_userns, struct inode *inode, +int ima_get_action(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, u32 secid, int mask, enum ima_hooks func, int *pcr, struct ima_template_desc **template_desc, @@ -268,7 +268,7 @@ void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, int pcr, struct ima_template_desc *template_desc); -int process_buffer_measurement(struct user_namespace *mnt_userns, +int process_buffer_measurement(struct mnt_idmap *idmap, struct inode *inode, const void *buf, int size, const char *eventname, enum ima_hooks func, int pcr, const char *func_data, @@ -285,7 +285,7 @@ void ima_free_template_entry(struct ima_template_entry *entry); const char *ima_d_path(const struct path *path, char **pathbuf, char *filename); /* IMA policy related functions */ -int ima_match_policy(struct user_namespace *mnt_userns, struct inode *inode, +int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, u32 secid, enum ima_hooks func, int mask, int flags, int *pcr, struct ima_template_desc **template_desc, @@ -318,7 +318,7 @@ int ima_appraise_measurement(enum ima_hooks func, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig); -int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode, +int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func); void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file); enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint, @@ -346,7 +346,7 @@ static inline int ima_appraise_measurement(enum ima_hooks func, return INTEGRITY_UNKNOWN; } -static inline int ima_must_appraise(struct user_namespace *mnt_userns, +static inline int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index c1e76282b5ee..9345fd66f5b8 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -163,7 +163,7 @@ err_out: /** * ima_get_action - appraise & measure decision based on policy. - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: pointer to the inode associated with the object being validated * @cred: pointer to credentials structure to validate * @secid: secid of the task being validated @@ -186,7 +186,7 @@ err_out: * Returns IMA_MEASURE, IMA_APPRAISE mask. * */ -int ima_get_action(struct user_namespace *mnt_userns, struct inode *inode, +int ima_get_action(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, u32 secid, int mask, enum ima_hooks func, int *pcr, struct ima_template_desc **template_desc, @@ -196,7 +196,7 @@ int ima_get_action(struct user_namespace *mnt_userns, struct inode *inode, flags &= ima_policy_flag; - return ima_match_policy(mnt_userns, inode, cred, secid, func, mask, + return ima_match_policy(idmap, inode, cred, secid, func, mask, flags, pcr, template_desc, func_data, allowed_algos); } diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index ee6f7e237f2e..555342d337f9 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -70,7 +70,7 @@ bool is_ima_appraise_enabled(void) * * Return 1 to appraise or hash */ -int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode, +int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { u32 secid; @@ -79,7 +79,7 @@ int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode, return 0; security_current_getsecid_subj(&secid); - return ima_match_policy(mnt_userns, inode, current_cred(), secid, + return ima_match_policy(idmap, inode, current_cred(), secid, func, mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL, NULL); } @@ -98,7 +98,7 @@ static int ima_fix_xattr(struct dentry *dentry, iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG; iint->ima_hash->xattr.ng.algo = algo; } - rc = __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_IMA, + rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, &iint->ima_hash->xattr.data[offset], (sizeof(iint->ima_hash->xattr) - offset) + iint->ima_hash->length, 0); @@ -225,7 +225,7 @@ int ima_read_xattr(struct dentry *dentry, { int ret; - ret = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_IMA, + ret = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, (char **)xattr_value, xattr_len, GFP_NOFS); if (ret == -EOPNOTSUPP) ret = 0; @@ -456,7 +456,7 @@ int ima_check_blacklist(struct integrity_iint_cache *iint, rc = is_binary_blacklisted(digest, digestsize); if ((rc == -EPERM) && (iint->flags & IMA_MEASURE)) - process_buffer_measurement(&init_user_ns, NULL, digest, digestsize, + process_buffer_measurement(&nop_mnt_idmap, NULL, digest, digestsize, "blacklisted-hash", NONE, pcr, NULL, false, NULL, 0); } @@ -622,7 +622,7 @@ void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file) /** * ima_inode_post_setattr - reflect file metadata changes - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: pointer to the affected dentry * * Changes to a dentry's metadata might result in needing to appraise. @@ -630,7 +630,7 @@ void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file) * This function is called from notify_change(), which expects the caller * to lock the inode's i_mutex. */ -void ima_inode_post_setattr(struct user_namespace *mnt_userns, +void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); @@ -641,7 +641,7 @@ void ima_inode_post_setattr(struct user_namespace *mnt_userns, || !(inode->i_opflags & IOP_XATTR)) return; - action = ima_must_appraise(mnt_userns, inode, MAY_ACCESS, POST_SETATTR); + action = ima_must_appraise(idmap, inode, MAY_ACCESS, POST_SETATTR); iint = integrity_iint_find(inode); if (iint) { set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags); @@ -774,7 +774,7 @@ int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name, return result; } -int ima_inode_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (evm_revalidate_status(acl_name)) diff --git a/security/integrity/ima/ima_asymmetric_keys.c b/security/integrity/ima/ima_asymmetric_keys.c index f6aa0b47a772..caacfe6860b1 100644 --- a/security/integrity/ima/ima_asymmetric_keys.c +++ b/security/integrity/ima/ima_asymmetric_keys.c @@ -60,7 +60,7 @@ void ima_post_key_create_or_update(struct key *keyring, struct key *key, * if the IMA policy is configured to measure a key linked * to the given keyring. */ - process_buffer_measurement(&init_user_ns, NULL, payload, payload_len, + process_buffer_measurement(&nop_mnt_idmap, NULL, payload, payload_len, keyring->description, KEY_CHECK, 0, keyring->description, false, NULL, 0); } diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 377300973e6c..358578267fea 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -224,7 +224,7 @@ static int process_measurement(struct file *file, const struct cred *cred, * bitmask based on the appraise/audit/measurement policy. * Included is the appraise submask. */ - action = ima_get_action(file_mnt_user_ns(file), inode, cred, secid, + action = ima_get_action(file_mnt_idmap(file), inode, cred, secid, mask, func, &pcr, &template_desc, NULL, &allowed_algos); violation_check = ((func == FILE_CHECK || func == MMAP_CHECK) && @@ -451,7 +451,7 @@ int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot) security_current_getsecid_subj(&secid); inode = file_inode(vma->vm_file); - action = ima_get_action(file_mnt_user_ns(vma->vm_file), inode, + action = ima_get_action(file_mnt_idmap(vma->vm_file), inode, current_cred(), secid, MAY_EXEC, MMAP_CHECK, &pcr, &template, NULL, NULL); @@ -638,14 +638,14 @@ EXPORT_SYMBOL_GPL(ima_inode_hash); /** * ima_post_create_tmpfile - mark newly created tmpfile as new - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode of the newly created tmpfile * * No measuring, appraising or auditing of newly created tmpfiles is needed. * Skip calling process_measurement(), but indicate which newly, created * tmpfiles are in policy. */ -void ima_post_create_tmpfile(struct user_namespace *mnt_userns, +void ima_post_create_tmpfile(struct mnt_idmap *idmap, struct inode *inode) { struct integrity_iint_cache *iint; @@ -654,7 +654,7 @@ void ima_post_create_tmpfile(struct user_namespace *mnt_userns, if (!ima_policy_flag || !S_ISREG(inode->i_mode)) return; - must_appraise = ima_must_appraise(mnt_userns, inode, MAY_ACCESS, + must_appraise = ima_must_appraise(idmap, inode, MAY_ACCESS, FILE_CHECK); if (!must_appraise) return; @@ -671,13 +671,13 @@ void ima_post_create_tmpfile(struct user_namespace *mnt_userns, /** * ima_post_path_mknod - mark as a new inode - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: newly created dentry * * Mark files created via the mknodat syscall as new, so that the * file data can be written later. */ -void ima_post_path_mknod(struct user_namespace *mnt_userns, +void ima_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry) { struct integrity_iint_cache *iint; @@ -687,7 +687,7 @@ void ima_post_path_mknod(struct user_namespace *mnt_userns, if (!ima_policy_flag || !S_ISREG(inode->i_mode)) return; - must_appraise = ima_must_appraise(mnt_userns, inode, MAY_ACCESS, + must_appraise = ima_must_appraise(idmap, inode, MAY_ACCESS, FILE_CHECK); if (!must_appraise) return; @@ -869,7 +869,7 @@ int ima_post_load_data(char *buf, loff_t size, /** * process_buffer_measurement - Measure the buffer or the buffer data hash - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode associated with the object being measured (NULL for KEY_CHECK) * @buf: pointer to the buffer that needs to be added to the log. * @size: size of buffer(in bytes). @@ -887,7 +887,7 @@ int ima_post_load_data(char *buf, loff_t size, * has been written to the passed location but not added to a measurement entry, * a negative value otherwise. */ -int process_buffer_measurement(struct user_namespace *mnt_userns, +int process_buffer_measurement(struct mnt_idmap *idmap, struct inode *inode, const void *buf, int size, const char *eventname, enum ima_hooks func, int pcr, const char *func_data, @@ -931,7 +931,7 @@ int process_buffer_measurement(struct user_namespace *mnt_userns, */ if (func) { security_current_getsecid_subj(&secid); - action = ima_get_action(mnt_userns, inode, current_cred(), + action = ima_get_action(idmap, inode, current_cred(), secid, 0, func, &pcr, &template, func_data, NULL); if (!(action & IMA_MEASURE) && !digest) @@ -1011,7 +1011,7 @@ void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) if (!f.file) return; - process_buffer_measurement(file_mnt_user_ns(f.file), file_inode(f.file), + process_buffer_measurement(file_mnt_idmap(f.file), file_inode(f.file), buf, size, "kexec-cmdline", KEXEC_CMDLINE, 0, NULL, false, NULL, 0); fdput(f); @@ -1044,7 +1044,7 @@ int ima_measure_critical_data(const char *event_label, if (!event_name || !event_label || !buf || !buf_len) return -ENOPARAM; - return process_buffer_measurement(&init_user_ns, NULL, buf, buf_len, + return process_buffer_measurement(&nop_mnt_idmap, NULL, buf, buf_len, event_name, CRITICAL_DATA, 0, event_label, hash, digest, digest_len); diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 6a68ec270822..fc128a6b4abe 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -552,7 +552,7 @@ static bool ima_match_rule_data(struct ima_rule_entry *rule, /** * ima_match_rules - determine whether an inode matches the policy rule. * @rule: a pointer to a rule - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: a pointer to an inode * @cred: a pointer to a credentials structure for user validation * @secid: the secid of the task to be validated @@ -563,7 +563,7 @@ static bool ima_match_rule_data(struct ima_rule_entry *rule, * Returns true on rule match, false on failure. */ static bool ima_match_rules(struct ima_rule_entry *rule, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, u32 secid, enum ima_hooks func, int mask, const char *func_data) @@ -624,11 +624,11 @@ static bool ima_match_rules(struct ima_rule_entry *rule, return false; } if ((rule->flags & IMA_FOWNER) && - !rule->fowner_op(i_uid_into_vfsuid(mnt_userns, inode), + !rule->fowner_op(i_uid_into_vfsuid(idmap, inode), rule->fowner)) return false; if ((rule->flags & IMA_FGROUP) && - !rule->fgroup_op(i_gid_into_vfsgid(mnt_userns, inode), + !rule->fgroup_op(i_gid_into_vfsgid(idmap, inode), rule->fgroup)) return false; for (i = 0; i < MAX_LSM_RULES; i++) { @@ -713,7 +713,7 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func) /** * ima_match_policy - decision based on LSM and other conditions - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: pointer to an inode for which the policy decision is being made * @cred: pointer to a credentials structure for which the policy decision is * being made @@ -732,7 +732,7 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func) * list when walking it. Reads are many orders of magnitude more numerous * than writes so ima_match_policy() is classical RCU candidate. */ -int ima_match_policy(struct user_namespace *mnt_userns, struct inode *inode, +int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, u32 secid, enum ima_hooks func, int mask, int flags, int *pcr, struct ima_template_desc **template_desc, @@ -752,7 +752,7 @@ int ima_match_policy(struct user_namespace *mnt_userns, struct inode *inode, if (!(entry->action & actmask)) continue; - if (!ima_match_rules(entry, mnt_userns, inode, cred, secid, + if (!ima_match_rules(entry, idmap, inode, cred, secid, func, mask, func_data)) continue; diff --git a/security/integrity/ima/ima_queue_keys.c b/security/integrity/ima/ima_queue_keys.c index 93056c03bf5a..4f0aea155bf9 100644 --- a/security/integrity/ima/ima_queue_keys.c +++ b/security/integrity/ima/ima_queue_keys.c @@ -159,7 +159,7 @@ void ima_process_queued_keys(void) list_for_each_entry_safe(entry, tmp, &ima_keys, list) { if (!timer_expired) - process_buffer_measurement(&init_user_ns, NULL, + process_buffer_measurement(&nop_mnt_idmap, NULL, entry->payload, entry->payload_len, entry->keyring_name, diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c index 4564faae7d67..6cd0add524cd 100644 --- a/security/integrity/ima/ima_template_lib.c +++ b/security/integrity/ima/ima_template_lib.c @@ -598,7 +598,7 @@ int ima_eventevmsig_init(struct ima_event_data *event_data, if (!event_data->file) return 0; - rc = vfs_getxattr_alloc(&init_user_ns, file_dentry(event_data->file), + rc = vfs_getxattr_alloc(&nop_mnt_idmap, file_dentry(event_data->file), XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0 || xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG) { diff --git a/security/keys/key.c b/security/keys/key.c index c45afdd1dfbb..5c0c7df833f8 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -788,38 +788,18 @@ error: goto out; } -/** - * key_create_or_update - Update or create and instantiate a key. - * @keyring_ref: A pointer to the destination keyring with possession flag. - * @type: The type of key. - * @description: The searchable description for the key. - * @payload: The data to use to instantiate or update the key. - * @plen: The length of @payload. - * @perm: The permissions mask for a new key. - * @flags: The quota flags for a new key. - * - * Search the destination keyring for a key of the same description and if one - * is found, update it, otherwise create and instantiate a new one and create a - * link to it from that keyring. - * - * If perm is KEY_PERM_UNDEF then an appropriate key permissions mask will be - * concocted. - * - * Returns a pointer to the new key if successful, -ENODEV if the key type - * wasn't available, -ENOTDIR if the keyring wasn't a keyring, -EACCES if the - * caller isn't permitted to modify the keyring or the LSM did not permit - * creation of the key. - * - * On success, the possession flag from the keyring ref will be tacked on to - * the key ref before it is returned. +/* + * Create or potentially update a key. The combined logic behind + * key_create_or_update() and key_create() */ -key_ref_t key_create_or_update(key_ref_t keyring_ref, - const char *type, - const char *description, - const void *payload, - size_t plen, - key_perm_t perm, - unsigned long flags) +static key_ref_t __key_create_or_update(key_ref_t keyring_ref, + const char *type, + const char *description, + const void *payload, + size_t plen, + key_perm_t perm, + unsigned long flags, + bool allow_update) { struct keyring_index_key index_key = { .description = description, @@ -906,14 +886,23 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, goto error_link_end; } - /* if it's possible to update this type of key, search for an existing - * key of the same type and description in the destination keyring and - * update that instead if possible + /* if it's requested and possible to update this type of key, search + * for an existing key of the same type and description in the + * destination keyring and update that instead if possible */ - if (index_key.type->update) { + if (allow_update) { + if (index_key.type->update) { + key_ref = find_key_to_update(keyring_ref, &index_key); + if (key_ref) + goto found_matching_key; + } + } else { key_ref = find_key_to_update(keyring_ref, &index_key); - if (key_ref) - goto found_matching_key; + if (key_ref) { + key_ref_put(key_ref); + key_ref = ERR_PTR(-EEXIST); + goto error_link_end; + } } /* if the client doesn't provide, decide on the permissions we want */ @@ -985,9 +974,83 @@ error: goto error_free_prep; } + +/** + * key_create_or_update - Update or create and instantiate a key. + * @keyring_ref: A pointer to the destination keyring with possession flag. + * @type: The type of key. + * @description: The searchable description for the key. + * @payload: The data to use to instantiate or update the key. + * @plen: The length of @payload. + * @perm: The permissions mask for a new key. + * @flags: The quota flags for a new key. + * + * Search the destination keyring for a key of the same description and if one + * is found, update it, otherwise create and instantiate a new one and create a + * link to it from that keyring. + * + * If perm is KEY_PERM_UNDEF then an appropriate key permissions mask will be + * concocted. + * + * Returns a pointer to the new key if successful, -ENODEV if the key type + * wasn't available, -ENOTDIR if the keyring wasn't a keyring, -EACCES if the + * caller isn't permitted to modify the keyring or the LSM did not permit + * creation of the key. + * + * On success, the possession flag from the keyring ref will be tacked on to + * the key ref before it is returned. + */ +key_ref_t key_create_or_update(key_ref_t keyring_ref, + const char *type, + const char *description, + const void *payload, + size_t plen, + key_perm_t perm, + unsigned long flags) +{ + return __key_create_or_update(keyring_ref, type, description, payload, + plen, perm, flags, true); +} EXPORT_SYMBOL(key_create_or_update); /** + * key_create - Create and instantiate a key. + * @keyring_ref: A pointer to the destination keyring with possession flag. + * @type: The type of key. + * @description: The searchable description for the key. + * @payload: The data to use to instantiate or update the key. + * @plen: The length of @payload. + * @perm: The permissions mask for a new key. + * @flags: The quota flags for a new key. + * + * Create and instantiate a new key and link to it from the destination keyring. + * + * If perm is KEY_PERM_UNDEF then an appropriate key permissions mask will be + * concocted. + * + * Returns a pointer to the new key if successful, -EEXIST if a key with the + * same description already exists, -ENODEV if the key type wasn't available, + * -ENOTDIR if the keyring wasn't a keyring, -EACCES if the caller isn't + * permitted to modify the keyring or the LSM did not permit creation of the + * key. + * + * On success, the possession flag from the keyring ref will be tacked on to + * the key ref before it is returned. + */ +key_ref_t key_create(key_ref_t keyring_ref, + const char *type, + const char *description, + const void *payload, + size_t plen, + key_perm_t perm, + unsigned long flags) +{ + return __key_create_or_update(keyring_ref, type, description, payload, + plen, perm, flags, false); +} +EXPORT_SYMBOL(key_create); + +/** * key_update - Update a key's contents. * @key_ref: The pointer (plus possession flag) to the key. * @payload: The data to be used to update the key. diff --git a/security/security.c b/security/security.c index d1571900a8c7..4e1150c44ab7 100644 --- a/security/security.c +++ b/security/security.c @@ -1354,7 +1354,7 @@ int security_inode_permission(struct inode *inode, int mask) return call_int_hook(inode_permission, 0, inode, mask); } -int security_inode_setattr(struct user_namespace *mnt_userns, +int security_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int ret; @@ -1364,7 +1364,7 @@ int security_inode_setattr(struct user_namespace *mnt_userns, ret = call_int_hook(inode_setattr, 0, dentry, attr); if (ret) return ret; - return evm_inode_setattr(mnt_userns, dentry, attr); + return evm_inode_setattr(idmap, dentry, attr); } EXPORT_SYMBOL_GPL(security_inode_setattr); @@ -1375,7 +1375,7 @@ int security_inode_getattr(const struct path *path) return call_int_hook(inode_getattr, 0, path); } -int security_inode_setxattr(struct user_namespace *mnt_userns, +int security_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { @@ -1387,7 +1387,7 @@ int security_inode_setxattr(struct user_namespace *mnt_userns, * SELinux and Smack integrate the cap call, * so assume that all LSMs supplying this call do so. */ - ret = call_int_hook(inode_setxattr, 1, mnt_userns, dentry, name, value, + ret = call_int_hook(inode_setxattr, 1, idmap, dentry, name, value, size, flags); if (ret == 1) @@ -1397,10 +1397,10 @@ int security_inode_setxattr(struct user_namespace *mnt_userns, ret = ima_inode_setxattr(dentry, name, value, size); if (ret) return ret; - return evm_inode_setxattr(mnt_userns, dentry, name, value, size); + return evm_inode_setxattr(idmap, dentry, name, value, size); } -int security_inode_set_acl(struct user_namespace *mnt_userns, +int security_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { @@ -1408,38 +1408,38 @@ int security_inode_set_acl(struct user_namespace *mnt_userns, if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; - ret = call_int_hook(inode_set_acl, 0, mnt_userns, dentry, acl_name, + ret = call_int_hook(inode_set_acl, 0, idmap, dentry, acl_name, kacl); if (ret) return ret; - ret = ima_inode_set_acl(mnt_userns, dentry, acl_name, kacl); + ret = ima_inode_set_acl(idmap, dentry, acl_name, kacl); if (ret) return ret; - return evm_inode_set_acl(mnt_userns, dentry, acl_name, kacl); + return evm_inode_set_acl(idmap, dentry, acl_name, kacl); } -int security_inode_get_acl(struct user_namespace *mnt_userns, +int security_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; - return call_int_hook(inode_get_acl, 0, mnt_userns, dentry, acl_name); + return call_int_hook(inode_get_acl, 0, idmap, dentry, acl_name); } -int security_inode_remove_acl(struct user_namespace *mnt_userns, +int security_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { int ret; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; - ret = call_int_hook(inode_remove_acl, 0, mnt_userns, dentry, acl_name); + ret = call_int_hook(inode_remove_acl, 0, idmap, dentry, acl_name); if (ret) return ret; - ret = ima_inode_remove_acl(mnt_userns, dentry, acl_name); + ret = ima_inode_remove_acl(idmap, dentry, acl_name); if (ret) return ret; - return evm_inode_remove_acl(mnt_userns, dentry, acl_name); + return evm_inode_remove_acl(idmap, dentry, acl_name); } void security_inode_post_setxattr(struct dentry *dentry, const char *name, @@ -1465,7 +1465,7 @@ int security_inode_listxattr(struct dentry *dentry) return call_int_hook(inode_listxattr, 0, dentry); } -int security_inode_removexattr(struct user_namespace *mnt_userns, +int security_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { int ret; @@ -1476,15 +1476,15 @@ int security_inode_removexattr(struct user_namespace *mnt_userns, * SELinux and Smack integrate the cap call, * so assume that all LSMs supplying this call do so. */ - ret = call_int_hook(inode_removexattr, 1, mnt_userns, dentry, name); + ret = call_int_hook(inode_removexattr, 1, idmap, dentry, name); if (ret == 1) - ret = cap_inode_removexattr(mnt_userns, dentry, name); + ret = cap_inode_removexattr(idmap, dentry, name); if (ret) return ret; ret = ima_inode_removexattr(dentry, name); if (ret) return ret; - return evm_inode_removexattr(mnt_userns, dentry, name); + return evm_inode_removexattr(idmap, dentry, name); } int security_inode_need_killpriv(struct dentry *dentry) @@ -1492,13 +1492,13 @@ int security_inode_need_killpriv(struct dentry *dentry) return call_int_hook(inode_need_killpriv, 0, dentry); } -int security_inode_killpriv(struct user_namespace *mnt_userns, +int security_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry) { - return call_int_hook(inode_killpriv, 0, mnt_userns, dentry); + return call_int_hook(inode_killpriv, 0, idmap, dentry); } -int security_inode_getsecurity(struct user_namespace *mnt_userns, +int security_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { @@ -1511,7 +1511,7 @@ int security_inode_getsecurity(struct user_namespace *mnt_userns, * Only one module will provide an attribute with a given name. */ hlist_for_each_entry(hp, &security_hook_heads.inode_getsecurity, list) { - rc = hp->hook.inode_getsecurity(mnt_userns, inode, name, buffer, alloc); + rc = hp->hook.inode_getsecurity(idmap, inode, name, buffer, alloc); if (rc != LSM_RET_DEFAULT(inode_getsecurity)) return rc; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 3c5be76a9199..9a5bdfc21314 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3145,7 +3145,7 @@ static bool has_cap_mac_admin(bool audit) return true; } -static int selinux_inode_setxattr(struct user_namespace *mnt_userns, +static int selinux_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { @@ -3167,13 +3167,13 @@ static int selinux_inode_setxattr(struct user_namespace *mnt_userns, } if (!selinux_initialized(&selinux_state)) - return (inode_owner_or_capable(mnt_userns, inode) ? 0 : -EPERM); + return (inode_owner_or_capable(idmap, inode) ? 0 : -EPERM); sbsec = selinux_superblock(inode->i_sb); if (!(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; ad.type = LSM_AUDIT_DATA_DENTRY; @@ -3240,20 +3240,20 @@ static int selinux_inode_setxattr(struct user_namespace *mnt_userns, &ad); } -static int selinux_inode_set_acl(struct user_namespace *mnt_userns, +static int selinux_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); } -static int selinux_inode_get_acl(struct user_namespace *mnt_userns, +static int selinux_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return dentry_has_perm(current_cred(), dentry, FILE__GETATTR); } -static int selinux_inode_remove_acl(struct user_namespace *mnt_userns, +static int selinux_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); @@ -3313,11 +3313,11 @@ static int selinux_inode_listxattr(struct dentry *dentry) return dentry_has_perm(cred, dentry, FILE__GETATTR); } -static int selinux_inode_removexattr(struct user_namespace *mnt_userns, +static int selinux_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { if (strcmp(name, XATTR_NAME_SELINUX)) { - int rc = cap_inode_removexattr(mnt_userns, dentry, name); + int rc = cap_inode_removexattr(idmap, dentry, name); if (rc) return rc; @@ -3383,7 +3383,7 @@ static int selinux_path_notify(const struct path *path, u64 mask, * * Permission check is handled by selinux_inode_getxattr hook. */ -static int selinux_inode_getsecurity(struct user_namespace *mnt_userns, +static int selinux_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { @@ -6588,14 +6588,14 @@ static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen */ static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { - return __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_SELINUX, + return __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_SELINUX, ctx, ctxlen, 0); } static int selinux_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) { int len = 0; - len = selinux_inode_getsecurity(&init_user_ns, inode, + len = selinux_inode_getsecurity(&nop_mnt_idmap, inode, XATTR_SELINUX_SUFFIX, ctx, true); if (len < 0) return len; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 9a82a15685d1..cfcbb748da25 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1207,7 +1207,7 @@ static int smack_inode_getattr(const struct path *path) /** * smack_inode_setxattr - Smack check for setting xattrs - * @mnt_userns: active user namespace + * @idmap: idmap of the mount * @dentry: the object * @name: name of the attribute * @value: value of the attribute @@ -1218,7 +1218,7 @@ static int smack_inode_getattr(const struct path *path) * * Returns 0 if access is permitted, an error code otherwise */ -static int smack_inode_setxattr(struct user_namespace *mnt_userns, +static int smack_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { @@ -1334,7 +1334,7 @@ static int smack_inode_getxattr(struct dentry *dentry, const char *name) /** * smack_inode_removexattr - Smack check on removexattr - * @mnt_userns: active user namespace + * @idmap: idmap of the mount * @dentry: the object * @name: name of the attribute * @@ -1342,7 +1342,7 @@ static int smack_inode_getxattr(struct dentry *dentry, const char *name) * * Returns 0 if access is permitted, an error code otherwise */ -static int smack_inode_removexattr(struct user_namespace *mnt_userns, +static int smack_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode_smack *isp; @@ -1358,7 +1358,7 @@ static int smack_inode_removexattr(struct user_namespace *mnt_userns, if (!smack_privileged(CAP_MAC_ADMIN)) rc = -EPERM; } else - rc = cap_inode_removexattr(mnt_userns, dentry, name); + rc = cap_inode_removexattr(idmap, dentry, name); if (rc != 0) return rc; @@ -1394,14 +1394,14 @@ static int smack_inode_removexattr(struct user_namespace *mnt_userns, /** * smack_inode_set_acl - Smack check for setting posix acls - * @mnt_userns: the userns attached to the mnt this request came from + * @idmap: idmap of the mnt this request came from * @dentry: the object * @acl_name: name of the posix acl * @kacl: the posix acls * * Returns 0 if access is permitted, an error code otherwise */ -static int smack_inode_set_acl(struct user_namespace *mnt_userns, +static int smack_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { @@ -1418,13 +1418,13 @@ static int smack_inode_set_acl(struct user_namespace *mnt_userns, /** * smack_inode_get_acl - Smack check for getting posix acls - * @mnt_userns: the userns attached to the mnt this request came from + * @idmap: idmap of the mnt this request came from * @dentry: the object * @acl_name: name of the posix acl * * Returns 0 if access is permitted, an error code otherwise */ -static int smack_inode_get_acl(struct user_namespace *mnt_userns, +static int smack_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { struct smk_audit_info ad; @@ -1440,13 +1440,13 @@ static int smack_inode_get_acl(struct user_namespace *mnt_userns, /** * smack_inode_remove_acl - Smack check for getting posix acls - * @mnt_userns: the userns attached to the mnt this request came from + * @idmap: idmap of the mnt this request came from * @dentry: the object * @acl_name: name of the posix acl * * Returns 0 if access is permitted, an error code otherwise */ -static int smack_inode_remove_acl(struct user_namespace *mnt_userns, +static int smack_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { struct smk_audit_info ad; @@ -1462,7 +1462,7 @@ static int smack_inode_remove_acl(struct user_namespace *mnt_userns, /** * smack_inode_getsecurity - get smack xattrs - * @mnt_userns: active user namespace + * @idmap: idmap of the mount * @inode: the object * @name: attribute name * @buffer: where to put the result @@ -1470,7 +1470,7 @@ static int smack_inode_remove_acl(struct user_namespace *mnt_userns, * * Returns the size of the attribute or an error code */ -static int smack_inode_getsecurity(struct user_namespace *mnt_userns, +static int smack_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { @@ -3507,7 +3507,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode) */ if (isp->smk_flags & SMK_INODE_CHANGED) { isp->smk_flags &= ~SMK_INODE_CHANGED; - rc = __vfs_setxattr(&init_user_ns, dp, inode, + rc = __vfs_setxattr(&nop_mnt_idmap, dp, inode, XATTR_NAME_SMACKTRANSMUTE, TRANS_TRUE, TRANS_TRUE_SIZE, 0); @@ -4686,7 +4686,7 @@ static int smack_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) static int smack_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { - return __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_SMACK, + return __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_SMACK, ctx, ctxlen, 0); } diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index 81025f50a542..f901504b5afc 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -541,16 +541,15 @@ static void *snd_dma_noncontig_alloc(struct snd_dma_buffer *dmab, size_t size) struct sg_table *sgt; void *p; +#ifdef CONFIG_SND_DMA_SGBUF + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return snd_dma_sg_fallback_alloc(dmab, size); +#endif sgt = dma_alloc_noncontiguous(dmab->dev.dev, size, dmab->dev.dir, DEFAULT_GFP, 0); #ifdef CONFIG_SND_DMA_SGBUF - if (!sgt && !get_dma_ops(dmab->dev.dev)) { - if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG) - dmab->dev.type = SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK; - else - dmab->dev.type = SNDRV_DMA_TYPE_DEV_SG_FALLBACK; + if (!sgt && !get_dma_ops(dmab->dev.dev)) return snd_dma_sg_fallback_alloc(dmab, size); - } #endif if (!sgt) return NULL; @@ -717,19 +716,38 @@ static const struct snd_malloc_ops snd_dma_sg_wc_ops = { /* Fallback SG-buffer allocations for x86 */ struct snd_dma_sg_fallback { + bool use_dma_alloc_coherent; size_t count; struct page **pages; + /* DMA address array; the first page contains #pages in ~PAGE_MASK */ + dma_addr_t *addrs; }; static void __snd_dma_sg_fallback_free(struct snd_dma_buffer *dmab, struct snd_dma_sg_fallback *sgbuf) { - bool wc = dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK; - size_t i; - - for (i = 0; i < sgbuf->count && sgbuf->pages[i]; i++) - do_free_pages(page_address(sgbuf->pages[i]), PAGE_SIZE, wc); + size_t i, size; + + if (sgbuf->pages && sgbuf->addrs) { + i = 0; + while (i < sgbuf->count) { + if (!sgbuf->pages[i] || !sgbuf->addrs[i]) + break; + size = sgbuf->addrs[i] & ~PAGE_MASK; + if (WARN_ON(!size)) + break; + if (sgbuf->use_dma_alloc_coherent) + dma_free_coherent(dmab->dev.dev, size << PAGE_SHIFT, + page_address(sgbuf->pages[i]), + sgbuf->addrs[i] & PAGE_MASK); + else + do_free_pages(page_address(sgbuf->pages[i]), + size << PAGE_SHIFT, false); + i += size; + } + } kvfree(sgbuf->pages); + kvfree(sgbuf->addrs); kfree(sgbuf); } @@ -738,24 +756,36 @@ static void *snd_dma_sg_fallback_alloc(struct snd_dma_buffer *dmab, size_t size) struct snd_dma_sg_fallback *sgbuf; struct page **pagep, *curp; size_t chunk, npages; + dma_addr_t *addrp; dma_addr_t addr; void *p; - bool wc = dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK; + + /* correct the type */ + if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_SG) + dmab->dev.type = SNDRV_DMA_TYPE_DEV_SG_FALLBACK; + else if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG) + dmab->dev.type = SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK; sgbuf = kzalloc(sizeof(*sgbuf), GFP_KERNEL); if (!sgbuf) return NULL; + sgbuf->use_dma_alloc_coherent = cpu_feature_enabled(X86_FEATURE_XENPV); size = PAGE_ALIGN(size); sgbuf->count = size >> PAGE_SHIFT; sgbuf->pages = kvcalloc(sgbuf->count, sizeof(*sgbuf->pages), GFP_KERNEL); - if (!sgbuf->pages) + sgbuf->addrs = kvcalloc(sgbuf->count, sizeof(*sgbuf->addrs), GFP_KERNEL); + if (!sgbuf->pages || !sgbuf->addrs) goto error; pagep = sgbuf->pages; - chunk = size; + addrp = sgbuf->addrs; + chunk = (PAGE_SIZE - 1) << PAGE_SHIFT; /* to fit in low bits in addrs */ while (size > 0) { chunk = min(size, chunk); - p = do_alloc_pages(dmab->dev.dev, chunk, &addr, wc); + if (sgbuf->use_dma_alloc_coherent) + p = dma_alloc_coherent(dmab->dev.dev, chunk, &addr, DEFAULT_GFP); + else + p = do_alloc_pages(dmab->dev.dev, chunk, &addr, false); if (!p) { if (chunk <= PAGE_SIZE) goto error; @@ -767,17 +797,25 @@ static void *snd_dma_sg_fallback_alloc(struct snd_dma_buffer *dmab, size_t size) size -= chunk; /* fill pages */ npages = chunk >> PAGE_SHIFT; + *addrp = npages; /* store in lower bits */ curp = virt_to_page(p); - while (npages--) + while (npages--) { *pagep++ = curp++; + *addrp++ |= addr; + addr += PAGE_SIZE; + } } p = vmap(sgbuf->pages, sgbuf->count, VM_MAP, PAGE_KERNEL); if (!p) goto error; + + if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK) + set_pages_array_wc(sgbuf->pages, sgbuf->count); + dmab->private_data = sgbuf; /* store the first page address for convenience */ - dmab->addr = snd_sgbuf_get_addr(dmab, 0); + dmab->addr = sgbuf->addrs[0] & PAGE_MASK; return p; error: @@ -787,10 +825,23 @@ static void *snd_dma_sg_fallback_alloc(struct snd_dma_buffer *dmab, size_t size) static void snd_dma_sg_fallback_free(struct snd_dma_buffer *dmab) { + struct snd_dma_sg_fallback *sgbuf = dmab->private_data; + + if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK) + set_pages_array_wb(sgbuf->pages, sgbuf->count); vunmap(dmab->area); __snd_dma_sg_fallback_free(dmab, dmab->private_data); } +static dma_addr_t snd_dma_sg_fallback_get_addr(struct snd_dma_buffer *dmab, + size_t offset) +{ + struct snd_dma_sg_fallback *sgbuf = dmab->private_data; + size_t index = offset >> PAGE_SHIFT; + + return (sgbuf->addrs[index] & PAGE_MASK) | (offset & ~PAGE_MASK); +} + static int snd_dma_sg_fallback_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { @@ -805,8 +856,8 @@ static const struct snd_malloc_ops snd_dma_sg_fallback_ops = { .alloc = snd_dma_sg_fallback_alloc, .free = snd_dma_sg_fallback_free, .mmap = snd_dma_sg_fallback_mmap, + .get_addr = snd_dma_sg_fallback_get_addr, /* reuse vmalloc helpers */ - .get_addr = snd_dma_vmalloc_get_addr, .get_page = snd_dma_vmalloc_get_page, .get_chunk_size = snd_dma_vmalloc_get_chunk_size, }; diff --git a/sound/firewire/motu/motu-hwdep.c b/sound/firewire/motu/motu-hwdep.c index a900fc0e7644..88d1f4b56e4b 100644 --- a/sound/firewire/motu/motu-hwdep.c +++ b/sound/firewire/motu/motu-hwdep.c @@ -87,6 +87,10 @@ static long hwdep_read(struct snd_hwdep *hwdep, char __user *buf, long count, return -EFAULT; count = consumed; + } else { + spin_unlock_irq(&motu->lock); + + count = 0; } return count; diff --git a/sound/pci/hda/hda_bind.c b/sound/pci/hda/hda_bind.c index 1a868dd9dc4b..890c2f7c33fc 100644 --- a/sound/pci/hda/hda_bind.c +++ b/sound/pci/hda/hda_bind.c @@ -144,6 +144,7 @@ static int hda_codec_driver_probe(struct device *dev) error: snd_hda_codec_cleanup_for_unbind(codec); + codec->preset = NULL; return err; } @@ -166,6 +167,7 @@ static int hda_codec_driver_remove(struct device *dev) if (codec->patch_ops.free) codec->patch_ops.free(codec); snd_hda_codec_cleanup_for_unbind(codec); + codec->preset = NULL; module_put(dev->driver->owner); return 0; } diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index edd653ece70d..2e728aad6771 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -795,7 +795,6 @@ void snd_hda_codec_cleanup_for_unbind(struct hda_codec *codec) snd_array_free(&codec->cvt_setups); snd_array_free(&codec->spdif_out); snd_array_free(&codec->verbs); - codec->preset = NULL; codec->follower_dig_outs = NULL; codec->spdif_status_reset = 0; snd_array_free(&codec->mixers); @@ -928,7 +927,6 @@ snd_hda_codec_device_init(struct hda_bus *bus, unsigned int codec_addr, codec->depop_delay = -1; codec->fixup_id = HDA_FIXUP_ID_NOT_SET; codec->core.dev.release = snd_hda_codec_dev_release; - codec->core.exec_verb = codec_exec_verb; codec->core.type = HDA_DEV_LEGACY; mutex_init(&codec->spdif_mutex); @@ -999,6 +997,7 @@ int snd_hda_codec_device_new(struct hda_bus *bus, struct snd_card *card, if (snd_BUG_ON(codec_addr > HDA_MAX_CODEC_ADDRESS)) return -EINVAL; + codec->core.exec_verb = codec_exec_verb; codec->card = card; codec->addr = codec_addr; diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 7b1a30a551f6..75e1d00074b9 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -1125,6 +1125,7 @@ static const struct hda_device_id snd_hda_id_conexant[] = { HDA_CODEC_ENTRY(0x14f11f87, "SN6140", patch_conexant_auto), HDA_CODEC_ENTRY(0x14f12008, "CX8200", patch_conexant_auto), HDA_CODEC_ENTRY(0x14f120d0, "CX11970", patch_conexant_auto), + HDA_CODEC_ENTRY(0x14f120d1, "SN6180", patch_conexant_auto), HDA_CODEC_ENTRY(0x14f15045, "CX20549 (Venice)", patch_conexant_auto), HDA_CODEC_ENTRY(0x14f15047, "CX20551 (Waikiki)", patch_conexant_auto), HDA_CODEC_ENTRY(0x14f15051, "CX20561 (Hermosa)", patch_conexant_auto), diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 6fab7c8fc19a..e103bb3693c0 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -832,7 +832,7 @@ do_sku: alc_setup_gpio(codec, 0x02); break; case 7: - alc_setup_gpio(codec, 0x03); + alc_setup_gpio(codec, 0x04); break; case 5: default: @@ -9202,6 +9202,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1025, 0x142b, "Acer Swift SF314-42", ALC255_FIXUP_ACER_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1025, 0x1430, "Acer TravelMate B311R-31", ALC256_FIXUP_ACER_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1025, 0x1466, "Acer Aspire A515-56", ALC255_FIXUP_ACER_HEADPHONE_AND_MIC), + SND_PCI_QUIRK(0x1025, 0x1534, "Acer Predator PH315-54", ALC255_FIXUP_ACER_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x0470, "Dell M101z", ALC269_FIXUP_DELL_M101Z), SND_PCI_QUIRK(0x1028, 0x053c, "Dell Latitude E5430", ALC292_FIXUP_DELL_E7X), SND_PCI_QUIRK(0x1028, 0x054b, "Dell XPS one 2710", ALC275_FIXUP_DELL_XPS), @@ -9422,6 +9423,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x89c3, "Zbook Studio G9", ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89c6, "Zbook Fury 17 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89ca, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x89d3, "HP EliteBook 645 G9 (MB 89D2)", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8a78, "HP Dev One", ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x103c, 0x8aa0, "HP ProBook 440 G9 (MB 8A9E)", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8aa3, "HP ProBook 450 G9 (MB 8AA1)", ALC236_FIXUP_HP_GPIO_LED), @@ -9430,8 +9432,21 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8abb, "HP ZBook Firefly 14 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ad1, "HP EliteBook 840 14 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ad2, "HP EliteBook 860 16 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b42, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b43, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b44, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b45, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b46, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b47, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8b5d, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8b5e, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8b7a, "HP", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b7d, "HP", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b87, "HP", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b8a, "HP", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b8b, "HP", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b8d, "HP", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b92, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8bf0, "HP", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300), @@ -9478,6 +9493,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1d4e, "ASUS TM420", ALC256_FIXUP_ASUS_HPE), SND_PCI_QUIRK(0x1043, 0x1e02, "ASUS UX3402", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1e11, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA502), + SND_PCI_QUIRK(0x1043, 0x1e12, "ASUS UM3402", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1e51, "ASUS Zephyrus M15", ALC294_FIXUP_ASUS_GU502_PINS), SND_PCI_QUIRK(0x1043, 0x1e5e, "ASUS ROG Strix G513", ALC294_FIXUP_ASUS_G513_PINS), SND_PCI_QUIRK(0x1043, 0x1e8e, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA401), @@ -9521,6 +9537,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x144d, 0xc812, "Samsung Notebook Pen S (NT950SBE-X58)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc830, "Samsung Galaxy Book Ion (NT950XCJ-X716A)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc832, "Samsung Galaxy Book Flex Alpha (NP730QCJ)", ALC256_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET), + SND_PCI_QUIRK(0x144d, 0xca03, "Samsung Galaxy Book2 Pro 360 (NP930QED)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x1458, 0xfa53, "Gigabyte BXBT-2807", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1462, 0xb120, "MSI Cubi MS-B120", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1462, 0xb171, "Cubi N 8GL (MS-B171)", ALC283_FIXUP_HEADSET_MIC), @@ -9699,6 +9716,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1b7d, 0xa831, "Ordissimo EVE2 ", ALC269VB_FIXUP_ORDISSIMO_EVE2), /* Also known as Malata PC-B1303 */ SND_PCI_QUIRK(0x1c06, 0x2013, "Lemote A1802", ALC269_FIXUP_LEMOTE_A1802), SND_PCI_QUIRK(0x1c06, 0x2015, "Lemote A190X", ALC269_FIXUP_LEMOTE_A190X), + SND_PCI_QUIRK(0x1c6c, 0x1251, "Positivo N14KP6-TG", ALC288_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1d05, 0x1132, "TongFang PHxTxX1", ALC256_FIXUP_SET_COEF_DEFAULTS), SND_PCI_QUIRK(0x1d05, 0x1096, "TongFang GMxMRxx", ALC269_FIXUP_NO_SHUTUP), SND_PCI_QUIRK(0x1d05, 0x1100, "TongFang GKxNRxx", ALC269_FIXUP_NO_SHUTUP), diff --git a/sound/pci/hda/patch_via.c b/sound/pci/hda/patch_via.c index aea7fae2ca4b..2994f85bc1b9 100644 --- a/sound/pci/hda/patch_via.c +++ b/sound/pci/hda/patch_via.c @@ -819,6 +819,9 @@ static int add_secret_dac_path(struct hda_codec *codec) return 0; nums = snd_hda_get_connections(codec, spec->gen.mixer_nid, conn, ARRAY_SIZE(conn) - 1); + if (nums < 0) + return nums; + for (i = 0; i < nums; i++) { if (get_wcaps_type(get_wcaps(codec, conn[i])) == AC_WID_AUD_OUT) return 0; diff --git a/sound/pci/lx6464es/lx_core.c b/sound/pci/lx6464es/lx_core.c index d3f58a3d17fb..b5b0d43bb8dc 100644 --- a/sound/pci/lx6464es/lx_core.c +++ b/sound/pci/lx6464es/lx_core.c @@ -493,12 +493,11 @@ int lx_buffer_ask(struct lx6464es *chip, u32 pipe, int is_capture, dev_dbg(chip->card->dev, "CMD_08_ASK_BUFFERS: needed %d, freed %d\n", *r_needed, *r_freed); - for (i = 0; i < MAX_STREAM_BUFFER; ++i) { - for (i = 0; i != chip->rmh.stat_len; ++i) - dev_dbg(chip->card->dev, - " stat[%d]: %x, %x\n", i, - chip->rmh.stat[i], - chip->rmh.stat[i] & MASK_DATA_SIZE); + for (i = 0; i < MAX_STREAM_BUFFER && i < chip->rmh.stat_len; + ++i) { + dev_dbg(chip->card->dev, " stat[%d]: %x, %x\n", i, + chip->rmh.stat[i], + chip->rmh.stat[i] & MASK_DATA_SIZE); } } diff --git a/sound/soc/amd/acp-es8336.c b/sound/soc/amd/acp-es8336.c index 2fe8df86053a..89499542c803 100644 --- a/sound/soc/amd/acp-es8336.c +++ b/sound/soc/amd/acp-es8336.c @@ -198,9 +198,11 @@ static int st_es8336_late_probe(struct snd_soc_card *card) int ret; adev = acpi_dev_get_first_match_dev("ESSX8336", NULL, -1); - if (adev) - put_device(&adev->dev); + if (!adev) + return -ENODEV; + codec_dev = acpi_get_first_physical_node(adev); + acpi_dev_put(adev); if (!codec_dev) dev_err(card->dev, "can not find codec dev\n"); diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index 0d283e41f66d..36314753923b 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -230,10 +230,31 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { { .driver_data = &acp6x_card, .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "TIMI"), + DMI_MATCH(DMI_PRODUCT_NAME, "Redmi Book Pro 15 2022"), + } + }, + { + .driver_data = &acp6x_card, + .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Razer"), DMI_MATCH(DMI_PRODUCT_NAME, "Blade 14 (2022) - RZ09-0427"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "RB"), + DMI_MATCH(DMI_PRODUCT_NAME, "Swift SFA16-41"), + } + }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IRBIS"), + DMI_MATCH(DMI_PRODUCT_NAME, "15NBC1011"), + } + }, {} }; diff --git a/sound/soc/codecs/cs42l56.c b/sound/soc/codecs/cs42l56.c index 26066682c983..3b0e715549c9 100644 --- a/sound/soc/codecs/cs42l56.c +++ b/sound/soc/codecs/cs42l56.c @@ -1191,18 +1191,12 @@ static int cs42l56_i2c_probe(struct i2c_client *i2c_client) if (pdata) { cs42l56->pdata = *pdata; } else { - pdata = devm_kzalloc(&i2c_client->dev, sizeof(*pdata), - GFP_KERNEL); - if (!pdata) - return -ENOMEM; - if (i2c_client->dev.of_node) { ret = cs42l56_handle_of_data(i2c_client, &cs42l56->pdata); if (ret != 0) return ret; } - cs42l56->pdata = *pdata; } if (cs42l56->pdata.gpio_nreset) { diff --git a/sound/soc/codecs/es8326.c b/sound/soc/codecs/es8326.c index 9ddf6a35e91c..28a0565c2a95 100644 --- a/sound/soc/codecs/es8326.c +++ b/sound/soc/codecs/es8326.c @@ -729,14 +729,16 @@ static int es8326_probe(struct snd_soc_component *component) } dev_dbg(component->dev, "jack-pol %x", es8326->jack_pol); - ret = device_property_read_u8(component->dev, "everest,interrupt-src", &es8326->jack_pol); + ret = device_property_read_u8(component->dev, "everest,interrupt-src", + &es8326->interrupt_src); if (ret != 0) { dev_dbg(component->dev, "interrupt-src return %d", ret); es8326->interrupt_src = ES8326_HP_DET_SRC_PIN9; } dev_dbg(component->dev, "interrupt-src %x", es8326->interrupt_src); - ret = device_property_read_u8(component->dev, "everest,interrupt-clk", &es8326->jack_pol); + ret = device_property_read_u8(component->dev, "everest,interrupt-clk", + &es8326->interrupt_clk); if (ret != 0) { dev_dbg(component->dev, "interrupt-clk return %d", ret); es8326->interrupt_clk = 0x45; diff --git a/sound/soc/codecs/rt715-sdca-sdw.c b/sound/soc/codecs/rt715-sdca-sdw.c index 3f981a9e7fb6..c54ecf3e6987 100644 --- a/sound/soc/codecs/rt715-sdca-sdw.c +++ b/sound/soc/codecs/rt715-sdca-sdw.c @@ -167,7 +167,7 @@ static int rt715_sdca_read_prop(struct sdw_slave *slave) } /* set the timeout values */ - prop->clk_stop_timeout = 20; + prop->clk_stop_timeout = 200; return 0; } diff --git a/sound/soc/codecs/tas5805m.c b/sound/soc/codecs/tas5805m.c index beb4ec629a03..4e38eb7acea1 100644 --- a/sound/soc/codecs/tas5805m.c +++ b/sound/soc/codecs/tas5805m.c @@ -154,6 +154,7 @@ static const uint32_t tas5805m_volume[] = { #define TAS5805M_VOLUME_MIN 0 struct tas5805m_priv { + struct i2c_client *i2c; struct regulator *pvdd; struct gpio_desc *gpio_pdn_n; @@ -165,6 +166,9 @@ struct tas5805m_priv { int vol[2]; bool is_powered; bool is_muted; + + struct work_struct work; + struct mutex lock; }; static void set_dsp_scale(struct regmap *rm, int offset, int vol) @@ -181,13 +185,11 @@ static void set_dsp_scale(struct regmap *rm, int offset, int vol) regmap_bulk_write(rm, offset, v, ARRAY_SIZE(v)); } -static void tas5805m_refresh(struct snd_soc_component *component) +static void tas5805m_refresh(struct tas5805m_priv *tas5805m) { - struct tas5805m_priv *tas5805m = - snd_soc_component_get_drvdata(component); struct regmap *rm = tas5805m->regmap; - dev_dbg(component->dev, "refresh: is_muted=%d, vol=%d/%d\n", + dev_dbg(&tas5805m->i2c->dev, "refresh: is_muted=%d, vol=%d/%d\n", tas5805m->is_muted, tas5805m->vol[0], tas5805m->vol[1]); regmap_write(rm, REG_PAGE, 0x00); @@ -201,6 +203,9 @@ static void tas5805m_refresh(struct snd_soc_component *component) set_dsp_scale(rm, 0x24, tas5805m->vol[0]); set_dsp_scale(rm, 0x28, tas5805m->vol[1]); + regmap_write(rm, REG_PAGE, 0x00); + regmap_write(rm, REG_BOOK, 0x00); + /* Set/clear digital soft-mute */ regmap_write(rm, REG_DEVICE_CTRL_2, (tas5805m->is_muted ? DCTRL2_MUTE : 0) | @@ -226,8 +231,11 @@ static int tas5805m_vol_get(struct snd_kcontrol *kcontrol, struct tas5805m_priv *tas5805m = snd_soc_component_get_drvdata(component); + mutex_lock(&tas5805m->lock); ucontrol->value.integer.value[0] = tas5805m->vol[0]; ucontrol->value.integer.value[1] = tas5805m->vol[1]; + mutex_unlock(&tas5805m->lock); + return 0; } @@ -243,11 +251,13 @@ static int tas5805m_vol_put(struct snd_kcontrol *kcontrol, snd_soc_kcontrol_component(kcontrol); struct tas5805m_priv *tas5805m = snd_soc_component_get_drvdata(component); + int ret = 0; if (!(volume_is_valid(ucontrol->value.integer.value[0]) && volume_is_valid(ucontrol->value.integer.value[1]))) return -EINVAL; + mutex_lock(&tas5805m->lock); if (tas5805m->vol[0] != ucontrol->value.integer.value[0] || tas5805m->vol[1] != ucontrol->value.integer.value[1]) { tas5805m->vol[0] = ucontrol->value.integer.value[0]; @@ -256,11 +266,12 @@ static int tas5805m_vol_put(struct snd_kcontrol *kcontrol, tas5805m->vol[0], tas5805m->vol[1], tas5805m->is_powered); if (tas5805m->is_powered) - tas5805m_refresh(component); - return 1; + tas5805m_refresh(tas5805m); + ret = 1; } + mutex_unlock(&tas5805m->lock); - return 0; + return ret; } static const struct snd_kcontrol_new tas5805m_snd_controls[] = { @@ -294,54 +305,83 @@ static int tas5805m_trigger(struct snd_pcm_substream *substream, int cmd, struct snd_soc_component *component = dai->component; struct tas5805m_priv *tas5805m = snd_soc_component_get_drvdata(component); - struct regmap *rm = tas5805m->regmap; - unsigned int chan, global1, global2; switch (cmd) { case SNDRV_PCM_TRIGGER_START: case SNDRV_PCM_TRIGGER_RESUME: case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: - dev_dbg(component->dev, "DSP startup\n"); - - /* We mustn't issue any I2C transactions until the I2S - * clock is stable. Furthermore, we must allow a 5ms - * delay after the first set of register writes to - * allow the DSP to boot before configuring it. - */ - usleep_range(5000, 10000); - send_cfg(rm, dsp_cfg_preboot, - ARRAY_SIZE(dsp_cfg_preboot)); - usleep_range(5000, 15000); - send_cfg(rm, tas5805m->dsp_cfg_data, - tas5805m->dsp_cfg_len); - - tas5805m->is_powered = true; - tas5805m_refresh(component); + dev_dbg(component->dev, "clock start\n"); + schedule_work(&tas5805m->work); break; case SNDRV_PCM_TRIGGER_STOP: case SNDRV_PCM_TRIGGER_SUSPEND: case SNDRV_PCM_TRIGGER_PAUSE_PUSH: - dev_dbg(component->dev, "DSP shutdown\n"); + break; - tas5805m->is_powered = false; + default: + return -EINVAL; + } - regmap_write(rm, REG_PAGE, 0x00); - regmap_write(rm, REG_BOOK, 0x00); + return 0; +} - regmap_read(rm, REG_CHAN_FAULT, &chan); - regmap_read(rm, REG_GLOBAL_FAULT1, &global1); - regmap_read(rm, REG_GLOBAL_FAULT2, &global2); +static void do_work(struct work_struct *work) +{ + struct tas5805m_priv *tas5805m = + container_of(work, struct tas5805m_priv, work); + struct regmap *rm = tas5805m->regmap; - dev_dbg(component->dev, - "fault regs: CHAN=%02x, GLOBAL1=%02x, GLOBAL2=%02x\n", - chan, global1, global2); + dev_dbg(&tas5805m->i2c->dev, "DSP startup\n"); - regmap_write(rm, REG_DEVICE_CTRL_2, DCTRL2_MODE_HIZ); - break; + mutex_lock(&tas5805m->lock); + /* We mustn't issue any I2C transactions until the I2S + * clock is stable. Furthermore, we must allow a 5ms + * delay after the first set of register writes to + * allow the DSP to boot before configuring it. + */ + usleep_range(5000, 10000); + send_cfg(rm, dsp_cfg_preboot, ARRAY_SIZE(dsp_cfg_preboot)); + usleep_range(5000, 15000); + send_cfg(rm, tas5805m->dsp_cfg_data, tas5805m->dsp_cfg_len); + + tas5805m->is_powered = true; + tas5805m_refresh(tas5805m); + mutex_unlock(&tas5805m->lock); +} - default: - return -EINVAL; +static int tas5805m_dac_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct tas5805m_priv *tas5805m = + snd_soc_component_get_drvdata(component); + struct regmap *rm = tas5805m->regmap; + + if (event & SND_SOC_DAPM_PRE_PMD) { + unsigned int chan, global1, global2; + + dev_dbg(component->dev, "DSP shutdown\n"); + cancel_work_sync(&tas5805m->work); + + mutex_lock(&tas5805m->lock); + if (tas5805m->is_powered) { + tas5805m->is_powered = false; + + regmap_write(rm, REG_PAGE, 0x00); + regmap_write(rm, REG_BOOK, 0x00); + + regmap_read(rm, REG_CHAN_FAULT, &chan); + regmap_read(rm, REG_GLOBAL_FAULT1, &global1); + regmap_read(rm, REG_GLOBAL_FAULT2, &global2); + + dev_dbg(component->dev, "fault regs: CHAN=%02x, " + "GLOBAL1=%02x, GLOBAL2=%02x\n", + chan, global1, global2); + + regmap_write(rm, REG_DEVICE_CTRL_2, DCTRL2_MODE_HIZ); + } + mutex_unlock(&tas5805m->lock); } return 0; @@ -354,7 +394,8 @@ static const struct snd_soc_dapm_route tas5805m_audio_map[] = { static const struct snd_soc_dapm_widget tas5805m_dapm_widgets[] = { SND_SOC_DAPM_AIF_IN("DAC IN", "Playback", 0, SND_SOC_NOPM, 0, 0), - SND_SOC_DAPM_DAC("DAC", NULL, SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_DAC_E("DAC", NULL, SND_SOC_NOPM, 0, 0, + tas5805m_dac_event, SND_SOC_DAPM_PRE_PMD), SND_SOC_DAPM_OUTPUT("OUT") }; @@ -375,11 +416,14 @@ static int tas5805m_mute(struct snd_soc_dai *dai, int mute, int direction) struct tas5805m_priv *tas5805m = snd_soc_component_get_drvdata(component); + mutex_lock(&tas5805m->lock); dev_dbg(component->dev, "set mute=%d (is_powered=%d)\n", mute, tas5805m->is_powered); + tas5805m->is_muted = mute; if (tas5805m->is_powered) - tas5805m_refresh(component); + tas5805m_refresh(tas5805m); + mutex_unlock(&tas5805m->lock); return 0; } @@ -434,6 +478,7 @@ static int tas5805m_i2c_probe(struct i2c_client *i2c) if (!tas5805m) return -ENOMEM; + tas5805m->i2c = i2c; tas5805m->pvdd = devm_regulator_get(dev, "pvdd"); if (IS_ERR(tas5805m->pvdd)) { dev_err(dev, "failed to get pvdd supply: %ld\n", @@ -507,6 +552,9 @@ static int tas5805m_i2c_probe(struct i2c_client *i2c) gpiod_set_value(tas5805m->gpio_pdn_n, 1); usleep_range(10000, 15000); + INIT_WORK(&tas5805m->work, do_work); + mutex_init(&tas5805m->lock); + /* Don't register through devm. We need to be able to unregister * the component prior to deasserting PDN# */ @@ -527,6 +575,7 @@ static void tas5805m_i2c_remove(struct i2c_client *i2c) struct device *dev = &i2c->dev; struct tas5805m_priv *tas5805m = dev_get_drvdata(dev); + cancel_work_sync(&tas5805m->work); snd_soc_unregister_component(dev); gpiod_set_value(tas5805m->gpio_pdn_n, 0); usleep_range(10000, 15000); diff --git a/sound/soc/codecs/wsa883x.c b/sound/soc/codecs/wsa883x.c index 966ba4909204..58fdb4e9fd97 100644 --- a/sound/soc/codecs/wsa883x.c +++ b/sound/soc/codecs/wsa883x.c @@ -1359,8 +1359,8 @@ static struct snd_soc_dai_driver wsa883x_dais[] = { .stream_name = "SPKR Playback", .rates = WSA883X_RATES | WSA883X_FRAC_RATES, .formats = WSA883X_FORMATS, - .rate_max = 8000, - .rate_min = 352800, + .rate_min = 8000, + .rate_max = 352800, .channels_min = 1, .channels_max = 1, }, diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c index 1c9be8a5dcb1..35a52c3a020d 100644 --- a/sound/soc/fsl/fsl_sai.c +++ b/sound/soc/fsl/fsl_sai.c @@ -1141,6 +1141,7 @@ static int fsl_sai_check_version(struct device *dev) sai->verid.version = val & (FSL_SAI_VERID_MAJOR_MASK | FSL_SAI_VERID_MINOR_MASK); + sai->verid.version >>= FSL_SAI_VERID_MINOR_SHIFT; sai->verid.feature = val & FSL_SAI_VERID_FEATURE_MASK; ret = regmap_read(sai->regmap, FSL_SAI_PARAM, &val); diff --git a/sound/soc/intel/avs/core.c b/sound/soc/intel/avs/core.c index 2ca24273c491..637501850728 100644 --- a/sound/soc/intel/avs/core.c +++ b/sound/soc/intel/avs/core.c @@ -481,6 +481,29 @@ err_remap_bar0: return ret; } +static void avs_pci_shutdown(struct pci_dev *pci) +{ + struct hdac_bus *bus = pci_get_drvdata(pci); + struct avs_dev *adev = hdac_to_avs(bus); + + cancel_work_sync(&adev->probe_work); + avs_ipc_block(adev->ipc); + + snd_hdac_stop_streams(bus); + avs_dsp_op(adev, int_control, false); + snd_hdac_ext_bus_ppcap_int_enable(bus, false); + snd_hdac_ext_bus_link_power_down_all(bus); + + snd_hdac_bus_stop_chip(bus); + snd_hdac_display_power(bus, HDA_CODEC_IDX_CONTROLLER, false); + + if (avs_platattr_test(adev, CLDMA)) + pci_free_irq(pci, 0, &code_loader); + pci_free_irq(pci, 0, adev); + pci_free_irq(pci, 0, bus); + pci_free_irq_vectors(pci); +} + static void avs_pci_remove(struct pci_dev *pci) { struct hdac_device *hdev, *save; @@ -739,6 +762,7 @@ static struct pci_driver avs_pci_driver = { .id_table = avs_ids, .probe = avs_pci_probe, .remove = avs_pci_remove, + .shutdown = avs_pci_shutdown, .driver = { .pm = &avs_dev_pm, }, diff --git a/sound/soc/intel/boards/bytcht_es8316.c b/sound/soc/intel/boards/bytcht_es8316.c index 09d1f0f6d686..df157b01df8b 100644 --- a/sound/soc/intel/boards/bytcht_es8316.c +++ b/sound/soc/intel/boards/bytcht_es8316.c @@ -497,21 +497,28 @@ static int snd_byt_cht_es8316_mc_probe(struct platform_device *pdev) if (adev) { snprintf(codec_name, sizeof(codec_name), "i2c-%s", acpi_dev_name(adev)); - put_device(&adev->dev); byt_cht_es8316_dais[dai_index].codecs->name = codec_name; } else { dev_err(dev, "Error cannot find '%s' dev\n", mach->id); return -ENXIO; } + codec_dev = acpi_get_first_physical_node(adev); + acpi_dev_put(adev); + if (!codec_dev) + return -EPROBE_DEFER; + priv->codec_dev = get_device(codec_dev); + /* override platform name, if required */ byt_cht_es8316_card.dev = dev; platform_name = mach->mach_params.platform; ret = snd_soc_fixup_dai_links_platform_name(&byt_cht_es8316_card, platform_name); - if (ret) + if (ret) { + put_device(codec_dev); return ret; + } /* Check for BYTCR or other platform and setup quirks */ dmi_id = dmi_first_match(byt_cht_es8316_quirk_table); @@ -539,13 +546,10 @@ static int snd_byt_cht_es8316_mc_probe(struct platform_device *pdev) /* get the clock */ priv->mclk = devm_clk_get(dev, "pmc_plt_clk_3"); - if (IS_ERR(priv->mclk)) + if (IS_ERR(priv->mclk)) { + put_device(codec_dev); return dev_err_probe(dev, PTR_ERR(priv->mclk), "clk_get pmc_plt_clk_3 failed\n"); - - codec_dev = acpi_get_first_physical_node(adev); - if (!codec_dev) - return -EPROBE_DEFER; - priv->codec_dev = get_device(codec_dev); + } if (quirk & BYT_CHT_ES8316_JD_INVERTED) props[cnt++] = PROPERTY_ENTRY_BOOL("everest,jack-detect-inverted"); diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index 4699ca79f3ea..79e0039c79a3 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -1636,13 +1636,18 @@ static int snd_byt_rt5640_mc_probe(struct platform_device *pdev) if (adev) { snprintf(byt_rt5640_codec_name, sizeof(byt_rt5640_codec_name), "i2c-%s", acpi_dev_name(adev)); - put_device(&adev->dev); byt_rt5640_dais[dai_index].codecs->name = byt_rt5640_codec_name; } else { dev_err(dev, "Error cannot find '%s' dev\n", mach->id); return -ENXIO; } + codec_dev = acpi_get_first_physical_node(adev); + acpi_dev_put(adev); + if (!codec_dev) + return -EPROBE_DEFER; + priv->codec_dev = get_device(codec_dev); + /* * swap SSP0 if bytcr is detected * (will be overridden if DMI quirk is detected) @@ -1717,11 +1722,6 @@ static int snd_byt_rt5640_mc_probe(struct platform_device *pdev) byt_rt5640_quirk = quirk_override; } - codec_dev = acpi_get_first_physical_node(adev); - if (!codec_dev) - return -EPROBE_DEFER; - priv->codec_dev = get_device(codec_dev); - if (byt_rt5640_quirk & BYT_RT5640_JD_HP_ELITEP_1000G2) { acpi_dev_add_driver_gpios(ACPI_COMPANION(priv->codec_dev), byt_rt5640_hp_elitepad_1000g2_gpios); diff --git a/sound/soc/intel/boards/bytcr_rt5651.c b/sound/soc/intel/boards/bytcr_rt5651.c index 81ac6eeda2e6..8fca9b82d4d0 100644 --- a/sound/soc/intel/boards/bytcr_rt5651.c +++ b/sound/soc/intel/boards/bytcr_rt5651.c @@ -922,7 +922,6 @@ static int snd_byt_rt5651_mc_probe(struct platform_device *pdev) if (adev) { snprintf(byt_rt5651_codec_name, sizeof(byt_rt5651_codec_name), "i2c-%s", acpi_dev_name(adev)); - put_device(&adev->dev); byt_rt5651_dais[dai_index].codecs->name = byt_rt5651_codec_name; } else { dev_err(dev, "Error cannot find '%s' dev\n", mach->id); @@ -930,6 +929,7 @@ static int snd_byt_rt5651_mc_probe(struct platform_device *pdev) } codec_dev = acpi_get_first_physical_node(adev); + acpi_dev_put(adev); if (!codec_dev) return -EPROBE_DEFER; priv->codec_dev = get_device(codec_dev); diff --git a/sound/soc/intel/boards/bytcr_wm5102.c b/sound/soc/intel/boards/bytcr_wm5102.c index 1669eb3bd80f..c0706537f673 100644 --- a/sound/soc/intel/boards/bytcr_wm5102.c +++ b/sound/soc/intel/boards/bytcr_wm5102.c @@ -411,9 +411,9 @@ static int snd_byt_wm5102_mc_probe(struct platform_device *pdev) return -ENOENT; } snprintf(codec_name, sizeof(codec_name), "spi-%s", acpi_dev_name(adev)); - put_device(&adev->dev); codec_dev = bus_find_device_by_name(&spi_bus_type, NULL, codec_name); + acpi_dev_put(adev); if (!codec_dev) return -EPROBE_DEFER; diff --git a/sound/soc/intel/boards/sof_cs42l42.c b/sound/soc/intel/boards/sof_cs42l42.c index e38bd2831e6a..e9d190cb13b0 100644 --- a/sound/soc/intel/boards/sof_cs42l42.c +++ b/sound/soc/intel/boards/sof_cs42l42.c @@ -336,6 +336,9 @@ static int create_spk_amp_dai_links(struct device *dev, links[*id].platforms = platform_component; links[*id].num_platforms = ARRAY_SIZE(platform_component); links[*id].dpcm_playback = 1; + /* firmware-generated echo reference */ + links[*id].dpcm_capture = 1; + links[*id].no_pcm = 1; links[*id].cpus = &cpus[*id]; links[*id].num_cpus = 1; diff --git a/sound/soc/intel/boards/sof_es8336.c b/sound/soc/intel/boards/sof_es8336.c index 773e5d1d87d4..894b6610b9e2 100644 --- a/sound/soc/intel/boards/sof_es8336.c +++ b/sound/soc/intel/boards/sof_es8336.c @@ -681,7 +681,6 @@ static int sof_es8336_probe(struct platform_device *pdev) if (adev) { snprintf(codec_name, sizeof(codec_name), "i2c-%s", acpi_dev_name(adev)); - put_device(&adev->dev); dai_links[0].codecs->name = codec_name; /* also fixup codec dai name if relevant */ @@ -692,16 +691,19 @@ static int sof_es8336_probe(struct platform_device *pdev) return -ENXIO; } - ret = snd_soc_fixup_dai_links_platform_name(&sof_es8336_card, - mach->mach_params.platform); - if (ret) - return ret; - codec_dev = acpi_get_first_physical_node(adev); + acpi_dev_put(adev); if (!codec_dev) return -EPROBE_DEFER; priv->codec_dev = get_device(codec_dev); + ret = snd_soc_fixup_dai_links_platform_name(&sof_es8336_card, + mach->mach_params.platform); + if (ret) { + put_device(codec_dev); + return ret; + } + if (quirk & SOF_ES8336_JD_INVERTED) props[cnt++] = PROPERTY_ENTRY_BOOL("everest,jack-detect-inverted"); diff --git a/sound/soc/intel/boards/sof_nau8825.c b/sound/soc/intel/boards/sof_nau8825.c index a800854c2831..6794a0249a9a 100644 --- a/sound/soc/intel/boards/sof_nau8825.c +++ b/sound/soc/intel/boards/sof_nau8825.c @@ -487,8 +487,6 @@ static struct snd_soc_dai_link *sof_card_dai_links_create(struct device *dev, links[id].num_codecs = ARRAY_SIZE(max_98373_components); links[id].init = max_98373_spk_codec_init; links[id].ops = &max_98373_ops; - /* feedback stream */ - links[id].dpcm_capture = 1; } else if (sof_nau8825_quirk & SOF_MAX98360A_SPEAKER_AMP_PRESENT) { max_98360a_dai_link(&links[id]); @@ -506,6 +504,9 @@ static struct snd_soc_dai_link *sof_card_dai_links_create(struct device *dev, links[id].platforms = platform_component; links[id].num_platforms = ARRAY_SIZE(platform_component); links[id].dpcm_playback = 1; + /* feedback stream or firmware-generated echo reference */ + links[id].dpcm_capture = 1; + links[id].no_pcm = 1; links[id].cpus = &cpus[id]; links[id].num_cpus = 1; diff --git a/sound/soc/intel/boards/sof_rt5682.c b/sound/soc/intel/boards/sof_rt5682.c index 2eabc4b0fafa..71a11d747622 100644 --- a/sound/soc/intel/boards/sof_rt5682.c +++ b/sound/soc/intel/boards/sof_rt5682.c @@ -761,8 +761,6 @@ static struct snd_soc_dai_link *sof_card_dai_links_create(struct device *dev, links[id].num_codecs = ARRAY_SIZE(max_98373_components); links[id].init = max_98373_spk_codec_init; links[id].ops = &max_98373_ops; - /* feedback stream */ - links[id].dpcm_capture = 1; } else if (sof_rt5682_quirk & SOF_MAX98360A_SPEAKER_AMP_PRESENT) { max_98360a_dai_link(&links[id]); @@ -789,6 +787,9 @@ static struct snd_soc_dai_link *sof_card_dai_links_create(struct device *dev, links[id].platforms = platform_component; links[id].num_platforms = ARRAY_SIZE(platform_component); links[id].dpcm_playback = 1; + /* feedback stream or firmware-generated echo reference */ + links[id].dpcm_capture = 1; + links[id].no_pcm = 1; links[id].cpus = &cpus[id]; links[id].num_cpus = 1; diff --git a/sound/soc/intel/boards/sof_ssp_amp.c b/sound/soc/intel/boards/sof_ssp_amp.c index 94d25aeb6e7c..7b74f122e340 100644 --- a/sound/soc/intel/boards/sof_ssp_amp.c +++ b/sound/soc/intel/boards/sof_ssp_amp.c @@ -258,13 +258,12 @@ static struct snd_soc_dai_link *sof_card_dai_links_create(struct device *dev, sof_rt1308_dai_link(&links[id]); } else if (sof_ssp_amp_quirk & SOF_CS35L41_SPEAKER_AMP_PRESENT) { cs35l41_set_dai_link(&links[id]); - - /* feedback from amplifier */ - links[id].dpcm_capture = 1; } links[id].platforms = platform_component; links[id].num_platforms = ARRAY_SIZE(platform_component); links[id].dpcm_playback = 1; + /* feedback from amplifier or firmware-generated echo reference */ + links[id].dpcm_capture = 1; links[id].no_pcm = 1; links[id].cpus = &cpus[id]; links[id].num_cpus = 1; diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index c3be24b2fac5..a79a2fb260b8 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1401,13 +1401,17 @@ static int soc_tplg_dapm_widget_create(struct soc_tplg *tplg, template.num_kcontrols = le32_to_cpu(w->num_kcontrols); kc = devm_kcalloc(tplg->dev, le32_to_cpu(w->num_kcontrols), sizeof(*kc), GFP_KERNEL); - if (!kc) + if (!kc) { + ret = -ENOMEM; goto hdr_err; + } kcontrol_type = devm_kcalloc(tplg->dev, le32_to_cpu(w->num_kcontrols), sizeof(unsigned int), GFP_KERNEL); - if (!kcontrol_type) + if (!kcontrol_type) { + ret = -ENOMEM; goto hdr_err; + } for (i = 0; i < le32_to_cpu(w->num_kcontrols); i++) { control_hdr = (struct snd_soc_tplg_ctl_hdr *)tplg->pos; diff --git a/sound/soc/sof/amd/acp.c b/sound/soc/sof/amd/acp.c index 6bd2888fbb66..d5ccd4d09278 100644 --- a/sound/soc/sof/amd/acp.c +++ b/sound/soc/sof/amd/acp.c @@ -318,7 +318,6 @@ static irqreturn_t acp_irq_thread(int irq, void *context) { struct snd_sof_dev *sdev = context; const struct sof_amd_acp_desc *desc = get_chip_info(sdev->pdata); - unsigned int base = desc->dsp_intr_base; unsigned int val, count = ACP_HW_SEM_RETRY_COUNT; val = snd_sof_dsp_read(sdev, ACP_DSP_BAR, desc->ext_intr_stat); @@ -328,28 +327,20 @@ static irqreturn_t acp_irq_thread(int irq, void *context) return IRQ_HANDLED; } - val = snd_sof_dsp_read(sdev, ACP_DSP_BAR, base + DSP_SW_INTR_STAT_OFFSET); - if (val & ACP_DSP_TO_HOST_IRQ) { - while (snd_sof_dsp_read(sdev, ACP_DSP_BAR, desc->hw_semaphore_offset)) { - /* Wait until acquired HW Semaphore lock or timeout */ - count--; - if (!count) { - dev_err(sdev->dev, "%s: Failed to acquire HW lock\n", __func__); - return IRQ_NONE; - } + while (snd_sof_dsp_read(sdev, ACP_DSP_BAR, desc->hw_semaphore_offset)) { + /* Wait until acquired HW Semaphore lock or timeout */ + count--; + if (!count) { + dev_err(sdev->dev, "%s: Failed to acquire HW lock\n", __func__); + return IRQ_NONE; } - - sof_ops(sdev)->irq_thread(irq, sdev); - val |= ACP_DSP_TO_HOST_IRQ; - snd_sof_dsp_write(sdev, ACP_DSP_BAR, base + DSP_SW_INTR_STAT_OFFSET, val); - - /* Unlock or Release HW Semaphore */ - snd_sof_dsp_write(sdev, ACP_DSP_BAR, desc->hw_semaphore_offset, 0x0); - - return IRQ_HANDLED; } - return IRQ_NONE; + sof_ops(sdev)->irq_thread(irq, sdev); + /* Unlock or Release HW Semaphore */ + snd_sof_dsp_write(sdev, ACP_DSP_BAR, desc->hw_semaphore_offset, 0x0); + + return IRQ_HANDLED; }; static irqreturn_t acp_irq_handler(int irq, void *dev_id) @@ -360,8 +351,11 @@ static irqreturn_t acp_irq_handler(int irq, void *dev_id) unsigned int val; val = snd_sof_dsp_read(sdev, ACP_DSP_BAR, base + DSP_SW_INTR_STAT_OFFSET); - if (val) + if (val) { + val |= ACP_DSP_TO_HOST_IRQ; + snd_sof_dsp_write(sdev, ACP_DSP_BAR, base + DSP_SW_INTR_STAT_OFFSET, val); return IRQ_WAKE_THREAD; + } return IRQ_NONE; } diff --git a/sound/soc/sof/intel/hda-dai.c b/sound/soc/sof/intel/hda-dai.c index 1c3d4887aa30..a642c3067ec5 100644 --- a/sound/soc/sof/intel/hda-dai.c +++ b/sound/soc/sof/intel/hda-dai.c @@ -216,6 +216,10 @@ static int hda_link_dma_hw_params(struct snd_pcm_substream *substream, sdev = snd_soc_component_get_drvdata(cpu_dai->component); bus = sof_to_bus(sdev); + hlink = snd_hdac_ext_bus_get_hlink_by_name(bus, codec_dai->component->name); + if (!hlink) + return -EINVAL; + hext_stream = snd_soc_dai_get_dma_data(cpu_dai, substream); if (!hext_stream) { hext_stream = hda_link_stream_assign(bus, substream); @@ -225,10 +229,6 @@ static int hda_link_dma_hw_params(struct snd_pcm_substream *substream, snd_soc_dai_set_dma_data(cpu_dai, substream, (void *)hext_stream); } - hlink = snd_hdac_ext_bus_get_hlink_by_name(bus, codec_dai->component->name); - if (!hlink) - return -EINVAL; - /* set the hdac_stream in the codec dai */ snd_soc_dai_set_stream(codec_dai, hdac_stream(hext_stream), substream->stream); diff --git a/sound/soc/sof/ipc4-mtrace.c b/sound/soc/sof/ipc4-mtrace.c index 70dea8ae706e..0ec6ef681012 100644 --- a/sound/soc/sof/ipc4-mtrace.c +++ b/sound/soc/sof/ipc4-mtrace.c @@ -344,9 +344,10 @@ static ssize_t sof_ipc4_priority_mask_dfs_write(struct file *file, size_t count, loff_t *ppos) { struct sof_mtrace_priv *priv = file->private_data; - int id, ret; + unsigned int id; char *buf; u32 mask; + int ret; /* * To update Nth mask entry, write: @@ -357,9 +358,9 @@ static ssize_t sof_ipc4_priority_mask_dfs_write(struct file *file, if (IS_ERR(buf)) return PTR_ERR(buf); - ret = sscanf(buf, "%d,0x%x", &id, &mask); + ret = sscanf(buf, "%u,0x%x", &id, &mask); if (ret != 2) { - ret = sscanf(buf, "%d,%x", &id, &mask); + ret = sscanf(buf, "%u,%x", &id, &mask); if (ret != 2) { ret = -EINVAL; goto out; diff --git a/sound/soc/sof/ops.h b/sound/soc/sof/ops.h index c52752250565..3b3f3cf7af38 100644 --- a/sound/soc/sof/ops.h +++ b/sound/soc/sof/ops.h @@ -357,7 +357,7 @@ static inline u64 snd_sof_dsp_read64(struct snd_sof_dev *sdev, u32 bar, } static inline void snd_sof_dsp_update8(struct snd_sof_dev *sdev, u32 bar, - u32 offset, u8 value, u8 mask) + u32 offset, u8 mask, u8 value) { u8 reg; diff --git a/sound/soc/sof/sof-audio.c b/sound/soc/sof/sof-audio.c index 7306a2649857..865c367eb2f2 100644 --- a/sound/soc/sof/sof-audio.c +++ b/sound/soc/sof/sof-audio.c @@ -271,9 +271,9 @@ sof_unprepare_widgets_in_path(struct snd_sof_dev *sdev, struct snd_soc_dapm_widg struct snd_sof_widget *swidget = widget->dobj.private; struct snd_soc_dapm_path *p; - /* return if the widget is in use or if it is already unprepared */ - if (!swidget->prepared || swidget->use_count > 1) - return; + /* skip if the widget is in use or if it is already unprepared */ + if (!swidget || !swidget->prepared || swidget->use_count > 0) + goto sink_unprepare; if (widget_ops[widget->id].ipc_unprepare) /* unprepare the source widget */ @@ -281,6 +281,7 @@ sof_unprepare_widgets_in_path(struct snd_sof_dev *sdev, struct snd_soc_dapm_widg swidget->prepared = false; +sink_unprepare: /* unprepare all widgets in the sink paths */ snd_soc_dapm_widget_for_each_sink_path(widget, p) { if (!p->walking && p->sink->dobj.private) { @@ -303,7 +304,7 @@ sof_prepare_widgets_in_path(struct snd_sof_dev *sdev, struct snd_soc_dapm_widget struct snd_soc_dapm_path *p; int ret; - if (!widget_ops[widget->id].ipc_prepare || swidget->prepared) + if (!swidget || !widget_ops[widget->id].ipc_prepare || swidget->prepared) goto sink_prepare; /* prepare the source widget */ @@ -326,7 +327,8 @@ sink_prepare: p->walking = false; if (ret < 0) { /* unprepare the source widget */ - if (widget_ops[widget->id].ipc_unprepare && swidget->prepared) { + if (widget_ops[widget->id].ipc_unprepare && + swidget && swidget->prepared) { widget_ops[widget->id].ipc_unprepare(swidget); swidget->prepared = false; } @@ -429,11 +431,11 @@ sof_walk_widgets_in_order(struct snd_sof_dev *sdev, struct snd_soc_dapm_widget_l for_each_dapm_widgets(list, i, widget) { /* starting widget for playback is AIF type */ - if (dir == SNDRV_PCM_STREAM_PLAYBACK && !WIDGET_IS_AIF(widget->id)) + if (dir == SNDRV_PCM_STREAM_PLAYBACK && widget->id != snd_soc_dapm_aif_in) continue; /* starting widget for capture is DAI type */ - if (dir == SNDRV_PCM_STREAM_CAPTURE && !WIDGET_IS_DAI(widget->id)) + if (dir == SNDRV_PCM_STREAM_CAPTURE && widget->id != snd_soc_dapm_dai_out) continue; switch (op) { diff --git a/sound/synth/emux/emux_nrpn.c b/sound/synth/emux/emux_nrpn.c index 8056422ed7c5..0d6b82ae2955 100644 --- a/sound/synth/emux/emux_nrpn.c +++ b/sound/synth/emux/emux_nrpn.c @@ -349,6 +349,9 @@ int snd_emux_xg_control(struct snd_emux_port *port, struct snd_midi_channel *chan, int param) { + if (param >= ARRAY_SIZE(chan->control)) + return -EINVAL; + return send_converted_effect(xg_effects, ARRAY_SIZE(xg_effects), port, chan, param, chan->control[param], diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 3d13fdf7590c..3ecd1ba7fd4b 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2152,6 +2152,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x0525, 0xa4ad, /* Hamedal C20 usb camero */ QUIRK_FLAG_IFACE_SKIP_CLOSE), + DEVICE_FLG(0x0ecb, 0x205c, /* JBL Quantum610 Wireless */ + QUIRK_FLAG_FIXED_RATE), DEVICE_FLG(0x0ecb, 0x2069, /* JBL Quantum810 Wireless */ QUIRK_FLAG_FIXED_RATE), diff --git a/tools/testing/memblock/internal.h b/tools/testing/memblock/internal.h index 85973e55489e..fdb7f5db7308 100644 --- a/tools/testing/memblock/internal.h +++ b/tools/testing/memblock/internal.h @@ -15,10 +15,6 @@ bool mirrored_kernelcore = false; struct page {}; -void __free_pages_core(struct page *page, unsigned int order) -{ -} - void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 2cf0c7a3fe23..567e07c19ecc 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -30,6 +30,8 @@ #define MAX_STRERR_LEN 256 #define MAX_TEST_NAME 80 +#define __always_unused __attribute__((__unused__)) + #define _FAIL(errnum, fmt...) \ ({ \ error_at_line(0, (errnum), __func__, __LINE__, fmt); \ @@ -321,7 +323,8 @@ static int socket_loopback(int family, int sotype) return socket_loopback_reuseport(family, sotype, -1); } -static void test_insert_invalid(int family, int sotype, int mapfd) +static void test_insert_invalid(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u32 key = 0; u64 value; @@ -338,7 +341,8 @@ static void test_insert_invalid(int family, int sotype, int mapfd) FAIL_ERRNO("map_update: expected EBADF"); } -static void test_insert_opened(int family, int sotype, int mapfd) +static void test_insert_opened(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u32 key = 0; u64 value; @@ -359,7 +363,8 @@ static void test_insert_opened(int family, int sotype, int mapfd) xclose(s); } -static void test_insert_bound(int family, int sotype, int mapfd) +static void test_insert_bound(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct sockaddr_storage addr; socklen_t len; @@ -386,7 +391,8 @@ close: xclose(s); } -static void test_insert(int family, int sotype, int mapfd) +static void test_insert(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u64 value; u32 key; @@ -402,7 +408,8 @@ static void test_insert(int family, int sotype, int mapfd) xclose(s); } -static void test_delete_after_insert(int family, int sotype, int mapfd) +static void test_delete_after_insert(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u64 value; u32 key; @@ -419,7 +426,8 @@ static void test_delete_after_insert(int family, int sotype, int mapfd) xclose(s); } -static void test_delete_after_close(int family, int sotype, int mapfd) +static void test_delete_after_close(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { int err, s; u64 value; @@ -442,7 +450,8 @@ static void test_delete_after_close(int family, int sotype, int mapfd) FAIL_ERRNO("map_delete: expected EINVAL/EINVAL"); } -static void test_lookup_after_insert(int family, int sotype, int mapfd) +static void test_lookup_after_insert(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u64 cookie, value; socklen_t len; @@ -470,7 +479,8 @@ static void test_lookup_after_insert(int family, int sotype, int mapfd) xclose(s); } -static void test_lookup_after_delete(int family, int sotype, int mapfd) +static void test_lookup_after_delete(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { int err, s; u64 value; @@ -493,7 +503,8 @@ static void test_lookup_after_delete(int family, int sotype, int mapfd) xclose(s); } -static void test_lookup_32_bit_value(int family, int sotype, int mapfd) +static void test_lookup_32_bit_value(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u32 key, value32; int err, s; @@ -523,7 +534,8 @@ close: xclose(s); } -static void test_update_existing(int family, int sotype, int mapfd) +static void test_update_existing(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { int s1, s2; u64 value; @@ -551,7 +563,7 @@ close_s1: /* Exercise the code path where we destroy child sockets that never * got accept()'ed, aka orphans, when parent socket gets closed. */ -static void test_destroy_orphan_child(int family, int sotype, int mapfd) +static void do_destroy_orphan_child(int family, int sotype, int mapfd) { struct sockaddr_storage addr; socklen_t len; @@ -582,10 +594,38 @@ close_srv: xclose(s); } +static void test_destroy_orphan_child(struct test_sockmap_listen *skel, + int family, int sotype, int mapfd) +{ + int msg_verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + int skb_verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + const struct test { + int progfd; + enum bpf_attach_type atype; + } tests[] = { + { -1, -1 }, + { msg_verdict, BPF_SK_MSG_VERDICT }, + { skb_verdict, BPF_SK_SKB_VERDICT }, + }; + const struct test *t; + + for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { + if (t->progfd != -1 && + xbpf_prog_attach(t->progfd, mapfd, t->atype, 0) != 0) + return; + + do_destroy_orphan_child(family, sotype, mapfd); + + if (t->progfd != -1) + xbpf_prog_detach2(t->progfd, mapfd, t->atype); + } +} + /* Perform a passive open after removing listening socket from SOCKMAP * to ensure that callbacks get restored properly. */ -static void test_clone_after_delete(int family, int sotype, int mapfd) +static void test_clone_after_delete(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct sockaddr_storage addr; socklen_t len; @@ -621,7 +661,8 @@ close_srv: * SOCKMAP, but got accept()'ed only after the parent has been removed * from SOCKMAP, gets cloned without parent psock state or callbacks. */ -static void test_accept_after_delete(int family, int sotype, int mapfd) +static void test_accept_after_delete(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct sockaddr_storage addr; const u32 zero = 0; @@ -675,7 +716,8 @@ close_srv: /* Check that child socket that got created and accepted while parent * was in a SOCKMAP is cloned without parent psock state or callbacks. */ -static void test_accept_before_delete(int family, int sotype, int mapfd) +static void test_accept_before_delete(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct sockaddr_storage addr; const u32 zero = 0, one = 1; @@ -784,7 +826,8 @@ done: return NULL; } -static void test_syn_recv_insert_delete(int family, int sotype, int mapfd) +static void test_syn_recv_insert_delete(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct connect_accept_ctx ctx = { 0 }; struct sockaddr_storage addr; @@ -847,7 +890,8 @@ static void *listen_thread(void *arg) return NULL; } -static void test_race_insert_listen(int family, int socktype, int mapfd) +static void test_race_insert_listen(struct test_sockmap_listen *skel __always_unused, + int family, int socktype, int mapfd) { struct connect_accept_ctx ctx = { 0 }; const u32 zero = 0; @@ -1473,7 +1517,8 @@ static void test_ops(struct test_sockmap_listen *skel, struct bpf_map *map, int family, int sotype) { const struct op_test { - void (*fn)(int family, int sotype, int mapfd); + void (*fn)(struct test_sockmap_listen *skel, + int family, int sotype, int mapfd); const char *name; int sotype; } tests[] = { @@ -1520,7 +1565,7 @@ static void test_ops(struct test_sockmap_listen *skel, struct bpf_map *map, if (!test__start_subtest(s)) continue; - t->fn(family, sotype, map_fd); + t->fn(skel, family, sotype, map_fd); test_ops_cleanup(map); } } diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h index 92331053dba3..7bd76b9e0f98 100644 --- a/tools/testing/selftests/bpf/progs/profiler.inc.h +++ b/tools/testing/selftests/bpf/progs/profiler.inc.h @@ -826,7 +826,7 @@ out: SEC("kprobe/vfs_link") int BPF_KPROBE(kprobe__vfs_link, - struct dentry* old_dentry, struct user_namespace *mnt_userns, + struct dentry* old_dentry, struct mnt_idmap *idmap, struct inode* dir, struct dentry* new_dentry, struct inode** delegated_inode) { diff --git a/tools/testing/selftests/bpf/verifier/search_pruning.c b/tools/testing/selftests/bpf/verifier/search_pruning.c index 68b14fdfebdb..d63fd8991b03 100644 --- a/tools/testing/selftests/bpf/verifier/search_pruning.c +++ b/tools/testing/selftests/bpf/verifier/search_pruning.c @@ -225,3 +225,39 @@ .result_unpriv = ACCEPT, .insn_processed = 15, }, +/* The test performs a conditional 64-bit write to a stack location + * fp[-8], this is followed by an unconditional 8-bit write to fp[-8], + * then data is read from fp[-8]. This sequence is unsafe. + * + * The test would be mistakenly marked as safe w/o dst register parent + * preservation in verifier.c:copy_register_state() function. + * + * Note the usage of BPF_F_TEST_STATE_FREQ to force creation of the + * checkpoint state after conditional 64-bit assignment. + */ +{ + "write tracking and register parent chain bug", + .insns = { + /* r6 = ktime_get_ns() */ + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* r0 = ktime_get_ns() */ + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + /* if r0 > r6 goto +1 */ + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_6, 1), + /* *(u64 *)(r10 - 8) = 0xdeadbeef */ + BPF_ST_MEM(BPF_DW, BPF_REG_FP, -8, 0xdeadbeef), + /* r1 = 42 */ + BPF_MOV64_IMM(BPF_REG_1, 42), + /* *(u8 *)(r10 - 8) = r1 */ + BPF_STX_MEM(BPF_B, BPF_REG_FP, BPF_REG_1, -8), + /* r2 = *(u64 *)(r10 - 8) */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_FP, -8), + /* exit(0) */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .flags = BPF_F_TEST_STATE_FREQ, + .errstr = "invalid read from stack off -8+1 size 8", + .result = REJECT, +}, diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 186e1c26867e..75c100de90ff 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -268,6 +268,7 @@ TEST_MATRIX=( # Taking away all CPUs from parent or itself if there are tasks # will make the partition invalid. " S+ C2-3:P1:S+ C3:P1 . . T C2-3 . . 0 A1:2-3,A2:2-3 A1:P1,A2:P-1" + " S+ C3:P1:S+ C3 . . T P1 . . 0 A1:3,A2:3 A1:P1,A2:P-1" " S+ $SETUP_A123_PARTITIONS . T:C2-3 . . . 0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1" " S+ $SETUP_A123_PARTITIONS . T:C2-3:C1-3 . . . 0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1" diff --git a/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh b/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh index 9c79bbcce5a8..aff0a59f92d9 100755 --- a/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh +++ b/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh @@ -246,7 +246,7 @@ test_vlan_ingress_modify() bridge vlan add dev $swp2 vid 300 tc filter add dev $swp1 ingress chain $(IS1 2) pref 3 \ - protocol 802.1Q flower skip_sw vlan_id 200 \ + protocol 802.1Q flower skip_sw vlan_id 200 src_mac $h1_mac \ action vlan modify id 300 \ action goto chain $(IS2 0 0) diff --git a/tools/testing/selftests/filesystems/fat/run_fat_tests.sh b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh index 7f35dc3d15df..7f35dc3d15df 100644..100755 --- a/tools/testing/selftests/filesystems/fat/run_fat_tests.sh +++ b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index beb944fa6fd4..54680dc5887f 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -237,6 +237,11 @@ static void guest_check_s1ptw_wr_in_dirty_log(void) GUEST_SYNC(CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG); } +static void guest_check_no_s1ptw_wr_in_dirty_log(void) +{ + GUEST_SYNC(CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG); +} + static void guest_exec(void) { int (*code)(void) = (int (*)(void))TEST_EXEC_GVA; @@ -304,7 +309,7 @@ static struct uffd_args { /* Returns true to continue the test, and false if it should be skipped. */ static int uffd_generic_handler(int uffd_mode, int uffd, struct uffd_msg *msg, - struct uffd_args *args, bool expect_write) + struct uffd_args *args) { uint64_t addr = msg->arg.pagefault.address; uint64_t flags = msg->arg.pagefault.flags; @@ -313,7 +318,6 @@ static int uffd_generic_handler(int uffd_mode, int uffd, struct uffd_msg *msg, TEST_ASSERT(uffd_mode == UFFDIO_REGISTER_MODE_MISSING, "The only expected UFFD mode is MISSING"); - ASSERT_EQ(!!(flags & UFFD_PAGEFAULT_FLAG_WRITE), expect_write); ASSERT_EQ(addr, (uint64_t)args->hva); pr_debug("uffd fault: addr=%p write=%d\n", @@ -337,19 +341,14 @@ static int uffd_generic_handler(int uffd_mode, int uffd, struct uffd_msg *msg, return 0; } -static int uffd_pt_write_handler(int mode, int uffd, struct uffd_msg *msg) -{ - return uffd_generic_handler(mode, uffd, msg, &pt_args, true); -} - -static int uffd_data_write_handler(int mode, int uffd, struct uffd_msg *msg) +static int uffd_pt_handler(int mode, int uffd, struct uffd_msg *msg) { - return uffd_generic_handler(mode, uffd, msg, &data_args, true); + return uffd_generic_handler(mode, uffd, msg, &pt_args); } -static int uffd_data_read_handler(int mode, int uffd, struct uffd_msg *msg) +static int uffd_data_handler(int mode, int uffd, struct uffd_msg *msg) { - return uffd_generic_handler(mode, uffd, msg, &data_args, false); + return uffd_generic_handler(mode, uffd, msg, &data_args); } static void setup_uffd_args(struct userspace_mem_region *region, @@ -471,9 +470,12 @@ static bool handle_cmd(struct kvm_vm *vm, int cmd) { struct userspace_mem_region *data_region, *pt_region; bool continue_test = true; + uint64_t pte_gpa, pte_pg; data_region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); pt_region = vm_get_mem_region(vm, MEM_REGION_PT); + pte_gpa = addr_hva2gpa(vm, virt_get_pte_hva(vm, TEST_GVA)); + pte_pg = (pte_gpa - pt_region->region.guest_phys_addr) / getpagesize(); if (cmd == CMD_SKIP_TEST) continue_test = false; @@ -486,13 +488,13 @@ static bool handle_cmd(struct kvm_vm *vm, int cmd) TEST_ASSERT(check_write_in_dirty_log(vm, data_region, 0), "Missing write in dirty log"); if (cmd & CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG) - TEST_ASSERT(check_write_in_dirty_log(vm, pt_region, 0), + TEST_ASSERT(check_write_in_dirty_log(vm, pt_region, pte_pg), "Missing s1ptw write in dirty log"); if (cmd & CMD_CHECK_NO_WRITE_IN_DIRTY_LOG) TEST_ASSERT(!check_write_in_dirty_log(vm, data_region, 0), "Unexpected write in dirty log"); if (cmd & CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG) - TEST_ASSERT(!check_write_in_dirty_log(vm, pt_region, 0), + TEST_ASSERT(!check_write_in_dirty_log(vm, pt_region, pte_pg), "Unexpected s1ptw write in dirty log"); return continue_test; @@ -797,7 +799,7 @@ static void help(char *name) .expected_events = { .uffd_faults = _uffd_faults, }, \ } -#define TEST_DIRTY_LOG(_access, _with_af, _test_check) \ +#define TEST_DIRTY_LOG(_access, _with_af, _test_check, _pt_check) \ { \ .name = SCAT3(dirty_log, _access, _with_af), \ .data_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ @@ -805,13 +807,12 @@ static void help(char *name) .guest_prepare = { _PREPARE(_with_af), \ _PREPARE(_access) }, \ .guest_test = _access, \ - .guest_test_check = { _CHECK(_with_af), _test_check, \ - guest_check_s1ptw_wr_in_dirty_log}, \ + .guest_test_check = { _CHECK(_with_af), _test_check, _pt_check }, \ .expected_events = { 0 }, \ } #define TEST_UFFD_AND_DIRTY_LOG(_access, _with_af, _uffd_data_handler, \ - _uffd_faults, _test_check) \ + _uffd_faults, _test_check, _pt_check) \ { \ .name = SCAT3(uffd_and_dirty_log, _access, _with_af), \ .data_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ @@ -820,16 +821,17 @@ static void help(char *name) _PREPARE(_access) }, \ .guest_test = _access, \ .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ - .guest_test_check = { _CHECK(_with_af), _test_check }, \ + .guest_test_check = { _CHECK(_with_af), _test_check, _pt_check }, \ .uffd_data_handler = _uffd_data_handler, \ - .uffd_pt_handler = uffd_pt_write_handler, \ + .uffd_pt_handler = uffd_pt_handler, \ .expected_events = { .uffd_faults = _uffd_faults, }, \ } #define TEST_RO_MEMSLOT(_access, _mmio_handler, _mmio_exits) \ { \ - .name = SCAT3(ro_memslot, _access, _with_af), \ + .name = SCAT2(ro_memslot, _access), \ .data_memslot_flags = KVM_MEM_READONLY, \ + .pt_memslot_flags = KVM_MEM_READONLY, \ .guest_prepare = { _PREPARE(_access) }, \ .guest_test = _access, \ .mmio_handler = _mmio_handler, \ @@ -840,6 +842,7 @@ static void help(char *name) { \ .name = SCAT2(ro_memslot_no_syndrome, _access), \ .data_memslot_flags = KVM_MEM_READONLY, \ + .pt_memslot_flags = KVM_MEM_READONLY, \ .guest_test = _access, \ .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ .expected_events = { .fail_vcpu_runs = 1 }, \ @@ -848,9 +851,9 @@ static void help(char *name) #define TEST_RO_MEMSLOT_AND_DIRTY_LOG(_access, _mmio_handler, _mmio_exits, \ _test_check) \ { \ - .name = SCAT3(ro_memslot, _access, _with_af), \ + .name = SCAT2(ro_memslot, _access), \ .data_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ - .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ .guest_prepare = { _PREPARE(_access) }, \ .guest_test = _access, \ .guest_test_check = { _test_check }, \ @@ -862,7 +865,7 @@ static void help(char *name) { \ .name = SCAT2(ro_memslot_no_syn_and_dlog, _access), \ .data_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ - .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ .guest_test = _access, \ .guest_test_check = { _test_check }, \ .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ @@ -874,11 +877,12 @@ static void help(char *name) { \ .name = SCAT2(ro_memslot_uffd, _access), \ .data_memslot_flags = KVM_MEM_READONLY, \ + .pt_memslot_flags = KVM_MEM_READONLY, \ .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ .guest_prepare = { _PREPARE(_access) }, \ .guest_test = _access, \ .uffd_data_handler = _uffd_data_handler, \ - .uffd_pt_handler = uffd_pt_write_handler, \ + .uffd_pt_handler = uffd_pt_handler, \ .mmio_handler = _mmio_handler, \ .expected_events = { .mmio_exits = _mmio_exits, \ .uffd_faults = _uffd_faults }, \ @@ -889,10 +893,11 @@ static void help(char *name) { \ .name = SCAT2(ro_memslot_no_syndrome, _access), \ .data_memslot_flags = KVM_MEM_READONLY, \ + .pt_memslot_flags = KVM_MEM_READONLY, \ .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ .guest_test = _access, \ .uffd_data_handler = _uffd_data_handler, \ - .uffd_pt_handler = uffd_pt_write_handler, \ + .uffd_pt_handler = uffd_pt_handler, \ .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ .expected_events = { .fail_vcpu_runs = 1, \ .uffd_faults = _uffd_faults }, \ @@ -933,44 +938,51 @@ static struct test_desc tests[] = { * (S1PTW). */ TEST_UFFD(guest_read64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_read_handler, uffd_pt_write_handler, 2), - /* no_af should also lead to a PT write. */ + uffd_data_handler, uffd_pt_handler, 2), TEST_UFFD(guest_read64, no_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_read_handler, uffd_pt_write_handler, 2), - /* Note how that cas invokes the read handler. */ + uffd_data_handler, uffd_pt_handler, 2), TEST_UFFD(guest_cas, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_read_handler, uffd_pt_write_handler, 2), + uffd_data_handler, uffd_pt_handler, 2), /* * Can't test guest_at with_af as it's IMPDEF whether the AF is set. * The S1PTW fault should still be marked as a write. */ TEST_UFFD(guest_at, no_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_read_handler, uffd_pt_write_handler, 1), + uffd_no_handler, uffd_pt_handler, 1), TEST_UFFD(guest_ld_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_read_handler, uffd_pt_write_handler, 2), + uffd_data_handler, uffd_pt_handler, 2), TEST_UFFD(guest_write64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_write_handler, uffd_pt_write_handler, 2), + uffd_data_handler, uffd_pt_handler, 2), TEST_UFFD(guest_dc_zva, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_write_handler, uffd_pt_write_handler, 2), + uffd_data_handler, uffd_pt_handler, 2), TEST_UFFD(guest_st_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_write_handler, uffd_pt_write_handler, 2), + uffd_data_handler, uffd_pt_handler, 2), TEST_UFFD(guest_exec, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, - uffd_data_read_handler, uffd_pt_write_handler, 2), + uffd_data_handler, uffd_pt_handler, 2), /* * Try accesses when the data and PT memory regions are both * tracked for dirty logging. */ - TEST_DIRTY_LOG(guest_read64, with_af, guest_check_no_write_in_dirty_log), - /* no_af should also lead to a PT write. */ - TEST_DIRTY_LOG(guest_read64, no_af, guest_check_no_write_in_dirty_log), - TEST_DIRTY_LOG(guest_ld_preidx, with_af, guest_check_no_write_in_dirty_log), - TEST_DIRTY_LOG(guest_at, no_af, guest_check_no_write_in_dirty_log), - TEST_DIRTY_LOG(guest_exec, with_af, guest_check_no_write_in_dirty_log), - TEST_DIRTY_LOG(guest_write64, with_af, guest_check_write_in_dirty_log), - TEST_DIRTY_LOG(guest_cas, with_af, guest_check_write_in_dirty_log), - TEST_DIRTY_LOG(guest_dc_zva, with_af, guest_check_write_in_dirty_log), - TEST_DIRTY_LOG(guest_st_preidx, with_af, guest_check_write_in_dirty_log), + TEST_DIRTY_LOG(guest_read64, with_af, guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_read64, no_af, guest_check_no_write_in_dirty_log, + guest_check_no_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_ld_preidx, with_af, + guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_at, no_af, guest_check_no_write_in_dirty_log, + guest_check_no_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_exec, with_af, guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_write64, with_af, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_cas, with_af, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_dc_zva, with_af, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_st_preidx, with_af, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), /* * Access when the data and PT memory regions are both marked for @@ -980,29 +992,43 @@ static struct test_desc tests[] = { * fault, and nothing in the dirty log. Any S1PTW should result in * a write in the dirty log and a userfaultfd write. */ - TEST_UFFD_AND_DIRTY_LOG(guest_read64, with_af, uffd_data_read_handler, 2, - guest_check_no_write_in_dirty_log), - /* no_af should also lead to a PT write. */ - TEST_UFFD_AND_DIRTY_LOG(guest_read64, no_af, uffd_data_read_handler, 2, - guest_check_no_write_in_dirty_log), - TEST_UFFD_AND_DIRTY_LOG(guest_ld_preidx, with_af, uffd_data_read_handler, - 2, guest_check_no_write_in_dirty_log), - TEST_UFFD_AND_DIRTY_LOG(guest_at, with_af, 0, 1, - guest_check_no_write_in_dirty_log), - TEST_UFFD_AND_DIRTY_LOG(guest_exec, with_af, uffd_data_read_handler, 2, - guest_check_no_write_in_dirty_log), - TEST_UFFD_AND_DIRTY_LOG(guest_write64, with_af, uffd_data_write_handler, - 2, guest_check_write_in_dirty_log), - TEST_UFFD_AND_DIRTY_LOG(guest_cas, with_af, uffd_data_read_handler, 2, - guest_check_write_in_dirty_log), - TEST_UFFD_AND_DIRTY_LOG(guest_dc_zva, with_af, uffd_data_write_handler, - 2, guest_check_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_read64, with_af, + uffd_data_handler, 2, + guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_read64, no_af, + uffd_data_handler, 2, + guest_check_no_write_in_dirty_log, + guest_check_no_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_ld_preidx, with_af, + uffd_data_handler, + 2, guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_at, with_af, uffd_no_handler, 1, + guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_exec, with_af, + uffd_data_handler, 2, + guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_write64, with_af, + uffd_data_handler, + 2, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_cas, with_af, + uffd_data_handler, 2, + guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_dc_zva, with_af, + uffd_data_handler, + 2, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), TEST_UFFD_AND_DIRTY_LOG(guest_st_preidx, with_af, - uffd_data_write_handler, 2, - guest_check_write_in_dirty_log), - + uffd_data_handler, 2, + guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), /* - * Try accesses when the data memory region is marked read-only + * Access when both the PT and data regions are marked read-only * (with KVM_MEM_READONLY). Writes with a syndrome result in an * MMIO exit, writes with no syndrome (e.g., CAS) result in a * failed vcpu run, and reads/execs with and without syndroms do @@ -1018,7 +1044,7 @@ static struct test_desc tests[] = { TEST_RO_MEMSLOT_NO_SYNDROME(guest_st_preidx), /* - * Access when both the data region is both read-only and marked + * The PT and data regions are both read-only and marked * for dirty logging at the same time. The expected result is that * for writes there should be no write in the dirty log. The * readonly handling is the same as if the memslot was not marked @@ -1043,7 +1069,7 @@ static struct test_desc tests[] = { guest_check_no_write_in_dirty_log), /* - * Access when the data region is both read-only and punched with + * The PT and data regions are both read-only and punched with * holes tracked with userfaultfd. The expected result is the * union of both userfaultfd and read-only behaviors. For example, * write accesses result in a userfaultfd write fault and an MMIO @@ -1051,22 +1077,15 @@ static struct test_desc tests[] = { * no userfaultfd write fault. Reads result in userfaultfd getting * triggered. */ - TEST_RO_MEMSLOT_AND_UFFD(guest_read64, 0, 0, - uffd_data_read_handler, 2), - TEST_RO_MEMSLOT_AND_UFFD(guest_ld_preidx, 0, 0, - uffd_data_read_handler, 2), - TEST_RO_MEMSLOT_AND_UFFD(guest_at, 0, 0, - uffd_no_handler, 1), - TEST_RO_MEMSLOT_AND_UFFD(guest_exec, 0, 0, - uffd_data_read_handler, 2), + TEST_RO_MEMSLOT_AND_UFFD(guest_read64, 0, 0, uffd_data_handler, 2), + TEST_RO_MEMSLOT_AND_UFFD(guest_ld_preidx, 0, 0, uffd_data_handler, 2), + TEST_RO_MEMSLOT_AND_UFFD(guest_at, 0, 0, uffd_no_handler, 1), + TEST_RO_MEMSLOT_AND_UFFD(guest_exec, 0, 0, uffd_data_handler, 2), TEST_RO_MEMSLOT_AND_UFFD(guest_write64, mmio_on_test_gpa_handler, 1, - uffd_data_write_handler, 2), - TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_cas, - uffd_data_read_handler, 2), - TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_dc_zva, - uffd_no_handler, 1), - TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_st_preidx, - uffd_no_handler, 1), + uffd_data_handler, 2), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_cas, uffd_data_handler, 2), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_dc_zva, uffd_no_handler, 1), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_st_preidx, uffd_no_handler, 1), { 0 } }; diff --git a/tools/testing/selftests/net/cmsg_ipv6.sh b/tools/testing/selftests/net/cmsg_ipv6.sh index 2d89cb0ad288..330d0b1ceced 100755 --- a/tools/testing/selftests/net/cmsg_ipv6.sh +++ b/tools/testing/selftests/net/cmsg_ipv6.sh @@ -6,7 +6,7 @@ ksft_skip=4 NS=ns IP6=2001:db8:1::1/64 TGT6=2001:db8:1::2 -TMPF=`mktemp` +TMPF=$(mktemp --suffix ".pcap") cleanup() { diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index c245476fa29d..63c3eaec8d30 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -10,8 +10,10 @@ ret=0 PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} IP="ip -netns testns" +IP_PEER="ip -netns peerns" RTABLE=100 +RTABLE_PEER=101 GW_IP4=192.51.100.2 SRC_IP=192.51.100.3 GW_IP6=2001:db8:1::2 @@ -20,7 +22,9 @@ SRC_IP6=2001:db8:1::3 DEV_ADDR=192.51.100.1 DEV_ADDR6=2001:db8:1::1 DEV=dummy0 -TESTS="fib_rule6 fib_rule4" +TESTS="fib_rule6 fib_rule4 fib_rule6_connect fib_rule4_connect" + +SELFTEST_PATH="" log_test() { @@ -52,6 +56,31 @@ log_section() echo "######################################################################" } +check_nettest() +{ + if which nettest > /dev/null 2>&1; then + return 0 + fi + + # Add the selftest directory to PATH if not already done + if [ "${SELFTEST_PATH}" = "" ]; then + SELFTEST_PATH="$(dirname $0)" + PATH="${PATH}:${SELFTEST_PATH}" + + # Now retry with the new path + if which nettest > /dev/null 2>&1; then + return 0 + fi + + if [ "${ret}" -eq 0 ]; then + ret="${ksft_skip}" + fi + echo "nettest not found (try 'make -C ${SELFTEST_PATH} nettest')" + fi + + return 1 +} + setup() { set -e @@ -72,6 +101,39 @@ cleanup() ip netns del testns } +setup_peer() +{ + set -e + + ip netns add peerns + $IP_PEER link set dev lo up + + ip link add name veth0 netns testns type veth \ + peer name veth1 netns peerns + $IP link set dev veth0 up + $IP_PEER link set dev veth1 up + + $IP address add 192.0.2.10 peer 192.0.2.11/32 dev veth0 + $IP_PEER address add 192.0.2.11 peer 192.0.2.10/32 dev veth1 + + $IP address add 2001:db8::10 peer 2001:db8::11/128 dev veth0 nodad + $IP_PEER address add 2001:db8::11 peer 2001:db8::10/128 dev veth1 nodad + + $IP_PEER address add 198.51.100.11/32 dev lo + $IP route add table $RTABLE_PEER 198.51.100.11/32 via 192.0.2.11 + + $IP_PEER address add 2001:db8::1:11/128 dev lo + $IP route add table $RTABLE_PEER 2001:db8::1:11/128 via 2001:db8::11 + + set +e +} + +cleanup_peer() +{ + $IP link del dev veth0 + ip netns del peerns +} + fib_check_iproute_support() { ip rule help 2>&1 | grep -q $1 @@ -190,6 +252,37 @@ fib_rule6_test() fi } +# Verify that the IPV6_TCLASS option of UDPv6 and TCPv6 sockets is properly +# taken into account when connecting the socket and when sending packets. +fib_rule6_connect_test() +{ + local dsfield + + if ! check_nettest; then + echo "SKIP: Could not run test without nettest tool" + return + fi + + setup_peer + $IP -6 rule add dsfield 0x04 table $RTABLE_PEER + + # Combine the base DS Field value (0x04) with all possible ECN values + # (Not-ECT: 0, ECT(1): 1, ECT(0): 2, CE: 3). + # The ECN bits shouldn't influence the result of the test. + for dsfield in 0x04 0x05 0x06 0x07; do + nettest -q -6 -B -t 5 -N testns -O peerns -U -D \ + -Q "${dsfield}" -l 2001:db8::1:11 -r 2001:db8::1:11 + log_test $? 0 "rule6 dsfield udp connect (dsfield ${dsfield})" + + nettest -q -6 -B -t 5 -N testns -O peerns -Q "${dsfield}" \ + -l 2001:db8::1:11 -r 2001:db8::1:11 + log_test $? 0 "rule6 dsfield tcp connect (dsfield ${dsfield})" + done + + $IP -6 rule del dsfield 0x04 table $RTABLE_PEER + cleanup_peer +} + fib_rule4_del() { $IP rule del $1 @@ -296,6 +389,37 @@ fib_rule4_test() fi } +# Verify that the IP_TOS option of UDPv4 and TCPv4 sockets is properly taken +# into account when connecting the socket and when sending packets. +fib_rule4_connect_test() +{ + local dsfield + + if ! check_nettest; then + echo "SKIP: Could not run test without nettest tool" + return + fi + + setup_peer + $IP -4 rule add dsfield 0x04 table $RTABLE_PEER + + # Combine the base DS Field value (0x04) with all possible ECN values + # (Not-ECT: 0, ECT(1): 1, ECT(0): 2, CE: 3). + # The ECN bits shouldn't influence the result of the test. + for dsfield in 0x04 0x05 0x06 0x07; do + nettest -q -B -t 5 -N testns -O peerns -D -U -Q "${dsfield}" \ + -l 198.51.100.11 -r 198.51.100.11 + log_test $? 0 "rule4 dsfield udp connect (dsfield ${dsfield})" + + nettest -q -B -t 5 -N testns -O peerns -Q "${dsfield}" \ + -l 198.51.100.11 -r 198.51.100.11 + log_test $? 0 "rule4 dsfield tcp connect (dsfield ${dsfield})" + done + + $IP -4 rule del dsfield 0x04 table $RTABLE_PEER + cleanup_peer +} + run_fibrule_tests() { log_section "IPv4 fib rule" @@ -345,6 +469,8 @@ do case $t in fib_rule6_test|fib_rule6) fib_rule6_test;; fib_rule4_test|fib_rule4) fib_rule4_test;; + fib_rule6_connect_test|fib_rule6_connect) fib_rule6_connect_test;; + fib_rule4_connect_test|fib_rule4_connect) fib_rule4_connect_test;; help) echo "Test names: $TESTS"; exit 0;; diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 1c4f866de7d7..3d8e4ebda1b6 100755 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -914,14 +914,14 @@ sysctl_set() local value=$1; shift SYSCTL_ORIG[$key]=$(sysctl -n $key) - sysctl -qw $key=$value + sysctl -qw $key="$value" } sysctl_restore() { local key=$1; shift - sysctl -qw $key=${SYSCTL_ORIG["$key"]} + sysctl -qw $key="${SYSCTL_ORIG[$key]}" } forwarding_enable() diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index d11d3d566608..079f8f46849d 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -498,6 +498,12 @@ kill_events_pids() kill_wait $evts_ns2_pid } +kill_tests_wait() +{ + kill -SIGUSR1 $(ip netns pids $ns2) $(ip netns pids $ns1) + wait +} + pm_nl_set_limits() { local ns=$1 @@ -1694,6 +1700,7 @@ chk_subflow_nr() local subflow_nr=$3 local cnt1 local cnt2 + local dump_stats if [ -n "${need_title}" ]; then printf "%03u %-36s %s" "${TEST_COUNT}" "${TEST_NAME}" "${msg}" @@ -1711,7 +1718,12 @@ chk_subflow_nr() echo "[ ok ]" fi - [ "${dump_stats}" = 1 ] && ( ss -N $ns1 -tOni ; ss -N $ns1 -tOni | grep token; ip -n $ns1 mptcp endpoint ) + if [ "${dump_stats}" = 1 ]; then + ss -N $ns1 -tOni + ss -N $ns1 -tOni | grep token + ip -n $ns1 mptcp endpoint + dump_stats + fi } chk_link_usage() @@ -3049,7 +3061,7 @@ endpoint_tests() pm_nl_set_limits $ns1 2 2 pm_nl_set_limits $ns2 2 2 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal - run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow & + run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow 2>/dev/null & wait_mpj $ns1 pm_nl_check_endpoint 1 "creation" \ @@ -3062,14 +3074,14 @@ endpoint_tests() pm_nl_add_endpoint $ns2 10.0.2.2 flags signal pm_nl_check_endpoint 0 "modif is allowed" \ $ns2 10.0.2.2 id 1 flags signal - wait + kill_tests_wait fi if reset "delete and re-add"; then pm_nl_set_limits $ns1 1 1 pm_nl_set_limits $ns2 1 1 pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow - run_tests $ns1 $ns2 10.0.1.1 4 0 0 slow & + run_tests $ns1 $ns2 10.0.1.1 4 0 0 speed_20 2>/dev/null & wait_mpj $ns2 pm_nl_del_endpoint $ns2 2 10.0.2.2 @@ -3079,7 +3091,7 @@ endpoint_tests() pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow wait_mpj $ns2 chk_subflow_nr "" "after re-add" 2 - wait + kill_tests_wait fi } diff --git a/tools/testing/selftests/net/nettest.c b/tools/testing/selftests/net/nettest.c index 7900fa98eccb..ee9a72982705 100644 --- a/tools/testing/selftests/net/nettest.c +++ b/tools/testing/selftests/net/nettest.c @@ -87,6 +87,7 @@ struct sock_args { int use_setsockopt; int use_freebind; int use_cmsg; + uint8_t dsfield; const char *dev; const char *server_dev; int ifindex; @@ -580,6 +581,36 @@ static int set_reuseaddr(int sd) return rc; } +static int set_dsfield(int sd, int version, int dsfield) +{ + if (!dsfield) + return 0; + + switch (version) { + case AF_INET: + if (setsockopt(sd, SOL_IP, IP_TOS, &dsfield, + sizeof(dsfield)) < 0) { + log_err_errno("setsockopt(IP_TOS)"); + return -1; + } + break; + + case AF_INET6: + if (setsockopt(sd, SOL_IPV6, IPV6_TCLASS, &dsfield, + sizeof(dsfield)) < 0) { + log_err_errno("setsockopt(IPV6_TCLASS)"); + return -1; + } + break; + + default: + log_error("Invalid address family\n"); + return -1; + } + + return 0; +} + static int str_to_uint(const char *str, int min, int max, unsigned int *value) { int number; @@ -1317,6 +1348,9 @@ static int msock_init(struct sock_args *args, int server) (char *)&one, sizeof(one)) < 0) log_err_errno("Setting SO_BROADCAST error"); + if (set_dsfield(sd, AF_INET, args->dsfield) != 0) + goto out_err; + if (args->dev && bind_to_device(sd, args->dev) != 0) goto out_err; else if (args->use_setsockopt && @@ -1445,6 +1479,9 @@ static int lsock_init(struct sock_args *args) if (set_reuseport(sd) != 0) goto err; + if (set_dsfield(sd, args->version, args->dsfield) != 0) + goto err; + if (args->dev && bind_to_device(sd, args->dev) != 0) goto err; else if (args->use_setsockopt && @@ -1658,6 +1695,9 @@ static int connectsock(void *addr, socklen_t alen, struct sock_args *args) if (set_reuseport(sd) != 0) goto err; + if (set_dsfield(sd, args->version, args->dsfield) != 0) + goto err; + if (args->dev && bind_to_device(sd, args->dev) != 0) goto err; else if (args->use_setsockopt && @@ -1862,7 +1902,7 @@ static int ipc_parent(int cpid, int fd, struct sock_args *args) return client_status; } -#define GETOPT_STR "sr:l:c:p:t:g:P:DRn:M:X:m:d:I:BN:O:SUCi6xL:0:1:2:3:Fbqf" +#define GETOPT_STR "sr:l:c:Q:p:t:g:P:DRn:M:X:m:d:I:BN:O:SUCi6xL:0:1:2:3:Fbqf" #define OPT_FORCE_BIND_KEY_IFINDEX 1001 #define OPT_NO_BIND_KEY_IFINDEX 1002 @@ -1893,6 +1933,8 @@ static void print_usage(char *prog) " -D|R datagram (D) / raw (R) socket (default stream)\n" " -l addr local address to bind to in server mode\n" " -c addr local address to bind to in client mode\n" + " -Q dsfield DS Field value of the socket (the IP_TOS or\n" + " IPV6_TCLASS socket option)\n" " -x configure XFRM policy on socket\n" "\n" " -d dev bind socket to given device name\n" @@ -1971,6 +2013,13 @@ int main(int argc, char *argv[]) args.has_local_ip = 1; args.client_local_addr_str = optarg; break; + case 'Q': + if (str_to_uint(optarg, 0, 255, &tmp) != 0) { + fprintf(stderr, "Invalid DS Field\n"); + return 1; + } + args.dsfield = tmp; + break; case 'p': if (str_to_uint(optarg, 1, 65535, &tmp) != 0) { fprintf(stderr, "Invalid port\n"); diff --git a/tools/testing/selftests/net/test_vxlan_vnifiltering.sh b/tools/testing/selftests/net/test_vxlan_vnifiltering.sh index 704997ffc244..8c3ac0a72545 100755 --- a/tools/testing/selftests/net/test_vxlan_vnifiltering.sh +++ b/tools/testing/selftests/net/test_vxlan_vnifiltering.sh @@ -293,19 +293,11 @@ setup-vm() { elif [[ -n $vtype && $vtype == "vnifilterg" ]]; then # Add per vni group config with 'bridge vni' api if [ -n "$group" ]; then - if [ "$family" == "v4" ]; then - if [ $mcast -eq 1 ]; then - bridge -netns hv-$hvid vni add dev $vxlandev vni $tid group $group - else - bridge -netns hv-$hvid vni add dev $vxlandev vni $tid remote $group - fi - else - if [ $mcast -eq 1 ]; then - bridge -netns hv-$hvid vni add dev $vxlandev vni $tid group6 $group - else - bridge -netns hv-$hvid vni add dev $vxlandev vni $tid remote6 $group - fi - fi + if [ $mcast -eq 1 ]; then + bridge -netns hv-$hvid vni add dev $vxlandev vni $tid group $group + else + bridge -netns hv-$hvid vni add dev $vxlandev vni $tid remote $group + fi fi fi done diff --git a/tools/testing/selftests/net/udpgso_bench.sh b/tools/testing/selftests/net/udpgso_bench.sh index dc932fd65363..640bc43452fa 100755 --- a/tools/testing/selftests/net/udpgso_bench.sh +++ b/tools/testing/selftests/net/udpgso_bench.sh @@ -7,6 +7,7 @@ readonly GREEN='\033[0;92m' readonly YELLOW='\033[0;33m' readonly RED='\033[0;31m' readonly NC='\033[0m' # No Color +readonly TESTPORT=8000 readonly KSFT_PASS=0 readonly KSFT_FAIL=1 @@ -56,11 +57,26 @@ trap wake_children EXIT run_one() { local -r args=$@ + local nr_socks=0 + local i=0 + local -r timeout=10 + + ./udpgso_bench_rx -p "$TESTPORT" & + ./udpgso_bench_rx -p "$TESTPORT" -t & + + # Wait for the above test program to get ready to receive connections. + while [ "$i" -lt "$timeout" ]; do + nr_socks="$(ss -lnHi | grep -c "\*:${TESTPORT}")" + [ "$nr_socks" -eq 2 ] && break + i=$((i + 1)) + sleep 1 + done + if [ "$nr_socks" -ne 2 ]; then + echo "timed out while waiting for udpgso_bench_rx" + exit 1 + fi - ./udpgso_bench_rx & - ./udpgso_bench_rx -t & - - ./udpgso_bench_tx ${args} + ./udpgso_bench_tx -p "$TESTPORT" ${args} } run_in_netns() { diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c index 6a193425c367..4058c7451e70 100644 --- a/tools/testing/selftests/net/udpgso_bench_rx.c +++ b/tools/testing/selftests/net/udpgso_bench_rx.c @@ -250,7 +250,7 @@ static int recv_msg(int fd, char *buf, int len, int *gso_size) static void do_flush_udp(int fd) { static char rbuf[ETH_MAX_MTU]; - int ret, len, gso_size, budget = 256; + int ret, len, gso_size = 0, budget = 256; len = cfg_read_all ? sizeof(rbuf) : 0; while (budget--) { @@ -336,6 +336,8 @@ static void parse_opts(int argc, char **argv) cfg_verify = true; cfg_read_all = true; break; + default: + exit(1); } } diff --git a/tools/testing/selftests/net/udpgso_bench_tx.c b/tools/testing/selftests/net/udpgso_bench_tx.c index f1fdaa270291..477392715a9a 100644 --- a/tools/testing/selftests/net/udpgso_bench_tx.c +++ b/tools/testing/selftests/net/udpgso_bench_tx.c @@ -62,6 +62,7 @@ static int cfg_payload_len = (1472 * 42); static int cfg_port = 8000; static int cfg_runtime_ms = -1; static bool cfg_poll; +static int cfg_poll_loop_timeout_ms = 2000; static bool cfg_segment; static bool cfg_sendmmsg; static bool cfg_tcp; @@ -235,16 +236,17 @@ static void flush_errqueue_recv(int fd) } } -static void flush_errqueue(int fd, const bool do_poll) +static void flush_errqueue(int fd, const bool do_poll, + unsigned long poll_timeout, const bool poll_err) { if (do_poll) { struct pollfd fds = {0}; int ret; fds.fd = fd; - ret = poll(&fds, 1, 500); + ret = poll(&fds, 1, poll_timeout); if (ret == 0) { - if (cfg_verbose) + if ((cfg_verbose) && (poll_err)) fprintf(stderr, "poll timeout\n"); } else if (ret < 0) { error(1, errno, "poll"); @@ -254,6 +256,20 @@ static void flush_errqueue(int fd, const bool do_poll) flush_errqueue_recv(fd); } +static void flush_errqueue_retry(int fd, unsigned long num_sends) +{ + unsigned long tnow, tstop; + bool first_try = true; + + tnow = gettimeofday_ms(); + tstop = tnow + cfg_poll_loop_timeout_ms; + do { + flush_errqueue(fd, true, tstop - tnow, first_try); + first_try = false; + tnow = gettimeofday_ms(); + } while ((stat_zcopies != num_sends) && (tnow < tstop)); +} + static int send_tcp(int fd, char *data) { int ret, done = 0, count = 0; @@ -413,7 +429,8 @@ static int send_udp_segment(int fd, char *data) static void usage(const char *filepath) { - error(1, 0, "Usage: %s [-46acmHPtTuvz] [-C cpu] [-D dst ip] [-l secs] [-M messagenr] [-p port] [-s sendsize] [-S gsosize]", + error(1, 0, "Usage: %s [-46acmHPtTuvz] [-C cpu] [-D dst ip] [-l secs] " + "[-L secs] [-M messagenr] [-p port] [-s sendsize] [-S gsosize]", filepath); } @@ -423,7 +440,7 @@ static void parse_opts(int argc, char **argv) int max_len, hdrlen; int c; - while ((c = getopt(argc, argv, "46acC:D:Hl:mM:p:s:PS:tTuvz")) != -1) { + while ((c = getopt(argc, argv, "46acC:D:Hl:L:mM:p:s:PS:tTuvz")) != -1) { switch (c) { case '4': if (cfg_family != PF_UNSPEC) @@ -452,6 +469,9 @@ static void parse_opts(int argc, char **argv) case 'l': cfg_runtime_ms = strtoul(optarg, NULL, 10) * 1000; break; + case 'L': + cfg_poll_loop_timeout_ms = strtoul(optarg, NULL, 10) * 1000; + break; case 'm': cfg_sendmmsg = true; break; @@ -490,6 +510,8 @@ static void parse_opts(int argc, char **argv) case 'z': cfg_zerocopy = true; break; + default: + exit(1); } } @@ -677,7 +699,7 @@ int main(int argc, char **argv) num_sends += send_udp(fd, buf[i]); num_msgs++; if ((cfg_zerocopy && ((num_msgs & 0xF) == 0)) || cfg_tx_tstamp) - flush_errqueue(fd, cfg_poll); + flush_errqueue(fd, cfg_poll, 500, true); if (cfg_msg_nr && num_msgs >= cfg_msg_nr) break; @@ -696,7 +718,7 @@ int main(int argc, char **argv) } while (!interrupted && (cfg_runtime_ms == -1 || tnow < tstop)); if (cfg_zerocopy || cfg_tx_tstamp) - flush_errqueue(fd, true); + flush_errqueue_retry(fd, num_sends); if (close(fd)) error(1, errno, "close"); diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c index a634f47d1e56..9a127a8fe176 100644 --- a/tools/testing/selftests/vm/hugetlb-madvise.c +++ b/tools/testing/selftests/vm/hugetlb-madvise.c @@ -17,7 +17,6 @@ #include <stdio.h> #include <unistd.h> #include <sys/mman.h> -#define __USE_GNU #include <fcntl.h> #define MIN_FREE_PAGES 20 diff --git a/tools/virtio/linux/bug.h b/tools/virtio/linux/bug.h index 813baf13f62a..51a919083d9b 100644 --- a/tools/virtio/linux/bug.h +++ b/tools/virtio/linux/bug.h @@ -1,13 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef BUG_H -#define BUG_H +#ifndef _LINUX_BUG_H +#define _LINUX_BUG_H #include <asm/bug.h> #define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond)) -#define BUILD_BUG_ON(x) - #define BUG() abort() -#endif /* BUG_H */ +#endif /* _LINUX_BUG_H */ diff --git a/tools/virtio/linux/build_bug.h b/tools/virtio/linux/build_bug.h new file mode 100644 index 000000000000..cdbb75e28a60 --- /dev/null +++ b/tools/virtio/linux/build_bug.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BUILD_BUG_H +#define _LINUX_BUILD_BUG_H + +#define BUILD_BUG_ON(x) + +#endif /* _LINUX_BUILD_BUG_H */ diff --git a/tools/virtio/linux/cpumask.h b/tools/virtio/linux/cpumask.h new file mode 100644 index 000000000000..307da69d6b26 --- /dev/null +++ b/tools/virtio/linux/cpumask.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CPUMASK_H +#define _LINUX_CPUMASK_H + +#include <linux/kernel.h> + +#endif /* _LINUX_CPUMASK_H */ diff --git a/tools/virtio/linux/gfp.h b/tools/virtio/linux/gfp.h new file mode 100644 index 000000000000..43d146f236f1 --- /dev/null +++ b/tools/virtio/linux/gfp.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_GFP_H +#define __LINUX_GFP_H + +#include <linux/topology.h> + +#endif diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h index 21593bf97755..8b877167933d 100644 --- a/tools/virtio/linux/kernel.h +++ b/tools/virtio/linux/kernel.h @@ -10,6 +10,7 @@ #include <stdarg.h> #include <linux/compiler.h> +#include <linux/log2.h> #include <linux/types.h> #include <linux/overflow.h> #include <linux/list.h> diff --git a/tools/virtio/linux/kmsan.h b/tools/virtio/linux/kmsan.h new file mode 100644 index 000000000000..272b5aa285d5 --- /dev/null +++ b/tools/virtio/linux/kmsan.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_KMSAN_H +#define _LINUX_KMSAN_H + +#include <linux/gfp.h> + +inline void kmsan_handle_dma(struct page *page, size_t offset, size_t size, + enum dma_data_direction dir) +{ +} + +#endif /* _LINUX_KMSAN_H */ diff --git a/tools/virtio/linux/scatterlist.h b/tools/virtio/linux/scatterlist.h index 369ee308b668..74d9e1825748 100644 --- a/tools/virtio/linux/scatterlist.h +++ b/tools/virtio/linux/scatterlist.h @@ -2,6 +2,7 @@ #ifndef SCATTERLIST_H #define SCATTERLIST_H #include <linux/kernel.h> +#include <linux/bug.h> struct scatterlist { unsigned long page_link; diff --git a/tools/virtio/linux/topology.h b/tools/virtio/linux/topology.h new file mode 100644 index 000000000000..910794afb993 --- /dev/null +++ b/tools/virtio/linux/topology.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TOPOLOGY_H +#define _LINUX_TOPOLOGY_H + +#include <linux/cpumask.h> + +#endif /* _LINUX_TOPOLOGY_H */ |