diff options
399 files changed, 5656 insertions, 3544 deletions
@@ -60,6 +60,10 @@ Arnd Bergmann <arnd@arndb.de> Atish Patra <atishp@atishpatra.org> <atish.patra@wdc.com> Axel Dyks <xl@xlsigned.net> Axel Lin <axel.lin@gmail.com> +Baolin Wang <baolin.wang@linux.alibaba.com> <baolin.wang@linaro.org> +Baolin Wang <baolin.wang@linux.alibaba.com> <baolin.wang@spreadtrum.com> +Baolin Wang <baolin.wang@linux.alibaba.com> <baolin.wang@unisoc.com> +Baolin Wang <baolin.wang@linux.alibaba.com> <baolin.wang7@gmail.com> Bart Van Assche <bvanassche@acm.org> <bart.vanassche@sandisk.com> Bart Van Assche <bvanassche@acm.org> <bart.vanassche@wdc.com> Ben Gardner <bgardner@wabtec.com> @@ -135,6 +139,8 @@ Frank Rowand <frowand.list@gmail.com> <frowand@mvista.com> Frank Zago <fzago@systemfabricworks.com> Gao Xiang <xiang@kernel.org> <gaoxiang25@huawei.com> Gao Xiang <xiang@kernel.org> <hsiangkao@aol.com> +Gao Xiang <xiang@kernel.org> <hsiangkao@linux.alibaba.com> +Gao Xiang <xiang@kernel.org> <hsiangkao@redhat.com> Gerald Schaefer <gerald.schaefer@linux.ibm.com> <geraldsc@de.ibm.com> Gerald Schaefer <gerald.schaefer@linux.ibm.com> <gerald.schaefer@de.ibm.com> Gerald Schaefer <gerald.schaefer@linux.ibm.com> <geraldsc@linux.vnet.ibm.com> @@ -371,6 +377,7 @@ Sean Nyekjaer <sean@geanix.com> <sean.nyekjaer@prevas.dk> Sebastian Reichel <sre@kernel.org> <sebastian.reichel@collabora.co.uk> Sebastian Reichel <sre@kernel.org> <sre@debian.org> Sedat Dilek <sedat.dilek@gmail.com> <sedat.dilek@credativ.de> +Seth Forshee <sforshee@kernel.org> <seth.forshee@canonical.com> Shiraz Hashim <shiraz.linux.kernel@gmail.com> <shiraz.hashim@st.com> Shuah Khan <shuah@kernel.org> <shuahkhan@gmail.com> Shuah Khan <shuah@kernel.org> <shuah.khan@hp.com> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f2d26cb7e853..cc3ea8febc62 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3176,6 +3176,7 @@ no_entry_flush [PPC] no_uaccess_flush [PPC] mmio_stale_data=off [X86] + retbleed=off [X86] Exceptions: This does not have any effect on @@ -3198,6 +3199,7 @@ mds=full,nosmt [X86] tsx_async_abort=full,nosmt [X86] mmio_stale_data=full,nosmt [X86] + retbleed=auto,nosmt [X86] mminit_loglevel= [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this @@ -5796,6 +5798,24 @@ expediting. Set to zero to disable automatic expediting. + srcutree.srcu_max_nodelay [KNL] + Specifies the number of no-delay instances + per jiffy for which the SRCU grace period + worker thread will be rescheduled with zero + delay. Beyond this limit, worker thread will + be rescheduled with a sleep delay of one jiffy. + + srcutree.srcu_max_nodelay_phase [KNL] + Specifies the per-grace-period phase, number of + non-sleeping polls of readers. Beyond this limit, + grace period worker thread will be rescheduled + with a sleep delay of one jiffy, between each + rescan of the readers, for a grace period phase. + + srcutree.srcu_retry_check_delay [KNL] + Specifies number of microseconds of non-sleeping + delay between each non-sleeping poll of readers. + srcutree.small_contention_lim [KNL] Specifies the number of update-side contention events per jiffy will be tolerated before diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst index aec2cd2aaea7..19754beb5a4e 100644 --- a/Documentation/admin-guide/pm/cpuidle.rst +++ b/Documentation/admin-guide/pm/cpuidle.rst @@ -612,8 +612,8 @@ the ``menu`` governor to be used on the systems that use the ``ladder`` governor by default this way, for example. The other kernel command line parameters controlling CPU idle time management -described below are only relevant for the *x86* architecture and some of -them affect Intel processors only. +described below are only relevant for the *x86* architecture and references +to ``intel_idle`` affect Intel processors only. The *x86* architecture support code recognizes three kernel command line options related to CPU idle time management: ``idle=poll``, ``idle=halt``, @@ -635,10 +635,13 @@ idle, so it very well may hurt single-thread computations performance as well as energy-efficiency. Thus using it for performance reasons may not be a good idea at all.] -The ``idle=nomwait`` option disables the ``intel_idle`` driver and causes -``acpi_idle`` to be used (as long as all of the information needed by it is -there in the system's ACPI tables), but it is not allowed to use the -``MWAIT`` instruction of the CPUs to ask the hardware to enter idle states. +The ``idle=nomwait`` option prevents the use of ``MWAIT`` instruction of +the CPU to enter idle states. When this option is used, the ``acpi_idle`` +driver will use the ``HLT`` instruction instead of ``MWAIT``. On systems +running Intel processors, this option disables the ``intel_idle`` driver +and forces the use of the ``acpi_idle`` driver instead. Note that in either +case, ``acpi_idle`` driver will function only if all the information needed +by it is in the system's ACPI tables. In addition to the architecture-level kernel command line options affecting CPU idle time management, there are parameters affecting individual ``CPUIdle`` diff --git a/Documentation/core-api/protection-keys.rst b/Documentation/core-api/protection-keys.rst index ec575e72d0b2..bf28ac0401f3 100644 --- a/Documentation/core-api/protection-keys.rst +++ b/Documentation/core-api/protection-keys.rst @@ -4,31 +4,29 @@ Memory Protection Keys ====================== -Memory Protection Keys for Userspace (PKU aka PKEYs) is a feature -which is found on Intel's Skylake (and later) "Scalable Processor" -Server CPUs. It will be available in future non-server Intel parts -and future AMD processors. - -For anyone wishing to test or use this feature, it is available in -Amazon's EC2 C5 instances and is known to work there using an Ubuntu -17.04 image. - -Memory Protection Keys provides a mechanism for enforcing page-based -protections, but without requiring modification of the page tables -when an application changes protection domains. It works by -dedicating 4 previously ignored bits in each page table entry to a -"protection key", giving 16 possible keys. - -There is also a new user-accessible register (PKRU) with two separate -bits (Access Disable and Write Disable) for each key. Being a CPU -register, PKRU is inherently thread-local, potentially giving each +Memory Protection Keys provide a mechanism for enforcing page-based +protections, but without requiring modification of the page tables when an +application changes protection domains. + +Pkeys Userspace (PKU) is a feature which can be found on: + * Intel server CPUs, Skylake and later + * Intel client CPUs, Tiger Lake (11th Gen Core) and later + * Future AMD CPUs + +Pkeys work by dedicating 4 previously Reserved bits in each page table entry to +a "protection key", giving 16 possible keys. + +Protections for each key are defined with a per-CPU user-accessible register +(PKRU). Each of these is a 32-bit register storing two bits (Access Disable +and Write Disable) for each of 16 keys. + +Being a CPU register, PKRU is inherently thread-local, potentially giving each thread a different set of protections from every other thread. -There are two new instructions (RDPKRU/WRPKRU) for reading and writing -to the new register. The feature is only available in 64-bit mode, -even though there is theoretically space in the PAE PTEs. These -permissions are enforced on data access only and have no effect on -instruction fetches. +There are two instructions (RDPKRU/WRPKRU) for reading and writing to the +register. The feature is only available in 64-bit mode, even though there is +theoretically space in the PAE PTEs. These permissions are enforced on data +access only and have no effect on instruction fetches. Syscalls ======== diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml index 4f15463611f8..170cd201adc2 100644 --- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml @@ -167,70 +167,65 @@ properties: - in-band-status fixed-link: - allOf: - - if: - type: array - then: - deprecated: true - items: - - minimum: 0 - maximum: 31 - description: - Emulated PHY ID, choose any but unique to the all - specified fixed-links - - - enum: [0, 1] - description: - Duplex configuration. 0 for half duplex or 1 for - full duplex - - - enum: [10, 100, 1000, 2500, 10000] - description: - Link speed in Mbits/sec. - - - enum: [0, 1] - description: - Pause configuration. 0 for no pause, 1 for pause - - - enum: [0, 1] - description: - Asymmetric pause configuration. 0 for no asymmetric - pause, 1 for asymmetric pause - - - - if: - type: object - then: - properties: - speed: - description: - Link speed. - $ref: /schemas/types.yaml#/definitions/uint32 - enum: [10, 100, 1000, 2500, 10000] - - full-duplex: - $ref: /schemas/types.yaml#/definitions/flag - description: - Indicates that full-duplex is used. When absent, half - duplex is assumed. - - pause: - $ref: /schemas/types.yaml#definitions/flag - description: - Indicates that pause should be enabled. - - asym-pause: - $ref: /schemas/types.yaml#/definitions/flag - description: - Indicates that asym_pause should be enabled. - - link-gpios: - maxItems: 1 - description: - GPIO to determine if the link is up - - required: - - speed + oneOf: + - $ref: /schemas/types.yaml#/definitions/uint32-array + deprecated: true + items: + - minimum: 0 + maximum: 31 + description: + Emulated PHY ID, choose any but unique to the all + specified fixed-links + + - enum: [0, 1] + description: + Duplex configuration. 0 for half duplex or 1 for + full duplex + + - enum: [10, 100, 1000, 2500, 10000] + description: + Link speed in Mbits/sec. + + - enum: [0, 1] + description: + Pause configuration. 0 for no pause, 1 for pause + + - enum: [0, 1] + description: + Asymmetric pause configuration. 0 for no asymmetric + pause, 1 for asymmetric pause + - type: object + additionalProperties: false + properties: + speed: + description: + Link speed. + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [10, 100, 1000, 2500, 10000] + + full-duplex: + $ref: /schemas/types.yaml#/definitions/flag + description: + Indicates that full-duplex is used. When absent, half + duplex is assumed. + + pause: + $ref: /schemas/types.yaml#definitions/flag + description: + Indicates that pause should be enabled. + + asym-pause: + $ref: /schemas/types.yaml#/definitions/flag + description: + Indicates that asym_pause should be enabled. + + link-gpios: + maxItems: 1 + description: + GPIO to determine if the link is up + + required: + - speed additionalProperties: true diff --git a/Documentation/devicetree/bindings/net/fsl,fec.yaml b/Documentation/devicetree/bindings/net/fsl,fec.yaml index daa2f79a294f..1b1853062cd3 100644 --- a/Documentation/devicetree/bindings/net/fsl,fec.yaml +++ b/Documentation/devicetree/bindings/net/fsl,fec.yaml @@ -183,6 +183,7 @@ properties: Should specify the gpio for phy reset. phy-reset-duration: + $ref: /schemas/types.yaml#/definitions/uint32 deprecated: true description: Reset duration in milliseconds. Should present only if property @@ -191,12 +192,14 @@ properties: and 1 millisecond will be used instead. phy-reset-active-high: + type: boolean deprecated: true description: If present then the reset sequence using the GPIO specified in the "phy-reset-gpios" property is reversed (H=reset state, L=operation state). phy-reset-post-delay: + $ref: /schemas/types.yaml#/definitions/uint32 deprecated: true description: Post reset delay in milliseconds. If present then a delay of phy-reset-post-delay diff --git a/Documentation/filesystems/overlayfs.rst b/Documentation/filesystems/overlayfs.rst index 316cfd8b1891..7da6c30ed596 100644 --- a/Documentation/filesystems/overlayfs.rst +++ b/Documentation/filesystems/overlayfs.rst @@ -466,10 +466,6 @@ overlay filesystem and the value of st_ino for filesystem objects may not be persistent and could change even while the overlay filesystem is mounted, as summarized in the `Inode properties`_ table above. -4) "idmapped mounts" -When the upper or lower layers are idmapped mounts overlayfs will be mounted -without support for POSIX Access Control Lists (ACLs). This limitation will -eventually be lifted. Changes to underlying filesystems --------------------------------- diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index ed7fa76e7a40..d742ba6bd211 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -503,26 +503,108 @@ per-port PHY specific details: interface connection, MDIO bus location, etc. Driver development ================== -DSA switch drivers need to implement a dsa_switch_ops structure which will +DSA switch drivers need to implement a ``dsa_switch_ops`` structure which will contain the various members described below. -``register_switch_driver()`` registers this dsa_switch_ops in its internal list -of drivers to probe for. ``unregister_switch_driver()`` does the exact opposite. +Probing, registration and device lifetime +----------------------------------------- -Unless requested differently by setting the priv_size member accordingly, DSA -does not allocate any driver private context space. +DSA switches are regular ``device`` structures on buses (be they platform, SPI, +I2C, MDIO or otherwise). The DSA framework is not involved in their probing +with the device core. + +Switch registration from the perspective of a driver means passing a valid +``struct dsa_switch`` pointer to ``dsa_register_switch()``, usually from the +switch driver's probing function. The following members must be valid in the +provided structure: + +- ``ds->dev``: will be used to parse the switch's OF node or platform data. + +- ``ds->num_ports``: will be used to create the port list for this switch, and + to validate the port indices provided in the OF node. + +- ``ds->ops``: a pointer to the ``dsa_switch_ops`` structure holding the DSA + method implementations. + +- ``ds->priv``: backpointer to a driver-private data structure which can be + retrieved in all further DSA method callbacks. + +In addition, the following flags in the ``dsa_switch`` structure may optionally +be configured to obtain driver-specific behavior from the DSA core. Their +behavior when set is documented through comments in ``include/net/dsa.h``. + +- ``ds->vlan_filtering_is_global`` + +- ``ds->needs_standalone_vlan_filtering`` + +- ``ds->configure_vlan_while_not_filtering`` + +- ``ds->untag_bridge_pvid`` + +- ``ds->assisted_learning_on_cpu_port`` + +- ``ds->mtu_enforcement_ingress`` + +- ``ds->fdb_isolation`` + +Internally, DSA keeps an array of switch trees (group of switches) global to +the kernel, and attaches a ``dsa_switch`` structure to a tree on registration. +The tree ID to which the switch is attached is determined by the first u32 +number of the ``dsa,member`` property of the switch's OF node (0 if missing). +The switch ID within the tree is determined by the second u32 number of the +same OF property (0 if missing). Registering multiple switches with the same +switch ID and tree ID is illegal and will cause an error. Using platform data, +a single switch and a single switch tree is permitted. + +In case of a tree with multiple switches, probing takes place asymmetrically. +The first N-1 callers of ``dsa_register_switch()`` only add their ports to the +port list of the tree (``dst->ports``), each port having a backpointer to its +associated switch (``dp->ds``). Then, these switches exit their +``dsa_register_switch()`` call early, because ``dsa_tree_setup_routing_table()`` +has determined that the tree is not yet complete (not all ports referenced by +DSA links are present in the tree's port list). The tree becomes complete when +the last switch calls ``dsa_register_switch()``, and this triggers the effective +continuation of initialization (including the call to ``ds->ops->setup()``) for +all switches within that tree, all as part of the calling context of the last +switch's probe function. + +The opposite of registration takes place when calling ``dsa_unregister_switch()``, +which removes a switch's ports from the port list of the tree. The entire tree +is torn down when the first switch unregisters. + +It is mandatory for DSA switch drivers to implement the ``shutdown()`` callback +of their respective bus, and call ``dsa_switch_shutdown()`` from it (a minimal +version of the full teardown performed by ``dsa_unregister_switch()``). +The reason is that DSA keeps a reference on the master net device, and if the +driver for the master device decides to unbind on shutdown, DSA's reference +will block that operation from finalizing. + +Either ``dsa_switch_shutdown()`` or ``dsa_unregister_switch()`` must be called, +but not both, and the device driver model permits the bus' ``remove()`` method +to be called even if ``shutdown()`` was already called. Therefore, drivers are +expected to implement a mutual exclusion method between ``remove()`` and +``shutdown()`` by setting their drvdata to NULL after any of these has run, and +checking whether the drvdata is NULL before proceeding to take any action. + +After ``dsa_switch_shutdown()`` or ``dsa_unregister_switch()`` was called, no +further callbacks via the provided ``dsa_switch_ops`` may take place, and the +driver may free the data structures associated with the ``dsa_switch``. Switch configuration -------------------- -- ``tag_protocol``: this is to indicate what kind of tagging protocol is supported, - should be a valid value from the ``dsa_tag_protocol`` enum +- ``get_tag_protocol``: this is to indicate what kind of tagging protocol is + supported, should be a valid value from the ``dsa_tag_protocol`` enum. + The returned information does not have to be static; the driver is passed the + CPU port number, as well as the tagging protocol of a possibly stacked + upstream switch, in case there are hardware limitations in terms of supported + tag formats. -- ``probe``: probe routine which will be invoked by the DSA platform device upon - registration to test for the presence/absence of a switch device. For MDIO - devices, it is recommended to issue a read towards internal registers using - the switch pseudo-PHY and return whether this is a supported device. For other - buses, return a non-NULL string +- ``change_tag_protocol``: when the default tagging protocol has compatibility + problems with the master or other issues, the driver may support changing it + at runtime, either through a device tree property or through sysfs. In that + case, further calls to ``get_tag_protocol`` should report the protocol in + current use. - ``setup``: setup function for the switch, this function is responsible for setting up the ``dsa_switch_ops`` private structure with all it needs: register maps, @@ -535,7 +617,17 @@ Switch configuration fully configured and ready to serve any kind of request. It is recommended to issue a software reset of the switch during this setup function in order to avoid relying on what a previous software agent such as a bootloader/firmware - may have previously configured. + may have previously configured. The method responsible for undoing any + applicable allocations or operations done here is ``teardown``. + +- ``port_setup`` and ``port_teardown``: methods for initialization and + destruction of per-port data structures. It is mandatory for some operations + such as registering and unregistering devlink port regions to be done from + these methods, otherwise they are optional. A port will be torn down only if + it has been previously set up. It is possible for a port to be set up during + probing only to be torn down immediately afterwards, for example in case its + PHY cannot be found. In this case, probing of the DSA switch continues + without that particular port. PHY devices and link management ------------------------------- @@ -635,26 +727,198 @@ Power management ``BR_STATE_DISABLED`` and propagating changes to the hardware if this port is disabled while being a bridge member +Address databases +----------------- + +Switching hardware is expected to have a table for FDB entries, however not all +of them are active at the same time. An address database is the subset (partition) +of FDB entries that is active (can be matched by address learning on RX, or FDB +lookup on TX) depending on the state of the port. An address database may +occasionally be called "FID" (Filtering ID) in this document, although the +underlying implementation may choose whatever is available to the hardware. + +For example, all ports that belong to a VLAN-unaware bridge (which is +*currently* VLAN-unaware) are expected to learn source addresses in the +database associated by the driver with that bridge (and not with other +VLAN-unaware bridges). During forwarding and FDB lookup, a packet received on a +VLAN-unaware bridge port should be able to find a VLAN-unaware FDB entry having +the same MAC DA as the packet, which is present on another port member of the +same bridge. At the same time, the FDB lookup process must be able to not find +an FDB entry having the same MAC DA as the packet, if that entry points towards +a port which is a member of a different VLAN-unaware bridge (and is therefore +associated with a different address database). + +Similarly, each VLAN of each offloaded VLAN-aware bridge should have an +associated address database, which is shared by all ports which are members of +that VLAN, but not shared by ports belonging to different bridges that are +members of the same VID. + +In this context, a VLAN-unaware database means that all packets are expected to +match on it irrespective of VLAN ID (only MAC address lookup), whereas a +VLAN-aware database means that packets are supposed to match based on the VLAN +ID from the classified 802.1Q header (or the pvid if untagged). + +At the bridge layer, VLAN-unaware FDB entries have the special VID value of 0, +whereas VLAN-aware FDB entries have non-zero VID values. Note that a +VLAN-unaware bridge may have VLAN-aware (non-zero VID) FDB entries, and a +VLAN-aware bridge may have VLAN-unaware FDB entries. As in hardware, the +software bridge keeps separate address databases, and offloads to hardware the +FDB entries belonging to these databases, through switchdev, asynchronously +relative to the moment when the databases become active or inactive. + +When a user port operates in standalone mode, its driver should configure it to +use a separate database called a port private database. This is different from +the databases described above, and should impede operation as standalone port +(packet in, packet out to the CPU port) as little as possible. For example, +on ingress, it should not attempt to learn the MAC SA of ingress traffic, since +learning is a bridging layer service and this is a standalone port, therefore +it would consume useless space. With no address learning, the port private +database should be empty in a naive implementation, and in this case, all +received packets should be trivially flooded to the CPU port. + +DSA (cascade) and CPU ports are also called "shared" ports because they service +multiple address databases, and the database that a packet should be associated +to is usually embedded in the DSA tag. This means that the CPU port may +simultaneously transport packets coming from a standalone port (which were +classified by hardware in one address database), and from a bridge port (which +were classified to a different address database). + +Switch drivers which satisfy certain criteria are able to optimize the naive +configuration by removing the CPU port from the flooding domain of the switch, +and just program the hardware with FDB entries pointing towards the CPU port +for which it is known that software is interested in those MAC addresses. +Packets which do not match a known FDB entry will not be delivered to the CPU, +which will save CPU cycles required for creating an skb just to drop it. + +DSA is able to perform host address filtering for the following kinds of +addresses: + +- Primary unicast MAC addresses of ports (``dev->dev_addr``). These are + associated with the port private database of the respective user port, + and the driver is notified to install them through ``port_fdb_add`` towards + the CPU port. + +- Secondary unicast and multicast MAC addresses of ports (addresses added + through ``dev_uc_add()`` and ``dev_mc_add()``). These are also associated + with the port private database of the respective user port. + +- Local/permanent bridge FDB entries (``BR_FDB_LOCAL``). These are the MAC + addresses of the bridge ports, for which packets must be terminated locally + and not forwarded. They are associated with the address database for that + bridge. + +- Static bridge FDB entries installed towards foreign (non-DSA) interfaces + present in the same bridge as some DSA switch ports. These are also + associated with the address database for that bridge. + +- Dynamically learned FDB entries on foreign interfaces present in the same + bridge as some DSA switch ports, only if ``ds->assisted_learning_on_cpu_port`` + is set to true by the driver. These are associated with the address database + for that bridge. + +For various operations detailed below, DSA provides a ``dsa_db`` structure +which can be of the following types: + +- ``DSA_DB_PORT``: the FDB (or MDB) entry to be installed or deleted belongs to + the port private database of user port ``db->dp``. +- ``DSA_DB_BRIDGE``: the entry belongs to one of the address databases of bridge + ``db->bridge``. Separation between the VLAN-unaware database and the per-VID + databases of this bridge is expected to be done by the driver. +- ``DSA_DB_LAG``: the entry belongs to the address database of LAG ``db->lag``. + Note: ``DSA_DB_LAG`` is currently unused and may be removed in the future. + +The drivers which act upon the ``dsa_db`` argument in ``port_fdb_add``, +``port_mdb_add`` etc should declare ``ds->fdb_isolation`` as true. + +DSA associates each offloaded bridge and each offloaded LAG with a one-based ID +(``struct dsa_bridge :: num``, ``struct dsa_lag :: id``) for the purposes of +refcounting addresses on shared ports. Drivers may piggyback on DSA's numbering +scheme (the ID is readable through ``db->bridge.num`` and ``db->lag.id`` or may +implement their own. + +Only the drivers which declare support for FDB isolation are notified of FDB +entries on the CPU port belonging to ``DSA_DB_PORT`` databases. +For compatibility/legacy reasons, ``DSA_DB_BRIDGE`` addresses are notified to +drivers even if they do not support FDB isolation. However, ``db->bridge.num`` +and ``db->lag.id`` are always set to 0 in that case (to denote the lack of +isolation, for refcounting purposes). + +Note that it is not mandatory for a switch driver to implement physically +separate address databases for each standalone user port. Since FDB entries in +the port private databases will always point to the CPU port, there is no risk +for incorrect forwarding decisions. In this case, all standalone ports may +share the same database, but the reference counting of host-filtered addresses +(not deleting the FDB entry for a port's MAC address if it's still in use by +another port) becomes the responsibility of the driver, because DSA is unaware +that the port databases are in fact shared. This can be achieved by calling +``dsa_fdb_present_in_other_db()`` and ``dsa_mdb_present_in_other_db()``. +The down side is that the RX filtering lists of each user port are in fact +shared, which means that user port A may accept a packet with a MAC DA it +shouldn't have, only because that MAC address was in the RX filtering list of +user port B. These packets will still be dropped in software, however. + Bridge layer ------------ +Offloading the bridge forwarding plane is optional and handled by the methods +below. They may be absent, return -EOPNOTSUPP, or ``ds->max_num_bridges`` may +be non-zero and exceeded, and in this case, joining a bridge port is still +possible, but the packet forwarding will take place in software, and the ports +under a software bridge must remain configured in the same way as for +standalone operation, i.e. have all bridging service functions (address +learning etc) disabled, and send all received packets to the CPU port only. + +Concretely, a port starts offloading the forwarding plane of a bridge once it +returns success to the ``port_bridge_join`` method, and stops doing so after +``port_bridge_leave`` has been called. Offloading the bridge means autonomously +learning FDB entries in accordance with the software bridge port's state, and +autonomously forwarding (or flooding) received packets without CPU intervention. +This is optional even when offloading a bridge port. Tagging protocol drivers +are expected to call ``dsa_default_offload_fwd_mark(skb)`` for packets which +have already been autonomously forwarded in the forwarding domain of the +ingress switch port. DSA, through ``dsa_port_devlink_setup()``, considers all +switch ports part of the same tree ID to be part of the same bridge forwarding +domain (capable of autonomous forwarding to each other). + +Offloading the TX forwarding process of a bridge is a distinct concept from +simply offloading its forwarding plane, and refers to the ability of certain +driver and tag protocol combinations to transmit a single skb coming from the +bridge device's transmit function to potentially multiple egress ports (and +thereby avoid its cloning in software). + +Packets for which the bridge requests this behavior are called data plane +packets and have ``skb->offload_fwd_mark`` set to true in the tag protocol +driver's ``xmit`` function. Data plane packets are subject to FDB lookup, +hardware learning on the CPU port, and do not override the port STP state. +Additionally, replication of data plane packets (multicast, flooding) is +handled in hardware and the bridge driver will transmit a single skb for each +packet that may or may not need replication. + +When the TX forwarding offload is enabled, the tag protocol driver is +responsible to inject packets into the data plane of the hardware towards the +correct bridging domain (FID) that the port is a part of. The port may be +VLAN-unaware, and in this case the FID must be equal to the FID used by the +driver for its VLAN-unaware address database associated with that bridge. +Alternatively, the bridge may be VLAN-aware, and in that case, it is guaranteed +that the packet is also VLAN-tagged with the VLAN ID that the bridge processed +this packet in. It is the responsibility of the hardware to untag the VID on +the egress-untagged ports, or keep the tag on the egress-tagged ones. + - ``port_bridge_join``: bridge layer function invoked when a given switch port is added to a bridge, this function should do what's necessary at the switch level to permit the joining port to be added to the relevant logical domain for it to ingress/egress traffic with other members of the bridge. + By setting the ``tx_fwd_offload`` argument to true, the TX forwarding process + of this bridge is also offloaded. - ``port_bridge_leave``: bridge layer function invoked when a given switch port is removed from a bridge, this function should do what's necessary at the switch level to deny the leaving port from ingress/egress traffic from the - remaining bridge members. When the port leaves the bridge, it should be aged - out at the switch hardware for the switch to (re) learn MAC addresses behind - this port. + remaining bridge members. - ``port_stp_state_set``: bridge layer function invoked when a given switch port STP state is computed by the bridge layer and should be propagated to switch - hardware to forward/block/learn traffic. The switch driver is responsible for - computing a STP state change based on current and asked parameters and perform - the relevant ageing based on the intersection results + hardware to forward/block/learn traffic. - ``port_bridge_flags``: bridge layer function invoked when a port must configure its settings for e.g. flooding of unknown traffic or source address @@ -667,21 +931,11 @@ Bridge layer CPU port, and flooding towards the CPU port should also be enabled, due to a lack of an explicit address filtering mechanism in the DSA core. -- ``port_bridge_tx_fwd_offload``: bridge layer function invoked after - ``port_bridge_join`` when a driver sets ``ds->num_fwd_offloading_bridges`` to - a non-zero value. Returning success in this function activates the TX - forwarding offload bridge feature for this port, which enables the tagging - protocol driver to inject data plane packets towards the bridging domain that - the port is a part of. Data plane packets are subject to FDB lookup, hardware - learning on the CPU port, and do not override the port STP state. - Additionally, replication of data plane packets (multicast, flooding) is - handled in hardware and the bridge driver will transmit a single skb for each - packet that needs replication. The method is provided as a configuration - point for drivers that need to configure the hardware for enabling this - feature. - -- ``port_bridge_tx_fwd_unoffload``: bridge layer function invoked when a driver - leaves a bridge port which had the TX forwarding offload feature enabled. +- ``port_fast_age``: bridge layer function invoked when flushing the + dynamically learned FDB entries on the port is necessary. This is called when + transitioning from an STP state where learning should take place to an STP + state where it shouldn't, or when leaving a bridge, or when address learning + is turned off via ``port_bridge_flags``. Bridge VLAN filtering --------------------- @@ -697,55 +951,44 @@ Bridge VLAN filtering allowed. - ``port_vlan_add``: bridge layer function invoked when a VLAN is configured - (tagged or untagged) for the given switch port. If the operation is not - supported by the hardware, this function should return ``-EOPNOTSUPP`` to - inform the bridge code to fallback to a software implementation. + (tagged or untagged) for the given switch port. The CPU port becomes a member + of a VLAN only if a foreign bridge port is also a member of it (and + forwarding needs to take place in software), or the VLAN is installed to the + VLAN group of the bridge device itself, for termination purposes + (``bridge vlan add dev br0 vid 100 self``). VLANs on shared ports are + reference counted and removed when there is no user left. Drivers do not need + to manually install a VLAN on the CPU port. - ``port_vlan_del``: bridge layer function invoked when a VLAN is removed from the given switch port -- ``port_vlan_dump``: bridge layer function invoked with a switchdev callback - function that the driver has to call for each VLAN the given port is a member - of. A switchdev object is used to carry the VID and bridge flags. - - ``port_fdb_add``: bridge layer function invoked when the bridge wants to install a Forwarding Database entry, the switch hardware should be programmed with the specified address in the specified VLAN Id in the forwarding database - associated with this VLAN ID. If the operation is not supported, this - function should return ``-EOPNOTSUPP`` to inform the bridge code to fallback to - a software implementation. - -.. note:: VLAN ID 0 corresponds to the port private database, which, in the context - of DSA, would be its port-based VLAN, used by the associated bridge device. + associated with this VLAN ID. - ``port_fdb_del``: bridge layer function invoked when the bridge wants to remove a Forwarding Database entry, the switch hardware should be programmed to delete the specified MAC address from the specified VLAN ID if it was mapped into this port forwarding database -- ``port_fdb_dump``: bridge layer function invoked with a switchdev callback - function that the driver has to call for each MAC address known to be behind - the given port. A switchdev object is used to carry the VID and FDB info. +- ``port_fdb_dump``: bridge bypass function invoked by ``ndo_fdb_dump`` on the + physical DSA port interfaces. Since DSA does not attempt to keep in sync its + hardware FDB entries with the software bridge, this method is implemented as + a means to view the entries visible on user ports in the hardware database. + The entries reported by this function have the ``self`` flag in the output of + the ``bridge fdb show`` command. - ``port_mdb_add``: bridge layer function invoked when the bridge wants to install - a multicast database entry. If the operation is not supported, this function - should return ``-EOPNOTSUPP`` to inform the bridge code to fallback to a - software implementation. The switch hardware should be programmed with the + a multicast database entry. The switch hardware should be programmed with the specified address in the specified VLAN ID in the forwarding database associated with this VLAN ID. -.. note:: VLAN ID 0 corresponds to the port private database, which, in the context - of DSA, would be its port-based VLAN, used by the associated bridge device. - - ``port_mdb_del``: bridge layer function invoked when the bridge wants to remove a multicast database entry, the switch hardware should be programmed to delete the specified MAC address from the specified VLAN ID if it was mapped into this port forwarding database. -- ``port_mdb_dump``: bridge layer function invoked with a switchdev callback - function that the driver has to call for each MAC address known to be behind - the given port. A switchdev object is used to carry the VID and MDB info. - Link aggregation ---------------- diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index b3a534ed0e7c..d7a1bf1a55b5 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1052,11 +1052,7 @@ udp_rmem_min - INTEGER Default: 4K udp_wmem_min - INTEGER - Minimal size of send buffer used by UDP sockets in moderation. - Each UDP socket is able to use the size for sending data, even if - total pages of UDP sockets exceed udp_mem pressure. The unit is byte. - - Default: 4K + UDP does not have tx memory accounting and this tunable has no effect. RAW variables ============= @@ -2870,7 +2866,14 @@ sctp_rmem - vector of 3 INTEGERs: min, default, max Default: 4K sctp_wmem - vector of 3 INTEGERs: min, default, max - Currently this tunable has no effect. + Only the first value ("min") is used, "default" and "max" are + ignored. + + min: Minimum size of send buffer that can be used by SCTP sockets. + It is guaranteed to each SCTP socket (but not association) even + under moderate memory pressure. + + Default: 4K addr_scope_policy - INTEGER Control IPv4 address scoping - draft-stewart-tsvwg-sctp-ipv4-00 diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 6e090fb96a0e..98a283930307 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5658,7 +5658,7 @@ by a string of size ``name_size``. #define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_BOOLEAN (0x4 << KVM_STATS_UNIT_SHIFT) - #define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES + #define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_BOOLEAN #define KVM_STATS_BASE_SHIFT 8 #define KVM_STATS_BASE_MASK (0xF << KVM_STATS_BASE_SHIFT) diff --git a/MAINTAINERS b/MAINTAINERS index 651616ed8ae2..e10b1835e5d7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7485,6 +7485,8 @@ F: include/video/s1d13xxxfb.h EROFS FILE SYSTEM M: Gao Xiang <xiang@kernel.org> M: Chao Yu <chao@kernel.org> +R: Yue Hu <huyue2@coolpad.com> +R: Jeffle Xu <jefflexu@linux.alibaba.com> L: linux-erofs@lists.ozlabs.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git @@ -9618,6 +9620,7 @@ F: drivers/input/misc/ideapad_slidebar.c IDMAPPED MOUNTS M: Christian Brauner <brauner@kernel.org> +M: Seth Forshee <sforshee@kernel.org> L: linux-fsdevel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git @@ -15849,7 +15852,7 @@ PIN CONTROLLER - FREESCALE M: Dong Aisheng <aisheng.dong@nxp.com> M: Fabio Estevam <festevam@gmail.com> M: Shawn Guo <shawnguo@kernel.org> -M: Stefan Agner <stefan@agner.ch> +M: Jacky Bai <ping.bai@nxp.com> R: Pengutronix Kernel Team <kernel@pengutronix.de> L: linux-gpio@vger.kernel.org S: Maintained @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 19 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = NAME = Superb Owl # *DOCUMENTATION* diff --git a/arch/Kconfig b/arch/Kconfig index fcf9a41a4ef5..71b9272acb28 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -438,6 +438,13 @@ config MMU_GATHER_PAGE_SIZE config MMU_GATHER_NO_RANGE bool + select MMU_GATHER_MERGE_VMAS + +config MMU_GATHER_NO_FLUSH_CACHE + bool + +config MMU_GATHER_MERGE_VMAS + bool config MMU_GATHER_NO_GATHER bool diff --git a/arch/arm/boot/dts/lan966x.dtsi b/arch/arm/boot/dts/lan966x.dtsi index 3cb02fffe716..38e90a31d2dd 100644 --- a/arch/arm/boot/dts/lan966x.dtsi +++ b/arch/arm/boot/dts/lan966x.dtsi @@ -38,7 +38,7 @@ sys_clk: sys_clk { compatible = "fixed-clock"; #clock-cells = <0>; - clock-frequency = <162500000>; + clock-frequency = <165625000>; }; cpu_clk: cpu_clk { diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h index a81dda65c576..45180a2cc47c 100644 --- a/arch/arm/include/asm/dma.h +++ b/arch/arm/include/asm/dma.h @@ -10,7 +10,7 @@ #else #define MAX_DMA_ADDRESS ({ \ extern phys_addr_t arm_dma_zone_size; \ - arm_dma_zone_size && arm_dma_zone_size < (0x10000000 - PAGE_OFFSET) ? \ + arm_dma_zone_size && arm_dma_zone_size < (0x100000000ULL - PAGE_OFFSET) ? \ (PAGE_OFFSET + arm_dma_zone_size) : 0xffffffffUL; }) #endif diff --git a/arch/arm/lib/findbit.S b/arch/arm/lib/findbit.S index b5e8b9ae4c7d..7fd3600db8ef 100644 --- a/arch/arm/lib/findbit.S +++ b/arch/arm/lib/findbit.S @@ -40,8 +40,8 @@ ENDPROC(_find_first_zero_bit_le) * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset) */ ENTRY(_find_next_zero_bit_le) - teq r1, #0 - beq 3b + cmp r2, r1 + bhs 3b ands ip, r2, #7 beq 1b @ If new byte, goto old routine ARM( ldrb r3, [r0, r2, lsr #3] ) @@ -81,8 +81,8 @@ ENDPROC(_find_first_bit_le) * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset) */ ENTRY(_find_next_bit_le) - teq r1, #0 - beq 3b + cmp r2, r1 + bhs 3b ands ip, r2, #7 beq 1b @ If new byte, goto old routine ARM( ldrb r3, [r0, r2, lsr #3] ) @@ -115,8 +115,8 @@ ENTRY(_find_first_zero_bit_be) ENDPROC(_find_first_zero_bit_be) ENTRY(_find_next_zero_bit_be) - teq r1, #0 - beq 3b + cmp r2, r1 + bhs 3b ands ip, r2, #7 beq 1b @ If new byte, goto old routine eor r3, r2, #0x18 @ big endian byte ordering @@ -149,8 +149,8 @@ ENTRY(_find_first_bit_be) ENDPROC(_find_first_bit_be) ENTRY(_find_next_bit_be) - teq r1, #0 - beq 3b + cmp r2, r1 + bhs 3b ands ip, r2, #7 beq 1b @ If new byte, goto old routine eor r3, r2, #0x18 @ big endian byte ordering diff --git a/arch/arm/mach-pxa/corgi.c b/arch/arm/mach-pxa/corgi.c index c546356d0f02..5738496717e2 100644 --- a/arch/arm/mach-pxa/corgi.c +++ b/arch/arm/mach-pxa/corgi.c @@ -549,7 +549,7 @@ static struct pxa2xx_spi_controller corgi_spi_info = { }; static struct gpiod_lookup_table corgi_spi_gpio_table = { - .dev_id = "pxa2xx-spi.1", + .dev_id = "spi1", .table = { GPIO_LOOKUP_IDX("gpio-pxa", CORGI_GPIO_ADS7846_CS, "cs", 0, GPIO_ACTIVE_LOW), GPIO_LOOKUP_IDX("gpio-pxa", CORGI_GPIO_LCDCON_CS, "cs", 1, GPIO_ACTIVE_LOW), diff --git a/arch/arm/mach-pxa/hx4700.c b/arch/arm/mach-pxa/hx4700.c index 2ae06edf413c..2fd665944103 100644 --- a/arch/arm/mach-pxa/hx4700.c +++ b/arch/arm/mach-pxa/hx4700.c @@ -635,7 +635,7 @@ static struct pxa2xx_spi_controller pxa_ssp2_master_info = { }; static struct gpiod_lookup_table pxa_ssp2_gpio_table = { - .dev_id = "pxa2xx-spi.2", + .dev_id = "spi2", .table = { GPIO_LOOKUP_IDX("gpio-pxa", GPIO88_HX4700_TSC2046_CS, "cs", 0, GPIO_ACTIVE_LOW), { }, diff --git a/arch/arm/mach-pxa/icontrol.c b/arch/arm/mach-pxa/icontrol.c index 753fe166ab68..624088257cfc 100644 --- a/arch/arm/mach-pxa/icontrol.c +++ b/arch/arm/mach-pxa/icontrol.c @@ -140,7 +140,7 @@ struct platform_device pxa_spi_ssp4 = { }; static struct gpiod_lookup_table pxa_ssp3_gpio_table = { - .dev_id = "pxa2xx-spi.3", + .dev_id = "spi3", .table = { GPIO_LOOKUP_IDX("gpio-pxa", ICONTROL_MCP251x_nCS1, "cs", 0, GPIO_ACTIVE_LOW), GPIO_LOOKUP_IDX("gpio-pxa", ICONTROL_MCP251x_nCS2, "cs", 1, GPIO_ACTIVE_LOW), @@ -149,7 +149,7 @@ static struct gpiod_lookup_table pxa_ssp3_gpio_table = { }; static struct gpiod_lookup_table pxa_ssp4_gpio_table = { - .dev_id = "pxa2xx-spi.4", + .dev_id = "spi4", .table = { GPIO_LOOKUP_IDX("gpio-pxa", ICONTROL_MCP251x_nCS3, "cs", 0, GPIO_ACTIVE_LOW), GPIO_LOOKUP_IDX("gpio-pxa", ICONTROL_MCP251x_nCS4, "cs", 1, GPIO_ACTIVE_LOW), diff --git a/arch/arm/mach-pxa/littleton.c b/arch/arm/mach-pxa/littleton.c index f98dc61e87af..98423a96f440 100644 --- a/arch/arm/mach-pxa/littleton.c +++ b/arch/arm/mach-pxa/littleton.c @@ -207,7 +207,7 @@ static struct spi_board_info littleton_spi_devices[] __initdata = { }; static struct gpiod_lookup_table littleton_spi_gpio_table = { - .dev_id = "pxa2xx-spi.2", + .dev_id = "spi2", .table = { GPIO_LOOKUP_IDX("gpio-pxa", LITTLETON_GPIO_LCD_CS, "cs", 0, GPIO_ACTIVE_LOW), { }, diff --git a/arch/arm/mach-pxa/magician.c b/arch/arm/mach-pxa/magician.c index 20456a55c4c5..0827ebca1d38 100644 --- a/arch/arm/mach-pxa/magician.c +++ b/arch/arm/mach-pxa/magician.c @@ -994,7 +994,7 @@ static struct pxa2xx_spi_controller magician_spi_info = { }; static struct gpiod_lookup_table magician_spi_gpio_table = { - .dev_id = "pxa2xx-spi.2", + .dev_id = "spi2", .table = { /* NOTICE must be GPIO, incompatibility with hw PXA SPI framing */ GPIO_LOOKUP_IDX("gpio-pxa", GPIO14_MAGICIAN_TSC2046_CS, "cs", 0, GPIO_ACTIVE_LOW), diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c index dd88953adc9d..9964729cd428 100644 --- a/arch/arm/mach-pxa/spitz.c +++ b/arch/arm/mach-pxa/spitz.c @@ -578,7 +578,7 @@ static struct pxa2xx_spi_controller spitz_spi_info = { }; static struct gpiod_lookup_table spitz_spi_gpio_table = { - .dev_id = "pxa2xx-spi.2", + .dev_id = "spi2", .table = { GPIO_LOOKUP_IDX("gpio-pxa", SPITZ_GPIO_ADS7846_CS, "cs", 0, GPIO_ACTIVE_LOW), GPIO_LOOKUP_IDX("gpio-pxa", SPITZ_GPIO_LCDCON_CS, "cs", 1, GPIO_ACTIVE_LOW), diff --git a/arch/arm/mach-pxa/z2.c b/arch/arm/mach-pxa/z2.c index d03520555497..c4d4162a7e6e 100644 --- a/arch/arm/mach-pxa/z2.c +++ b/arch/arm/mach-pxa/z2.c @@ -623,7 +623,7 @@ static struct pxa2xx_spi_controller pxa_ssp2_master_info = { }; static struct gpiod_lookup_table pxa_ssp1_gpio_table = { - .dev_id = "pxa2xx-spi.1", + .dev_id = "spi1", .table = { GPIO_LOOKUP_IDX("gpio-pxa", GPIO24_ZIPITZ2_WIFI_CS, "cs", 0, GPIO_ACTIVE_LOW), { }, @@ -631,7 +631,7 @@ static struct gpiod_lookup_table pxa_ssp1_gpio_table = { }; static struct gpiod_lookup_table pxa_ssp2_gpio_table = { - .dev_id = "pxa2xx-spi.2", + .dev_id = "spi2", .table = { GPIO_LOOKUP_IDX("gpio-pxa", GPIO88_ZIPITZ2_LCD_CS, "cs", 0, GPIO_ACTIVE_LOW), { }, diff --git a/arch/csky/include/asm/tlb.h b/arch/csky/include/asm/tlb.h index 3498e65f59f8..702861c68874 100644 --- a/arch/csky/include/asm/tlb.h +++ b/arch/csky/include/asm/tlb.h @@ -4,21 +4,6 @@ #define __ASM_CSKY_TLB_H #include <asm/cacheflush.h> - -#define tlb_start_vma(tlb, vma) \ - do { \ - if (!(tlb)->fullmm) \ - flush_cache_range(vma, (vma)->vm_start, (vma)->vm_end); \ - } while (0) - -#define tlb_end_vma(tlb, vma) \ - do { \ - if (!(tlb)->fullmm) \ - flush_tlb_range(vma, (vma)->vm_start, (vma)->vm_end); \ - } while (0) - -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include <asm-generic/tlb.h> #endif /* __ASM_CSKY_TLB_H */ diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 53a912befb62..62b5b07fa4e1 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -69,7 +69,6 @@ config LOONGARCH select GENERIC_TIME_VSYSCALL select GPIOLIB select HAVE_ARCH_AUDITSYSCALL - select HAVE_ARCH_COMPILER_H select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK @@ -108,6 +107,7 @@ config LOONGARCH select TRACE_IRQFLAGS_SUPPORT select USE_PERCPU_NUMA_NODE_ID select ZONE_DMA32 + select MMU_GATHER_MERGE_VMAS if MMU config 32BIT bool diff --git a/arch/loongarch/include/asm/asmmacro.h b/arch/loongarch/include/asm/asmmacro.h index a1a04083bd67..be037a40580d 100644 --- a/arch/loongarch/include/asm/asmmacro.h +++ b/arch/loongarch/include/asm/asmmacro.h @@ -274,16 +274,4 @@ nor \dst, \src, zero .endm -.macro bgt r0 r1 label - blt \r1, \r0, \label -.endm - -.macro bltz r0 label - blt \r0, zero, \label -.endm - -.macro bgez r0 label - bge \r0, zero, \label -.endm - #endif /* _ASM_ASMMACRO_H */ diff --git a/arch/loongarch/include/asm/atomic.h b/arch/loongarch/include/asm/atomic.h index 979367ad4e2c..6b9aca9ab6e9 100644 --- a/arch/loongarch/include/asm/atomic.h +++ b/arch/loongarch/include/asm/atomic.h @@ -10,7 +10,6 @@ #include <linux/types.h> #include <asm/barrier.h> #include <asm/cmpxchg.h> -#include <asm/compiler.h> #if __SIZEOF_LONG__ == 4 #define __LL "ll.w " @@ -157,27 +156,25 @@ static inline int arch_atomic_sub_if_positive(int i, atomic_t *v) __asm__ __volatile__( "1: ll.w %1, %2 # atomic_sub_if_positive\n" " addi.w %0, %1, %3 \n" - " or %1, %0, $zero \n" - " blt %0, $zero, 2f \n" + " move %1, %0 \n" + " bltz %0, 2f \n" " sc.w %1, %2 \n" - " beq $zero, %1, 1b \n" + " beqz %1, 1b \n" "2: \n" __WEAK_LLSC_MB - : "=&r" (result), "=&r" (temp), - "+" GCC_OFF_SMALL_ASM() (v->counter) + : "=&r" (result), "=&r" (temp), "+ZC" (v->counter) : "I" (-i)); } else { __asm__ __volatile__( "1: ll.w %1, %2 # atomic_sub_if_positive\n" " sub.w %0, %1, %3 \n" - " or %1, %0, $zero \n" - " blt %0, $zero, 2f \n" + " move %1, %0 \n" + " bltz %0, 2f \n" " sc.w %1, %2 \n" - " beq $zero, %1, 1b \n" + " beqz %1, 1b \n" "2: \n" __WEAK_LLSC_MB - : "=&r" (result), "=&r" (temp), - "+" GCC_OFF_SMALL_ASM() (v->counter) + : "=&r" (result), "=&r" (temp), "+ZC" (v->counter) : "r" (i)); } @@ -320,27 +317,25 @@ static inline long arch_atomic64_sub_if_positive(long i, atomic64_t *v) __asm__ __volatile__( "1: ll.d %1, %2 # atomic64_sub_if_positive \n" " addi.d %0, %1, %3 \n" - " or %1, %0, $zero \n" - " blt %0, $zero, 2f \n" + " move %1, %0 \n" + " bltz %0, 2f \n" " sc.d %1, %2 \n" - " beq %1, $zero, 1b \n" + " beqz %1, 1b \n" "2: \n" __WEAK_LLSC_MB - : "=&r" (result), "=&r" (temp), - "+" GCC_OFF_SMALL_ASM() (v->counter) + : "=&r" (result), "=&r" (temp), "+ZC" (v->counter) : "I" (-i)); } else { __asm__ __volatile__( "1: ll.d %1, %2 # atomic64_sub_if_positive \n" " sub.d %0, %1, %3 \n" - " or %1, %0, $zero \n" - " blt %0, $zero, 2f \n" + " move %1, %0 \n" + " bltz %0, 2f \n" " sc.d %1, %2 \n" - " beq %1, $zero, 1b \n" + " beqz %1, 1b \n" "2: \n" __WEAK_LLSC_MB - : "=&r" (result), "=&r" (temp), - "+" GCC_OFF_SMALL_ASM() (v->counter) + : "=&r" (result), "=&r" (temp), "+ZC" (v->counter) : "r" (i)); } diff --git a/arch/loongarch/include/asm/barrier.h b/arch/loongarch/include/asm/barrier.h index b6517eeeb141..cda977675854 100644 --- a/arch/loongarch/include/asm/barrier.h +++ b/arch/loongarch/include/asm/barrier.h @@ -48,9 +48,9 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, __asm__ __volatile__( "sltu %0, %1, %2\n\t" #if (__SIZEOF_LONG__ == 4) - "sub.w %0, $r0, %0\n\t" + "sub.w %0, $zero, %0\n\t" #elif (__SIZEOF_LONG__ == 8) - "sub.d %0, $r0, %0\n\t" + "sub.d %0, $zero, %0\n\t" #endif : "=r" (mask) : "r" (index), "r" (size) diff --git a/arch/loongarch/include/asm/cmpxchg.h b/arch/loongarch/include/asm/cmpxchg.h index 75b3a4478652..0a9b0fac1eee 100644 --- a/arch/loongarch/include/asm/cmpxchg.h +++ b/arch/loongarch/include/asm/cmpxchg.h @@ -55,9 +55,9 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x, __asm__ __volatile__( \ "1: " ld " %0, %2 # __cmpxchg_asm \n" \ " bne %0, %z3, 2f \n" \ - " or $t0, %z4, $zero \n" \ + " move $t0, %z4 \n" \ " " st " $t0, %1 \n" \ - " beq $zero, $t0, 1b \n" \ + " beqz $t0, 1b \n" \ "2: \n" \ __WEAK_LLSC_MB \ : "=&r" (__ret), "=ZB"(*m) \ diff --git a/arch/loongarch/include/asm/compiler.h b/arch/loongarch/include/asm/compiler.h deleted file mode 100644 index 657cebe70ace..000000000000 --- a/arch/loongarch/include/asm/compiler.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2020-2022 Loongson Technology Corporation Limited - */ -#ifndef _ASM_COMPILER_H -#define _ASM_COMPILER_H - -#define GCC_OFF_SMALL_ASM() "ZC" - -#define LOONGARCH_ISA_LEVEL "loongarch" -#define LOONGARCH_ISA_ARCH_LEVEL "arch=loongarch" -#define LOONGARCH_ISA_LEVEL_RAW loongarch -#define LOONGARCH_ISA_ARCH_LEVEL_RAW LOONGARCH_ISA_LEVEL_RAW - -#endif /* _ASM_COMPILER_H */ diff --git a/arch/loongarch/include/asm/elf.h b/arch/loongarch/include/asm/elf.h index f3960b18a90e..5f3ff4781fda 100644 --- a/arch/loongarch/include/asm/elf.h +++ b/arch/loongarch/include/asm/elf.h @@ -288,8 +288,6 @@ struct arch_elf_state { .interp_fp_abi = LOONGARCH_ABI_FP_ANY, \ } -#define elf_read_implies_exec(ex, exec_stk) (exec_stk == EXSTACK_DEFAULT) - extern int arch_elf_pt_proc(void *ehdr, void *phdr, struct file *elf, bool is_interp, struct arch_elf_state *state); diff --git a/arch/loongarch/include/asm/futex.h b/arch/loongarch/include/asm/futex.h index 9de8231694ec..feb6658c84ff 100644 --- a/arch/loongarch/include/asm/futex.h +++ b/arch/loongarch/include/asm/futex.h @@ -8,7 +8,6 @@ #include <linux/futex.h> #include <linux/uaccess.h> #include <asm/barrier.h> -#include <asm/compiler.h> #include <asm/errno.h> #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \ @@ -17,7 +16,7 @@ "1: ll.w %1, %4 # __futex_atomic_op\n" \ " " insn " \n" \ "2: sc.w $t0, %2 \n" \ - " beq $t0, $zero, 1b \n" \ + " beqz $t0, 1b \n" \ "3: \n" \ " .section .fixup,\"ax\" \n" \ "4: li.w %0, %6 \n" \ @@ -82,9 +81,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newv "# futex_atomic_cmpxchg_inatomic \n" "1: ll.w %1, %3 \n" " bne %1, %z4, 3f \n" - " or $t0, %z5, $zero \n" + " move $t0, %z5 \n" "2: sc.w $t0, %2 \n" - " beq $zero, $t0, 1b \n" + " beqz $t0, 1b \n" "3: \n" __WEAK_LLSC_MB " .section .fixup,\"ax\" \n" @@ -95,8 +94,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newv " "__UA_ADDR "\t1b, 4b \n" " "__UA_ADDR "\t2b, 4b \n" " .previous \n" - : "+r" (ret), "=&r" (val), "=" GCC_OFF_SMALL_ASM() (*uaddr) - : GCC_OFF_SMALL_ASM() (*uaddr), "Jr" (oldval), "Jr" (newval), + : "+r" (ret), "=&r" (val), "=ZC" (*uaddr) + : "ZC" (*uaddr), "Jr" (oldval), "Jr" (newval), "i" (-EFAULT) : "memory", "t0"); diff --git a/arch/loongarch/include/asm/irqflags.h b/arch/loongarch/include/asm/irqflags.h index 52121cd791fe..319a8c616f1f 100644 --- a/arch/loongarch/include/asm/irqflags.h +++ b/arch/loongarch/include/asm/irqflags.h @@ -9,7 +9,6 @@ #include <linux/compiler.h> #include <linux/stringify.h> -#include <asm/compiler.h> #include <asm/loongarch.h> static inline void arch_local_irq_enable(void) diff --git a/arch/loongarch/include/asm/local.h b/arch/loongarch/include/asm/local.h index 2052a2267337..65fbbae9fc4d 100644 --- a/arch/loongarch/include/asm/local.h +++ b/arch/loongarch/include/asm/local.h @@ -9,7 +9,6 @@ #include <linux/bitops.h> #include <linux/atomic.h> #include <asm/cmpxchg.h> -#include <asm/compiler.h> typedef struct { atomic_long_t a; diff --git a/arch/loongarch/include/asm/loongson.h b/arch/loongarch/include/asm/loongson.h index 6a8038725ba7..6e8f6972ceb6 100644 --- a/arch/loongarch/include/asm/loongson.h +++ b/arch/loongarch/include/asm/loongson.h @@ -39,18 +39,6 @@ extern const struct plat_smp_ops loongson3_smp_ops; #define MAX_PACKAGES 16 -/* Chip Config register of each physical cpu package */ -extern u64 loongson_chipcfg[MAX_PACKAGES]; -#define LOONGSON_CHIPCFG(id) (*(volatile u32 *)(loongson_chipcfg[id])) - -/* Chip Temperature register of each physical cpu package */ -extern u64 loongson_chiptemp[MAX_PACKAGES]; -#define LOONGSON_CHIPTEMP(id) (*(volatile u32 *)(loongson_chiptemp[id])) - -/* Freq Control register of each physical cpu package */ -extern u64 loongson_freqctrl[MAX_PACKAGES]; -#define LOONGSON_FREQCTRL(id) (*(volatile u32 *)(loongson_freqctrl[id])) - #define xconf_readl(addr) readl(addr) #define xconf_readq(addr) readq(addr) @@ -58,7 +46,7 @@ static inline void xconf_writel(u32 val, volatile void __iomem *addr) { asm volatile ( " st.w %[v], %[hw], 0 \n" - " ld.b $r0, %[hw], 0 \n" + " ld.b $zero, %[hw], 0 \n" : : [hw] "r" (addr), [v] "r" (val) ); @@ -68,7 +56,7 @@ static inline void xconf_writeq(u64 val64, volatile void __iomem *addr) { asm volatile ( " st.d %[v], %[hw], 0 \n" - " ld.b $r0, %[hw], 0 \n" + " ld.b $zero, %[hw], 0 \n" : : [hw] "r" (addr), [v] "r" (val64) ); diff --git a/arch/loongarch/include/asm/stacktrace.h b/arch/loongarch/include/asm/stacktrace.h index 26483e396ad1..6b5c2a7aa706 100644 --- a/arch/loongarch/include/asm/stacktrace.h +++ b/arch/loongarch/include/asm/stacktrace.h @@ -23,13 +23,13 @@ static __always_inline void prepare_frametrace(struct pt_regs *regs) { __asm__ __volatile__( - /* Save $r1 */ + /* Save $ra */ STORE_ONE_REG(1) - /* Use $r1 to save PC */ - "pcaddi $r1, 0\n\t" - STR_LONG_S " $r1, %0\n\t" - /* Restore $r1 */ - STR_LONG_L " $r1, %1, "STR_LONGSIZE"\n\t" + /* Use $ra to save PC */ + "pcaddi $ra, 0\n\t" + STR_LONG_S " $ra, %0\n\t" + /* Restore $ra */ + STR_LONG_L " $ra, %1, "STR_LONGSIZE"\n\t" STORE_ONE_REG(2) STORE_ONE_REG(3) STORE_ONE_REG(4) diff --git a/arch/loongarch/include/asm/thread_info.h b/arch/loongarch/include/asm/thread_info.h index 99beb11c2fa8..b7dd9f19a5a9 100644 --- a/arch/loongarch/include/asm/thread_info.h +++ b/arch/loongarch/include/asm/thread_info.h @@ -44,14 +44,14 @@ struct thread_info { } /* How to get the thread information struct from C. */ -register struct thread_info *__current_thread_info __asm__("$r2"); +register struct thread_info *__current_thread_info __asm__("$tp"); static inline struct thread_info *current_thread_info(void) { return __current_thread_info; } -register unsigned long current_stack_pointer __asm__("$r3"); +register unsigned long current_stack_pointer __asm__("$sp"); #endif /* !__ASSEMBLY__ */ diff --git a/arch/loongarch/include/asm/tlb.h b/arch/loongarch/include/asm/tlb.h index 4f629ae9d5a9..dd24f5898f65 100644 --- a/arch/loongarch/include/asm/tlb.h +++ b/arch/loongarch/include/asm/tlb.h @@ -137,16 +137,6 @@ static inline void invtlb_all(u32 op, u32 info, u64 addr) ); } -/* - * LoongArch doesn't need any special per-pte or per-vma handling, except - * we need to flush cache for area to be unmapped. - */ -#define tlb_start_vma(tlb, vma) \ - do { \ - if (!(tlb)->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) static void tlb_flush(struct mmu_gather *tlb); diff --git a/arch/loongarch/include/asm/uaccess.h b/arch/loongarch/include/asm/uaccess.h index 217c6a3727b1..2b44edc604a2 100644 --- a/arch/loongarch/include/asm/uaccess.h +++ b/arch/loongarch/include/asm/uaccess.h @@ -162,7 +162,7 @@ do { \ "2: \n" \ " .section .fixup,\"ax\" \n" \ "3: li.w %0, %3 \n" \ - " or %1, $r0, $r0 \n" \ + " move %1, $zero \n" \ " b 2b \n" \ " .previous \n" \ " .section __ex_table,\"a\" \n" \ diff --git a/arch/loongarch/kernel/cacheinfo.c b/arch/loongarch/kernel/cacheinfo.c index b38f5489d094..4662b06269f4 100644 --- a/arch/loongarch/kernel/cacheinfo.c +++ b/arch/loongarch/kernel/cacheinfo.c @@ -4,8 +4,9 @@ * * Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ -#include <asm/cpu-info.h> #include <linux/cacheinfo.h> +#include <asm/bootinfo.h> +#include <asm/cpu-info.h> /* Populates leaf and increments to next leaf */ #define populate_cache(cache, leaf, c_level, c_type) \ @@ -17,6 +18,8 @@ do { \ leaf->ways_of_associativity = c->cache.ways; \ leaf->size = c->cache.linesz * c->cache.sets * \ c->cache.ways; \ + if (leaf->level > 2) \ + leaf->size *= nodes_per_package; \ leaf++; \ } while (0) @@ -95,11 +98,15 @@ static void cache_cpumap_setup(unsigned int cpu) int populate_cache_leaves(unsigned int cpu) { - int level = 1; + int level = 1, nodes_per_package = 1; struct cpuinfo_loongarch *c = ¤t_cpu_data; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct cacheinfo *this_leaf = this_cpu_ci->info_list; + if (loongson_sysconf.nr_nodes > 1) + nodes_per_package = loongson_sysconf.cores_per_package + / loongson_sysconf.cores_per_node; + if (c->icache.waysize) { populate_cache(dcache, this_leaf, level, CACHE_TYPE_DATA); populate_cache(icache, this_leaf, level++, CACHE_TYPE_INST); diff --git a/arch/loongarch/kernel/entry.S b/arch/loongarch/kernel/entry.S index d5b3dbcf5425..d53b631c9022 100644 --- a/arch/loongarch/kernel/entry.S +++ b/arch/loongarch/kernel/entry.S @@ -27,7 +27,7 @@ SYM_FUNC_START(handle_syscall) addi.d sp, sp, -PT_SIZE cfi_st t2, PT_R3 - cfi_rel_offset sp, PT_R3 + cfi_rel_offset sp, PT_R3 st.d zero, sp, PT_R0 csrrd t2, LOONGARCH_CSR_PRMD st.d t2, sp, PT_PRMD @@ -50,7 +50,7 @@ SYM_FUNC_START(handle_syscall) cfi_st a7, PT_R11 csrrd ra, LOONGARCH_CSR_ERA st.d ra, sp, PT_ERA - cfi_rel_offset ra, PT_ERA + cfi_rel_offset ra, PT_ERA cfi_st tp, PT_R2 cfi_st u0, PT_R21 diff --git a/arch/loongarch/kernel/env.c b/arch/loongarch/kernel/env.c index 467946ecf451..82b478a5c665 100644 --- a/arch/loongarch/kernel/env.c +++ b/arch/loongarch/kernel/env.c @@ -17,21 +17,6 @@ u64 efi_system_table; struct loongson_system_configuration loongson_sysconf; EXPORT_SYMBOL(loongson_sysconf); -u64 loongson_chipcfg[MAX_PACKAGES]; -u64 loongson_chiptemp[MAX_PACKAGES]; -u64 loongson_freqctrl[MAX_PACKAGES]; -unsigned long long smp_group[MAX_PACKAGES]; - -static void __init register_addrs_set(u64 *registers, const u64 addr, int num) -{ - u64 i; - - for (i = 0; i < num; i++) { - *registers = (i << 44) | addr; - registers++; - } -} - void __init init_environ(void) { int efi_boot = fw_arg0; @@ -50,11 +35,6 @@ void __init init_environ(void) efi_memmap_init_early(&data); memblock_reserve(data.phys_map & PAGE_MASK, PAGE_ALIGN(data.size + (data.phys_map & ~PAGE_MASK))); - - register_addrs_set(smp_group, TO_UNCACHE(0x1fe01000), 16); - register_addrs_set(loongson_chipcfg, TO_UNCACHE(0x1fe00180), 16); - register_addrs_set(loongson_chiptemp, TO_UNCACHE(0x1fe0019c), 16); - register_addrs_set(loongson_freqctrl, TO_UNCACHE(0x1fe001d0), 16); } static int __init init_cpu_fullname(void) diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S index a631a7137667..576b3370a296 100644 --- a/arch/loongarch/kernel/fpu.S +++ b/arch/loongarch/kernel/fpu.S @@ -27,78 +27,78 @@ .endm .macro sc_save_fp base - EX fst.d $f0, \base, (0 * FPU_REG_WIDTH) - EX fst.d $f1, \base, (1 * FPU_REG_WIDTH) - EX fst.d $f2, \base, (2 * FPU_REG_WIDTH) - EX fst.d $f3, \base, (3 * FPU_REG_WIDTH) - EX fst.d $f4, \base, (4 * FPU_REG_WIDTH) - EX fst.d $f5, \base, (5 * FPU_REG_WIDTH) - EX fst.d $f6, \base, (6 * FPU_REG_WIDTH) - EX fst.d $f7, \base, (7 * FPU_REG_WIDTH) - EX fst.d $f8, \base, (8 * FPU_REG_WIDTH) - EX fst.d $f9, \base, (9 * FPU_REG_WIDTH) - EX fst.d $f10, \base, (10 * FPU_REG_WIDTH) - EX fst.d $f11, \base, (11 * FPU_REG_WIDTH) - EX fst.d $f12, \base, (12 * FPU_REG_WIDTH) - EX fst.d $f13, \base, (13 * FPU_REG_WIDTH) - EX fst.d $f14, \base, (14 * FPU_REG_WIDTH) - EX fst.d $f15, \base, (15 * FPU_REG_WIDTH) - EX fst.d $f16, \base, (16 * FPU_REG_WIDTH) - EX fst.d $f17, \base, (17 * FPU_REG_WIDTH) - EX fst.d $f18, \base, (18 * FPU_REG_WIDTH) - EX fst.d $f19, \base, (19 * FPU_REG_WIDTH) - EX fst.d $f20, \base, (20 * FPU_REG_WIDTH) - EX fst.d $f21, \base, (21 * FPU_REG_WIDTH) - EX fst.d $f22, \base, (22 * FPU_REG_WIDTH) - EX fst.d $f23, \base, (23 * FPU_REG_WIDTH) - EX fst.d $f24, \base, (24 * FPU_REG_WIDTH) - EX fst.d $f25, \base, (25 * FPU_REG_WIDTH) - EX fst.d $f26, \base, (26 * FPU_REG_WIDTH) - EX fst.d $f27, \base, (27 * FPU_REG_WIDTH) - EX fst.d $f28, \base, (28 * FPU_REG_WIDTH) - EX fst.d $f29, \base, (29 * FPU_REG_WIDTH) - EX fst.d $f30, \base, (30 * FPU_REG_WIDTH) - EX fst.d $f31, \base, (31 * FPU_REG_WIDTH) + EX fst.d $f0, \base, (0 * FPU_REG_WIDTH) + EX fst.d $f1, \base, (1 * FPU_REG_WIDTH) + EX fst.d $f2, \base, (2 * FPU_REG_WIDTH) + EX fst.d $f3, \base, (3 * FPU_REG_WIDTH) + EX fst.d $f4, \base, (4 * FPU_REG_WIDTH) + EX fst.d $f5, \base, (5 * FPU_REG_WIDTH) + EX fst.d $f6, \base, (6 * FPU_REG_WIDTH) + EX fst.d $f7, \base, (7 * FPU_REG_WIDTH) + EX fst.d $f8, \base, (8 * FPU_REG_WIDTH) + EX fst.d $f9, \base, (9 * FPU_REG_WIDTH) + EX fst.d $f10, \base, (10 * FPU_REG_WIDTH) + EX fst.d $f11, \base, (11 * FPU_REG_WIDTH) + EX fst.d $f12, \base, (12 * FPU_REG_WIDTH) + EX fst.d $f13, \base, (13 * FPU_REG_WIDTH) + EX fst.d $f14, \base, (14 * FPU_REG_WIDTH) + EX fst.d $f15, \base, (15 * FPU_REG_WIDTH) + EX fst.d $f16, \base, (16 * FPU_REG_WIDTH) + EX fst.d $f17, \base, (17 * FPU_REG_WIDTH) + EX fst.d $f18, \base, (18 * FPU_REG_WIDTH) + EX fst.d $f19, \base, (19 * FPU_REG_WIDTH) + EX fst.d $f20, \base, (20 * FPU_REG_WIDTH) + EX fst.d $f21, \base, (21 * FPU_REG_WIDTH) + EX fst.d $f22, \base, (22 * FPU_REG_WIDTH) + EX fst.d $f23, \base, (23 * FPU_REG_WIDTH) + EX fst.d $f24, \base, (24 * FPU_REG_WIDTH) + EX fst.d $f25, \base, (25 * FPU_REG_WIDTH) + EX fst.d $f26, \base, (26 * FPU_REG_WIDTH) + EX fst.d $f27, \base, (27 * FPU_REG_WIDTH) + EX fst.d $f28, \base, (28 * FPU_REG_WIDTH) + EX fst.d $f29, \base, (29 * FPU_REG_WIDTH) + EX fst.d $f30, \base, (30 * FPU_REG_WIDTH) + EX fst.d $f31, \base, (31 * FPU_REG_WIDTH) .endm .macro sc_restore_fp base - EX fld.d $f0, \base, (0 * FPU_REG_WIDTH) - EX fld.d $f1, \base, (1 * FPU_REG_WIDTH) - EX fld.d $f2, \base, (2 * FPU_REG_WIDTH) - EX fld.d $f3, \base, (3 * FPU_REG_WIDTH) - EX fld.d $f4, \base, (4 * FPU_REG_WIDTH) - EX fld.d $f5, \base, (5 * FPU_REG_WIDTH) - EX fld.d $f6, \base, (6 * FPU_REG_WIDTH) - EX fld.d $f7, \base, (7 * FPU_REG_WIDTH) - EX fld.d $f8, \base, (8 * FPU_REG_WIDTH) - EX fld.d $f9, \base, (9 * FPU_REG_WIDTH) - EX fld.d $f10, \base, (10 * FPU_REG_WIDTH) - EX fld.d $f11, \base, (11 * FPU_REG_WIDTH) - EX fld.d $f12, \base, (12 * FPU_REG_WIDTH) - EX fld.d $f13, \base, (13 * FPU_REG_WIDTH) - EX fld.d $f14, \base, (14 * FPU_REG_WIDTH) - EX fld.d $f15, \base, (15 * FPU_REG_WIDTH) - EX fld.d $f16, \base, (16 * FPU_REG_WIDTH) - EX fld.d $f17, \base, (17 * FPU_REG_WIDTH) - EX fld.d $f18, \base, (18 * FPU_REG_WIDTH) - EX fld.d $f19, \base, (19 * FPU_REG_WIDTH) - EX fld.d $f20, \base, (20 * FPU_REG_WIDTH) - EX fld.d $f21, \base, (21 * FPU_REG_WIDTH) - EX fld.d $f22, \base, (22 * FPU_REG_WIDTH) - EX fld.d $f23, \base, (23 * FPU_REG_WIDTH) - EX fld.d $f24, \base, (24 * FPU_REG_WIDTH) - EX fld.d $f25, \base, (25 * FPU_REG_WIDTH) - EX fld.d $f26, \base, (26 * FPU_REG_WIDTH) - EX fld.d $f27, \base, (27 * FPU_REG_WIDTH) - EX fld.d $f28, \base, (28 * FPU_REG_WIDTH) - EX fld.d $f29, \base, (29 * FPU_REG_WIDTH) - EX fld.d $f30, \base, (30 * FPU_REG_WIDTH) - EX fld.d $f31, \base, (31 * FPU_REG_WIDTH) + EX fld.d $f0, \base, (0 * FPU_REG_WIDTH) + EX fld.d $f1, \base, (1 * FPU_REG_WIDTH) + EX fld.d $f2, \base, (2 * FPU_REG_WIDTH) + EX fld.d $f3, \base, (3 * FPU_REG_WIDTH) + EX fld.d $f4, \base, (4 * FPU_REG_WIDTH) + EX fld.d $f5, \base, (5 * FPU_REG_WIDTH) + EX fld.d $f6, \base, (6 * FPU_REG_WIDTH) + EX fld.d $f7, \base, (7 * FPU_REG_WIDTH) + EX fld.d $f8, \base, (8 * FPU_REG_WIDTH) + EX fld.d $f9, \base, (9 * FPU_REG_WIDTH) + EX fld.d $f10, \base, (10 * FPU_REG_WIDTH) + EX fld.d $f11, \base, (11 * FPU_REG_WIDTH) + EX fld.d $f12, \base, (12 * FPU_REG_WIDTH) + EX fld.d $f13, \base, (13 * FPU_REG_WIDTH) + EX fld.d $f14, \base, (14 * FPU_REG_WIDTH) + EX fld.d $f15, \base, (15 * FPU_REG_WIDTH) + EX fld.d $f16, \base, (16 * FPU_REG_WIDTH) + EX fld.d $f17, \base, (17 * FPU_REG_WIDTH) + EX fld.d $f18, \base, (18 * FPU_REG_WIDTH) + EX fld.d $f19, \base, (19 * FPU_REG_WIDTH) + EX fld.d $f20, \base, (20 * FPU_REG_WIDTH) + EX fld.d $f21, \base, (21 * FPU_REG_WIDTH) + EX fld.d $f22, \base, (22 * FPU_REG_WIDTH) + EX fld.d $f23, \base, (23 * FPU_REG_WIDTH) + EX fld.d $f24, \base, (24 * FPU_REG_WIDTH) + EX fld.d $f25, \base, (25 * FPU_REG_WIDTH) + EX fld.d $f26, \base, (26 * FPU_REG_WIDTH) + EX fld.d $f27, \base, (27 * FPU_REG_WIDTH) + EX fld.d $f28, \base, (28 * FPU_REG_WIDTH) + EX fld.d $f29, \base, (29 * FPU_REG_WIDTH) + EX fld.d $f30, \base, (30 * FPU_REG_WIDTH) + EX fld.d $f31, \base, (31 * FPU_REG_WIDTH) .endm .macro sc_save_fcc base, tmp0, tmp1 movcf2gr \tmp0, $fcc0 - move \tmp1, \tmp0 + move \tmp1, \tmp0 movcf2gr \tmp0, $fcc1 bstrins.d \tmp1, \tmp0, 15, 8 movcf2gr \tmp0, $fcc2 @@ -113,11 +113,11 @@ bstrins.d \tmp1, \tmp0, 55, 48 movcf2gr \tmp0, $fcc7 bstrins.d \tmp1, \tmp0, 63, 56 - EX st.d \tmp1, \base, 0 + EX st.d \tmp1, \base, 0 .endm .macro sc_restore_fcc base, tmp0, tmp1 - EX ld.d \tmp0, \base, 0 + EX ld.d \tmp0, \base, 0 bstrpick.d \tmp1, \tmp0, 7, 0 movgr2cf $fcc0, \tmp1 bstrpick.d \tmp1, \tmp0, 15, 8 @@ -138,11 +138,11 @@ .macro sc_save_fcsr base, tmp0 movfcsr2gr \tmp0, fcsr0 - EX st.w \tmp0, \base, 0 + EX st.w \tmp0, \base, 0 .endm .macro sc_restore_fcsr base, tmp0 - EX ld.w \tmp0, \base, 0 + EX ld.w \tmp0, \base, 0 movgr2fcsr fcsr0, \tmp0 .endm @@ -151,9 +151,9 @@ */ SYM_FUNC_START(_save_fp) fpu_save_csr a0 t1 - fpu_save_double a0 t1 # clobbers t1 + fpu_save_double a0 t1 # clobbers t1 fpu_save_cc a0 t1 t2 # clobbers t1, t2 - jirl zero, ra, 0 + jr ra SYM_FUNC_END(_save_fp) EXPORT_SYMBOL(_save_fp) @@ -161,10 +161,10 @@ EXPORT_SYMBOL(_save_fp) * Restore a thread's fp context. */ SYM_FUNC_START(_restore_fp) - fpu_restore_double a0 t1 # clobbers t1 - fpu_restore_csr a0 t1 - fpu_restore_cc a0 t1 t2 # clobbers t1, t2 - jirl zero, ra, 0 + fpu_restore_double a0 t1 # clobbers t1 + fpu_restore_csr a0 t1 + fpu_restore_cc a0 t1 t2 # clobbers t1, t2 + jr ra SYM_FUNC_END(_restore_fp) /* @@ -216,7 +216,7 @@ SYM_FUNC_START(_init_fpu) movgr2fr.d $f30, t1 movgr2fr.d $f31, t1 - jirl zero, ra, 0 + jr ra SYM_FUNC_END(_init_fpu) /* @@ -225,11 +225,11 @@ SYM_FUNC_END(_init_fpu) * a2: fcsr */ SYM_FUNC_START(_save_fp_context) - sc_save_fcc a1 t1 t2 - sc_save_fcsr a2 t1 - sc_save_fp a0 - li.w a0, 0 # success - jirl zero, ra, 0 + sc_save_fcc a1 t1 t2 + sc_save_fcsr a2 t1 + sc_save_fp a0 + li.w a0, 0 # success + jr ra SYM_FUNC_END(_save_fp_context) /* @@ -238,14 +238,14 @@ SYM_FUNC_END(_save_fp_context) * a2: fcsr */ SYM_FUNC_START(_restore_fp_context) - sc_restore_fp a0 - sc_restore_fcc a1 t1 t2 - sc_restore_fcsr a2 t1 - li.w a0, 0 # success - jirl zero, ra, 0 + sc_restore_fp a0 + sc_restore_fcc a1 t1 t2 + sc_restore_fcsr a2 t1 + li.w a0, 0 # success + jr ra SYM_FUNC_END(_restore_fp_context) SYM_FUNC_START(fault) li.w a0, -EFAULT # failure - jirl zero, ra, 0 + jr ra SYM_FUNC_END(fault) diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S index 93496852b3cc..75e5be807a0d 100644 --- a/arch/loongarch/kernel/genex.S +++ b/arch/loongarch/kernel/genex.S @@ -28,23 +28,23 @@ SYM_FUNC_START(__arch_cpu_idle) nop idle 0 /* end of rollback region */ -1: jirl zero, ra, 0 +1: jr ra SYM_FUNC_END(__arch_cpu_idle) SYM_FUNC_START(handle_vint) BACKUP_T0T1 SAVE_ALL la.abs t1, __arch_cpu_idle - LONG_L t0, sp, PT_ERA + LONG_L t0, sp, PT_ERA /* 32 byte rollback region */ ori t0, t0, 0x1f xori t0, t0, 0x1f bne t0, t1, 1f - LONG_S t0, sp, PT_ERA + LONG_S t0, sp, PT_ERA 1: move a0, sp move a1, sp la.abs t0, do_vint - jirl ra, t0, 0 + jirl ra, t0, 0 RESTORE_ALL_AND_RET SYM_FUNC_END(handle_vint) @@ -72,7 +72,7 @@ SYM_FUNC_END(except_vec_cex) build_prep_\prep move a0, sp la.abs t0, do_\handler - jirl ra, t0, 0 + jirl ra, t0, 0 RESTORE_ALL_AND_RET SYM_FUNC_END(handle_\exception) .endm @@ -91,5 +91,5 @@ SYM_FUNC_END(except_vec_cex) SYM_FUNC_START(handle_sys) la.abs t0, handle_syscall - jirl zero, t0, 0 + jr t0 SYM_FUNC_END(handle_sys) diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S index d01e62dd414f..7062cdf0e33e 100644 --- a/arch/loongarch/kernel/head.S +++ b/arch/loongarch/kernel/head.S @@ -32,7 +32,7 @@ SYM_CODE_START(kernel_entry) # kernel entry point /* We might not get launched at the address the kernel is linked to, so we jump there. */ la.abs t0, 0f - jirl zero, t0, 0 + jr t0 0: la t0, __bss_start # clear .bss st.d zero, t0, 0 @@ -50,7 +50,7 @@ SYM_CODE_START(kernel_entry) # kernel entry point /* KSave3 used for percpu base, initialized as 0 */ csrwr zero, PERCPU_BASE_KS /* GPR21 used for percpu base (runtime), initialized as 0 */ - or u0, zero, zero + move u0, zero la tp, init_thread_union /* Set the SP after an empty pt_regs. */ @@ -85,8 +85,8 @@ SYM_CODE_START(smpboot_entry) ld.d sp, t0, CPU_BOOT_STACK ld.d tp, t0, CPU_BOOT_TINFO - la.abs t0, 0f - jirl zero, t0, 0 + la.abs t0, 0f + jr t0 0: bl start_secondary SYM_CODE_END(smpboot_entry) diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c index e6ab87948e1d..dc2b82ea894c 100644 --- a/arch/loongarch/kernel/ptrace.c +++ b/arch/loongarch/kernel/ptrace.c @@ -193,7 +193,7 @@ static int fpr_set(struct task_struct *target, const void *kbuf, const void __user *ubuf) { const int fcc_start = NUM_FPU_REGS * sizeof(elf_fpreg_t); - const int fcc_end = fcc_start + sizeof(u64); + const int fcsr_start = fcc_start + sizeof(u64); int err; BUG_ON(count % sizeof(elf_fpreg_t)); @@ -209,10 +209,12 @@ static int fpr_set(struct task_struct *target, if (err) return err; - if (count > 0) - err |= user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.fcc, - fcc_start, fcc_end); + err |= user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.fcc, fcc_start, + fcc_start + sizeof(u64)); + err |= user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.fcsr, fcsr_start, + fcsr_start + sizeof(u32)); return err; } diff --git a/arch/loongarch/kernel/reset.c b/arch/loongarch/kernel/reset.c index 2b86469e4718..800c965a17ea 100644 --- a/arch/loongarch/kernel/reset.c +++ b/arch/loongarch/kernel/reset.c @@ -13,7 +13,6 @@ #include <linux/console.h> #include <acpi/reboot.h> -#include <asm/compiler.h> #include <asm/idle.h> #include <asm/loongarch.h> #include <asm/reboot.h> diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index c74860b53375..8f5c2f9a1a83 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -126,7 +126,7 @@ static void __init parse_bios_table(const struct dmi_header *dm) char *dmi_data = (char *)dm; bios_extern = *(dmi_data + SMBIOS_BIOSEXTERN_OFFSET); - b_info.bios_size = *(dmi_data + SMBIOS_BIOSSIZE_OFFSET); + b_info.bios_size = (*(dmi_data + SMBIOS_BIOSSIZE_OFFSET) + 1) << 6; if (bios_extern & LOONGSON_EFI_ENABLE) set_bit(EFI_BOOT, &efi.flags); diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index 73cec62504fb..09743103d9b3 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -278,116 +278,29 @@ void loongson3_cpu_die(unsigned int cpu) mb(); } -/* - * The target CPU should go to XKPRANGE (uncached area) and flush - * ICache/DCache/VCache before the control CPU can safely disable its clock. - */ -static void loongson3_play_dead(int *state_addr) +void play_dead(void) { - register int val; - register void *addr; + register uint64_t addr; register void (*init_fn)(void); - __asm__ __volatile__( - " li.d %[addr], 0x8000000000000000\n" - "1: cacop 0x8, %[addr], 0 \n" /* flush ICache */ - " cacop 0x8, %[addr], 1 \n" - " cacop 0x8, %[addr], 2 \n" - " cacop 0x8, %[addr], 3 \n" - " cacop 0x9, %[addr], 0 \n" /* flush DCache */ - " cacop 0x9, %[addr], 1 \n" - " cacop 0x9, %[addr], 2 \n" - " cacop 0x9, %[addr], 3 \n" - " addi.w %[sets], %[sets], -1 \n" - " addi.d %[addr], %[addr], 0x40 \n" - " bnez %[sets], 1b \n" - " li.d %[addr], 0x8000000000000000\n" - "2: cacop 0xa, %[addr], 0 \n" /* flush VCache */ - " cacop 0xa, %[addr], 1 \n" - " cacop 0xa, %[addr], 2 \n" - " cacop 0xa, %[addr], 3 \n" - " cacop 0xa, %[addr], 4 \n" - " cacop 0xa, %[addr], 5 \n" - " cacop 0xa, %[addr], 6 \n" - " cacop 0xa, %[addr], 7 \n" - " cacop 0xa, %[addr], 8 \n" - " cacop 0xa, %[addr], 9 \n" - " cacop 0xa, %[addr], 10 \n" - " cacop 0xa, %[addr], 11 \n" - " cacop 0xa, %[addr], 12 \n" - " cacop 0xa, %[addr], 13 \n" - " cacop 0xa, %[addr], 14 \n" - " cacop 0xa, %[addr], 15 \n" - " addi.w %[vsets], %[vsets], -1 \n" - " addi.d %[addr], %[addr], 0x40 \n" - " bnez %[vsets], 2b \n" - " li.w %[val], 0x7 \n" /* *state_addr = CPU_DEAD; */ - " st.w %[val], %[state_addr], 0 \n" - " dbar 0 \n" - " cacop 0x11, %[state_addr], 0 \n" /* flush entry of *state_addr */ - : [addr] "=&r" (addr), [val] "=&r" (val) - : [state_addr] "r" (state_addr), - [sets] "r" (cpu_data[smp_processor_id()].dcache.sets), - [vsets] "r" (cpu_data[smp_processor_id()].vcache.sets)); - + idle_task_exit(); local_irq_enable(); - change_csr_ecfg(ECFG0_IM, ECFGF_IPI); + set_csr_ecfg(ECFGF_IPI); + __this_cpu_write(cpu_state, CPU_DEAD); + + __smp_mb(); + do { + __asm__ __volatile__("idle 0\n\t"); + addr = iocsr_read64(LOONGARCH_IOCSR_MBUF0); + } while (addr == 0); - __asm__ __volatile__( - " idle 0 \n" - " li.w $t0, 0x1020 \n" - " iocsrrd.d %[init_fn], $t0 \n" /* Get init PC */ - : [init_fn] "=&r" (addr) - : /* No Input */ - : "a0"); - init_fn = __va(addr); + init_fn = (void *)TO_CACHE(addr); + iocsr_write32(0xffffffff, LOONGARCH_IOCSR_IPI_CLEAR); init_fn(); unreachable(); } -void play_dead(void) -{ - int *state_addr; - unsigned int cpu = smp_processor_id(); - void (*play_dead_uncached)(int *s); - - idle_task_exit(); - play_dead_uncached = (void *)TO_UNCACHE(__pa((unsigned long)loongson3_play_dead)); - state_addr = &per_cpu(cpu_state, cpu); - mb(); - play_dead_uncached(state_addr); -} - -static int loongson3_enable_clock(unsigned int cpu) -{ - uint64_t core_id = cpu_data[cpu].core; - uint64_t package_id = cpu_data[cpu].package; - - LOONGSON_FREQCTRL(package_id) |= 1 << (core_id * 4 + 3); - - return 0; -} - -static int loongson3_disable_clock(unsigned int cpu) -{ - uint64_t core_id = cpu_data[cpu].core; - uint64_t package_id = cpu_data[cpu].package; - - LOONGSON_FREQCTRL(package_id) &= ~(1 << (core_id * 4 + 3)); - - return 0; -} - -static int register_loongson3_notifier(void) -{ - return cpuhp_setup_state_nocalls(CPUHP_LOONGARCH_SOC_PREPARE, - "loongarch/loongson:prepare", - loongson3_enable_clock, - loongson3_disable_clock); -} -early_initcall(register_loongson3_notifier); - #endif /* diff --git a/arch/loongarch/kernel/switch.S b/arch/loongarch/kernel/switch.S index 53e2fa8e580e..37e84ac8ffc2 100644 --- a/arch/loongarch/kernel/switch.S +++ b/arch/loongarch/kernel/switch.S @@ -24,8 +24,8 @@ SYM_FUNC_START(__switch_to) move tp, a2 cpu_restore_nonscratch a1 - li.w t0, _THREAD_SIZE - 32 - PTR_ADD t0, t0, tp + li.w t0, _THREAD_SIZE - 32 + PTR_ADD t0, t0, tp set_saved_sp t0, t1, t2 ldptr.d t1, a1, THREAD_CSRPRMD diff --git a/arch/loongarch/lib/clear_user.S b/arch/loongarch/lib/clear_user.S index 25d9be5fbb19..16ba2b8dd68a 100644 --- a/arch/loongarch/lib/clear_user.S +++ b/arch/loongarch/lib/clear_user.S @@ -32,7 +32,7 @@ SYM_FUNC_START(__clear_user) 1: st.b zero, a0, 0 addi.d a0, a0, 1 addi.d a1, a1, -1 - bgt a1, zero, 1b + bgtz a1, 1b 2: move a0, a1 jr ra diff --git a/arch/loongarch/lib/copy_user.S b/arch/loongarch/lib/copy_user.S index 9ae507f851b5..97d20327a69e 100644 --- a/arch/loongarch/lib/copy_user.S +++ b/arch/loongarch/lib/copy_user.S @@ -35,7 +35,7 @@ SYM_FUNC_START(__copy_user) addi.d a0, a0, 1 addi.d a1, a1, 1 addi.d a2, a2, -1 - bgt a2, zero, 1b + bgtz a2, 1b 3: move a0, a2 jr ra diff --git a/arch/loongarch/lib/delay.c b/arch/loongarch/lib/delay.c index 5d856694fcfe..831d4761f385 100644 --- a/arch/loongarch/lib/delay.c +++ b/arch/loongarch/lib/delay.c @@ -7,7 +7,6 @@ #include <linux/smp.h> #include <linux/timex.h> -#include <asm/compiler.h> #include <asm/processor.h> void __delay(unsigned long cycles) diff --git a/arch/loongarch/mm/page.S b/arch/loongarch/mm/page.S index ddc78ab33c7b..4c874a7af0ad 100644 --- a/arch/loongarch/mm/page.S +++ b/arch/loongarch/mm/page.S @@ -10,75 +10,75 @@ .align 5 SYM_FUNC_START(clear_page) - lu12i.w t0, 1 << (PAGE_SHIFT - 12) - add.d t0, t0, a0 + lu12i.w t0, 1 << (PAGE_SHIFT - 12) + add.d t0, t0, a0 1: - st.d zero, a0, 0 - st.d zero, a0, 8 - st.d zero, a0, 16 - st.d zero, a0, 24 - st.d zero, a0, 32 - st.d zero, a0, 40 - st.d zero, a0, 48 - st.d zero, a0, 56 - addi.d a0, a0, 128 - st.d zero, a0, -64 - st.d zero, a0, -56 - st.d zero, a0, -48 - st.d zero, a0, -40 - st.d zero, a0, -32 - st.d zero, a0, -24 - st.d zero, a0, -16 - st.d zero, a0, -8 - bne t0, a0, 1b + st.d zero, a0, 0 + st.d zero, a0, 8 + st.d zero, a0, 16 + st.d zero, a0, 24 + st.d zero, a0, 32 + st.d zero, a0, 40 + st.d zero, a0, 48 + st.d zero, a0, 56 + addi.d a0, a0, 128 + st.d zero, a0, -64 + st.d zero, a0, -56 + st.d zero, a0, -48 + st.d zero, a0, -40 + st.d zero, a0, -32 + st.d zero, a0, -24 + st.d zero, a0, -16 + st.d zero, a0, -8 + bne t0, a0, 1b - jirl $r0, ra, 0 + jr ra SYM_FUNC_END(clear_page) EXPORT_SYMBOL(clear_page) .align 5 SYM_FUNC_START(copy_page) - lu12i.w t8, 1 << (PAGE_SHIFT - 12) - add.d t8, t8, a0 + lu12i.w t8, 1 << (PAGE_SHIFT - 12) + add.d t8, t8, a0 1: - ld.d t0, a1, 0 - ld.d t1, a1, 8 - ld.d t2, a1, 16 - ld.d t3, a1, 24 - ld.d t4, a1, 32 - ld.d t5, a1, 40 - ld.d t6, a1, 48 - ld.d t7, a1, 56 + ld.d t0, a1, 0 + ld.d t1, a1, 8 + ld.d t2, a1, 16 + ld.d t3, a1, 24 + ld.d t4, a1, 32 + ld.d t5, a1, 40 + ld.d t6, a1, 48 + ld.d t7, a1, 56 - st.d t0, a0, 0 - st.d t1, a0, 8 - ld.d t0, a1, 64 - ld.d t1, a1, 72 - st.d t2, a0, 16 - st.d t3, a0, 24 - ld.d t2, a1, 80 - ld.d t3, a1, 88 - st.d t4, a0, 32 - st.d t5, a0, 40 - ld.d t4, a1, 96 - ld.d t5, a1, 104 - st.d t6, a0, 48 - st.d t7, a0, 56 - ld.d t6, a1, 112 - ld.d t7, a1, 120 - addi.d a0, a0, 128 - addi.d a1, a1, 128 + st.d t0, a0, 0 + st.d t1, a0, 8 + ld.d t0, a1, 64 + ld.d t1, a1, 72 + st.d t2, a0, 16 + st.d t3, a0, 24 + ld.d t2, a1, 80 + ld.d t3, a1, 88 + st.d t4, a0, 32 + st.d t5, a0, 40 + ld.d t4, a1, 96 + ld.d t5, a1, 104 + st.d t6, a0, 48 + st.d t7, a0, 56 + ld.d t6, a1, 112 + ld.d t7, a1, 120 + addi.d a0, a0, 128 + addi.d a1, a1, 128 - st.d t0, a0, -64 - st.d t1, a0, -56 - st.d t2, a0, -48 - st.d t3, a0, -40 - st.d t4, a0, -32 - st.d t5, a0, -24 - st.d t6, a0, -16 - st.d t7, a0, -8 + st.d t0, a0, -64 + st.d t1, a0, -56 + st.d t2, a0, -48 + st.d t3, a0, -40 + st.d t4, a0, -32 + st.d t5, a0, -24 + st.d t6, a0, -16 + st.d t7, a0, -8 - bne t8, a0, 1b - jirl $r0, ra, 0 + bne t8, a0, 1b + jr ra SYM_FUNC_END(copy_page) EXPORT_SYMBOL(copy_page) diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S index 7eee40271577..de19fa2d7f0d 100644 --- a/arch/loongarch/mm/tlbex.S +++ b/arch/loongarch/mm/tlbex.S @@ -18,7 +18,7 @@ REG_S a2, sp, PT_BVADDR li.w a1, \write la.abs t0, do_page_fault - jirl ra, t0, 0 + jirl ra, t0, 0 RESTORE_ALL_AND_RET SYM_FUNC_END(tlb_do_page_fault_\write) .endm @@ -34,7 +34,7 @@ SYM_FUNC_START(handle_tlb_protect) csrrd a2, LOONGARCH_CSR_BADV REG_S a2, sp, PT_BVADDR la.abs t0, do_page_fault - jirl ra, t0, 0 + jirl ra, t0, 0 RESTORE_ALL_AND_RET SYM_FUNC_END(handle_tlb_protect) @@ -47,7 +47,7 @@ SYM_FUNC_START(handle_tlb_load) * The vmalloc handling is not in the hotpath. */ csrrd t0, LOONGARCH_CSR_BADV - blt t0, $r0, vmalloc_load + bltz t0, vmalloc_load csrrd t1, LOONGARCH_CSR_PGDL vmalloc_done_load: @@ -80,7 +80,7 @@ vmalloc_done_load: * see if we need to jump to huge tlb processing. */ andi t0, ra, _PAGE_HUGE - bne t0, $r0, tlb_huge_update_load + bnez t0, tlb_huge_update_load csrrd t0, LOONGARCH_CSR_BADV srli.d t0, t0, (PAGE_SHIFT + PTE_ORDER) @@ -100,12 +100,12 @@ smp_pgtable_change_load: srli.d ra, t0, _PAGE_PRESENT_SHIFT andi ra, ra, 1 - beq ra, $r0, nopage_tlb_load + beqz ra, nopage_tlb_load ori t0, t0, _PAGE_VALID #ifdef CONFIG_SMP sc.d t0, t1, 0 - beq t0, $r0, smp_pgtable_change_load + beqz t0, smp_pgtable_change_load #else st.d t0, t1, 0 #endif @@ -139,23 +139,23 @@ tlb_huge_update_load: #endif srli.d ra, t0, _PAGE_PRESENT_SHIFT andi ra, ra, 1 - beq ra, $r0, nopage_tlb_load + beqz ra, nopage_tlb_load tlbsrch ori t0, t0, _PAGE_VALID #ifdef CONFIG_SMP sc.d t0, t1, 0 - beq t0, $r0, tlb_huge_update_load + beqz t0, tlb_huge_update_load ld.d t0, t1, 0 #else st.d t0, t1, 0 #endif - addu16i.d t1, $r0, -(CSR_TLBIDX_EHINV >> 16) - addi.d ra, t1, 0 - csrxchg ra, t1, LOONGARCH_CSR_TLBIDX + addu16i.d t1, zero, -(CSR_TLBIDX_EHINV >> 16) + addi.d ra, t1, 0 + csrxchg ra, t1, LOONGARCH_CSR_TLBIDX tlbwr - csrxchg $r0, t1, LOONGARCH_CSR_TLBIDX + csrxchg zero, t1, LOONGARCH_CSR_TLBIDX /* * A huge PTE describes an area the size of the @@ -178,27 +178,27 @@ tlb_huge_update_load: addi.d t0, ra, 0 /* Convert to entrylo1 */ - addi.d t1, $r0, 1 + addi.d t1, zero, 1 slli.d t1, t1, (HPAGE_SHIFT - 1) add.d t0, t0, t1 csrwr t0, LOONGARCH_CSR_TLBELO1 /* Set huge page tlb entry size */ - addu16i.d t0, $r0, (CSR_TLBIDX_PS >> 16) - addu16i.d t1, $r0, (PS_HUGE_SIZE << (CSR_TLBIDX_PS_SHIFT - 16)) + addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16) + addu16i.d t1, zero, (PS_HUGE_SIZE << (CSR_TLBIDX_PS_SHIFT - 16)) csrxchg t1, t0, LOONGARCH_CSR_TLBIDX tlbfill - addu16i.d t0, $r0, (CSR_TLBIDX_PS >> 16) - addu16i.d t1, $r0, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16)) + addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16) + addu16i.d t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16)) csrxchg t1, t0, LOONGARCH_CSR_TLBIDX nopage_tlb_load: dbar 0 csrrd ra, EXCEPTION_KS2 la.abs t0, tlb_do_page_fault_0 - jirl $r0, t0, 0 + jr t0 SYM_FUNC_END(handle_tlb_load) SYM_FUNC_START(handle_tlb_store) @@ -210,7 +210,7 @@ SYM_FUNC_START(handle_tlb_store) * The vmalloc handling is not in the hotpath. */ csrrd t0, LOONGARCH_CSR_BADV - blt t0, $r0, vmalloc_store + bltz t0, vmalloc_store csrrd t1, LOONGARCH_CSR_PGDL vmalloc_done_store: @@ -244,7 +244,7 @@ vmalloc_done_store: * see if we need to jump to huge tlb processing. */ andi t0, ra, _PAGE_HUGE - bne t0, $r0, tlb_huge_update_store + bnez t0, tlb_huge_update_store csrrd t0, LOONGARCH_CSR_BADV srli.d t0, t0, (PAGE_SHIFT + PTE_ORDER) @@ -265,12 +265,12 @@ smp_pgtable_change_store: srli.d ra, t0, _PAGE_PRESENT_SHIFT andi ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT) xori ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT) - bne ra, $r0, nopage_tlb_store + bnez ra, nopage_tlb_store ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) #ifdef CONFIG_SMP sc.d t0, t1, 0 - beq t0, $r0, smp_pgtable_change_store + beqz t0, smp_pgtable_change_store #else st.d t0, t1, 0 #endif @@ -306,24 +306,24 @@ tlb_huge_update_store: srli.d ra, t0, _PAGE_PRESENT_SHIFT andi ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT) xori ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT) - bne ra, $r0, nopage_tlb_store + bnez ra, nopage_tlb_store tlbsrch ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) #ifdef CONFIG_SMP sc.d t0, t1, 0 - beq t0, $r0, tlb_huge_update_store + beqz t0, tlb_huge_update_store ld.d t0, t1, 0 #else st.d t0, t1, 0 #endif - addu16i.d t1, $r0, -(CSR_TLBIDX_EHINV >> 16) - addi.d ra, t1, 0 - csrxchg ra, t1, LOONGARCH_CSR_TLBIDX + addu16i.d t1, zero, -(CSR_TLBIDX_EHINV >> 16) + addi.d ra, t1, 0 + csrxchg ra, t1, LOONGARCH_CSR_TLBIDX tlbwr - csrxchg $r0, t1, LOONGARCH_CSR_TLBIDX + csrxchg zero, t1, LOONGARCH_CSR_TLBIDX /* * A huge PTE describes an area the size of the * configured huge page size. This is twice the @@ -345,28 +345,28 @@ tlb_huge_update_store: addi.d t0, ra, 0 /* Convert to entrylo1 */ - addi.d t1, $r0, 1 + addi.d t1, zero, 1 slli.d t1, t1, (HPAGE_SHIFT - 1) add.d t0, t0, t1 csrwr t0, LOONGARCH_CSR_TLBELO1 /* Set huge page tlb entry size */ - addu16i.d t0, $r0, (CSR_TLBIDX_PS >> 16) - addu16i.d t1, $r0, (PS_HUGE_SIZE << (CSR_TLBIDX_PS_SHIFT - 16)) + addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16) + addu16i.d t1, zero, (PS_HUGE_SIZE << (CSR_TLBIDX_PS_SHIFT - 16)) csrxchg t1, t0, LOONGARCH_CSR_TLBIDX tlbfill /* Reset default page size */ - addu16i.d t0, $r0, (CSR_TLBIDX_PS >> 16) - addu16i.d t1, $r0, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16)) + addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16) + addu16i.d t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16)) csrxchg t1, t0, LOONGARCH_CSR_TLBIDX nopage_tlb_store: dbar 0 csrrd ra, EXCEPTION_KS2 la.abs t0, tlb_do_page_fault_1 - jirl $r0, t0, 0 + jr t0 SYM_FUNC_END(handle_tlb_store) SYM_FUNC_START(handle_tlb_modify) @@ -378,7 +378,7 @@ SYM_FUNC_START(handle_tlb_modify) * The vmalloc handling is not in the hotpath. */ csrrd t0, LOONGARCH_CSR_BADV - blt t0, $r0, vmalloc_modify + bltz t0, vmalloc_modify csrrd t1, LOONGARCH_CSR_PGDL vmalloc_done_modify: @@ -411,7 +411,7 @@ vmalloc_done_modify: * see if we need to jump to huge tlb processing. */ andi t0, ra, _PAGE_HUGE - bne t0, $r0, tlb_huge_update_modify + bnez t0, tlb_huge_update_modify csrrd t0, LOONGARCH_CSR_BADV srli.d t0, t0, (PAGE_SHIFT + PTE_ORDER) @@ -431,12 +431,12 @@ smp_pgtable_change_modify: srli.d ra, t0, _PAGE_WRITE_SHIFT andi ra, ra, 1 - beq ra, $r0, nopage_tlb_modify + beqz ra, nopage_tlb_modify ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) #ifdef CONFIG_SMP sc.d t0, t1, 0 - beq t0, $r0, smp_pgtable_change_modify + beqz t0, smp_pgtable_change_modify #else st.d t0, t1, 0 #endif @@ -454,7 +454,7 @@ leave_modify: ertn #ifdef CONFIG_64BIT vmalloc_modify: - la.abs t1, swapper_pg_dir + la.abs t1, swapper_pg_dir b vmalloc_done_modify #endif @@ -471,14 +471,14 @@ tlb_huge_update_modify: srli.d ra, t0, _PAGE_WRITE_SHIFT andi ra, ra, 1 - beq ra, $r0, nopage_tlb_modify + beqz ra, nopage_tlb_modify tlbsrch ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) #ifdef CONFIG_SMP sc.d t0, t1, 0 - beq t0, $r0, tlb_huge_update_modify + beqz t0, tlb_huge_update_modify ld.d t0, t1, 0 #else st.d t0, t1, 0 @@ -504,28 +504,28 @@ tlb_huge_update_modify: addi.d t0, ra, 0 /* Convert to entrylo1 */ - addi.d t1, $r0, 1 + addi.d t1, zero, 1 slli.d t1, t1, (HPAGE_SHIFT - 1) add.d t0, t0, t1 csrwr t0, LOONGARCH_CSR_TLBELO1 /* Set huge page tlb entry size */ - addu16i.d t0, $r0, (CSR_TLBIDX_PS >> 16) - addu16i.d t1, $r0, (PS_HUGE_SIZE << (CSR_TLBIDX_PS_SHIFT - 16)) - csrxchg t1, t0, LOONGARCH_CSR_TLBIDX + addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16) + addu16i.d t1, zero, (PS_HUGE_SIZE << (CSR_TLBIDX_PS_SHIFT - 16)) + csrxchg t1, t0, LOONGARCH_CSR_TLBIDX tlbwr /* Reset default page size */ - addu16i.d t0, $r0, (CSR_TLBIDX_PS >> 16) - addu16i.d t1, $r0, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16)) - csrxchg t1, t0, LOONGARCH_CSR_TLBIDX + addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16) + addu16i.d t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16)) + csrxchg t1, t0, LOONGARCH_CSR_TLBIDX nopage_tlb_modify: dbar 0 csrrd ra, EXCEPTION_KS2 la.abs t0, tlb_do_page_fault_1 - jirl $r0, t0, 0 + jr t0 SYM_FUNC_END(handle_tlb_modify) SYM_FUNC_START(handle_tlb_refill) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7aa12e88c580..4d8f26c1399b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -256,6 +256,7 @@ config PPC select IRQ_FORCED_THREADING select MMU_GATHER_PAGE_SIZE select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_MERGE_VMAS select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PPC64 || NOT_COHERENT_CACHE select NEED_PER_CPU_EMBED_FIRST_CHUNK if PPC64 @@ -281,6 +282,10 @@ config PPC # Please keep this list sorted alphabetically. # +config PPC_LONG_DOUBLE_128 + depends on PPC64 + def_bool $(success,test "$(shell,echo __LONG_DOUBLE_128__ | $(CC) -E -P -)" = 1) + config PPC_BARRIER_NOSPEC bool default y diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index 09a9ae5f3656..b3de6102a907 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -19,8 +19,6 @@ #include <linux/pagemap.h> -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry #define tlb_flush tlb_flush diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index f91f0f29a566..c8cf924bf9c0 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -20,6 +20,7 @@ CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_prom_init.o += -fno-stack-protector CFLAGS_prom_init.o += -DDISABLE_BRANCH_PROFILING CFLAGS_prom_init.o += -ffreestanding +CFLAGS_prom_init.o += $(call cc-option, -ftrivial-auto-var-init=uninitialized) ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 34cf8a598617..81029d40a672 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -73,6 +73,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y) endif KBUILD_CFLAGS_MODULE += $(call cc-option,-mno-relax) +KBUILD_AFLAGS_MODULE += $(call as-option,-Wa$(comma)-mno-relax) # GCC versions that support the "-mstrict-align" option default to allowing # unaligned accesses. While unaligned accesses are explicitly allowed in the @@ -110,7 +111,7 @@ PHONY += vdso_install vdso_install: $(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso $@ $(if $(CONFIG_COMPAT),$(Q)$(MAKE) \ - $(build)=arch/riscv/kernel/compat_vdso $@) + $(build)=arch/riscv/kernel/compat_vdso compat_$@) ifeq ($(KBUILD_EXTMOD),) ifeq ($(CONFIG_MMU),y) diff --git a/arch/riscv/boot/dts/canaan/canaan_kd233.dts b/arch/riscv/boot/dts/canaan/canaan_kd233.dts index 039b92abf046..f72540bd14a3 100644 --- a/arch/riscv/boot/dts/canaan/canaan_kd233.dts +++ b/arch/riscv/boot/dts/canaan/canaan_kd233.dts @@ -35,7 +35,7 @@ gpio-keys { compatible = "gpio-keys"; - key0 { + key { label = "KEY0"; linux,code = <BTN_0>; gpios = <&gpio0 10 GPIO_ACTIVE_LOW>; diff --git a/arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts b/arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts index b9e30df127fe..8abdbe26a1d0 100644 --- a/arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts +++ b/arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts @@ -47,7 +47,7 @@ gpio-keys { compatible = "gpio-keys"; - boot { + key-boot { label = "BOOT"; linux,code = <BTN_0>; gpios = <&gpio0 0 GPIO_ACTIVE_LOW>; diff --git a/arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts b/arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts index 8d23401b0bbb..3c6df1ecf76f 100644 --- a/arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts +++ b/arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts @@ -52,7 +52,7 @@ gpio-keys { compatible = "gpio-keys"; - boot { + key-boot { label = "BOOT"; linux,code = <BTN_0>; gpios = <&gpio0 0 GPIO_ACTIVE_LOW>; diff --git a/arch/riscv/boot/dts/canaan/sipeed_maix_go.dts b/arch/riscv/boot/dts/canaan/sipeed_maix_go.dts index 24fd83b43d9d..03c9843d503e 100644 --- a/arch/riscv/boot/dts/canaan/sipeed_maix_go.dts +++ b/arch/riscv/boot/dts/canaan/sipeed_maix_go.dts @@ -46,19 +46,19 @@ gpio-keys { compatible = "gpio-keys"; - up { + key-up { label = "UP"; linux,code = <BTN_1>; gpios = <&gpio1_0 7 GPIO_ACTIVE_LOW>; }; - press { + key-press { label = "PRESS"; linux,code = <BTN_0>; gpios = <&gpio0 0 GPIO_ACTIVE_LOW>; }; - down { + key-down { label = "DOWN"; linux,code = <BTN_2>; gpios = <&gpio0 1 GPIO_ACTIVE_LOW>; diff --git a/arch/riscv/boot/dts/canaan/sipeed_maixduino.dts b/arch/riscv/boot/dts/canaan/sipeed_maixduino.dts index 25341f38292a..7164ad063178 100644 --- a/arch/riscv/boot/dts/canaan/sipeed_maixduino.dts +++ b/arch/riscv/boot/dts/canaan/sipeed_maixduino.dts @@ -23,7 +23,7 @@ gpio-keys { compatible = "gpio-keys"; - boot { + key-boot { label = "BOOT"; linux,code = <BTN_0>; gpios = <&gpio0 0 GPIO_ACTIVE_LOW>; diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index c71d6591d539..33bb60a354cd 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -78,7 +78,7 @@ obj-$(CONFIG_SMP) += cpu_ops_sbi.o endif obj-$(CONFIG_HOTPLUG_CPU) += cpu-hotplug.o obj-$(CONFIG_KGDB) += kgdb.o -obj-$(CONFIG_KEXEC) += kexec_relocate.o crash_save_regs.o machine_kexec.o +obj-$(CONFIG_KEXEC_CORE) += kexec_relocate.o crash_save_regs.o machine_kexec.o obj-$(CONFIG_KEXEC_FILE) += elf_kexec.o machine_kexec_file.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c index 9cb85095fd45..0cb94992c15b 100644 --- a/arch/riscv/kernel/elf_kexec.c +++ b/arch/riscv/kernel/elf_kexec.c @@ -349,7 +349,7 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, { const char *strtab, *name, *shstrtab; const Elf_Shdr *sechdrs; - Elf_Rela *relas; + Elf64_Rela *relas; int i, r_type; /* String & section header string table */ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8cd9e56c629b..5a1a8dfda6f8 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -204,6 +204,7 @@ config S390 select IOMMU_SUPPORT if PCI select MMU_GATHER_NO_GATHER select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_MERGE_VMAS select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PCI select NEED_SG_DMA_LENGTH if PCI diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h index 2c6e1c6ecbe7..4120c428dc37 100644 --- a/arch/s390/include/asm/archrandom.h +++ b/arch/s390/include/asm/archrandom.h @@ -2,7 +2,7 @@ /* * Kernel interface for the s390 arch_random_* functions * - * Copyright IBM Corp. 2017, 2020 + * Copyright IBM Corp. 2017, 2022 * * Author: Harald Freudenberger <freude@de.ibm.com> * @@ -14,6 +14,7 @@ #ifdef CONFIG_ARCH_RANDOM #include <linux/static_key.h> +#include <linux/preempt.h> #include <linux/atomic.h> #include <asm/cpacf.h> @@ -32,7 +33,8 @@ static inline bool __must_check arch_get_random_int(unsigned int *v) static inline bool __must_check arch_get_random_seed_long(unsigned long *v) { - if (static_branch_likely(&s390_arch_random_available)) { + if (static_branch_likely(&s390_arch_random_available) && + in_task()) { cpacf_trng(NULL, 0, (u8 *)v, sizeof(*v)); atomic64_add(sizeof(*v), &s390_arch_random_counter); return true; @@ -42,7 +44,8 @@ static inline bool __must_check arch_get_random_seed_long(unsigned long *v) static inline bool __must_check arch_get_random_seed_int(unsigned int *v) { - if (static_branch_likely(&s390_arch_random_available)) { + if (static_branch_likely(&s390_arch_random_available) && + in_task()) { cpacf_trng(NULL, 0, (u8 *)v, sizeof(*v)); atomic64_add(sizeof(*v), &s390_arch_random_counter); return true; diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index fe6407f0eb1b..3a5c8fb590e5 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -27,9 +27,6 @@ static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size); -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) - #define tlb_flush tlb_flush #define pte_free_tlb pte_free_tlb #define pmd_free_tlb pmd_free_tlb diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index ba449c47effd..4f7d1dfbc608 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -67,6 +67,8 @@ config SPARC64 select HAVE_KRETPROBES select HAVE_KPROBES select MMU_GATHER_RCU_TABLE_FREE if SMP + select MMU_GATHER_MERGE_VMAS + select MMU_GATHER_NO_FLUSH_CACHE select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h index 779a5a0f0608..3037187482db 100644 --- a/arch/sparc/include/asm/tlb_64.h +++ b/arch/sparc/include/asm/tlb_64.h @@ -22,8 +22,6 @@ void smp_flush_tlb_mm(struct mm_struct *mm); void __flush_tlb_pending(unsigned long, unsigned long, unsigned long *); void flush_tlb_pending(void); -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) #define tlb_flush(tlb) flush_tlb_pending() /* diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e58798f636d4..52a7f91527fe 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -245,6 +245,7 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select MMU_GATHER_MERGE_VMAS select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if UNWINDER_ORC || STACK_VALIDATION @@ -2473,7 +2474,7 @@ config RETHUNK bool "Enable return-thunks" depends on RETPOLINE && CC_HAS_RETURN_THUNK select OBJTOOL if HAVE_OBJTOOL - default y + default y if X86_64 help Compile the kernel with the return-thunks compiler option to guard against kernel-to-user data leaks by avoiding return speculation. @@ -2482,21 +2483,21 @@ config RETHUNK config CPU_UNRET_ENTRY bool "Enable UNRET on kernel entry" - depends on CPU_SUP_AMD && RETHUNK + depends on CPU_SUP_AMD && RETHUNK && X86_64 default y help Compile the kernel with support for the retbleed=unret mitigation. config CPU_IBPB_ENTRY bool "Enable IBPB on kernel entry" - depends on CPU_SUP_AMD + depends on CPU_SUP_AMD && X86_64 default y help Compile the kernel with support for the retbleed=ibpb mitigation. config CPU_IBRS_ENTRY bool "Enable IBRS on kernel entry" - depends on CPU_SUP_INTEL + depends on CPU_SUP_INTEL && X86_64 default y help Compile the kernel with support for the spectre_v2=ibrs mitigation. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1f40dad30d50..7854685c5f25 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -27,6 +27,7 @@ RETHUNK_CFLAGS := -mfunction-return=thunk-extern RETPOLINE_CFLAGS += $(RETHUNK_CFLAGS) endif +export RETHUNK_CFLAGS export RETPOLINE_CFLAGS export RETPOLINE_VDSO_CFLAGS diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 13179f31fe10..4f70fb6c2c1e 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -278,9 +278,9 @@ enum { }; /* - * For formats with LBR_TSX flags (e.g. LBR_FORMAT_EIP_FLAGS2), bits 61:62 in - * MSR_LAST_BRANCH_FROM_x are the TSX flags when TSX is supported, but when - * TSX is not supported they have no consistent behavior: + * For format LBR_FORMAT_EIP_FLAGS2, bits 61:62 in MSR_LAST_BRANCH_FROM_x + * are the TSX flags when TSX is supported, but when TSX is not supported + * they have no consistent behavior: * * - For wrmsr(), bits 61:62 are considered part of the sign extension. * - For HW updates (branch captures) bits 61:62 are always OFF and are not @@ -288,7 +288,7 @@ enum { * * Therefore, if: * - * 1) LBR has TSX format + * 1) LBR format LBR_FORMAT_EIP_FLAGS2 * 2) CPU has no TSX support enabled * * ... then any value passed to wrmsr() must be sign extended to 63 bits and any @@ -300,7 +300,7 @@ static inline bool lbr_from_signext_quirk_needed(void) bool tsx_support = boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM); - return !tsx_support && x86_pmu.lbr_has_tsx; + return !tsx_support; } static DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key); @@ -1609,9 +1609,6 @@ void intel_pmu_lbr_init_hsw(void) x86_pmu.lbr_sel_map = hsw_lbr_sel_map; x86_get_pmu(smp_processor_id())->task_ctx_cache = create_lbr_kmem_cache(size, 0); - - if (lbr_from_signext_quirk_needed()) - static_branch_enable(&lbr_from_quirk_key); } /* skylake */ @@ -1702,7 +1699,11 @@ void intel_pmu_lbr_init(void) switch (x86_pmu.intel_cap.lbr_format) { case LBR_FORMAT_EIP_FLAGS2: x86_pmu.lbr_has_tsx = 1; - fallthrough; + x86_pmu.lbr_from_flags = 1; + if (lbr_from_signext_quirk_needed()) + static_branch_enable(&lbr_from_quirk_key); + break; + case LBR_FORMAT_EIP_FLAGS: x86_pmu.lbr_from_flags = 1; break; diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 00f5227c8459..5fe7f6c8a7a4 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -219,7 +219,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 or above (Zen) */ +#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ #define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ @@ -302,6 +302,7 @@ #define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ #define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ +#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 6b0f31fb53f7..503a577814b2 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -164,4 +164,6 @@ static inline bool fpstate_is_confidential(struct fpu_guest *gfpu) /* prctl */ extern long fpu_xstate_prctl(int option, unsigned long arg2); +extern void fpu_idle_fpregs(void); + #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 29dd27b5a339..3a8fdf881313 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -13,6 +13,7 @@ #define MWAIT_SUBSTATE_SIZE 4 #define MWAIT_HINT2CSTATE(hint) (((hint) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) #define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK) +#define MWAIT_C1_SUBSTATE_MASK 0xf0 #define CPUID_MWAIT_LEAF 5 #define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 42c09bc8ae18..cba942006ffe 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -309,6 +309,8 @@ do { \ alternative_msr_write(MSR_IA32_SPEC_CTRL, \ spec_ctrl_current() | SPEC_CTRL_IBRS, \ X86_FEATURE_USE_IBRS_FW); \ + alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, \ + X86_FEATURE_USE_IBPB_FW); \ } while (0) #define firmware_restrict_branch_speculation_end() \ diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 19514524f0f8..4a23e52fe0ee 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -72,7 +72,6 @@ static inline u64 lower_bits(u64 val, unsigned int bits) struct real_mode_header; enum stack_type; -struct ghcb; /* Early IDT entry points for #VC handler */ extern void vc_no_ghcb(void); @@ -156,11 +155,7 @@ static __always_inline void sev_es_nmi_complete(void) __sev_es_nmi_complete(); } extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); -extern enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, - bool set_ghcb_msr, - struct es_em_ctxt *ctxt, - u64 exit_code, u64 exit_info_1, - u64 exit_info_2); + static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { int rc; diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 45b18eb94fa1..35f709f619fb 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -295,6 +295,15 @@ static inline int enqcmds(void __iomem *dst, const void *src) return 0; } +static inline void tile_release(void) +{ + /* + * Instruction opcode for TILERELEASE; supported in binutils + * version >= 2.36. + */ + asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0"); +} + #endif /* __KERNEL__ */ #endif /* _ASM_X86_SPECIAL_INSNS_H */ diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 1bfe979bb9bc..580636cdc257 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -2,9 +2,6 @@ #ifndef _ASM_X86_TLB_H #define _ASM_X86_TLB_H -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) - #define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 4af5579c7ef7..cda3118f3b27 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -16,6 +16,7 @@ void __flush_tlb_all(void); #define TLB_FLUSH_ALL -1UL +#define TLB_GENERATION_INVALID 0 void cr4_update_irqsoff(unsigned long set, unsigned long clear); unsigned long cr4_read_shadow(void); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d6858533e6e5..62f6b8b7c4a5 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -555,7 +555,9 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) dest = addr + insn.length + insn.immediate.value; if (__static_call_fixup(addr, op, dest) || - WARN_ON_ONCE(dest != &__x86_return_thunk)) + WARN_ONCE(dest != &__x86_return_thunk, + "missing return thunk: %pS-%pS: %*ph", + addr, dest, 5, addr)) continue; DPRINTK("return thunk at: %pS (%px) len: %d to: %pS", diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 190e0f763375..4266b64631a4 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -19,17 +19,23 @@ #define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 #define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 #define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 +#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5 #define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4 +#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8 +#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 +#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728 #define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 #define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1 #define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5 #define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e +#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 +#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); @@ -41,8 +47,11 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) }, {} }; @@ -61,12 +70,15 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) }, {} }; @@ -81,6 +93,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index aa34f908c39f..6761668100b9 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -975,6 +975,7 @@ static inline const char *spectre_v2_module_string(void) { return ""; } #define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" #define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n" #ifdef CONFIG_BPF_SYSCALL void unpriv_ebpf_notify(int new_state) @@ -1415,6 +1416,8 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_IBRS: setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS); + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) + pr_warn(SPECTRE_V2_IBRS_PERF_MSG); break; case SPECTRE_V2_LFENCE: @@ -1516,7 +1519,17 @@ static void __init spectre_v2_select_mitigation(void) * the CPU supports Enhanced IBRS, kernel might un-intentionally not * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { + if (boot_cpu_has_bug(X86_BUG_RETBLEED) && + boot_cpu_has(X86_FEATURE_IBPB) && + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) { + + if (retbleed_cmd != RETBLEED_CMD_IBPB) { + setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW); + pr_info("Enabling Speculation Barrier for firmware calls\n"); + } + + } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fd5dead8371c..663f6e6dd288 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -682,9 +682,9 @@ static void init_intel(struct cpuinfo_x86 *c) unsigned int l1, l2; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); - if (!(l1 & (1<<11))) + if (!(l1 & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)) set_cpu_cap(c, X86_FEATURE_BTS); - if (!(l1 & (1<<12))) + if (!(l1 & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL)) set_cpu_cap(c, X86_FEATURE_PEBS); } diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 5fbd7ffb3233..12cf2e7ca33c 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -33,6 +33,8 @@ #include "internal.h" +static bool hw_injection_possible = true; + /* * Collect all the MCi_XXX settings */ @@ -339,6 +341,8 @@ static int __set_inj(const char *buf) for (i = 0; i < N_INJ_TYPES; i++) { if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { + if (i > SW_INJ && !hw_injection_possible) + continue; inj_type = i; return 0; } @@ -717,11 +721,54 @@ static void __init debugfs_init(void) &i_mce, dfs_fls[i].fops); } +static void check_hw_inj_possible(void) +{ + int cpu; + u8 bank; + + /* + * This behavior exists only on SMCA systems though its not directly + * related to SMCA. + */ + if (!cpu_feature_enabled(X86_FEATURE_SMCA)) + return; + + cpu = get_cpu(); + + for (bank = 0; bank < MAX_NR_BANKS; ++bank) { + u64 status = MCI_STATUS_VAL, ipid; + + /* Check whether bank is populated */ + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); + if (!ipid) + continue; + + toggle_hw_mce_inject(cpu, true); + + wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status); + rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status); + + if (!status) { + hw_injection_possible = false; + pr_warn("Platform does not allow *hardware* error injection." + "Try using APEI EINJ instead.\n"); + } + + toggle_hw_mce_inject(cpu, false); + + break; + } + + put_cpu(); +} + static int __init inject_init(void) { if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; + check_hw_inj_possible(); + debugfs_init(); register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 4ae0e603f7fa..7e03f5b7f6bd 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -211,7 +211,7 @@ noinstr u64 mce_rdmsrl(u32 msr); static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) { - if (mce_flags.smca) { + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { switch (reg) { case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank); case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index c04b933f48d3..02039ec3597d 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -476,8 +476,8 @@ static bool __init vmware_legacy_x2apic_available(void) { uint32_t eax, ebx, ecx, edx; VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx); - return (eax & (1 << VMWARE_CMD_VCPU_RESERVED)) == 0 && - (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0; + return !(eax & BIT(VMWARE_CMD_VCPU_RESERVED)) && + (eax & BIT(VMWARE_CMD_LEGACY_X2APIC)); } #ifdef CONFIG_AMD_MEM_ENCRYPT diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 0531d6a06df5..3b28c5b25e12 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -851,3 +851,17 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) */ return 0; } + +/* + * Initialize register state that may prevent from entering low-power idle. + * This function will be invoked from the cpuidle driver only when needed. + */ +void fpu_idle_fpregs(void) +{ + /* Note: AMX_TILE being enabled implies XGETBV1 support */ + if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) && + (xfeatures_in_use() & XFEATURE_MASK_XTILE)) { + tile_release(); + fpregs_deactivate(¤t->thread.fpu); + } +} diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 6b07faaa1579..23154d24b117 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -27,6 +27,11 @@ static __init int register_e820_pmem(void) * simply here to trigger the module to load on demand. */ pdev = platform_device_alloc("e820_pmem", -1); - return platform_device_add(pdev); + + rc = platform_device_add(pdev); + if (rc) + platform_device_put(pdev); + + return rc; } device_initcall(register_e820_pmem); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index d456ce21c255..58a6ea472db9 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -810,24 +810,43 @@ static void amd_e400_idle(void) } /* - * Intel Core2 and older machines prefer MWAIT over HALT for C1. - * We can't rely on cpuidle installing MWAIT, because it will not load - * on systems that support only C1 -- so the boot default must be MWAIT. + * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf + * exists and whenever MONITOR/MWAIT extensions are present there is at + * least one C1 substate. * - * Some AMD machines are the opposite, they depend on using HALT. - * - * So for default C1, which is used during boot until cpuidle loads, - * use MWAIT-C1 on Intel HW that has it, else use HALT. + * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait + * is passed to kernel commandline parameter. */ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) { - if (c->x86_vendor != X86_VENDOR_INTEL) + u32 eax, ebx, ecx, edx; + + /* User has disallowed the use of MWAIT. Fallback to HALT */ + if (boot_option_idle_override == IDLE_NOMWAIT) return 0; - if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_cpu_has_bug(X86_BUG_MONITOR)) + /* MWAIT is not supported on this platform. Fallback to HALT */ + if (!cpu_has(c, X86_FEATURE_MWAIT)) return 0; - return 1; + /* Monitor has a bug. Fallback to HALT */ + if (boot_cpu_has_bug(X86_BUG_MONITOR)) + return 0; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + /* + * If MWAIT extensions are not available, it is safe to use MWAIT + * with EAX=0, ECX=0. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) + return 1; + + /* + * If MWAIT extensions are available, there should be at least one + * MWAIT C1 substate present. + */ + return (edx & MWAIT_C1_SUBSTATE_MASK); } /* @@ -932,9 +951,8 @@ static int __init idle_setup(char *str) } else if (!strcmp(str, "nomwait")) { /* * If the boot option of "idle=nomwait" is added, - * it means that mwait will be disabled for CPU C2/C3 - * states. In such case it won't touch the variable - * of boot_option_idle_override. + * it means that mwait will be disabled for CPU C1/C2/C3 + * states. */ boot_option_idle_override = IDLE_NOMWAIT; } else diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index b478edf43bec..3a5b0c9c4fcc 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -219,9 +219,10 @@ static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt return ES_VMM_ERROR; } -enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, - struct es_em_ctxt *ctxt, u64 exit_code, - u64 exit_info_1, u64 exit_info_2) +static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) { /* Fill in protocol and format specifiers */ ghcb->protocol_version = ghcb_version; @@ -231,14 +232,7 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, ghcb_set_sw_exit_info_1(ghcb, exit_info_1); ghcb_set_sw_exit_info_2(ghcb, exit_info_2); - /* - * Hyper-V unenlightened guests use a paravisor for communicating and - * GHCB pages are being allocated and set up by that paravisor. Linux - * should not change the GHCB page's physical address. - */ - if (set_ghcb_msr) - sev_es_wr_ghcb_msr(__pa(ghcb)); - + sev_es_wr_ghcb_msr(__pa(ghcb)); VMGEXIT(); return verify_exception_info(ghcb, ctxt); @@ -795,7 +789,7 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) */ sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); ghcb_set_sw_scratch(ghcb, sw_scratch); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_IOIO, + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, exit_info_2); if (ret != ES_OK) return ret; @@ -837,8 +831,7 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) ghcb_set_rax(ghcb, rax); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, - SVM_EXIT_IOIO, exit_info_1, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); if (ret != ES_OK) return ret; @@ -894,7 +887,7 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb, /* xgetbv will cause #GP - use reset value for xcr0 */ ghcb_set_xcr0(ghcb, 1); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_CPUID, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); if (ret != ES_OK) return ret; @@ -919,7 +912,7 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); enum es_result ret; - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); if (ret != ES_OK) return ret; diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index c05f0124c410..63dc626627a0 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -786,7 +786,7 @@ static int vmgexit_psc(struct snp_psc_desc *desc) ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); /* This will advance the shared buffer data points to. */ - ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, SVM_VMGEXIT_PSC, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); /* * Page State Change VMGEXIT can pass error code through @@ -1212,8 +1212,7 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) ghcb_set_rdx(ghcb, regs->dx); } - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_MSR, - exit_info_1, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); if ((ret == ES_OK) && (!exit_info_1)) { regs->ax = ghcb->save.rax; @@ -1452,7 +1451,7 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); - return sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, exit_info_1, exit_info_2); + return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); } /* @@ -1628,7 +1627,7 @@ static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, /* Using a value of 0 for ExitInfo1 means RAX holds the value */ ghcb_set_rax(ghcb, val); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); if (ret != ES_OK) return ret; @@ -1658,7 +1657,7 @@ static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { - return sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WBINVD, 0, 0); + return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); } static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) @@ -1667,7 +1666,7 @@ static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt ghcb_set_rcx(ghcb, ctxt->regs->cx); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_RDPMC, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); if (ret != ES_OK) return ret; @@ -1708,7 +1707,7 @@ static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, if (x86_platform.hyper.sev_es_hcall_prepare) x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_VMMCALL, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); if (ret != ES_OK) return ret; @@ -2197,7 +2196,7 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned ghcb_set_rbx(ghcb, input->data_npages); } - ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, exit_code, input->req_gpa, input->resp_gpa); + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, exit_code, input->req_gpa, input->resp_gpa); if (ret) goto e_put; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 143e37298d8a..e5fa335a4ea7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6029,6 +6029,11 @@ split_irqchip_unlock: r = 0; break; case KVM_CAP_X86_USER_SPACE_MSR: + r = -EINVAL; + if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL | + KVM_MSR_EXIT_REASON_UNKNOWN | + KVM_MSR_EXIT_REASON_FILTER)) + break; kvm->arch.user_space_msr_mask = cap->args[0]; r = 0; break; @@ -6183,6 +6188,9 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) return -EFAULT; + if (filter.flags & ~KVM_MSR_FILTER_DEFAULT_DENY) + return -EINVAL; + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) empty &= !filter.ranges[i].nmsrs; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 57ba5502aecf..82a042c03824 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -856,7 +856,7 @@ int devmem_is_allowed(unsigned long pagenr) /* * This must follow RAM test, since System RAM is considered a - * restricted resource under CONFIG_STRICT_IOMEM. + * restricted resource under CONFIG_STRICT_DEVMEM. */ if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) { /* Low 1MB bypasses iomem restrictions. */ diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index e44e938885b7..7418c367e328 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -110,7 +110,7 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey return vma_pkey(vma); } -#define PKRU_AD_KEY(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY)) +#define PKRU_AD_MASK(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY)) /* * Make the default PKRU value (at execve() time) as restrictive @@ -118,11 +118,14 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey * in the process's lifetime will not accidentally get access * to data which is pkey-protected later on. */ -u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | - PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) | - PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) | - PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) | - PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15); +u32 init_pkru_value = PKRU_AD_MASK( 1) | PKRU_AD_MASK( 2) | + PKRU_AD_MASK( 3) | PKRU_AD_MASK( 4) | + PKRU_AD_MASK( 5) | PKRU_AD_MASK( 6) | + PKRU_AD_MASK( 7) | PKRU_AD_MASK( 8) | + PKRU_AD_MASK( 9) | PKRU_AD_MASK(10) | + PKRU_AD_MASK(11) | PKRU_AD_MASK(12) | + PKRU_AD_MASK(13) | PKRU_AD_MASK(14) | + PKRU_AD_MASK(15); static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index d400b6d9d246..c1e31e9a85d7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -734,10 +734,10 @@ static void flush_tlb_func(void *info) const struct flush_tlb_info *f = info; struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); bool local = smp_processor_id() == f->initiating_cpu; unsigned long nr_invalidate = 0; + u64 mm_tlb_gen; /* This code cannot presently handle being reentered. */ VM_WARN_ON(!irqs_disabled()); @@ -771,6 +771,23 @@ static void flush_tlb_func(void *info) return; } + if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID && + f->new_tlb_gen <= local_tlb_gen)) { + /* + * The TLB is already up to date in respect to f->new_tlb_gen. + * While the core might be still behind mm_tlb_gen, checking + * mm_tlb_gen unnecessarily would have negative caching effects + * so avoid it. + */ + return; + } + + /* + * Defer mm_tlb_gen reading as long as possible to avoid cache + * contention. + */ + mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); + if (unlikely(local_tlb_gen == mm_tlb_gen)) { /* * There's nothing to do: we're already up to date. This can @@ -827,6 +844,12 @@ static void flush_tlb_func(void *info) /* Partial flush */ unsigned long addr = f->start; + /* Partial flush cannot have invalid generations */ + VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID); + + /* Partial flush must have valid mm */ + VM_WARN_ON(f->mm == NULL); + nr_invalidate = (f->end - f->start) >> f->stride_shift; while (addr < f->end) { @@ -1029,7 +1052,8 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) struct flush_tlb_info *info; preempt_disable(); - info = get_flush_tlb_info(NULL, start, end, 0, false, 0); + info = get_flush_tlb_info(NULL, start, end, 0, false, + TLB_GENERATION_INVALID); on_each_cpu(do_kernel_range_flush, info, 1); @@ -1198,7 +1222,8 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) int cpu = get_cpu(); - info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0); + info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, + TLB_GENERATION_INVALID); /* * flush_tlb_multi() is not optimized for the common case in which only * a local TLB flush is needed. Optimize this use-case by calling diff --git a/certs/Kconfig b/certs/Kconfig index 476755703cf8..bf9b511573d7 100644 --- a/certs/Kconfig +++ b/certs/Kconfig @@ -43,6 +43,7 @@ config SYSTEM_TRUSTED_KEYRING bool "Provide system-wide ring of trusted keys" depends on KEYS depends on ASYMMETRIC_KEY_TYPE + depends on X509_CERTIFICATE_PARSER help Provide a system keyring to which trusted keys can be added. Keys in the keyring are considered to be trusted. Keys may be added at will diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 6ff1901d7d43..3c6d4ef87be0 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -782,7 +782,8 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) if (!osc_cpc_flexible_adr_space_confirmed) { pr_debug("Flexible address space capability not supported\n"); - goto out_free; + if (!cpc_supported_by_cpu()) + goto out_free; } addr = ioremap(gas_t->address, gas_t->bit_width/8); @@ -809,7 +810,8 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) } if (!osc_cpc_flexible_adr_space_confirmed) { pr_debug("Flexible address space capability not supported\n"); - goto out_free; + if (!cpc_supported_by_cpu()) + goto out_free; } } else { if (gas_t->space_id != ACPI_ADR_SPACE_FIXED_HARDWARE || !cpc_ffh_supported()) { diff --git a/drivers/clk/clk-lan966x.c b/drivers/clk/clk-lan966x.c index d1535ac13e89..81cb90955d68 100644 --- a/drivers/clk/clk-lan966x.c +++ b/drivers/clk/clk-lan966x.c @@ -213,7 +213,7 @@ static int lan966x_gate_clk_register(struct device *dev, hw_data->hws[i] = devm_clk_hw_register_gate(dev, clk_gate_desc[idx].name, - "lan966x", 0, base, + "lan966x", 0, gate_base, clk_gate_desc[idx].bit_idx, 0, &clk_gate_lock); diff --git a/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c b/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c index 29a8c710ae06..b7962e5149a5 100644 --- a/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c +++ b/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c @@ -138,6 +138,7 @@ static struct ccu_common *sun50i_h6_r_ccu_clks[] = { &r_apb2_rsb_clk.common, &r_apb1_ir_clk.common, &r_apb1_w1_clk.common, + &r_apb1_rtc_clk.common, &ir_clk.common, &w1_clk.common, }; diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 59b0bedc9c24..c8fa7dcfdbd0 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -103,9 +103,14 @@ static void dimm_setup_label(struct dimm_info *dimm, u16 handle) dmi_memdev_name(handle, &bank, &device); - /* both strings must be non-zero */ - if (bank && *bank && device && *device) - snprintf(dimm->label, sizeof(dimm->label), "%s %s", bank, device); + /* + * Set to a NULL string when both bank and device are zero. In this case, + * the label assigned by default will be preserved. + */ + snprintf(dimm->label, sizeof(dimm->label), "%s%s%s", + (bank && *bank) ? bank : "", + (bank && *bank && device && *device) ? " " : "", + (device && *device) ? device : ""); } static void assign_dmi_dimm_info(struct dimm_info *dimm, struct memdev_dmi_entry *entry) diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c index 1cee64b80a7e..f7d37c282819 100644 --- a/drivers/edac/synopsys_edac.c +++ b/drivers/edac/synopsys_edac.c @@ -514,6 +514,28 @@ static void handle_error(struct mem_ctl_info *mci, struct synps_ecc_status *p) memset(p, 0, sizeof(*p)); } +static void enable_intr(struct synps_edac_priv *priv) +{ + /* Enable UE/CE Interrupts */ + if (priv->p_data->quirks & DDR_ECC_INTR_SELF_CLEAR) + writel(DDR_UE_MASK | DDR_CE_MASK, + priv->baseaddr + ECC_CLR_OFST); + else + writel(DDR_QOSUE_MASK | DDR_QOSCE_MASK, + priv->baseaddr + DDR_QOS_IRQ_EN_OFST); + +} + +static void disable_intr(struct synps_edac_priv *priv) +{ + /* Disable UE/CE Interrupts */ + if (priv->p_data->quirks & DDR_ECC_INTR_SELF_CLEAR) + writel(0x0, priv->baseaddr + ECC_CLR_OFST); + else + writel(DDR_QOSUE_MASK | DDR_QOSCE_MASK, + priv->baseaddr + DDR_QOS_IRQ_DB_OFST); +} + /** * intr_handler - Interrupt Handler for ECC interrupts. * @irq: IRQ number. @@ -555,6 +577,9 @@ static irqreturn_t intr_handler(int irq, void *dev_id) /* v3.0 of the controller does not have this register */ if (!(priv->p_data->quirks & DDR_ECC_INTR_SELF_CLEAR)) writel(regval, priv->baseaddr + DDR_QOS_IRQ_STAT_OFST); + else + enable_intr(priv); + return IRQ_HANDLED; } @@ -837,25 +862,6 @@ static void mc_init(struct mem_ctl_info *mci, struct platform_device *pdev) init_csrows(mci); } -static void enable_intr(struct synps_edac_priv *priv) -{ - /* Enable UE/CE Interrupts */ - if (priv->p_data->quirks & DDR_ECC_INTR_SELF_CLEAR) - writel(DDR_UE_MASK | DDR_CE_MASK, - priv->baseaddr + ECC_CLR_OFST); - else - writel(DDR_QOSUE_MASK | DDR_QOSCE_MASK, - priv->baseaddr + DDR_QOS_IRQ_EN_OFST); - -} - -static void disable_intr(struct synps_edac_priv *priv) -{ - /* Disable UE/CE Interrupts */ - writel(DDR_QOSUE_MASK | DDR_QOSCE_MASK, - priv->baseaddr + DDR_QOS_IRQ_DB_OFST); -} - static int setup_irq(struct mem_ctl_info *mci, struct platform_device *pdev) { diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index 08bc52c3cdcb..ecd7d169470b 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -351,6 +351,9 @@ static const struct regmap_config pca953x_i2c_regmap = { .reg_bits = 8, .val_bits = 8, + .use_single_read = true, + .use_single_write = true, + .readable_reg = pca953x_readable_register, .writeable_reg = pca953x_writeable_register, .volatile_reg = pca953x_volatile_register, @@ -906,15 +909,18 @@ static int pca953x_irq_setup(struct pca953x_chip *chip, static int device_pca95xx_init(struct pca953x_chip *chip, u32 invert) { DECLARE_BITMAP(val, MAX_LINE); + u8 regaddr; int ret; - ret = regcache_sync_region(chip->regmap, chip->regs->output, - chip->regs->output + NBANK(chip)); + regaddr = pca953x_recalc_addr(chip, chip->regs->output, 0); + ret = regcache_sync_region(chip->regmap, regaddr, + regaddr + NBANK(chip) - 1); if (ret) goto out; - ret = regcache_sync_region(chip->regmap, chip->regs->direction, - chip->regs->direction + NBANK(chip)); + regaddr = pca953x_recalc_addr(chip, chip->regs->direction, 0); + ret = regcache_sync_region(chip->regmap, regaddr, + regaddr + NBANK(chip) - 1); if (ret) goto out; @@ -1127,14 +1133,14 @@ static int pca953x_regcache_sync(struct device *dev) * sync these registers first and only then sync the rest. */ regaddr = pca953x_recalc_addr(chip, chip->regs->direction, 0); - ret = regcache_sync_region(chip->regmap, regaddr, regaddr + NBANK(chip)); + ret = regcache_sync_region(chip->regmap, regaddr, regaddr + NBANK(chip) - 1); if (ret) { dev_err(dev, "Failed to sync GPIO dir registers: %d\n", ret); return ret; } regaddr = pca953x_recalc_addr(chip, chip->regs->output, 0); - ret = regcache_sync_region(chip->regmap, regaddr, regaddr + NBANK(chip)); + ret = regcache_sync_region(chip->regmap, regaddr, regaddr + NBANK(chip) - 1); if (ret) { dev_err(dev, "Failed to sync GPIO out registers: %d\n", ret); return ret; @@ -1144,7 +1150,7 @@ static int pca953x_regcache_sync(struct device *dev) if (chip->driver_data & PCA_PCAL) { regaddr = pca953x_recalc_addr(chip, PCAL953X_IN_LATCH, 0); ret = regcache_sync_region(chip->regmap, regaddr, - regaddr + NBANK(chip)); + regaddr + NBANK(chip) - 1); if (ret) { dev_err(dev, "Failed to sync INT latch registers: %d\n", ret); @@ -1153,7 +1159,7 @@ static int pca953x_regcache_sync(struct device *dev) regaddr = pca953x_recalc_addr(chip, PCAL953X_INT_MASK, 0); ret = regcache_sync_region(chip->regmap, regaddr, - regaddr + NBANK(chip)); + regaddr + NBANK(chip) - 1); if (ret) { dev_err(dev, "Failed to sync INT mask registers: %d\n", ret); diff --git a/drivers/gpio/gpio-xilinx.c b/drivers/gpio/gpio-xilinx.c index b6d3a57e27ed..7f8e2fed2988 100644 --- a/drivers/gpio/gpio-xilinx.c +++ b/drivers/gpio/gpio-xilinx.c @@ -99,7 +99,7 @@ static inline void xgpio_set_value32(unsigned long *map, int bit, u32 v) const unsigned long offset = (bit % BITS_PER_LONG) & BIT(5); map[index] &= ~(0xFFFFFFFFul << offset); - map[index] |= v << offset; + map[index] |= (unsigned long)v << offset; } static inline int xgpio_regoffset(struct xgpio_instance *chip, int ch) diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index 0c9a63becfef..b26e64338376 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -421,6 +421,10 @@ out_free_lh: * @work: the worker that implements software debouncing * @sw_debounced: flag indicating if the software debouncer is active * @level: the current debounced physical level of the line + * @hdesc: the Hardware Timestamp Engine (HTE) descriptor + * @raw_level: the line level at the time of event + * @total_discard_seq: the running counter of the discarded events + * @last_seqno: the last sequence number before debounce period expires */ struct line { struct gpio_desc *desc; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 6b6d46e29e6e..4608599ba6bb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1364,16 +1364,10 @@ void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, struct amdgpu_vm *vm) { struct amdkfd_process_info *process_info = vm->process_info; - struct amdgpu_bo *pd = vm->root.bo; if (!process_info) return; - /* Release eviction fence from PD */ - amdgpu_bo_reserve(pd, false); - amdgpu_bo_fence(pd, NULL, false); - amdgpu_bo_unreserve(pd); - /* Update process info */ mutex_lock(&process_info->lock); process_info->n_vms--; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c index 714178f1b6c6..2168163aad2d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c @@ -40,7 +40,7 @@ static void amdgpu_bo_list_free_rcu(struct rcu_head *rcu) { struct amdgpu_bo_list *list = container_of(rcu, struct amdgpu_bo_list, rhead); - + mutex_destroy(&list->bo_list_mutex); kvfree(list); } @@ -136,6 +136,7 @@ int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp, trace_amdgpu_cs_bo_status(list->num_entries, total_size); + mutex_init(&list->bo_list_mutex); *result = list; return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h index 529d52a204cf..9caea1688fc3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h @@ -47,6 +47,10 @@ struct amdgpu_bo_list { struct amdgpu_bo *oa_obj; unsigned first_userptr; unsigned num_entries; + + /* Protect access during command submission. + */ + struct mutex bo_list_mutex; }; int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index b28af04b0c3e..d8f1335bc68f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -519,6 +519,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, return r; } + mutex_lock(&p->bo_list->bo_list_mutex); + /* One for TTM and one for the CS job */ amdgpu_bo_list_for_each_entry(e, p->bo_list) e->tv.num_shared = 2; @@ -651,6 +653,7 @@ out_free_user_pages: kvfree(e->user_pages); e->user_pages = NULL; } + mutex_unlock(&p->bo_list->bo_list_mutex); } return r; } @@ -690,9 +693,11 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, { unsigned i; - if (error && backoff) + if (error && backoff) { ttm_eu_backoff_reservation(&parser->ticket, &parser->validated); + mutex_unlock(&parser->bo_list->bo_list_mutex); + } for (i = 0; i < parser->num_post_deps; i++) { drm_syncobj_put(parser->post_deps[i].syncobj); @@ -832,12 +837,16 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) continue; r = amdgpu_vm_bo_update(adev, bo_va, false); - if (r) + if (r) { + mutex_unlock(&p->bo_list->bo_list_mutex); return r; + } r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update); - if (r) + if (r) { + mutex_unlock(&p->bo_list->bo_list_mutex); return r; + } } r = amdgpu_vm_handle_moved(adev, vm); @@ -1278,6 +1287,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence); mutex_unlock(&p->adev->notifier_lock); + mutex_unlock(&p->bo_list->bo_list_mutex); return 0; diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig index 0ba0598eba20..ec6771e87e73 100644 --- a/drivers/gpu/drm/amd/display/Kconfig +++ b/drivers/gpu/drm/amd/display/Kconfig @@ -6,7 +6,7 @@ config DRM_AMD_DC bool "AMD DC - Enable new display engine" default y select SND_HDA_COMPONENT if SND_HDA_CORE - select DRM_AMD_DC_DCN if X86 && !(KCOV_INSTRUMENT_ALL && KCOV_ENABLE_COMPARISONS) + select DRM_AMD_DC_DCN if (X86 || PPC_LONG_DOUBLE_128) && !(KCOV_INSTRUMENT_ALL && KCOV_ENABLE_COMPARISONS) help Choose this option if you want to use the new display engine support for AMDGPU. This adds required support for Vega and diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 93ac33a8de9a..3087dd1a1856 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1653,7 +1653,7 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) #if defined(CONFIG_DRM_AMD_SECURE_DISPLAY) adev->dm.crc_rd_wrk = amdgpu_dm_crtc_secure_display_create_work(); #endif - if (dc_enable_dmub_notifications(adev->dm.dc)) { + if (dc_is_dmub_outbox_supported(adev->dm.dc)) { init_completion(&adev->dm.dmub_aux_transfer_done); adev->dm.dmub_notify = kzalloc(sizeof(struct dmub_notification), GFP_KERNEL); if (!adev->dm.dmub_notify) { @@ -1689,6 +1689,13 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) goto error; } + /* Enable outbox notification only after IRQ handlers are registered and DMUB is alive. + * It is expected that DMUB will resend any pending notifications at this point, for + * example HPD from DPIA. + */ + if (dc_is_dmub_outbox_supported(adev->dm.dc)) + dc_enable_dmub_outbox(adev->dm.dc); + /* create fake encoders for MST */ dm_dp_create_fake_mst_encoders(adev); @@ -2678,9 +2685,6 @@ static int dm_resume(void *handle) */ link_enc_cfg_copy(adev->dm.dc->current_state, dc_state); - if (dc_enable_dmub_notifications(adev->dm.dc)) - amdgpu_dm_outbox_init(adev); - r = dm_dmub_hw_init(adev); if (r) DRM_ERROR("DMUB interface failed to initialize: status=%d\n", r); @@ -2698,6 +2702,11 @@ static int dm_resume(void *handle) } } + if (dc_is_dmub_outbox_supported(adev->dm.dc)) { + amdgpu_dm_outbox_init(adev); + dc_enable_dmub_outbox(adev->dm.dc); + } + WARN_ON(!dc_commit_state(dm->dc, dc_state)); dm_gpureset_commit_state(dm->cached_dc_state, dm); @@ -2719,13 +2728,15 @@ static int dm_resume(void *handle) /* TODO: Remove dc_state->dccg, use dc->dccg directly. */ dc_resource_state_construct(dm->dc, dm_state->context); - /* Re-enable outbox interrupts for DPIA. */ - if (dc_enable_dmub_notifications(adev->dm.dc)) - amdgpu_dm_outbox_init(adev); - /* Before powering on DC we need to re-initialize DMUB. */ dm_dmub_hw_resume(adev); + /* Re-enable outbox interrupts for DPIA. */ + if (dc_is_dmub_outbox_supported(adev->dm.dc)) { + amdgpu_dm_outbox_init(adev); + dc_enable_dmub_outbox(adev->dm.dc); + } + /* power on hardware */ dc_set_power_state(dm->dc, DC_ACPI_CM_POWER_STATE_D0); diff --git a/drivers/gpu/drm/drm_gem_ttm_helper.c b/drivers/gpu/drm/drm_gem_ttm_helper.c index d5962a34c01d..e5fc875990c4 100644 --- a/drivers/gpu/drm/drm_gem_ttm_helper.c +++ b/drivers/gpu/drm/drm_gem_ttm_helper.c @@ -64,8 +64,13 @@ int drm_gem_ttm_vmap(struct drm_gem_object *gem, struct iosys_map *map) { struct ttm_buffer_object *bo = drm_gem_ttm_of_gem(gem); + int ret; + + dma_resv_lock(gem->resv, NULL); + ret = ttm_bo_vmap(bo, map); + dma_resv_unlock(gem->resv); - return ttm_bo_vmap(bo, map); + return ret; } EXPORT_SYMBOL(drm_gem_ttm_vmap); @@ -82,7 +87,9 @@ void drm_gem_ttm_vunmap(struct drm_gem_object *gem, { struct ttm_buffer_object *bo = drm_gem_ttm_of_gem(gem); + dma_resv_lock(gem->resv, NULL); ttm_bo_vunmap(bo, map); + dma_resv_unlock(gem->resv); } EXPORT_SYMBOL(drm_gem_ttm_vunmap); diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index 09f82545789f..44e7339e7a4a 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -273,10 +273,17 @@ struct intel_context { u8 child_index; /** @guc: GuC specific members for parallel submission */ struct { - /** @wqi_head: head pointer in work queue */ + /** @wqi_head: cached head pointer in work queue */ u16 wqi_head; - /** @wqi_tail: tail pointer in work queue */ + /** @wqi_tail: cached tail pointer in work queue */ u16 wqi_tail; + /** @wq_head: pointer to the actual head in work queue */ + u32 *wq_head; + /** @wq_tail: pointer to the actual head in work queue */ + u32 *wq_tail; + /** @wq_status: pointer to the status in work queue */ + u32 *wq_status; + /** * @parent_page: page in context state (ce->state) used * by parent for work queue, process descriptor diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h index 1431f1e9dbee..04e435bce79b 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine.h +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -201,6 +201,8 @@ int intel_ring_submission_setup(struct intel_engine_cs *engine); int intel_engine_stop_cs(struct intel_engine_cs *engine); void intel_engine_cancel_stop_cs(struct intel_engine_cs *engine); +void intel_engine_wait_for_pending_mi_fw(struct intel_engine_cs *engine); + void intel_engine_set_hwsp_writemask(struct intel_engine_cs *engine, u32 mask); u64 intel_engine_get_active_head(const struct intel_engine_cs *engine); diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 14c6ddbbfde8..5b6ce10cb158 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -1282,10 +1282,10 @@ static int __intel_engine_stop_cs(struct intel_engine_cs *engine, intel_uncore_write_fw(uncore, mode, _MASKED_BIT_ENABLE(STOP_RING)); /* - * Wa_22011802037 : gen12, Prior to doing a reset, ensure CS is + * Wa_22011802037 : gen11, gen12, Prior to doing a reset, ensure CS is * stopped, set ring stop bit and prefetch disable bit to halt CS */ - if (GRAPHICS_VER(engine->i915) == 12) + if (IS_GRAPHICS_VER(engine->i915, 11, 12)) intel_uncore_write_fw(uncore, RING_MODE_GEN7(engine->mmio_base), _MASKED_BIT_ENABLE(GEN12_GFX_PREFETCH_DISABLE)); @@ -1308,6 +1308,18 @@ int intel_engine_stop_cs(struct intel_engine_cs *engine) return -ENODEV; ENGINE_TRACE(engine, "\n"); + /* + * TODO: Find out why occasionally stopping the CS times out. Seen + * especially with gem_eio tests. + * + * Occasionally trying to stop the cs times out, but does not adversely + * affect functionality. The timeout is set as a config parameter that + * defaults to 100ms. In most cases the follow up operation is to wait + * for pending MI_FORCE_WAKES. The assumption is that this timeout is + * sufficient for any pending MI_FORCEWAKEs to complete. Once root + * caused, the caller must check and handle the return from this + * function. + */ if (__intel_engine_stop_cs(engine, 1000, stop_timeout(engine))) { ENGINE_TRACE(engine, "timed out on STOP_RING -> IDLE; HEAD:%04x, TAIL:%04x\n", @@ -1334,6 +1346,78 @@ void intel_engine_cancel_stop_cs(struct intel_engine_cs *engine) ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); } +static u32 __cs_pending_mi_force_wakes(struct intel_engine_cs *engine) +{ + static const i915_reg_t _reg[I915_NUM_ENGINES] = { + [RCS0] = MSG_IDLE_CS, + [BCS0] = MSG_IDLE_BCS, + [VCS0] = MSG_IDLE_VCS0, + [VCS1] = MSG_IDLE_VCS1, + [VCS2] = MSG_IDLE_VCS2, + [VCS3] = MSG_IDLE_VCS3, + [VCS4] = MSG_IDLE_VCS4, + [VCS5] = MSG_IDLE_VCS5, + [VCS6] = MSG_IDLE_VCS6, + [VCS7] = MSG_IDLE_VCS7, + [VECS0] = MSG_IDLE_VECS0, + [VECS1] = MSG_IDLE_VECS1, + [VECS2] = MSG_IDLE_VECS2, + [VECS3] = MSG_IDLE_VECS3, + [CCS0] = MSG_IDLE_CS, + [CCS1] = MSG_IDLE_CS, + [CCS2] = MSG_IDLE_CS, + [CCS3] = MSG_IDLE_CS, + }; + u32 val; + + if (!_reg[engine->id].reg) { + drm_err(&engine->i915->drm, + "MSG IDLE undefined for engine id %u\n", engine->id); + return 0; + } + + val = intel_uncore_read(engine->uncore, _reg[engine->id]); + + /* bits[29:25] & bits[13:9] >> shift */ + return (val & (val >> 16) & MSG_IDLE_FW_MASK) >> MSG_IDLE_FW_SHIFT; +} + +static void __gpm_wait_for_fw_complete(struct intel_gt *gt, u32 fw_mask) +{ + int ret; + + /* Ensure GPM receives fw up/down after CS is stopped */ + udelay(1); + + /* Wait for forcewake request to complete in GPM */ + ret = __intel_wait_for_register_fw(gt->uncore, + GEN9_PWRGT_DOMAIN_STATUS, + fw_mask, fw_mask, 5000, 0, NULL); + + /* Ensure CS receives fw ack from GPM */ + udelay(1); + + if (ret) + GT_TRACE(gt, "Failed to complete pending forcewake %d\n", ret); +} + +/* + * Wa_22011802037:gen12: In addition to stopping the cs, we need to wait for any + * pending MI_FORCE_WAKEUP requests that the CS has initiated to complete. The + * pending status is indicated by bits[13:9] (masked by bits[29:25]) in the + * MSG_IDLE register. There's one MSG_IDLE register per reset domain. Since we + * are concerned only with the gt reset here, we use a logical OR of pending + * forcewakeups from all reset domains and then wait for them to complete by + * querying PWRGT_DOMAIN_STATUS. + */ +void intel_engine_wait_for_pending_mi_fw(struct intel_engine_cs *engine) +{ + u32 fw_pending = __cs_pending_mi_force_wakes(engine); + + if (fw_pending) + __gpm_wait_for_fw_complete(engine->gt, fw_pending); +} + static u32 read_subslice_reg(const struct intel_engine_cs *engine, int slice, int subslice, i915_reg_t reg) diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c index 86f7a9ac1c39..0627fa10d2dc 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@ -661,6 +661,16 @@ static inline void execlists_schedule_out(struct i915_request *rq) i915_request_put(rq); } +static u32 map_i915_prio_to_lrc_desc_prio(int prio) +{ + if (prio > I915_PRIORITY_NORMAL) + return GEN12_CTX_PRIORITY_HIGH; + else if (prio < I915_PRIORITY_NORMAL) + return GEN12_CTX_PRIORITY_LOW; + else + return GEN12_CTX_PRIORITY_NORMAL; +} + static u64 execlists_update_context(struct i915_request *rq) { struct intel_context *ce = rq->context; @@ -669,7 +679,7 @@ static u64 execlists_update_context(struct i915_request *rq) desc = ce->lrc.desc; if (rq->engine->flags & I915_ENGINE_HAS_EU_PRIORITY) - desc |= lrc_desc_priority(rq_prio(rq)); + desc |= map_i915_prio_to_lrc_desc_prio(rq_prio(rq)); /* * WaIdleLiteRestore:bdw,skl @@ -2958,6 +2968,13 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine) ring_set_paused(engine, 1); intel_engine_stop_cs(engine); + /* + * Wa_22011802037:gen11/gen12: In addition to stopping the cs, we need + * to wait for any pending mi force wakeups + */ + if (IS_GRAPHICS_VER(engine->i915, 11, 12)) + intel_engine_wait_for_pending_mi_fw(engine); + engine->execlists.reset_ccid = active_ccid(engine); } diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.h b/drivers/gpu/drm/i915/gt/intel_lrc.h index 31be734010db..a390f0813c8b 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.h +++ b/drivers/gpu/drm/i915/gt/intel_lrc.h @@ -111,16 +111,6 @@ enum { #define XEHP_SW_COUNTER_SHIFT 58 #define XEHP_SW_COUNTER_WIDTH 6 -static inline u32 lrc_desc_priority(int prio) -{ - if (prio > I915_PRIORITY_NORMAL) - return GEN12_CTX_PRIORITY_HIGH; - else if (prio < I915_PRIORITY_NORMAL) - return GEN12_CTX_PRIORITY_LOW; - else - return GEN12_CTX_PRIORITY_NORMAL; -} - static inline void lrc_runtime_start(struct intel_context *ce) { struct intel_context_stats *stats = &ce->stats; diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h index 4ef9990ed7f8..29ef8afc8c2e 100644 --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h @@ -122,6 +122,9 @@ enum intel_guc_action { INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_DONE = 0x1002, INTEL_GUC_ACTION_SCHED_ENGINE_MODE_SET = 0x1003, INTEL_GUC_ACTION_SCHED_ENGINE_MODE_DONE = 0x1004, + INTEL_GUC_ACTION_V69_SET_CONTEXT_PRIORITY = 0x1005, + INTEL_GUC_ACTION_V69_SET_CONTEXT_EXECUTION_QUANTUM = 0x1006, + INTEL_GUC_ACTION_V69_SET_CONTEXT_PREEMPTION_TIMEOUT = 0x1007, INTEL_GUC_ACTION_CONTEXT_RESET_NOTIFICATION = 0x1008, INTEL_GUC_ACTION_ENGINE_FAILURE_NOTIFICATION = 0x1009, INTEL_GUC_ACTION_HOST2GUC_UPDATE_CONTEXT_POLICIES = 0x100B, diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c index 2c4ad4a65089..8c6885f43d1a 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c @@ -310,8 +310,8 @@ static u32 guc_ctl_wa_flags(struct intel_guc *guc) if (IS_DG2(gt->i915)) flags |= GUC_WA_DUAL_QUEUE; - /* Wa_22011802037: graphics version 12 */ - if (GRAPHICS_VER(gt->i915) == 12) + /* Wa_22011802037: graphics version 11/12 */ + if (IS_GRAPHICS_VER(gt->i915, 11, 12)) flags |= GUC_WA_PRE_PARSER; /* Wa_16011777198:dg2 */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h index 966e69a8b1c1..9feda105f913 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h @@ -170,6 +170,11 @@ struct intel_guc { /** @ads_engine_usage_size: size of engine usage in the ADS */ u32 ads_engine_usage_size; + /** @lrc_desc_pool_v69: object allocated to hold the GuC LRC descriptor pool */ + struct i915_vma *lrc_desc_pool_v69; + /** @lrc_desc_pool_vaddr_v69: contents of the GuC LRC descriptor pool */ + void *lrc_desc_pool_vaddr_v69; + /** * @context_lookup: used to resolve intel_context from guc_id, if a * context is present in this structure it is registered with the GuC diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h index 42cb7a9a6199..89a7e5ec0614 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h @@ -203,6 +203,20 @@ struct guc_wq_item { u32 fence_id; } __packed; +struct guc_process_desc_v69 { + u32 stage_id; + u64 db_base_addr; + u32 head; + u32 tail; + u32 error_offset; + u64 wq_base_addr; + u32 wq_size_bytes; + u32 wq_status; + u32 engine_presence; + u32 priority; + u32 reserved[36]; +} __packed; + struct guc_sched_wq_desc { u32 head; u32 tail; @@ -227,6 +241,37 @@ struct guc_ctxt_registration_info { }; #define CONTEXT_REGISTRATION_FLAG_KMD BIT(0) +/* Preempt to idle on quantum expiry */ +#define CONTEXT_POLICY_FLAG_PREEMPT_TO_IDLE_V69 BIT(0) + +/* + * GuC Context registration descriptor. + * FIXME: This is only required to exist during context registration. + * The current 1:1 between guc_lrc_desc and LRCs for the lifetime of the LRC + * is not required. + */ +struct guc_lrc_desc_v69 { + u32 hw_context_desc; + u32 slpm_perf_mode_hint; /* SPLC v1 only */ + u32 slpm_freq_hint; + u32 engine_submit_mask; /* In logical space */ + u8 engine_class; + u8 reserved0[3]; + u32 priority; + u32 process_desc; + u32 wq_addr; + u32 wq_size; + u32 context_flags; /* CONTEXT_REGISTRATION_* */ + /* Time for one workload to execute. (in micro seconds) */ + u32 execution_quantum; + /* Time to wait for a preemption request to complete before issuing a + * reset. (in micro seconds). + */ + u32 preemption_timeout; + u32 policy_flags; /* CONTEXT_POLICY_* */ + u32 reserved1[19]; +} __packed; + /* 32-bit KLV structure as used by policy updates and others */ struct guc_klv_generic_dw_t { u32 kl; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 1726f0f19901..2d9f5f1c79d3 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -414,12 +414,15 @@ struct sync_semaphore { }; struct parent_scratch { - struct guc_sched_wq_desc wq_desc; + union guc_descs { + struct guc_sched_wq_desc wq_desc; + struct guc_process_desc_v69 pdesc; + } descs; struct sync_semaphore go; struct sync_semaphore join[MAX_ENGINE_INSTANCE + 1]; - u8 unused[WQ_OFFSET - sizeof(struct guc_sched_wq_desc) - + u8 unused[WQ_OFFSET - sizeof(union guc_descs) - sizeof(struct sync_semaphore) * (MAX_ENGINE_INSTANCE + 2)]; u32 wq[WQ_SIZE / sizeof(u32)]; @@ -456,17 +459,23 @@ __get_parent_scratch(struct intel_context *ce) LRC_STATE_OFFSET) / sizeof(u32))); } +static struct guc_process_desc_v69 * +__get_process_desc_v69(struct intel_context *ce) +{ + struct parent_scratch *ps = __get_parent_scratch(ce); + + return &ps->descs.pdesc; +} + static struct guc_sched_wq_desc * -__get_wq_desc(struct intel_context *ce) +__get_wq_desc_v70(struct intel_context *ce) { struct parent_scratch *ps = __get_parent_scratch(ce); - return &ps->wq_desc; + return &ps->descs.wq_desc; } -static u32 *get_wq_pointer(struct guc_sched_wq_desc *wq_desc, - struct intel_context *ce, - u32 wqi_size) +static u32 *get_wq_pointer(struct intel_context *ce, u32 wqi_size) { /* * Check for space in work queue. Caching a value of head pointer in @@ -476,7 +485,7 @@ static u32 *get_wq_pointer(struct guc_sched_wq_desc *wq_desc, #define AVAILABLE_SPACE \ CIRC_SPACE(ce->parallel.guc.wqi_tail, ce->parallel.guc.wqi_head, WQ_SIZE) if (wqi_size > AVAILABLE_SPACE) { - ce->parallel.guc.wqi_head = READ_ONCE(wq_desc->head); + ce->parallel.guc.wqi_head = READ_ONCE(*ce->parallel.guc.wq_head); if (wqi_size > AVAILABLE_SPACE) return NULL; @@ -495,11 +504,55 @@ static inline struct intel_context *__get_context(struct intel_guc *guc, u32 id) return ce; } +static struct guc_lrc_desc_v69 *__get_lrc_desc_v69(struct intel_guc *guc, u32 index) +{ + struct guc_lrc_desc_v69 *base = guc->lrc_desc_pool_vaddr_v69; + + if (!base) + return NULL; + + GEM_BUG_ON(index >= GUC_MAX_CONTEXT_ID); + + return &base[index]; +} + +static int guc_lrc_desc_pool_create_v69(struct intel_guc *guc) +{ + u32 size; + int ret; + + size = PAGE_ALIGN(sizeof(struct guc_lrc_desc_v69) * + GUC_MAX_CONTEXT_ID); + ret = intel_guc_allocate_and_map_vma(guc, size, &guc->lrc_desc_pool_v69, + (void **)&guc->lrc_desc_pool_vaddr_v69); + if (ret) + return ret; + + return 0; +} + +static void guc_lrc_desc_pool_destroy_v69(struct intel_guc *guc) +{ + if (!guc->lrc_desc_pool_vaddr_v69) + return; + + guc->lrc_desc_pool_vaddr_v69 = NULL; + i915_vma_unpin_and_release(&guc->lrc_desc_pool_v69, I915_VMA_RELEASE_MAP); +} + static inline bool guc_submission_initialized(struct intel_guc *guc) { return guc->submission_initialized; } +static inline void _reset_lrc_desc_v69(struct intel_guc *guc, u32 id) +{ + struct guc_lrc_desc_v69 *desc = __get_lrc_desc_v69(guc, id); + + if (desc) + memset(desc, 0, sizeof(*desc)); +} + static inline bool ctx_id_mapped(struct intel_guc *guc, u32 id) { return __get_context(guc, id); @@ -526,6 +579,8 @@ static inline void clr_ctx_id_mapping(struct intel_guc *guc, u32 id) if (unlikely(!guc_submission_initialized(guc))) return; + _reset_lrc_desc_v69(guc, id); + /* * xarray API doesn't have xa_erase_irqsave wrapper, so calling * the lower level functions directly. @@ -611,7 +666,7 @@ int intel_guc_wait_for_idle(struct intel_guc *guc, long timeout) true, timeout); } -static int guc_context_policy_init(struct intel_context *ce, bool loop); +static int guc_context_policy_init_v70(struct intel_context *ce, bool loop); static int try_context_registration(struct intel_context *ce, bool loop); static int __guc_add_request(struct intel_guc *guc, struct i915_request *rq) @@ -639,7 +694,7 @@ static int __guc_add_request(struct intel_guc *guc, struct i915_request *rq) GEM_BUG_ON(context_guc_id_invalid(ce)); if (context_policy_required(ce)) { - err = guc_context_policy_init(ce, false); + err = guc_context_policy_init_v70(ce, false); if (err) return err; } @@ -737,9 +792,7 @@ static u32 wq_space_until_wrap(struct intel_context *ce) return (WQ_SIZE - ce->parallel.guc.wqi_tail); } -static void write_wqi(struct guc_sched_wq_desc *wq_desc, - struct intel_context *ce, - u32 wqi_size) +static void write_wqi(struct intel_context *ce, u32 wqi_size) { BUILD_BUG_ON(!is_power_of_2(WQ_SIZE)); @@ -750,13 +803,12 @@ static void write_wqi(struct guc_sched_wq_desc *wq_desc, ce->parallel.guc.wqi_tail = (ce->parallel.guc.wqi_tail + wqi_size) & (WQ_SIZE - 1); - WRITE_ONCE(wq_desc->tail, ce->parallel.guc.wqi_tail); + WRITE_ONCE(*ce->parallel.guc.wq_tail, ce->parallel.guc.wqi_tail); } static int guc_wq_noop_append(struct intel_context *ce) { - struct guc_sched_wq_desc *wq_desc = __get_wq_desc(ce); - u32 *wqi = get_wq_pointer(wq_desc, ce, wq_space_until_wrap(ce)); + u32 *wqi = get_wq_pointer(ce, wq_space_until_wrap(ce)); u32 len_dw = wq_space_until_wrap(ce) / sizeof(u32) - 1; if (!wqi) @@ -775,7 +827,6 @@ static int __guc_wq_item_append(struct i915_request *rq) { struct intel_context *ce = request_to_scheduling_context(rq); struct intel_context *child; - struct guc_sched_wq_desc *wq_desc = __get_wq_desc(ce); unsigned int wqi_size = (ce->parallel.number_children + 4) * sizeof(u32); u32 *wqi; @@ -795,7 +846,7 @@ static int __guc_wq_item_append(struct i915_request *rq) return ret; } - wqi = get_wq_pointer(wq_desc, ce, wqi_size); + wqi = get_wq_pointer(ce, wqi_size); if (!wqi) return -EBUSY; @@ -810,7 +861,7 @@ static int __guc_wq_item_append(struct i915_request *rq) for_each_child(ce, child) *wqi++ = child->ring->tail / sizeof(u64); - write_wqi(wq_desc, ce, wqi_size); + write_wqi(ce, wqi_size); return 0; } @@ -1527,87 +1578,18 @@ static void guc_reset_state(struct intel_context *ce, u32 head, bool scrub) lrc_update_regs(ce, engine, head); } -static u32 __cs_pending_mi_force_wakes(struct intel_engine_cs *engine) -{ - static const i915_reg_t _reg[I915_NUM_ENGINES] = { - [RCS0] = MSG_IDLE_CS, - [BCS0] = MSG_IDLE_BCS, - [VCS0] = MSG_IDLE_VCS0, - [VCS1] = MSG_IDLE_VCS1, - [VCS2] = MSG_IDLE_VCS2, - [VCS3] = MSG_IDLE_VCS3, - [VCS4] = MSG_IDLE_VCS4, - [VCS5] = MSG_IDLE_VCS5, - [VCS6] = MSG_IDLE_VCS6, - [VCS7] = MSG_IDLE_VCS7, - [VECS0] = MSG_IDLE_VECS0, - [VECS1] = MSG_IDLE_VECS1, - [VECS2] = MSG_IDLE_VECS2, - [VECS3] = MSG_IDLE_VECS3, - [CCS0] = MSG_IDLE_CS, - [CCS1] = MSG_IDLE_CS, - [CCS2] = MSG_IDLE_CS, - [CCS3] = MSG_IDLE_CS, - }; - u32 val; - - if (!_reg[engine->id].reg) - return 0; - - val = intel_uncore_read(engine->uncore, _reg[engine->id]); - - /* bits[29:25] & bits[13:9] >> shift */ - return (val & (val >> 16) & MSG_IDLE_FW_MASK) >> MSG_IDLE_FW_SHIFT; -} - -static void __gpm_wait_for_fw_complete(struct intel_gt *gt, u32 fw_mask) -{ - int ret; - - /* Ensure GPM receives fw up/down after CS is stopped */ - udelay(1); - - /* Wait for forcewake request to complete in GPM */ - ret = __intel_wait_for_register_fw(gt->uncore, - GEN9_PWRGT_DOMAIN_STATUS, - fw_mask, fw_mask, 5000, 0, NULL); - - /* Ensure CS receives fw ack from GPM */ - udelay(1); - - if (ret) - GT_TRACE(gt, "Failed to complete pending forcewake %d\n", ret); -} - -/* - * Wa_22011802037:gen12: In addition to stopping the cs, we need to wait for any - * pending MI_FORCE_WAKEUP requests that the CS has initiated to complete. The - * pending status is indicated by bits[13:9] (masked by bits[ 29:25]) in the - * MSG_IDLE register. There's one MSG_IDLE register per reset domain. Since we - * are concerned only with the gt reset here, we use a logical OR of pending - * forcewakeups from all reset domains and then wait for them to complete by - * querying PWRGT_DOMAIN_STATUS. - */ static void guc_engine_reset_prepare(struct intel_engine_cs *engine) { - u32 fw_pending; - - if (GRAPHICS_VER(engine->i915) != 12) + if (!IS_GRAPHICS_VER(engine->i915, 11, 12)) return; - /* - * Wa_22011802037 - * TODO: Occasionally trying to stop the cs times out, but does not - * adversely affect functionality. The timeout is set as a config - * parameter that defaults to 100ms. Assuming that this timeout is - * sufficient for any pending MI_FORCEWAKEs to complete, ignore the - * timeout returned here until it is root caused. - */ intel_engine_stop_cs(engine); - fw_pending = __cs_pending_mi_force_wakes(engine); - if (fw_pending) - __gpm_wait_for_fw_complete(engine->gt, fw_pending); + /* + * Wa_22011802037:gen11/gen12: In addition to stopping the cs, we need + * to wait for any pending mi force wakeups + */ + intel_engine_wait_for_pending_mi_fw(engine); } static void guc_reset_nop(struct intel_engine_cs *engine) @@ -1868,20 +1850,34 @@ static void reset_fail_worker_func(struct work_struct *w); int intel_guc_submission_init(struct intel_guc *guc) { struct intel_gt *gt = guc_to_gt(guc); + int ret; if (guc->submission_initialized) return 0; + if (guc->fw.major_ver_found < 70) { + ret = guc_lrc_desc_pool_create_v69(guc); + if (ret) + return ret; + } + guc->submission_state.guc_ids_bitmap = bitmap_zalloc(NUMBER_MULTI_LRC_GUC_ID(guc), GFP_KERNEL); - if (!guc->submission_state.guc_ids_bitmap) - return -ENOMEM; + if (!guc->submission_state.guc_ids_bitmap) { + ret = -ENOMEM; + goto destroy_pool; + } guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ; guc->timestamp.shift = gpm_timestamp_shift(gt); guc->submission_initialized = true; return 0; + +destroy_pool: + guc_lrc_desc_pool_destroy_v69(guc); + + return ret; } void intel_guc_submission_fini(struct intel_guc *guc) @@ -1890,6 +1886,7 @@ void intel_guc_submission_fini(struct intel_guc *guc) return; guc_flush_destroyed_contexts(guc); + guc_lrc_desc_pool_destroy_v69(guc); i915_sched_engine_put(guc->sched_engine); bitmap_free(guc->submission_state.guc_ids_bitmap); guc->submission_initialized = false; @@ -2147,10 +2144,34 @@ static void unpin_guc_id(struct intel_guc *guc, struct intel_context *ce) spin_unlock_irqrestore(&guc->submission_state.lock, flags); } -static int __guc_action_register_multi_lrc(struct intel_guc *guc, - struct intel_context *ce, - struct guc_ctxt_registration_info *info, - bool loop) +static int __guc_action_register_multi_lrc_v69(struct intel_guc *guc, + struct intel_context *ce, + u32 guc_id, + u32 offset, + bool loop) +{ + struct intel_context *child; + u32 action[4 + MAX_ENGINE_INSTANCE]; + int len = 0; + + GEM_BUG_ON(ce->parallel.number_children > MAX_ENGINE_INSTANCE); + + action[len++] = INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC; + action[len++] = guc_id; + action[len++] = ce->parallel.number_children + 1; + action[len++] = offset; + for_each_child(ce, child) { + offset += sizeof(struct guc_lrc_desc_v69); + action[len++] = offset; + } + + return guc_submission_send_busy_loop(guc, action, len, 0, loop); +} + +static int __guc_action_register_multi_lrc_v70(struct intel_guc *guc, + struct intel_context *ce, + struct guc_ctxt_registration_info *info, + bool loop) { struct intel_context *child; u32 action[13 + (MAX_ENGINE_INSTANCE * 2)]; @@ -2190,9 +2211,24 @@ static int __guc_action_register_multi_lrc(struct intel_guc *guc, return guc_submission_send_busy_loop(guc, action, len, 0, loop); } -static int __guc_action_register_context(struct intel_guc *guc, - struct guc_ctxt_registration_info *info, - bool loop) +static int __guc_action_register_context_v69(struct intel_guc *guc, + u32 guc_id, + u32 offset, + bool loop) +{ + u32 action[] = { + INTEL_GUC_ACTION_REGISTER_CONTEXT, + guc_id, + offset, + }; + + return guc_submission_send_busy_loop(guc, action, ARRAY_SIZE(action), + 0, loop); +} + +static int __guc_action_register_context_v70(struct intel_guc *guc, + struct guc_ctxt_registration_info *info, + bool loop) { u32 action[] = { INTEL_GUC_ACTION_REGISTER_CONTEXT, @@ -2213,24 +2249,52 @@ static int __guc_action_register_context(struct intel_guc *guc, 0, loop); } -static void prepare_context_registration_info(struct intel_context *ce, - struct guc_ctxt_registration_info *info); +static void prepare_context_registration_info_v69(struct intel_context *ce); +static void prepare_context_registration_info_v70(struct intel_context *ce, + struct guc_ctxt_registration_info *info); -static int register_context(struct intel_context *ce, bool loop) +static int +register_context_v69(struct intel_guc *guc, struct intel_context *ce, bool loop) +{ + u32 offset = intel_guc_ggtt_offset(guc, guc->lrc_desc_pool_v69) + + ce->guc_id.id * sizeof(struct guc_lrc_desc_v69); + + prepare_context_registration_info_v69(ce); + + if (intel_context_is_parent(ce)) + return __guc_action_register_multi_lrc_v69(guc, ce, ce->guc_id.id, + offset, loop); + else + return __guc_action_register_context_v69(guc, ce->guc_id.id, + offset, loop); +} + +static int +register_context_v70(struct intel_guc *guc, struct intel_context *ce, bool loop) { struct guc_ctxt_registration_info info; + + prepare_context_registration_info_v70(ce, &info); + + if (intel_context_is_parent(ce)) + return __guc_action_register_multi_lrc_v70(guc, ce, &info, loop); + else + return __guc_action_register_context_v70(guc, &info, loop); +} + +static int register_context(struct intel_context *ce, bool loop) +{ struct intel_guc *guc = ce_to_guc(ce); int ret; GEM_BUG_ON(intel_context_is_child(ce)); trace_intel_context_register(ce); - prepare_context_registration_info(ce, &info); - - if (intel_context_is_parent(ce)) - ret = __guc_action_register_multi_lrc(guc, ce, &info, loop); + if (guc->fw.major_ver_found >= 70) + ret = register_context_v70(guc, ce, loop); else - ret = __guc_action_register_context(guc, &info, loop); + ret = register_context_v69(guc, ce, loop); + if (likely(!ret)) { unsigned long flags; @@ -2238,7 +2302,8 @@ static int register_context(struct intel_context *ce, bool loop) set_context_registered(ce); spin_unlock_irqrestore(&ce->guc_state.lock, flags); - guc_context_policy_init(ce, loop); + if (guc->fw.major_ver_found >= 70) + guc_context_policy_init_v70(ce, loop); } return ret; @@ -2335,7 +2400,7 @@ static int __guc_context_set_context_policies(struct intel_guc *guc, 0, loop); } -static int guc_context_policy_init(struct intel_context *ce, bool loop) +static int guc_context_policy_init_v70(struct intel_context *ce, bool loop) { struct intel_engine_cs *engine = ce->engine; struct intel_guc *guc = &engine->gt->uc.guc; @@ -2394,8 +2459,108 @@ static int guc_context_policy_init(struct intel_context *ce, bool loop) return ret; } -static void prepare_context_registration_info(struct intel_context *ce, - struct guc_ctxt_registration_info *info) +static void guc_context_policy_init_v69(struct intel_engine_cs *engine, + struct guc_lrc_desc_v69 *desc) +{ + desc->policy_flags = 0; + + if (engine->flags & I915_ENGINE_WANT_FORCED_PREEMPTION) + desc->policy_flags |= CONTEXT_POLICY_FLAG_PREEMPT_TO_IDLE_V69; + + /* NB: For both of these, zero means disabled. */ + desc->execution_quantum = engine->props.timeslice_duration_ms * 1000; + desc->preemption_timeout = engine->props.preempt_timeout_ms * 1000; +} + +static u32 map_guc_prio_to_lrc_desc_prio(u8 prio) +{ + /* + * this matches the mapping we do in map_i915_prio_to_guc_prio() + * (e.g. prio < I915_PRIORITY_NORMAL maps to GUC_CLIENT_PRIORITY_NORMAL) + */ + switch (prio) { + default: + MISSING_CASE(prio); + fallthrough; + case GUC_CLIENT_PRIORITY_KMD_NORMAL: + return GEN12_CTX_PRIORITY_NORMAL; + case GUC_CLIENT_PRIORITY_NORMAL: + return GEN12_CTX_PRIORITY_LOW; + case GUC_CLIENT_PRIORITY_HIGH: + case GUC_CLIENT_PRIORITY_KMD_HIGH: + return GEN12_CTX_PRIORITY_HIGH; + } +} + +static void prepare_context_registration_info_v69(struct intel_context *ce) +{ + struct intel_engine_cs *engine = ce->engine; + struct intel_guc *guc = &engine->gt->uc.guc; + u32 ctx_id = ce->guc_id.id; + struct guc_lrc_desc_v69 *desc; + struct intel_context *child; + + GEM_BUG_ON(!engine->mask); + + /* + * Ensure LRC + CT vmas are is same region as write barrier is done + * based on CT vma region. + */ + GEM_BUG_ON(i915_gem_object_is_lmem(guc->ct.vma->obj) != + i915_gem_object_is_lmem(ce->ring->vma->obj)); + + desc = __get_lrc_desc_v69(guc, ctx_id); + desc->engine_class = engine_class_to_guc_class(engine->class); + desc->engine_submit_mask = engine->logical_mask; + desc->hw_context_desc = ce->lrc.lrca; + desc->priority = ce->guc_state.prio; + desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD; + guc_context_policy_init_v69(engine, desc); + + /* + * If context is a parent, we need to register a process descriptor + * describing a work queue and register all child contexts. + */ + if (intel_context_is_parent(ce)) { + struct guc_process_desc_v69 *pdesc; + + ce->parallel.guc.wqi_tail = 0; + ce->parallel.guc.wqi_head = 0; + + desc->process_desc = i915_ggtt_offset(ce->state) + + __get_parent_scratch_offset(ce); + desc->wq_addr = i915_ggtt_offset(ce->state) + + __get_wq_offset(ce); + desc->wq_size = WQ_SIZE; + + pdesc = __get_process_desc_v69(ce); + memset(pdesc, 0, sizeof(*(pdesc))); + pdesc->stage_id = ce->guc_id.id; + pdesc->wq_base_addr = desc->wq_addr; + pdesc->wq_size_bytes = desc->wq_size; + pdesc->wq_status = WQ_STATUS_ACTIVE; + + ce->parallel.guc.wq_head = &pdesc->head; + ce->parallel.guc.wq_tail = &pdesc->tail; + ce->parallel.guc.wq_status = &pdesc->wq_status; + + for_each_child(ce, child) { + desc = __get_lrc_desc_v69(guc, child->guc_id.id); + + desc->engine_class = + engine_class_to_guc_class(engine->class); + desc->hw_context_desc = child->lrc.lrca; + desc->priority = ce->guc_state.prio; + desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD; + guc_context_policy_init_v69(engine, desc); + } + + clear_children_join_go_memory(ce); + } +} + +static void prepare_context_registration_info_v70(struct intel_context *ce, + struct guc_ctxt_registration_info *info) { struct intel_engine_cs *engine = ce->engine; struct intel_guc *guc = &engine->gt->uc.guc; @@ -2420,6 +2585,8 @@ static void prepare_context_registration_info(struct intel_context *ce, */ info->hwlrca_lo = lower_32_bits(ce->lrc.lrca); info->hwlrca_hi = upper_32_bits(ce->lrc.lrca); + if (engine->flags & I915_ENGINE_HAS_EU_PRIORITY) + info->hwlrca_lo |= map_guc_prio_to_lrc_desc_prio(ce->guc_state.prio); info->flags = CONTEXT_REGISTRATION_FLAG_KMD; /* @@ -2443,10 +2610,14 @@ static void prepare_context_registration_info(struct intel_context *ce, info->wq_base_hi = upper_32_bits(wq_base_offset); info->wq_size = WQ_SIZE; - wq_desc = __get_wq_desc(ce); + wq_desc = __get_wq_desc_v70(ce); memset(wq_desc, 0, sizeof(*wq_desc)); wq_desc->wq_status = WQ_STATUS_ACTIVE; + ce->parallel.guc.wq_head = &wq_desc->head; + ce->parallel.guc.wq_tail = &wq_desc->tail; + ce->parallel.guc.wq_status = &wq_desc->wq_status; + clear_children_join_go_memory(ce); } } @@ -2761,11 +2932,21 @@ static void __guc_context_set_preemption_timeout(struct intel_guc *guc, u16 guc_id, u32 preemption_timeout) { - struct context_policy policy; + if (guc->fw.major_ver_found >= 70) { + struct context_policy policy; - __guc_context_policy_start_klv(&policy, guc_id); - __guc_context_policy_add_preemption_timeout(&policy, preemption_timeout); - __guc_context_set_context_policies(guc, &policy, true); + __guc_context_policy_start_klv(&policy, guc_id); + __guc_context_policy_add_preemption_timeout(&policy, preemption_timeout); + __guc_context_set_context_policies(guc, &policy, true); + } else { + u32 action[] = { + INTEL_GUC_ACTION_V69_SET_CONTEXT_PREEMPTION_TIMEOUT, + guc_id, + preemption_timeout + }; + + intel_guc_send_busy_loop(guc, action, ARRAY_SIZE(action), 0, true); + } } static void guc_context_ban(struct intel_context *ce, struct i915_request *rq) @@ -3013,11 +3194,21 @@ static int guc_context_alloc(struct intel_context *ce) static void __guc_context_set_prio(struct intel_guc *guc, struct intel_context *ce) { - struct context_policy policy; + if (guc->fw.major_ver_found >= 70) { + struct context_policy policy; - __guc_context_policy_start_klv(&policy, ce->guc_id.id); - __guc_context_policy_add_priority(&policy, ce->guc_state.prio); - __guc_context_set_context_policies(guc, &policy, true); + __guc_context_policy_start_klv(&policy, ce->guc_id.id); + __guc_context_policy_add_priority(&policy, ce->guc_state.prio); + __guc_context_set_context_policies(guc, &policy, true); + } else { + u32 action[] = { + INTEL_GUC_ACTION_V69_SET_CONTEXT_PRIORITY, + ce->guc_id.id, + ce->guc_state.prio, + }; + + guc_submission_send_busy_loop(guc, action, ARRAY_SIZE(action), 0, true); + } } static void guc_context_set_prio(struct intel_guc *guc, @@ -4527,17 +4718,19 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc, guc_log_context_priority(p, ce); if (intel_context_is_parent(ce)) { - struct guc_sched_wq_desc *wq_desc = __get_wq_desc(ce); struct intel_context *child; drm_printf(p, "\t\tNumber children: %u\n", ce->parallel.number_children); - drm_printf(p, "\t\tWQI Head: %u\n", - READ_ONCE(wq_desc->head)); - drm_printf(p, "\t\tWQI Tail: %u\n", - READ_ONCE(wq_desc->tail)); - drm_printf(p, "\t\tWQI Status: %u\n\n", - READ_ONCE(wq_desc->wq_status)); + + if (ce->parallel.guc.wq_status) { + drm_printf(p, "\t\tWQI Head: %u\n", + READ_ONCE(*ce->parallel.guc.wq_head)); + drm_printf(p, "\t\tWQI Tail: %u\n", + READ_ONCE(*ce->parallel.guc.wq_tail)); + drm_printf(p, "\t\tWQI Status: %u\n\n", + READ_ONCE(*ce->parallel.guc.wq_status)); + } if (ce->engine->emit_bb_start == emit_bb_start_parent_no_preempt_mid_batch) { diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c index 2ff55b9994bc..703f42ba5ddd 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c @@ -70,6 +70,10 @@ void intel_uc_fw_change_status(struct intel_uc_fw *uc_fw, fw_def(BROXTON, 0, guc_def(bxt, 70, 1, 1)) \ fw_def(SKYLAKE, 0, guc_def(skl, 70, 1, 1)) +#define INTEL_GUC_FIRMWARE_DEFS_FALLBACK(fw_def, guc_def) \ + fw_def(ALDERLAKE_P, 0, guc_def(adlp, 69, 0, 3)) \ + fw_def(ALDERLAKE_S, 0, guc_def(tgl, 69, 0, 3)) + #define INTEL_HUC_FIRMWARE_DEFS(fw_def, huc_def) \ fw_def(ALDERLAKE_P, 0, huc_def(tgl, 7, 9, 3)) \ fw_def(ALDERLAKE_S, 0, huc_def(tgl, 7, 9, 3)) \ @@ -105,6 +109,7 @@ void intel_uc_fw_change_status(struct intel_uc_fw *uc_fw, MODULE_FIRMWARE(uc_); INTEL_GUC_FIRMWARE_DEFS(INTEL_UC_MODULE_FW, MAKE_GUC_FW_PATH) +INTEL_GUC_FIRMWARE_DEFS_FALLBACK(INTEL_UC_MODULE_FW, MAKE_GUC_FW_PATH) INTEL_HUC_FIRMWARE_DEFS(INTEL_UC_MODULE_FW, MAKE_HUC_FW_PATH) /* The below structs and macros are used to iterate across the list of blobs */ @@ -149,6 +154,9 @@ __uc_fw_auto_select(struct drm_i915_private *i915, struct intel_uc_fw *uc_fw) static const struct uc_fw_platform_requirement blobs_guc[] = { INTEL_GUC_FIRMWARE_DEFS(MAKE_FW_LIST, GUC_FW_BLOB) }; + static const struct uc_fw_platform_requirement blobs_guc_fallback[] = { + INTEL_GUC_FIRMWARE_DEFS_FALLBACK(MAKE_FW_LIST, GUC_FW_BLOB) + }; static const struct uc_fw_platform_requirement blobs_huc[] = { INTEL_HUC_FIRMWARE_DEFS(MAKE_FW_LIST, HUC_FW_BLOB) }; @@ -179,12 +187,29 @@ __uc_fw_auto_select(struct drm_i915_private *i915, struct intel_uc_fw *uc_fw) if (p == fw_blobs[i].p && rev >= fw_blobs[i].rev) { const struct uc_fw_blob *blob = &fw_blobs[i].blob; uc_fw->path = blob->path; + uc_fw->wanted_path = blob->path; uc_fw->major_ver_wanted = blob->major; uc_fw->minor_ver_wanted = blob->minor; break; } } + if (uc_fw->type == INTEL_UC_FW_TYPE_GUC) { + const struct uc_fw_platform_requirement *blobs = blobs_guc_fallback; + u32 count = ARRAY_SIZE(blobs_guc_fallback); + + for (i = 0; i < count && p <= blobs[i].p; i++) { + if (p == blobs[i].p && rev >= blobs[i].rev) { + const struct uc_fw_blob *blob = &blobs[i].blob; + + uc_fw->fallback.path = blob->path; + uc_fw->fallback.major_ver = blob->major; + uc_fw->fallback.minor_ver = blob->minor; + break; + } + } + } + /* make sure the list is ordered as expected */ if (IS_ENABLED(CONFIG_DRM_I915_SELFTEST)) { for (i = 1; i < fw_count; i++) { @@ -338,7 +363,24 @@ int intel_uc_fw_fetch(struct intel_uc_fw *uc_fw) __force_fw_fetch_failures(uc_fw, -EINVAL); __force_fw_fetch_failures(uc_fw, -ESTALE); - err = request_firmware(&fw, uc_fw->path, dev); + err = firmware_request_nowarn(&fw, uc_fw->path, dev); + if (err && !intel_uc_fw_is_overridden(uc_fw) && uc_fw->fallback.path) { + err = firmware_request_nowarn(&fw, uc_fw->fallback.path, dev); + if (!err) { + drm_notice(&i915->drm, + "%s firmware %s is recommended, but only %s was found\n", + intel_uc_fw_type_repr(uc_fw->type), + uc_fw->wanted_path, + uc_fw->fallback.path); + drm_info(&i915->drm, + "Consider updating your linux-firmware pkg or downloading from %s\n", + INTEL_UC_FIRMWARE_URL); + + uc_fw->path = uc_fw->fallback.path; + uc_fw->major_ver_wanted = uc_fw->fallback.major_ver; + uc_fw->minor_ver_wanted = uc_fw->fallback.minor_ver; + } + } if (err) goto fail; @@ -437,8 +479,8 @@ fail: INTEL_UC_FIRMWARE_MISSING : INTEL_UC_FIRMWARE_ERROR); - drm_notice(&i915->drm, "%s firmware %s: fetch failed with error %d\n", - intel_uc_fw_type_repr(uc_fw->type), uc_fw->path, err); + i915_probe_error(i915, "%s firmware %s: fetch failed with error %d\n", + intel_uc_fw_type_repr(uc_fw->type), uc_fw->path, err); drm_info(&i915->drm, "%s firmware(s) can be downloaded from %s\n", intel_uc_fw_type_repr(uc_fw->type), INTEL_UC_FIRMWARE_URL); @@ -796,7 +838,13 @@ size_t intel_uc_fw_copy_rsa(struct intel_uc_fw *uc_fw, void *dst, u32 max_len) void intel_uc_fw_dump(const struct intel_uc_fw *uc_fw, struct drm_printer *p) { drm_printf(p, "%s firmware: %s\n", - intel_uc_fw_type_repr(uc_fw->type), uc_fw->path); + intel_uc_fw_type_repr(uc_fw->type), uc_fw->wanted_path); + if (uc_fw->fallback.path) { + drm_printf(p, "%s firmware fallback: %s\n", + intel_uc_fw_type_repr(uc_fw->type), uc_fw->fallback.path); + drm_printf(p, "fallback selected: %s\n", + str_yes_no(uc_fw->path == uc_fw->fallback.path)); + } drm_printf(p, "\tstatus: %s\n", intel_uc_fw_status_repr(uc_fw->status)); drm_printf(p, "\tversion: wanted %u.%u, found %u.%u\n", diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h index 3229018877d3..562acdf88adb 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h @@ -74,6 +74,7 @@ struct intel_uc_fw { const enum intel_uc_fw_status status; enum intel_uc_fw_status __status; /* no accidental overwrites */ }; + const char *wanted_path; const char *path; bool user_overridden; size_t size; @@ -98,6 +99,12 @@ struct intel_uc_fw { u16 major_ver_found; u16 minor_ver_found; + struct { + const char *path; + u16 major_ver; + u16 minor_ver; + } fallback; + u32 rsa_size; u32 ucode_size; diff --git a/drivers/gpu/drm/imx/dcss/dcss-dev.c b/drivers/gpu/drm/imx/dcss/dcss-dev.c index c849533ca83e..3f5750cc2673 100644 --- a/drivers/gpu/drm/imx/dcss/dcss-dev.c +++ b/drivers/gpu/drm/imx/dcss/dcss-dev.c @@ -207,6 +207,7 @@ struct dcss_dev *dcss_dev_create(struct device *dev, bool hdmi_output) ret = dcss_submodules_init(dcss); if (ret) { + of_node_put(dcss->of_port); dev_err(dev, "submodules initialization failed\n"); goto clks_err; } @@ -237,6 +238,8 @@ void dcss_dev_destroy(struct dcss_dev *dcss) dcss_clocks_disable(dcss); } + of_node_put(dcss->of_port); + pm_runtime_disable(dcss->dev); dcss_submodules_stop(dcss); diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 7ba66ad68a8a..16356611b5b9 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -680,7 +680,11 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm, goto out_free_dma; for (i = 0; i < npages; i += max) { - args.end = start + (max << PAGE_SHIFT); + if (args.start + (max << PAGE_SHIFT) > end) + args.end = end; + else + args.end = args.start + (max << PAGE_SHIFT); + ret = migrate_vma_setup(&args); if (ret) goto out_free_pfns; diff --git a/drivers/gpu/drm/panel/panel-edp.c b/drivers/gpu/drm/panel/panel-edp.c index c96014464355..a189982601a4 100644 --- a/drivers/gpu/drm/panel/panel-edp.c +++ b/drivers/gpu/drm/panel/panel-edp.c @@ -713,7 +713,7 @@ static int generic_edp_panel_probe(struct device *dev, struct panel_edp *panel) of_property_read_u32(dev->of_node, "hpd-reliable-delay-ms", &reliable_ms); desc->delay.hpd_reliable = reliable_ms; of_property_read_u32(dev->of_node, "hpd-absent-delay-ms", &absent_ms); - desc->delay.hpd_reliable = absent_ms; + desc->delay.hpd_absent = absent_ms; /* Power the panel on so we can read the EDID */ ret = pm_runtime_get_sync(dev); diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index 191c56064f19..6b25b2f4f5a3 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -190,7 +190,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) } EXPORT_SYMBOL(drm_sched_entity_flush); -static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk) +static void drm_sched_entity_kill_jobs_work(struct work_struct *wrk) { struct drm_sched_job *job = container_of(wrk, typeof(*job), work); @@ -207,8 +207,8 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, struct drm_sched_job *job = container_of(cb, struct drm_sched_job, finish_cb); - init_irq_work(&job->work, drm_sched_entity_kill_jobs_irq_work); - irq_work_queue(&job->work); + INIT_WORK(&job->work, drm_sched_entity_kill_jobs_work); + schedule_work(&job->work); } static struct dma_fence * diff --git a/drivers/gpu/drm/tiny/simpledrm.c b/drivers/gpu/drm/tiny/simpledrm.c index 768242a78e2b..5422363690e7 100644 --- a/drivers/gpu/drm/tiny/simpledrm.c +++ b/drivers/gpu/drm/tiny/simpledrm.c @@ -627,7 +627,7 @@ static const struct drm_connector_funcs simpledrm_connector_funcs = { .atomic_destroy_state = drm_atomic_helper_connector_destroy_state, }; -static int +static enum drm_mode_status simpledrm_simple_display_pipe_mode_valid(struct drm_simple_display_pipe *pipe, const struct drm_display_mode *mode) { diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c index 4e239bd75b1d..5a9d47a229e4 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@ -428,6 +428,10 @@ static int k10temp_probe(struct pci_dev *pdev, const struct pci_device_id *id) data->ccd_offset = 0x154; k10temp_get_ccd_support(pdev, data, 8); break; + case 0xa0 ... 0xaf: + data->ccd_offset = 0x300; + k10temp_get_ccd_support(pdev, data, 8); + break; } } else if (boot_cpu_data.x86 == 0x19) { data->temp_adjust_mask = ZEN_CUR_TEMP_RANGE_SEL_MASK; @@ -445,6 +449,11 @@ static int k10temp_probe(struct pci_dev *pdev, const struct pci_device_id *id) data->ccd_offset = 0x300; k10temp_get_ccd_support(pdev, data, 8); break; + case 0x60 ... 0x6f: + case 0x70 ... 0x7f: + data->ccd_offset = 0x308; + k10temp_get_ccd_support(pdev, data, 8); + break; case 0x10 ... 0x1f: case 0xa0 ... 0xaf: data->ccd_offset = 0x300; @@ -489,10 +498,13 @@ static const struct pci_device_id k10temp_id_table[] = { { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) }, { PCI_VDEVICE(HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} }; diff --git a/drivers/i2c/busses/i2c-cadence.c b/drivers/i2c/busses/i2c-cadence.c index 3d6f8ee355bf..630cfa4ddd46 100644 --- a/drivers/i2c/busses/i2c-cadence.c +++ b/drivers/i2c/busses/i2c-cadence.c @@ -388,9 +388,9 @@ static irqreturn_t cdns_i2c_slave_isr(void *ptr) */ static irqreturn_t cdns_i2c_master_isr(void *ptr) { - unsigned int isr_status, avail_bytes, updatetx; + unsigned int isr_status, avail_bytes; unsigned int bytes_to_send; - bool hold_quirk; + bool updatetx; struct cdns_i2c *id = ptr; /* Signal completion only after everything is updated */ int done_flag = 0; @@ -410,11 +410,7 @@ static irqreturn_t cdns_i2c_master_isr(void *ptr) * Check if transfer size register needs to be updated again for a * large data receive operation. */ - updatetx = 0; - if (id->recv_count > id->curr_recv_count) - updatetx = 1; - - hold_quirk = (id->quirks & CDNS_I2C_BROKEN_HOLD_BIT) && updatetx; + updatetx = id->recv_count > id->curr_recv_count; /* When receiving, handle data interrupt and completion interrupt */ if (id->p_recv_buf && @@ -445,7 +441,7 @@ static irqreturn_t cdns_i2c_master_isr(void *ptr) break; } - if (cdns_is_holdquirk(id, hold_quirk)) + if (cdns_is_holdquirk(id, updatetx)) break; } @@ -456,7 +452,7 @@ static irqreturn_t cdns_i2c_master_isr(void *ptr) * maintain transfer size non-zero while performing a large * receive operation. */ - if (cdns_is_holdquirk(id, hold_quirk)) { + if (cdns_is_holdquirk(id, updatetx)) { /* wait while fifo is full */ while (cdns_i2c_readreg(CDNS_I2C_XFER_SIZE_OFFSET) != (id->curr_recv_count - CDNS_I2C_FIFO_DEPTH)) @@ -478,22 +474,6 @@ static irqreturn_t cdns_i2c_master_isr(void *ptr) CDNS_I2C_XFER_SIZE_OFFSET); id->curr_recv_count = id->recv_count; } - } else if (id->recv_count && !hold_quirk && - !id->curr_recv_count) { - - /* Set the slave address in address register*/ - cdns_i2c_writereg(id->p_msg->addr & CDNS_I2C_ADDR_MASK, - CDNS_I2C_ADDR_OFFSET); - - if (id->recv_count > CDNS_I2C_TRANSFER_SIZE) { - cdns_i2c_writereg(CDNS_I2C_TRANSFER_SIZE, - CDNS_I2C_XFER_SIZE_OFFSET); - id->curr_recv_count = CDNS_I2C_TRANSFER_SIZE; - } else { - cdns_i2c_writereg(id->recv_count, - CDNS_I2C_XFER_SIZE_OFFSET); - id->curr_recv_count = id->recv_count; - } } /* Clear hold (if not repeated start) and signal completion */ diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index e9e2db68b9fb..78fb1a4274a6 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -66,7 +66,7 @@ /* IMX I2C registers: * the I2C register offset is different between SoCs, - * to provid support for all these chips, split the + * to provide support for all these chips, split the * register offset into a fixed base address and a * variable shift value, then the full register offset * will be calculated by diff --git a/drivers/i2c/busses/i2c-mlxcpld.c b/drivers/i2c/busses/i2c-mlxcpld.c index 56aa424fd71d..815cc561386b 100644 --- a/drivers/i2c/busses/i2c-mlxcpld.c +++ b/drivers/i2c/busses/i2c-mlxcpld.c @@ -49,7 +49,7 @@ #define MLXCPLD_LPCI2C_NACK_IND 2 #define MLXCPLD_I2C_FREQ_1000KHZ_SET 0x04 -#define MLXCPLD_I2C_FREQ_400KHZ_SET 0x0c +#define MLXCPLD_I2C_FREQ_400KHZ_SET 0x0e #define MLXCPLD_I2C_FREQ_100KHZ_SET 0x42 enum mlxcpld_i2c_frequency { diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index f5c6802aa6c3..445b19d20b9a 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -56,6 +56,7 @@ #include <asm/nospec-branch.h> #include <asm/mwait.h> #include <asm/msr.h> +#include <asm/fpu/api.h> #define INTEL_IDLE_VERSION "0.5.1" @@ -114,6 +115,11 @@ static unsigned int mwait_substates __initdata; #define CPUIDLE_FLAG_IBRS BIT(16) /* + * Initialize large xstate for the C6-state entrance. + */ +#define CPUIDLE_FLAG_INIT_XSTATE BIT(17) + +/* * MWAIT takes an 8-bit "hint" in EAX "suggesting" * the C-state (top nibble) and sub-state (bottom nibble) * 0x00 means "MWAIT(C1)", 0x10 means "MWAIT(C2)" etc. @@ -162,7 +168,13 @@ static __cpuidle int intel_idle_irq(struct cpuidle_device *dev, raw_local_irq_enable(); ret = __intel_idle(dev, drv, index); - raw_local_irq_disable(); + + /* + * The lockdep hardirqs state may be changed to 'on' with timer + * tick interrupt followed by __do_softirq(). Use local_irq_disable() + * to keep the hardirqs state correct. + */ + local_irq_disable(); return ret; } @@ -185,6 +197,13 @@ static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev, return ret; } +static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + fpu_idle_fpregs(); + return __intel_idle(dev, drv, index); +} + /** * intel_idle_s2idle - Ask the processor to enter the given idle state. * @dev: cpuidle device of the target CPU. @@ -200,8 +219,12 @@ static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev, static __cpuidle int intel_idle_s2idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - unsigned long eax = flg2MWAIT(drv->states[index].flags); unsigned long ecx = 1; /* break on interrupt flag */ + struct cpuidle_state *state = &drv->states[index]; + unsigned long eax = flg2MWAIT(state->flags); + + if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) + fpu_idle_fpregs(); mwait_idle_with_hints(eax, ecx); @@ -936,7 +959,8 @@ static struct cpuidle_state spr_cstates[] __initdata = { { .name = "C6", .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | + CPUIDLE_FLAG_INIT_XSTATE, .exit_latency = 290, .target_residency = 800, .enter = &intel_idle, @@ -1851,6 +1875,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) drv->states[drv->state_count].enter = intel_idle_ibrs; } + if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_INIT_XSTATE) + drv->states[drv->state_count].enter = intel_idle_xstate; + if ((disabled_states_mask & BIT(drv->state_count)) || ((icpu->use_acpi || force_use_acpi) && intel_idle_off_by_default(mwait_hint) && diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 638bf4a1ed94..646fa8677490 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -4231,10 +4231,6 @@ void irdma_cm_teardown_connections(struct irdma_device *iwdev, u32 *ipaddr, struct irdma_cm_node *cm_node; struct list_head teardown_list; struct ib_qp_attr attr; - struct irdma_sc_vsi *vsi = &iwdev->vsi; - struct irdma_sc_qp *sc_qp; - struct irdma_qp *qp; - int i; INIT_LIST_HEAD(&teardown_list); @@ -4251,52 +4247,6 @@ void irdma_cm_teardown_connections(struct irdma_device *iwdev, u32 *ipaddr, irdma_cm_disconn(cm_node->iwqp); irdma_rem_ref_cm_node(cm_node); } - if (!iwdev->roce_mode) - return; - - INIT_LIST_HEAD(&teardown_list); - for (i = 0; i < IRDMA_MAX_USER_PRIORITY; i++) { - mutex_lock(&vsi->qos[i].qos_mutex); - list_for_each_safe (list_node, list_core_temp, - &vsi->qos[i].qplist) { - u32 qp_ip[4]; - - sc_qp = container_of(list_node, struct irdma_sc_qp, - list); - if (sc_qp->qp_uk.qp_type != IRDMA_QP_TYPE_ROCE_RC) - continue; - - qp = sc_qp->qp_uk.back_qp; - if (!disconnect_all) { - if (nfo->ipv4) - qp_ip[0] = qp->udp_info.local_ipaddr[3]; - else - memcpy(qp_ip, - &qp->udp_info.local_ipaddr[0], - sizeof(qp_ip)); - } - - if (disconnect_all || - (nfo->vlan_id == (qp->udp_info.vlan_tag & VLAN_VID_MASK) && - !memcmp(qp_ip, ipaddr, nfo->ipv4 ? 4 : 16))) { - spin_lock(&iwdev->rf->qptable_lock); - if (iwdev->rf->qp_table[sc_qp->qp_uk.qp_id]) { - irdma_qp_add_ref(&qp->ibqp); - list_add(&qp->teardown_entry, - &teardown_list); - } - spin_unlock(&iwdev->rf->qptable_lock); - } - } - mutex_unlock(&vsi->qos[i].qos_mutex); - } - - list_for_each_safe (list_node, list_core_temp, &teardown_list) { - qp = container_of(list_node, struct irdma_qp, teardown_entry); - attr.qp_state = IB_QPS_ERR; - irdma_modify_qp_roce(&qp->ibqp, &attr, IB_QP_STATE, NULL); - irdma_qp_rem_ref(&qp->ibqp); - } } /** diff --git a/drivers/infiniband/hw/irdma/i40iw_hw.c b/drivers/infiniband/hw/irdma/i40iw_hw.c index e46fc110004d..50299f58b6b3 100644 --- a/drivers/infiniband/hw/irdma/i40iw_hw.c +++ b/drivers/infiniband/hw/irdma/i40iw_hw.c @@ -201,6 +201,7 @@ void i40iw_init_hw(struct irdma_sc_dev *dev) dev->hw_attrs.uk_attrs.max_hw_read_sges = I40IW_MAX_SGE_RD; dev->hw_attrs.max_hw_device_pages = I40IW_MAX_PUSH_PAGE_COUNT; dev->hw_attrs.uk_attrs.max_hw_inline = I40IW_MAX_INLINE_DATA_SIZE; + dev->hw_attrs.page_size_cap = SZ_4K | SZ_2M; dev->hw_attrs.max_hw_ird = I40IW_MAX_IRD_SIZE; dev->hw_attrs.max_hw_ord = I40IW_MAX_ORD_SIZE; dev->hw_attrs.max_hw_wqes = I40IW_MAX_WQ_ENTRIES; diff --git a/drivers/infiniband/hw/irdma/icrdma_hw.c b/drivers/infiniband/hw/irdma/icrdma_hw.c index cf53b17510cd..5986fd906308 100644 --- a/drivers/infiniband/hw/irdma/icrdma_hw.c +++ b/drivers/infiniband/hw/irdma/icrdma_hw.c @@ -139,6 +139,7 @@ void icrdma_init_hw(struct irdma_sc_dev *dev) dev->cqp_db = dev->hw_regs[IRDMA_CQPDB]; dev->cq_ack_db = dev->hw_regs[IRDMA_CQACK]; dev->irq_ops = &icrdma_irq_ops; + dev->hw_attrs.page_size_cap = SZ_4K | SZ_2M | SZ_1G; dev->hw_attrs.max_hw_ird = ICRDMA_MAX_IRD_SIZE; dev->hw_attrs.max_hw_ord = ICRDMA_MAX_ORD_SIZE; dev->hw_attrs.max_stat_inst = ICRDMA_MAX_STATS_COUNT; diff --git a/drivers/infiniband/hw/irdma/irdma.h b/drivers/infiniband/hw/irdma/irdma.h index 46c12334c735..4789e85d717b 100644 --- a/drivers/infiniband/hw/irdma/irdma.h +++ b/drivers/infiniband/hw/irdma/irdma.h @@ -127,6 +127,7 @@ struct irdma_hw_attrs { u64 max_hw_outbound_msg_size; u64 max_hw_inbound_msg_size; u64 max_mr_size; + u64 page_size_cap; u32 min_hw_qp_id; u32 min_hw_aeq_size; u32 max_hw_aeq_size; diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index c4412ece5a6d..96135a228f26 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -32,7 +32,7 @@ static int irdma_query_device(struct ib_device *ibdev, props->vendor_part_id = pcidev->device; props->hw_ver = rf->pcidev->revision; - props->page_size_cap = SZ_4K | SZ_2M | SZ_1G; + props->page_size_cap = hw_attrs->page_size_cap; props->max_mr_size = hw_attrs->max_mr_size; props->max_qp = rf->max_qp - rf->used_qps; props->max_qp_wr = hw_attrs->max_qp_wr; @@ -2781,7 +2781,7 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, if (req.reg_type == IRDMA_MEMREG_TYPE_MEM) { iwmr->page_size = ib_umem_find_best_pgsz(region, - SZ_4K | SZ_2M | SZ_1G, + iwdev->rf->sc_dev.hw_attrs.page_size_cap, virt); if (unlikely(!iwmr->page_size)) { kfree(iwmr); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 20e53b167f81..c8539d0e12dd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7304,7 +7304,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; conf->mddev = mddev; - if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) + ret = -ENOMEM; + conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!conf->stripe_hashtbl) goto abort; /* We init hash_locks[0] separately to that it can be used diff --git a/drivers/misc/lkdtm/Makefile b/drivers/misc/lkdtm/Makefile index 2e0aa74ac185..95ef971b5e1c 100644 --- a/drivers/misc/lkdtm/Makefile +++ b/drivers/misc/lkdtm/Makefile @@ -13,10 +13,13 @@ lkdtm-$(CONFIG_LKDTM) += cfi.o lkdtm-$(CONFIG_LKDTM) += fortify.o lkdtm-$(CONFIG_PPC_64S_HASH_MMU) += powerpc.o -KASAN_SANITIZE_rodata.o := n KASAN_SANITIZE_stackleak.o := n -KCOV_INSTRUMENT_rodata.o := n -CFLAGS_REMOVE_rodata.o += $(CC_FLAGS_LTO) + +KASAN_SANITIZE_rodata.o := n +KCSAN_SANITIZE_rodata.o := n +KCOV_INSTRUMENT_rodata.o := n +OBJECT_FILES_NON_STANDARD_rodata.o := y +CFLAGS_REMOVE_rodata.o += $(CC_FLAGS_LTO) $(RETHUNK_CFLAGS) OBJCOPYFLAGS := OBJCOPYFLAGS_rodata_objcopy.o := \ diff --git a/drivers/mmc/host/sdhci-omap.c b/drivers/mmc/host/sdhci-omap.c index 86e867ffbb10..033be559a730 100644 --- a/drivers/mmc/host/sdhci-omap.c +++ b/drivers/mmc/host/sdhci-omap.c @@ -1298,8 +1298,9 @@ static int sdhci_omap_probe(struct platform_device *pdev) /* * omap_device_pm_domain has callbacks to enable the main * functional clock, interface clock and also configure the - * SYSCONFIG register of omap devices. The callback will be invoked - * as part of pm_runtime_get_sync. + * SYSCONFIG register to clear any boot loader set voltage + * capabilities before calling sdhci_setup_host(). The + * callback will be invoked as part of pm_runtime_get_sync. */ pm_runtime_use_autosuspend(dev); pm_runtime_set_autosuspend_delay(dev, 50); @@ -1441,7 +1442,8 @@ static int __maybe_unused sdhci_omap_runtime_suspend(struct device *dev) struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct sdhci_omap_host *omap_host = sdhci_pltfm_priv(pltfm_host); - sdhci_runtime_suspend_host(host); + if (omap_host->con != -EINVAL) + sdhci_runtime_suspend_host(host); sdhci_omap_context_save(omap_host); @@ -1458,10 +1460,10 @@ static int __maybe_unused sdhci_omap_runtime_resume(struct device *dev) pinctrl_pm_select_default_state(dev); - if (omap_host->con != -EINVAL) + if (omap_host->con != -EINVAL) { sdhci_omap_context_restore(omap_host); - - sdhci_runtime_resume_host(host, 0); + sdhci_runtime_resume_host(host, 0); + } return 0; } diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c index 889e40329956..93da23682d86 100644 --- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c +++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c @@ -850,9 +850,10 @@ static int gpmi_nfc_compute_timings(struct gpmi_nand_data *this, unsigned int tRP_ps; bool use_half_period; int sample_delay_ps, sample_delay_factor; - u16 busy_timeout_cycles; + unsigned int busy_timeout_cycles; u8 wrn_dly_sel; unsigned long clk_rate, min_rate; + u64 busy_timeout_ps; if (sdr->tRC_min >= 30000) { /* ONFI non-EDO modes [0-3] */ @@ -885,7 +886,8 @@ static int gpmi_nfc_compute_timings(struct gpmi_nand_data *this, addr_setup_cycles = TO_CYCLES(sdr->tALS_min, period_ps); data_setup_cycles = TO_CYCLES(sdr->tDS_min, period_ps); data_hold_cycles = TO_CYCLES(sdr->tDH_min, period_ps); - busy_timeout_cycles = TO_CYCLES(sdr->tWB_max + sdr->tR_max, period_ps); + busy_timeout_ps = max(sdr->tBERS_max, sdr->tPROG_max); + busy_timeout_cycles = TO_CYCLES(busy_timeout_ps, period_ps); hw->timing0 = BF_GPMI_TIMING0_ADDRESS_SETUP(addr_setup_cycles) | BF_GPMI_TIMING0_DATA_HOLD(data_hold_cycles) | diff --git a/drivers/net/amt.c b/drivers/net/amt.c index be2719a3ba70..e019526e1df6 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -563,7 +563,7 @@ static struct sk_buff *amt_build_igmp_gq(struct amt_dev *amt) ihv3->nsrcs = 0; ihv3->resv = 0; ihv3->suppress = false; - ihv3->qrv = amt->net->ipv4.sysctl_igmp_qrv; + ihv3->qrv = READ_ONCE(amt->net->ipv4.sysctl_igmp_qrv); ihv3->csum = 0; csum = &ihv3->csum; csum_start = (void *)ihv3; @@ -577,14 +577,14 @@ static struct sk_buff *amt_build_igmp_gq(struct amt_dev *amt) return skb; } -static void __amt_update_gw_status(struct amt_dev *amt, enum amt_status status, - bool validate) +static void amt_update_gw_status(struct amt_dev *amt, enum amt_status status, + bool validate) { if (validate && amt->status >= status) return; netdev_dbg(amt->dev, "Update GW status %s -> %s", status_str[amt->status], status_str[status]); - amt->status = status; + WRITE_ONCE(amt->status, status); } static void __amt_update_relay_status(struct amt_tunnel_list *tunnel, @@ -600,14 +600,6 @@ static void __amt_update_relay_status(struct amt_tunnel_list *tunnel, tunnel->status = status; } -static void amt_update_gw_status(struct amt_dev *amt, enum amt_status status, - bool validate) -{ - spin_lock_bh(&amt->lock); - __amt_update_gw_status(amt, status, validate); - spin_unlock_bh(&amt->lock); -} - static void amt_update_relay_status(struct amt_tunnel_list *tunnel, enum amt_status status, bool validate) { @@ -700,9 +692,7 @@ static void amt_send_discovery(struct amt_dev *amt) if (unlikely(net_xmit_eval(err))) amt->dev->stats.tx_errors++; - spin_lock_bh(&amt->lock); - __amt_update_gw_status(amt, AMT_STATUS_SENT_DISCOVERY, true); - spin_unlock_bh(&amt->lock); + amt_update_gw_status(amt, AMT_STATUS_SENT_DISCOVERY, true); out: rcu_read_unlock(); } @@ -900,6 +890,28 @@ static void amt_send_mld_gq(struct amt_dev *amt, struct amt_tunnel_list *tunnel) } #endif +static bool amt_queue_event(struct amt_dev *amt, enum amt_event event, + struct sk_buff *skb) +{ + int index; + + spin_lock_bh(&amt->lock); + if (amt->nr_events >= AMT_MAX_EVENTS) { + spin_unlock_bh(&amt->lock); + return 1; + } + + index = (amt->event_idx + amt->nr_events) % AMT_MAX_EVENTS; + amt->events[index].event = event; + amt->events[index].skb = skb; + amt->nr_events++; + amt->event_idx %= AMT_MAX_EVENTS; + queue_work(amt_wq, &amt->event_wq); + spin_unlock_bh(&amt->lock); + + return 0; +} + static void amt_secret_work(struct work_struct *work) { struct amt_dev *amt = container_of(to_delayed_work(work), @@ -913,58 +925,72 @@ static void amt_secret_work(struct work_struct *work) msecs_to_jiffies(AMT_SECRET_TIMEOUT)); } -static void amt_discovery_work(struct work_struct *work) +static void amt_event_send_discovery(struct amt_dev *amt) { - struct amt_dev *amt = container_of(to_delayed_work(work), - struct amt_dev, - discovery_wq); - - spin_lock_bh(&amt->lock); if (amt->status > AMT_STATUS_SENT_DISCOVERY) goto out; get_random_bytes(&amt->nonce, sizeof(__be32)); - spin_unlock_bh(&amt->lock); amt_send_discovery(amt); - spin_lock_bh(&amt->lock); out: mod_delayed_work(amt_wq, &amt->discovery_wq, msecs_to_jiffies(AMT_DISCOVERY_TIMEOUT)); - spin_unlock_bh(&amt->lock); } -static void amt_req_work(struct work_struct *work) +static void amt_discovery_work(struct work_struct *work) { struct amt_dev *amt = container_of(to_delayed_work(work), struct amt_dev, - req_wq); + discovery_wq); + + if (amt_queue_event(amt, AMT_EVENT_SEND_DISCOVERY, NULL)) + mod_delayed_work(amt_wq, &amt->discovery_wq, + msecs_to_jiffies(AMT_DISCOVERY_TIMEOUT)); +} + +static void amt_event_send_request(struct amt_dev *amt) +{ u32 exp; - spin_lock_bh(&amt->lock); if (amt->status < AMT_STATUS_RECEIVED_ADVERTISEMENT) goto out; if (amt->req_cnt > AMT_MAX_REQ_COUNT) { netdev_dbg(amt->dev, "Gateway is not ready"); amt->qi = AMT_INIT_REQ_TIMEOUT; - amt->ready4 = false; - amt->ready6 = false; + WRITE_ONCE(amt->ready4, false); + WRITE_ONCE(amt->ready6, false); amt->remote_ip = 0; - __amt_update_gw_status(amt, AMT_STATUS_INIT, false); + amt_update_gw_status(amt, AMT_STATUS_INIT, false); amt->req_cnt = 0; + amt->nonce = 0; goto out; } - spin_unlock_bh(&amt->lock); + + if (!amt->req_cnt) { + WRITE_ONCE(amt->ready4, false); + WRITE_ONCE(amt->ready6, false); + get_random_bytes(&amt->nonce, sizeof(__be32)); + } amt_send_request(amt, false); amt_send_request(amt, true); - spin_lock_bh(&amt->lock); - __amt_update_gw_status(amt, AMT_STATUS_SENT_REQUEST, true); + amt_update_gw_status(amt, AMT_STATUS_SENT_REQUEST, true); amt->req_cnt++; out: exp = min_t(u32, (1 * (1 << amt->req_cnt)), AMT_MAX_REQ_TIMEOUT); mod_delayed_work(amt_wq, &amt->req_wq, msecs_to_jiffies(exp * 1000)); - spin_unlock_bh(&amt->lock); +} + +static void amt_req_work(struct work_struct *work) +{ + struct amt_dev *amt = container_of(to_delayed_work(work), + struct amt_dev, + req_wq); + + if (amt_queue_event(amt, AMT_EVENT_SEND_REQUEST, NULL)) + mod_delayed_work(amt_wq, &amt->req_wq, + msecs_to_jiffies(100)); } static bool amt_send_membership_update(struct amt_dev *amt, @@ -1220,7 +1246,8 @@ static netdev_tx_t amt_dev_xmit(struct sk_buff *skb, struct net_device *dev) /* Gateway only passes IGMP/MLD packets */ if (!report) goto free; - if ((!v6 && !amt->ready4) || (v6 && !amt->ready6)) + if ((!v6 && !READ_ONCE(amt->ready4)) || + (v6 && !READ_ONCE(amt->ready6))) goto free; if (amt_send_membership_update(amt, skb, v6)) goto free; @@ -2236,6 +2263,10 @@ static bool amt_advertisement_handler(struct amt_dev *amt, struct sk_buff *skb) ipv4_is_zeronet(amta->ip4)) return true; + if (amt->status != AMT_STATUS_SENT_DISCOVERY || + amt->nonce != amta->nonce) + return true; + amt->remote_ip = amta->ip4; netdev_dbg(amt->dev, "advertised remote ip = %pI4\n", &amt->remote_ip); mod_delayed_work(amt_wq, &amt->req_wq, 0); @@ -2251,6 +2282,9 @@ static bool amt_multicast_data_handler(struct amt_dev *amt, struct sk_buff *skb) struct ethhdr *eth; struct iphdr *iph; + if (READ_ONCE(amt->status) != AMT_STATUS_SENT_UPDATE) + return true; + hdr_size = sizeof(*amtmd) + sizeof(struct udphdr); if (!pskb_may_pull(skb, hdr_size)) return true; @@ -2325,6 +2359,9 @@ static bool amt_membership_query_handler(struct amt_dev *amt, if (amtmq->reserved || amtmq->version) return true; + if (amtmq->nonce != amt->nonce) + return true; + hdr_size -= sizeof(*eth); if (iptunnel_pull_header(skb, hdr_size, htons(ETH_P_TEB), false)) return true; @@ -2339,6 +2376,9 @@ static bool amt_membership_query_handler(struct amt_dev *amt, iph = ip_hdr(skb); if (iph->version == 4) { + if (READ_ONCE(amt->ready4)) + return true; + if (!pskb_may_pull(skb, sizeof(*iph) + AMT_IPHDR_OPTS + sizeof(*ihv3))) return true; @@ -2349,12 +2389,10 @@ static bool amt_membership_query_handler(struct amt_dev *amt, ihv3 = skb_pull(skb, sizeof(*iph) + AMT_IPHDR_OPTS); skb_reset_transport_header(skb); skb_push(skb, sizeof(*iph) + AMT_IPHDR_OPTS); - spin_lock_bh(&amt->lock); - amt->ready4 = true; + WRITE_ONCE(amt->ready4, true); amt->mac = amtmq->response_mac; amt->req_cnt = 0; amt->qi = ihv3->qqic; - spin_unlock_bh(&amt->lock); skb->protocol = htons(ETH_P_IP); eth->h_proto = htons(ETH_P_IP); ip_eth_mc_map(iph->daddr, eth->h_dest); @@ -2363,6 +2401,9 @@ static bool amt_membership_query_handler(struct amt_dev *amt, struct mld2_query *mld2q; struct ipv6hdr *ip6h; + if (READ_ONCE(amt->ready6)) + return true; + if (!pskb_may_pull(skb, sizeof(*ip6h) + AMT_IP6HDR_OPTS + sizeof(*mld2q))) return true; @@ -2374,12 +2415,10 @@ static bool amt_membership_query_handler(struct amt_dev *amt, mld2q = skb_pull(skb, sizeof(*ip6h) + AMT_IP6HDR_OPTS); skb_reset_transport_header(skb); skb_push(skb, sizeof(*ip6h) + AMT_IP6HDR_OPTS); - spin_lock_bh(&amt->lock); - amt->ready6 = true; + WRITE_ONCE(amt->ready6, true); amt->mac = amtmq->response_mac; amt->req_cnt = 0; amt->qi = mld2q->mld2q_qqic; - spin_unlock_bh(&amt->lock); skb->protocol = htons(ETH_P_IPV6); eth->h_proto = htons(ETH_P_IPV6); ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); @@ -2392,12 +2431,14 @@ static bool amt_membership_query_handler(struct amt_dev *amt, skb->pkt_type = PACKET_MULTICAST; skb->ip_summed = CHECKSUM_NONE; len = skb->len; + local_bh_disable(); if (__netif_rx(skb) == NET_RX_SUCCESS) { amt_update_gw_status(amt, AMT_STATUS_RECEIVED_QUERY, true); dev_sw_netstats_rx_add(amt->dev, len); } else { amt->dev->stats.rx_dropped++; } + local_bh_enable(); return false; } @@ -2638,7 +2679,9 @@ static bool amt_request_handler(struct amt_dev *amt, struct sk_buff *skb) if (tunnel->ip4 == iph->saddr) goto send; + spin_lock_bh(&amt->lock); if (amt->nr_tunnels >= amt->max_tunnels) { + spin_unlock_bh(&amt->lock); icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); return true; } @@ -2646,8 +2689,10 @@ static bool amt_request_handler(struct amt_dev *amt, struct sk_buff *skb) tunnel = kzalloc(sizeof(*tunnel) + (sizeof(struct hlist_head) * amt->hash_buckets), GFP_ATOMIC); - if (!tunnel) + if (!tunnel) { + spin_unlock_bh(&amt->lock); return true; + } tunnel->source_port = udph->source; tunnel->ip4 = iph->saddr; @@ -2660,10 +2705,9 @@ static bool amt_request_handler(struct amt_dev *amt, struct sk_buff *skb) INIT_DELAYED_WORK(&tunnel->gc_wq, amt_tunnel_expire); - spin_lock_bh(&amt->lock); list_add_tail_rcu(&tunnel->list, &amt->tunnel_list); tunnel->key = amt->key; - amt_update_relay_status(tunnel, AMT_STATUS_RECEIVED_REQUEST, true); + __amt_update_relay_status(tunnel, AMT_STATUS_RECEIVED_REQUEST, true); amt->nr_tunnels++; mod_delayed_work(amt_wq, &tunnel->gc_wq, msecs_to_jiffies(amt_gmi(amt))); @@ -2688,6 +2732,38 @@ send: return false; } +static void amt_gw_rcv(struct amt_dev *amt, struct sk_buff *skb) +{ + int type = amt_parse_type(skb); + int err = 1; + + if (type == -1) + goto drop; + + if (amt->mode == AMT_MODE_GATEWAY) { + switch (type) { + case AMT_MSG_ADVERTISEMENT: + err = amt_advertisement_handler(amt, skb); + break; + case AMT_MSG_MEMBERSHIP_QUERY: + err = amt_membership_query_handler(amt, skb); + if (!err) + return; + break; + default: + netdev_dbg(amt->dev, "Invalid type of Gateway\n"); + break; + } + } +drop: + if (err) { + amt->dev->stats.rx_dropped++; + kfree_skb(skb); + } else { + consume_skb(skb); + } +} + static int amt_rcv(struct sock *sk, struct sk_buff *skb) { struct amt_dev *amt; @@ -2719,8 +2795,12 @@ static int amt_rcv(struct sock *sk, struct sk_buff *skb) err = true; goto drop; } - err = amt_advertisement_handler(amt, skb); - break; + if (amt_queue_event(amt, AMT_EVENT_RECEIVE, skb)) { + netdev_dbg(amt->dev, "AMT Event queue full\n"); + err = true; + goto drop; + } + goto out; case AMT_MSG_MULTICAST_DATA: if (iph->saddr != amt->remote_ip) { netdev_dbg(amt->dev, "Invalid Relay IP\n"); @@ -2738,11 +2818,12 @@ static int amt_rcv(struct sock *sk, struct sk_buff *skb) err = true; goto drop; } - err = amt_membership_query_handler(amt, skb); - if (err) + if (amt_queue_event(amt, AMT_EVENT_RECEIVE, skb)) { + netdev_dbg(amt->dev, "AMT Event queue full\n"); + err = true; goto drop; - else - goto out; + } + goto out; default: err = true; netdev_dbg(amt->dev, "Invalid type of Gateway\n"); @@ -2780,6 +2861,46 @@ out: return 0; } +static void amt_event_work(struct work_struct *work) +{ + struct amt_dev *amt = container_of(work, struct amt_dev, event_wq); + struct sk_buff *skb; + u8 event; + int i; + + for (i = 0; i < AMT_MAX_EVENTS; i++) { + spin_lock_bh(&amt->lock); + if (amt->nr_events == 0) { + spin_unlock_bh(&amt->lock); + return; + } + event = amt->events[amt->event_idx].event; + skb = amt->events[amt->event_idx].skb; + amt->events[amt->event_idx].event = AMT_EVENT_NONE; + amt->events[amt->event_idx].skb = NULL; + amt->nr_events--; + amt->event_idx++; + amt->event_idx %= AMT_MAX_EVENTS; + spin_unlock_bh(&amt->lock); + + switch (event) { + case AMT_EVENT_RECEIVE: + amt_gw_rcv(amt, skb); + break; + case AMT_EVENT_SEND_DISCOVERY: + amt_event_send_discovery(amt); + break; + case AMT_EVENT_SEND_REQUEST: + amt_event_send_request(amt); + break; + default: + if (skb) + kfree_skb(skb); + break; + } + } +} + static int amt_err_lookup(struct sock *sk, struct sk_buff *skb) { struct amt_dev *amt; @@ -2804,7 +2925,7 @@ static int amt_err_lookup(struct sock *sk, struct sk_buff *skb) break; case AMT_MSG_REQUEST: case AMT_MSG_MEMBERSHIP_UPDATE: - if (amt->status >= AMT_STATUS_RECEIVED_ADVERTISEMENT) + if (READ_ONCE(amt->status) >= AMT_STATUS_RECEIVED_ADVERTISEMENT) mod_delayed_work(amt_wq, &amt->req_wq, 0); break; default: @@ -2867,6 +2988,8 @@ static int amt_dev_open(struct net_device *dev) amt->ready4 = false; amt->ready6 = false; + amt->event_idx = 0; + amt->nr_events = 0; err = amt_socket_create(amt); if (err) @@ -2874,6 +2997,7 @@ static int amt_dev_open(struct net_device *dev) amt->req_cnt = 0; amt->remote_ip = 0; + amt->nonce = 0; get_random_bytes(&amt->key, sizeof(siphash_key_t)); amt->status = AMT_STATUS_INIT; @@ -2892,6 +3016,8 @@ static int amt_dev_stop(struct net_device *dev) struct amt_dev *amt = netdev_priv(dev); struct amt_tunnel_list *tunnel, *tmp; struct socket *sock; + struct sk_buff *skb; + int i; cancel_delayed_work_sync(&amt->req_wq); cancel_delayed_work_sync(&amt->discovery_wq); @@ -2904,6 +3030,15 @@ static int amt_dev_stop(struct net_device *dev) if (sock) udp_tunnel_sock_release(sock); + cancel_work_sync(&amt->event_wq); + for (i = 0; i < AMT_MAX_EVENTS; i++) { + skb = amt->events[i].skb; + if (skb) + kfree_skb(skb); + amt->events[i].event = AMT_EVENT_NONE; + amt->events[i].skb = NULL; + } + amt->ready4 = false; amt->ready6 = false; amt->req_cnt = 0; @@ -3095,7 +3230,7 @@ static int amt_newlink(struct net *net, struct net_device *dev, goto err; } if (amt->mode == AMT_MODE_RELAY) { - amt->qrv = amt->net->ipv4.sysctl_igmp_qrv; + amt->qrv = READ_ONCE(amt->net->ipv4.sysctl_igmp_qrv); amt->qri = 10; dev->needed_headroom = amt->stream_dev->needed_headroom + AMT_RELAY_HLEN; @@ -3146,8 +3281,8 @@ static int amt_newlink(struct net *net, struct net_device *dev, INIT_DELAYED_WORK(&amt->discovery_wq, amt_discovery_work); INIT_DELAYED_WORK(&amt->req_wq, amt_req_work); INIT_DELAYED_WORK(&amt->secret_wq, amt_secret_work); + INIT_WORK(&amt->event_wq, amt_event_work); INIT_LIST_HEAD(&amt->tunnel_list); - return 0; err: dev_put(amt->stream_dev); @@ -3280,7 +3415,7 @@ static int __init amt_init(void) if (err < 0) goto unregister_notifier; - amt_wq = alloc_workqueue("amt", WQ_UNBOUND, 1); + amt_wq = alloc_workqueue("amt", WQ_UNBOUND, 0); if (!amt_wq) { err = -ENOMEM; goto rtnl_unregister; diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c index ba42cef10a53..cb0321ea853c 100644 --- a/drivers/net/can/rcar/rcar_canfd.c +++ b/drivers/net/can/rcar/rcar_canfd.c @@ -1843,6 +1843,7 @@ static int rcar_canfd_probe(struct platform_device *pdev) of_child = of_get_child_by_name(pdev->dev.of_node, name); if (of_child && of_device_is_available(of_child)) channels_mask |= BIT(i); + of_node_put(of_child); } if (chip_id != RENESAS_RZG2L) { diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 9b47b07162fe..bc6518504fd4 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -1690,8 +1690,8 @@ static int mcp251xfd_register_chip_detect(struct mcp251xfd_priv *priv) u32 osc; int err; - /* The OSC_LPMEN is only supported on MCP2518FD, so use it to - * autodetect the model. + /* The OSC_LPMEN is only supported on MCP2518FD and MCP251863, + * so use it to autodetect the model. */ err = regmap_update_bits(priv->map_reg, MCP251XFD_REG_OSC, MCP251XFD_REG_OSC_LPMEN, @@ -1703,10 +1703,18 @@ static int mcp251xfd_register_chip_detect(struct mcp251xfd_priv *priv) if (err) return err; - if (osc & MCP251XFD_REG_OSC_LPMEN) - devtype_data = &mcp251xfd_devtype_data_mcp2518fd; - else + if (osc & MCP251XFD_REG_OSC_LPMEN) { + /* We cannot distinguish between MCP2518FD and + * MCP251863. If firmware specifies MCP251863, keep + * it, otherwise set to MCP2518FD. + */ + if (mcp251xfd_is_251863(priv)) + devtype_data = &mcp251xfd_devtype_data_mcp251863; + else + devtype_data = &mcp251xfd_devtype_data_mcp2518fd; + } else { devtype_data = &mcp251xfd_devtype_data_mcp2517fd; + } if (!mcp251xfd_is_251XFD(priv) && priv->devtype_data.model != devtype_data->model) { diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 9ca8c8d7740f..92a500e1ccd2 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -1038,18 +1038,21 @@ int ksz_switch_register(struct ksz_device *dev, ports = of_get_child_by_name(dev->dev->of_node, "ethernet-ports"); if (!ports) ports = of_get_child_by_name(dev->dev->of_node, "ports"); - if (ports) + if (ports) { for_each_available_child_of_node(ports, port) { if (of_property_read_u32(port, "reg", &port_num)) continue; if (!(dev->port_mask & BIT(port_num))) { of_node_put(port); + of_node_put(ports); return -EINVAL; } of_get_phy_mode(port, &dev->ports[port_num].interface); } + of_node_put(ports); + } dev->synclko_125 = of_property_read_bool(dev->dev->of_node, "microchip,synclko-125"); dev->synclko_disable = of_property_read_bool(dev->dev->of_node, diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 72b6fc1932b5..698c7d1fb45c 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -3382,12 +3382,28 @@ static const struct of_device_id sja1105_dt_ids[] = { }; MODULE_DEVICE_TABLE(of, sja1105_dt_ids); +static const struct spi_device_id sja1105_spi_ids[] = { + { "sja1105e" }, + { "sja1105t" }, + { "sja1105p" }, + { "sja1105q" }, + { "sja1105r" }, + { "sja1105s" }, + { "sja1110a" }, + { "sja1110b" }, + { "sja1110c" }, + { "sja1110d" }, + { }, +}; +MODULE_DEVICE_TABLE(spi, sja1105_spi_ids); + static struct spi_driver sja1105_driver = { .driver = { .name = "sja1105", .owner = THIS_MODULE, .of_match_table = of_match_ptr(sja1105_dt_ids), }, + .id_table = sja1105_spi_ids, .probe = sja1105_probe, .remove = sja1105_remove, .shutdown = sja1105_shutdown, diff --git a/drivers/net/dsa/vitesse-vsc73xx-spi.c b/drivers/net/dsa/vitesse-vsc73xx-spi.c index 3110895358d8..97a92e6da60d 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-spi.c +++ b/drivers/net/dsa/vitesse-vsc73xx-spi.c @@ -205,10 +205,20 @@ static const struct of_device_id vsc73xx_of_match[] = { }; MODULE_DEVICE_TABLE(of, vsc73xx_of_match); +static const struct spi_device_id vsc73xx_spi_ids[] = { + { "vsc7385" }, + { "vsc7388" }, + { "vsc7395" }, + { "vsc7398" }, + { }, +}; +MODULE_DEVICE_TABLE(spi, vsc73xx_spi_ids); + static struct spi_driver vsc73xx_spi_driver = { .probe = vsc73xx_spi_probe, .remove = vsc73xx_spi_remove, .shutdown = vsc73xx_spi_shutdown, + .id_table = vsc73xx_spi_ids, .driver = { .name = "vsc73xx-spi", .of_match_table = vsc73xx_of_match, diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 7c760aa65540..ddfe9208529a 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -1236,8 +1236,8 @@ static struct sock *chtls_recv_sock(struct sock *lsk, csk->sndbuf = newsk->sk_sndbuf; csk->smac_idx = ((struct port_info *)netdev_priv(ndev))->smt_idx; RCV_WSCALE(tp) = select_rcv_wscale(tcp_full_space(newsk), - sock_net(newsk)-> - ipv4.sysctl_tcp_window_scaling, + READ_ONCE(sock_net(newsk)-> + ipv4.sysctl_tcp_window_scaling), tp->window_clamp); neigh_release(n); inet_inherit_port(&tcp_hashinfo, lsk, newsk); @@ -1384,7 +1384,7 @@ static void chtls_pass_accept_request(struct sock *sk, #endif } if (req->tcpopt.wsf <= 14 && - sock_net(sk)->ipv4.sysctl_tcp_window_scaling) { + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) { inet_rsk(oreq)->wscale_ok = 1; inet_rsk(oreq)->snd_wscale = req->tcpopt.wsf; } diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c index 528eb0f223b1..b4f5e57d0285 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.c +++ b/drivers/net/ethernet/emulex/benet/be_cmds.c @@ -2287,7 +2287,7 @@ err: /* Uses sync mcc */ int be_cmd_read_port_transceiver_data(struct be_adapter *adapter, - u8 page_num, u8 *data) + u8 page_num, u32 off, u32 len, u8 *data) { struct be_dma_mem cmd; struct be_mcc_wrb *wrb; @@ -2321,10 +2321,10 @@ int be_cmd_read_port_transceiver_data(struct be_adapter *adapter, req->port = cpu_to_le32(adapter->hba_port_num); req->page_num = cpu_to_le32(page_num); status = be_mcc_notify_wait(adapter); - if (!status) { + if (!status && len > 0) { struct be_cmd_resp_port_type *resp = cmd.va; - memcpy(data, resp->page_data, PAGE_DATA_LEN); + memcpy(data, resp->page_data + off, len); } err: mutex_unlock(&adapter->mcc_lock); @@ -2415,7 +2415,7 @@ int be_cmd_query_cable_type(struct be_adapter *adapter) int status; status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0, - page_data); + 0, PAGE_DATA_LEN, page_data); if (!status) { switch (adapter->phy.interface_type) { case PHY_TYPE_QSFP: @@ -2440,7 +2440,7 @@ int be_cmd_query_sfp_info(struct be_adapter *adapter) int status; status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0, - page_data); + 0, PAGE_DATA_LEN, page_data); if (!status) { strlcpy(adapter->phy.vendor_name, page_data + SFP_VENDOR_NAME_OFFSET, SFP_VENDOR_NAME_LEN - 1); diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h index db1f3b908582..e2085c68c0ee 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.h +++ b/drivers/net/ethernet/emulex/benet/be_cmds.h @@ -2427,7 +2427,7 @@ int be_cmd_set_beacon_state(struct be_adapter *adapter, u8 port_num, u8 beacon, int be_cmd_get_beacon_state(struct be_adapter *adapter, u8 port_num, u32 *state); int be_cmd_read_port_transceiver_data(struct be_adapter *adapter, - u8 page_num, u8 *data); + u8 page_num, u32 off, u32 len, u8 *data); int be_cmd_query_cable_type(struct be_adapter *adapter); int be_cmd_query_sfp_info(struct be_adapter *adapter); int lancer_cmd_read_object(struct be_adapter *adapter, struct be_dma_mem *cmd, diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c index dfa784339781..bd0df189d871 100644 --- a/drivers/net/ethernet/emulex/benet/be_ethtool.c +++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c @@ -1344,7 +1344,7 @@ static int be_get_module_info(struct net_device *netdev, return -EOPNOTSUPP; status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0, - page_data); + 0, PAGE_DATA_LEN, page_data); if (!status) { if (!page_data[SFP_PLUS_SFF_8472_COMP]) { modinfo->type = ETH_MODULE_SFF_8079; @@ -1362,25 +1362,32 @@ static int be_get_module_eeprom(struct net_device *netdev, { struct be_adapter *adapter = netdev_priv(netdev); int status; + u32 begin, end; if (!check_privilege(adapter, MAX_PRIVILEGES)) return -EOPNOTSUPP; - status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0, - data); - if (status) - goto err; + begin = eeprom->offset; + end = eeprom->offset + eeprom->len; + + if (begin < PAGE_DATA_LEN) { + status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0, begin, + min_t(u32, end, PAGE_DATA_LEN) - begin, + data); + if (status) + goto err; + + data += PAGE_DATA_LEN - begin; + begin = PAGE_DATA_LEN; + } - if (eeprom->offset + eeprom->len > PAGE_DATA_LEN) { - status = be_cmd_read_port_transceiver_data(adapter, - TR_PAGE_A2, - data + - PAGE_DATA_LEN); + if (end > PAGE_DATA_LEN) { + status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A2, + begin - PAGE_DATA_LEN, + end - begin, data); if (status) goto err; } - if (eeprom->offset) - memcpy(data, data + eeprom->offset, eeprom->len); err: return be_cmd_status(status); } diff --git a/drivers/net/ethernet/fungible/funeth/funeth_rx.c b/drivers/net/ethernet/fungible/funeth/funeth_rx.c index 0f6a549b9f67..29a6c2ede43a 100644 --- a/drivers/net/ethernet/fungible/funeth/funeth_rx.c +++ b/drivers/net/ethernet/fungible/funeth/funeth_rx.c @@ -142,6 +142,7 @@ static void *fun_run_xdp(struct funeth_rxq *q, skb_frag_t *frags, void *buf_va, int ref_ok, struct funeth_txq *xdp_q) { struct bpf_prog *xdp_prog; + struct xdp_frame *xdpf; struct xdp_buff xdp; u32 act; @@ -163,7 +164,9 @@ static void *fun_run_xdp(struct funeth_rxq *q, skb_frag_t *frags, void *buf_va, case XDP_TX: if (unlikely(!ref_ok)) goto pass; - if (!fun_xdp_tx(xdp_q, xdp.data, xdp.data_end - xdp.data)) + + xdpf = xdp_convert_buff_to_frame(&xdp); + if (!xdpf || !fun_xdp_tx(xdp_q, xdpf)) goto xdp_error; FUN_QSTAT_INC(q, xdp_tx); q->xdp_flush |= FUN_XDP_FLUSH_TX; diff --git a/drivers/net/ethernet/fungible/funeth/funeth_tx.c b/drivers/net/ethernet/fungible/funeth/funeth_tx.c index ff6e29237253..2f6698b98b03 100644 --- a/drivers/net/ethernet/fungible/funeth/funeth_tx.c +++ b/drivers/net/ethernet/fungible/funeth/funeth_tx.c @@ -466,7 +466,7 @@ static unsigned int fun_xdpq_clean(struct funeth_txq *q, unsigned int budget) do { fun_xdp_unmap(q, reclaim_idx); - page_frag_free(q->info[reclaim_idx].vaddr); + xdp_return_frame(q->info[reclaim_idx].xdpf); trace_funeth_tx_free(q, reclaim_idx, 1, head); @@ -479,11 +479,11 @@ static unsigned int fun_xdpq_clean(struct funeth_txq *q, unsigned int budget) return npkts; } -bool fun_xdp_tx(struct funeth_txq *q, void *data, unsigned int len) +bool fun_xdp_tx(struct funeth_txq *q, struct xdp_frame *xdpf) { struct fun_eth_tx_req *req; struct fun_dataop_gl *gle; - unsigned int idx; + unsigned int idx, len; dma_addr_t dma; if (fun_txq_avail(q) < FUN_XDP_CLEAN_THRES) @@ -494,7 +494,8 @@ bool fun_xdp_tx(struct funeth_txq *q, void *data, unsigned int len) return false; } - dma = dma_map_single(q->dma_dev, data, len, DMA_TO_DEVICE); + len = xdpf->len; + dma = dma_map_single(q->dma_dev, xdpf->data, len, DMA_TO_DEVICE); if (unlikely(dma_mapping_error(q->dma_dev, dma))) { FUN_QSTAT_INC(q, tx_map_err); return false; @@ -514,7 +515,7 @@ bool fun_xdp_tx(struct funeth_txq *q, void *data, unsigned int len) gle = (struct fun_dataop_gl *)req->dataop.imm; fun_dataop_gl_init(gle, 0, 0, len, dma); - q->info[idx].vaddr = data; + q->info[idx].xdpf = xdpf; u64_stats_update_begin(&q->syncp); q->stats.tx_bytes += len; @@ -545,12 +546,9 @@ int fun_xdp_xmit_frames(struct net_device *dev, int n, if (unlikely(q_idx >= fp->num_xdpqs)) return -ENXIO; - for (q = xdpqs[q_idx], i = 0; i < n; i++) { - const struct xdp_frame *xdpf = frames[i]; - - if (!fun_xdp_tx(q, xdpf->data, xdpf->len)) + for (q = xdpqs[q_idx], i = 0; i < n; i++) + if (!fun_xdp_tx(q, frames[i])) break; - } if (unlikely(flags & XDP_XMIT_FLUSH)) fun_txq_wr_db(q); @@ -577,7 +575,7 @@ static void fun_xdpq_purge(struct funeth_txq *q) unsigned int idx = q->cons_cnt & q->mask; fun_xdp_unmap(q, idx); - page_frag_free(q->info[idx].vaddr); + xdp_return_frame(q->info[idx].xdpf); q->cons_cnt++; } } diff --git a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h index 04c9f91b7489..8708e2895946 100644 --- a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h +++ b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h @@ -95,8 +95,8 @@ struct funeth_txq_stats { /* per Tx queue SW counters */ struct funeth_tx_info { /* per Tx descriptor state */ union { - struct sk_buff *skb; /* associated packet */ - void *vaddr; /* start address for XDP */ + struct sk_buff *skb; /* associated packet (sk_buff path) */ + struct xdp_frame *xdpf; /* associated XDP frame (XDP path) */ }; }; @@ -245,7 +245,7 @@ static inline int fun_irq_node(const struct fun_irq *p) int fun_rxq_napi_poll(struct napi_struct *napi, int budget); int fun_txq_napi_poll(struct napi_struct *napi, int budget); netdev_tx_t fun_start_xmit(struct sk_buff *skb, struct net_device *netdev); -bool fun_xdp_tx(struct funeth_txq *q, void *data, unsigned int len); +bool fun_xdp_tx(struct funeth_txq *q, struct xdp_frame *xdpf); int fun_xdp_xmit_frames(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags); diff --git a/drivers/net/ethernet/intel/e1000e/hw.h b/drivers/net/ethernet/intel/e1000e/hw.h index 13382df2f2ef..bcf680e83811 100644 --- a/drivers/net/ethernet/intel/e1000e/hw.h +++ b/drivers/net/ethernet/intel/e1000e/hw.h @@ -630,7 +630,6 @@ struct e1000_phy_info { bool disable_polarity_correction; bool is_mdix; bool polarity_correction; - bool reset_disable; bool speed_downgraded; bool autoneg_wait_to_complete; }; diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index e6c8e6d5234f..9466f65a6da7 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -2050,10 +2050,6 @@ static s32 e1000_check_reset_block_ich8lan(struct e1000_hw *hw) bool blocked = false; int i = 0; - /* Check the PHY (LCD) reset flag */ - if (hw->phy.reset_disable) - return true; - while ((blocked = !(er32(FWSM) & E1000_ICH_FWSM_RSPCIPHY)) && (i++ < 30)) usleep_range(10000, 11000); diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.h b/drivers/net/ethernet/intel/e1000e/ich8lan.h index 638a3ddd7ada..2504b11c3169 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.h +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.h @@ -271,7 +271,6 @@ #define I217_CGFREG_ENABLE_MTA_RESET 0x0002 #define I217_MEMPWR PHY_REG(772, 26) #define I217_MEMPWR_DISABLE_SMB_RELEASE 0x0010 -#define I217_MEMPWR_MOEM 0x1000 /* Receive Address Initial CRC Calculation */ #define E1000_PCH_RAICC(_n) (0x05F50 + ((_n) * 4)) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index fa06f68c8c80..f1729940e46c 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6494,6 +6494,10 @@ static void e1000e_s0ix_exit_flow(struct e1000_adapter *adapter) if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID && hw->mac.type >= e1000_pch_adp) { + /* Keep the GPT clock enabled for CSME */ + mac_data = er32(FEXTNVM); + mac_data |= BIT(3); + ew32(FEXTNVM, mac_data); /* Request ME unconfigure the device from S0ix */ mac_data = er32(H2ME); mac_data &= ~E1000_H2ME_START_DPG; @@ -6987,21 +6991,8 @@ static __maybe_unused int e1000e_pm_suspend(struct device *dev) struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev)); struct e1000_adapter *adapter = netdev_priv(netdev); struct pci_dev *pdev = to_pci_dev(dev); - struct e1000_hw *hw = &adapter->hw; - u16 phy_data; int rc; - if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID && - hw->mac.type >= e1000_pch_adp) { - /* Mask OEM Bits / Gig Disable / Restart AN (772_26[12] = 1) */ - e1e_rphy(hw, I217_MEMPWR, &phy_data); - phy_data |= I217_MEMPWR_MOEM; - e1e_wphy(hw, I217_MEMPWR, phy_data); - - /* Disable LCD reset */ - hw->phy.reset_disable = true; - } - e1000e_flush_lpic(pdev); e1000e_pm_freeze(dev); @@ -7023,8 +7014,6 @@ static __maybe_unused int e1000e_pm_resume(struct device *dev) struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev)); struct e1000_adapter *adapter = netdev_priv(netdev); struct pci_dev *pdev = to_pci_dev(dev); - struct e1000_hw *hw = &adapter->hw; - u16 phy_data; int rc; /* Introduce S0ix implementation */ @@ -7035,17 +7024,6 @@ static __maybe_unused int e1000e_pm_resume(struct device *dev) if (rc) return rc; - if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID && - hw->mac.type >= e1000_pch_adp) { - /* Unmask OEM Bits / Gig Disable / Restart AN 772_26[12] = 0 */ - e1e_rphy(hw, I217_MEMPWR, &phy_data); - phy_data &= ~I217_MEMPWR_MOEM; - e1e_wphy(hw, I217_MEMPWR, phy_data); - - /* Enable LCD reset */ - hw->phy.reset_disable = false; - } - return e1000e_pm_thaw(dev); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index aa786fd55951..685556e968f2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1925,11 +1925,15 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, * non-zero req_queue_pairs says that user requested a new * queue count via ethtool's set_channels, so use this * value for queues distribution across traffic classes + * We need at least one queue pair for the interface + * to be usable as we see in else statement. */ if (vsi->req_queue_pairs > 0) vsi->num_queue_pairs = vsi->req_queue_pairs; else if (pf->flags & I40E_FLAG_MSIX_ENABLED) vsi->num_queue_pairs = pf->num_lan_msix; + else + vsi->num_queue_pairs = 1; } /* Number of queues per enabled TC */ @@ -10650,7 +10654,7 @@ static int i40e_reset(struct i40e_pf *pf) **/ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) { - int old_recovery_mode_bit = test_bit(__I40E_RECOVERY_MODE, pf->state); + const bool is_recovery_mode_reported = i40e_check_recovery_mode(pf); struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi]; struct i40e_hw *hw = &pf->hw; i40e_status ret; @@ -10658,13 +10662,11 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) int v; if (test_bit(__I40E_EMP_RESET_INTR_RECEIVED, pf->state) && - i40e_check_recovery_mode(pf)) { + is_recovery_mode_reported) i40e_set_ethtool_ops(pf->vsi[pf->lan_vsi]->netdev); - } if (test_bit(__I40E_DOWN, pf->state) && - !test_bit(__I40E_RECOVERY_MODE, pf->state) && - !old_recovery_mode_bit) + !test_bit(__I40E_RECOVERY_MODE, pf->state)) goto clear_recovery; dev_dbg(&pf->pdev->dev, "Rebuilding internal switch\n"); @@ -10691,13 +10693,12 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) * accordingly with regard to resources initialization * and deinitialization */ - if (test_bit(__I40E_RECOVERY_MODE, pf->state) || - old_recovery_mode_bit) { + if (test_bit(__I40E_RECOVERY_MODE, pf->state)) { if (i40e_get_capabilities(pf, i40e_aqc_opc_list_func_capabilities)) goto end_unlock; - if (test_bit(__I40E_RECOVERY_MODE, pf->state)) { + if (is_recovery_mode_reported) { /* we're staying in recovery mode so we'll reinitialize * misc vector here */ diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 49aed3e506a6..0ea0361cd86b 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -64,7 +64,6 @@ struct iavf_vsi { u16 id; DECLARE_BITMAP(state, __IAVF_VSI_STATE_SIZE__); int base_vector; - u16 work_limit; u16 qs_handle; void *priv; /* client driver data reference. */ }; @@ -159,8 +158,12 @@ struct iavf_vlan { struct iavf_vlan_filter { struct list_head list; struct iavf_vlan vlan; - bool remove; /* filter needs to be removed */ - bool add; /* filter needs to be added */ + struct { + u8 is_new_vlan:1; /* filter is new, wait for PF answer */ + u8 remove:1; /* filter needs to be removed */ + u8 add:1; /* filter needs to be added */ + u8 padding:5; + }; }; #define IAVF_MAX_TRAFFIC_CLASS 4 @@ -461,6 +464,10 @@ static inline const char *iavf_state_str(enum iavf_state_t state) return "__IAVF_INIT_VERSION_CHECK"; case __IAVF_INIT_GET_RESOURCES: return "__IAVF_INIT_GET_RESOURCES"; + case __IAVF_INIT_EXTENDED_CAPS: + return "__IAVF_INIT_EXTENDED_CAPS"; + case __IAVF_INIT_CONFIG_ADAPTER: + return "__IAVF_INIT_CONFIG_ADAPTER"; case __IAVF_INIT_SW: return "__IAVF_INIT_SW"; case __IAVF_INIT_FAILED: @@ -520,6 +527,7 @@ int iavf_get_vf_config(struct iavf_adapter *adapter); int iavf_get_vf_vlan_v2_caps(struct iavf_adapter *adapter); int iavf_send_vf_offload_vlan_v2_msg(struct iavf_adapter *adapter); void iavf_set_queue_vlan_tag_loc(struct iavf_adapter *adapter); +u16 iavf_get_num_vlans_added(struct iavf_adapter *adapter); void iavf_irq_enable(struct iavf_adapter *adapter, bool flush); void iavf_configure_queues(struct iavf_adapter *adapter); void iavf_deconfigure_queues(struct iavf_adapter *adapter); diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index 3bb56714beb0..e535d4c3da49 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -692,12 +692,8 @@ static int __iavf_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec, int queue) { struct iavf_adapter *adapter = netdev_priv(netdev); - struct iavf_vsi *vsi = &adapter->vsi; struct iavf_ring *rx_ring, *tx_ring; - ec->tx_max_coalesced_frames = vsi->work_limit; - ec->rx_max_coalesced_frames = vsi->work_limit; - /* Rx and Tx usecs per queue value. If user doesn't specify the * queue, return queue 0's value to represent. */ @@ -825,12 +821,8 @@ static int __iavf_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec, int queue) { struct iavf_adapter *adapter = netdev_priv(netdev); - struct iavf_vsi *vsi = &adapter->vsi; int i; - if (ec->tx_max_coalesced_frames_irq || ec->rx_max_coalesced_frames_irq) - vsi->work_limit = ec->tx_max_coalesced_frames_irq; - if (ec->rx_coalesce_usecs == 0) { if (ec->use_adaptive_rx_coalesce) netif_info(adapter, drv, netdev, "rx-usecs=0, need to disable adaptive-rx for a complete disable\n"); @@ -1969,8 +1961,6 @@ static int iavf_set_rxfh(struct net_device *netdev, const u32 *indir, static const struct ethtool_ops iavf_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_USECS | - ETHTOOL_COALESCE_MAX_FRAMES | - ETHTOOL_COALESCE_MAX_FRAMES_IRQ | ETHTOOL_COALESCE_USE_ADAPTIVE, .get_drvinfo = iavf_get_drvinfo, .get_link = ethtool_op_get_link, diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index f3ecb3bca33d..2e2c153ce46a 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -843,7 +843,7 @@ static void iavf_restore_filters(struct iavf_adapter *adapter) * iavf_get_num_vlans_added - get number of VLANs added * @adapter: board private structure */ -static u16 iavf_get_num_vlans_added(struct iavf_adapter *adapter) +u16 iavf_get_num_vlans_added(struct iavf_adapter *adapter) { return bitmap_weight(adapter->vsi.active_cvlans, VLAN_N_VID) + bitmap_weight(adapter->vsi.active_svlans, VLAN_N_VID); @@ -906,11 +906,6 @@ static int iavf_vlan_rx_add_vid(struct net_device *netdev, if (!iavf_add_vlan(adapter, IAVF_VLAN(vid, be16_to_cpu(proto)))) return -ENOMEM; - if (proto == cpu_to_be16(ETH_P_8021Q)) - set_bit(vid, adapter->vsi.active_cvlans); - else - set_bit(vid, adapter->vsi.active_svlans); - return 0; } @@ -2245,7 +2240,6 @@ int iavf_parse_vf_resource_msg(struct iavf_adapter *adapter) adapter->vsi.back = adapter; adapter->vsi.base_vector = 1; - adapter->vsi.work_limit = IAVF_DEFAULT_IRQ_WORK; vsi->netdev = adapter->netdev; vsi->qs_handle = adapter->vsi_res->qset_handle; if (adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_RSS_PF) { @@ -2956,6 +2950,9 @@ continue_reset: adapter->aq_required |= IAVF_FLAG_AQ_ADD_CLOUD_FILTER; iavf_misc_irq_enable(adapter); + bitmap_clear(adapter->vsi.active_cvlans, 0, VLAN_N_VID); + bitmap_clear(adapter->vsi.active_svlans, 0, VLAN_N_VID); + mod_delayed_work(iavf_wq, &adapter->watchdog_task, 2); /* We were running when the reset started, so we need to restore some diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c index 978f651c6b09..06d18797d25a 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c +++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c @@ -194,7 +194,7 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi, struct iavf_tx_buffer *tx_buf; struct iavf_tx_desc *tx_desc; unsigned int total_bytes = 0, total_packets = 0; - unsigned int budget = vsi->work_limit; + unsigned int budget = IAVF_DEFAULT_IRQ_WORK; tx_buf = &tx_ring->tx_bi[i]; tx_desc = IAVF_TX_DESC(tx_ring, i); @@ -1285,11 +1285,10 @@ static struct iavf_rx_buffer *iavf_get_rx_buffer(struct iavf_ring *rx_ring, { struct iavf_rx_buffer *rx_buffer; - if (!size) - return NULL; - rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean]; prefetchw(rx_buffer->page); + if (!size) + return rx_buffer; /* we are reusing so sync this buffer for CPU use */ dma_sync_single_range_for_cpu(rx_ring->dev, diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 782450d5c12f..1603e99bae4a 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -627,6 +627,33 @@ static void iavf_mac_add_reject(struct iavf_adapter *adapter) } /** + * iavf_vlan_add_reject + * @adapter: adapter structure + * + * Remove VLAN filters from list based on PF response. + **/ +static void iavf_vlan_add_reject(struct iavf_adapter *adapter) +{ + struct iavf_vlan_filter *f, *ftmp; + + spin_lock_bh(&adapter->mac_vlan_list_lock); + list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { + if (f->is_new_vlan) { + if (f->vlan.tpid == ETH_P_8021Q) + clear_bit(f->vlan.vid, + adapter->vsi.active_cvlans); + else + clear_bit(f->vlan.vid, + adapter->vsi.active_svlans); + + list_del(&f->list); + kfree(f); + } + } + spin_unlock_bh(&adapter->mac_vlan_list_lock); +} + +/** * iavf_add_vlans * @adapter: adapter structure * @@ -683,6 +710,7 @@ void iavf_add_vlans(struct iavf_adapter *adapter) vvfl->vlan_id[i] = f->vlan.vid; i++; f->add = false; + f->is_new_vlan = true; if (i == count) break; } @@ -695,10 +723,18 @@ void iavf_add_vlans(struct iavf_adapter *adapter) iavf_send_pf_msg(adapter, VIRTCHNL_OP_ADD_VLAN, (u8 *)vvfl, len); kfree(vvfl); } else { + u16 max_vlans = adapter->vlan_v2_caps.filtering.max_filters; + u16 current_vlans = iavf_get_num_vlans_added(adapter); struct virtchnl_vlan_filter_list_v2 *vvfl_v2; adapter->current_op = VIRTCHNL_OP_ADD_VLAN_V2; + if ((count + current_vlans) > max_vlans && + current_vlans < max_vlans) { + count = max_vlans - iavf_get_num_vlans_added(adapter); + more = true; + } + len = sizeof(*vvfl_v2) + ((count - 1) * sizeof(struct virtchnl_vlan_filter)); if (len > IAVF_MAX_AQ_BUF_SIZE) { @@ -725,6 +761,9 @@ void iavf_add_vlans(struct iavf_adapter *adapter) &adapter->vlan_v2_caps.filtering.filtering_support; struct virtchnl_vlan *vlan; + if (i == count) + break; + /* give priority over outer if it's enabled */ if (filtering_support->outer) vlan = &vvfl_v2->filters[i].outer; @@ -736,8 +775,7 @@ void iavf_add_vlans(struct iavf_adapter *adapter) i++; f->add = false; - if (i == count) - break; + f->is_new_vlan = true; } } @@ -2080,6 +2118,11 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, */ iavf_netdev_features_vlan_strip_set(netdev, true); break; + case VIRTCHNL_OP_ADD_VLAN_V2: + iavf_vlan_add_reject(adapter); + dev_warn(&adapter->pdev->dev, "Failed to add VLAN filter, error %s\n", + iavf_stat_str(&adapter->hw, v_retval)); + break; default: dev_err(&adapter->pdev->dev, "PF returned error %d (%s) to our request %d\n", v_retval, iavf_stat_str(&adapter->hw, v_retval), @@ -2332,6 +2375,24 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, spin_unlock_bh(&adapter->adv_rss_lock); } break; + case VIRTCHNL_OP_ADD_VLAN_V2: { + struct iavf_vlan_filter *f; + + spin_lock_bh(&adapter->mac_vlan_list_lock); + list_for_each_entry(f, &adapter->vlan_filter_list, list) { + if (f->is_new_vlan) { + f->is_new_vlan = false; + if (f->vlan.tpid == ETH_P_8021Q) + set_bit(f->vlan.vid, + adapter->vsi.active_cvlans); + else + set_bit(f->vlan.vid, + adapter->vsi.active_svlans); + } + } + spin_unlock_bh(&adapter->mac_vlan_list_lock); + } + break; case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING: /* PF enabled vlan strip on this VF. * Update netdev->features if needed to be in sync with ethtool. diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 70335f6e8524..4efa5e5846e0 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -658,7 +658,8 @@ static int ice_lbtest_receive_frames(struct ice_rx_ring *rx_ring) rx_desc = ICE_RX_DESC(rx_ring, i); if (!(rx_desc->wb.status_error0 & - cpu_to_le16(ICE_TX_DESC_CMD_EOP | ICE_TX_DESC_CMD_RS))) + (cpu_to_le16(BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S)) | + cpu_to_le16(BIT(ICE_RX_FLEX_DESC_STATUS0_EOF_S))))) continue; rx_buf = &rx_ring->rx_buf[i]; diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index ff2eac2f8c64..9f02b60459f1 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -4656,6 +4656,8 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent) ice_set_safe_mode_caps(hw); } + hw->ucast_shared = true; + err = ice_init_pf(pf); if (err) { dev_err(dev, "ice_init_pf failed: %d\n", err); @@ -6011,10 +6013,12 @@ int ice_vsi_cfg(struct ice_vsi *vsi) if (vsi->netdev) { ice_set_rx_mode(vsi->netdev); - err = ice_vsi_vlan_setup(vsi); + if (vsi->type != ICE_VSI_LB) { + err = ice_vsi_vlan_setup(vsi); - if (err) - return err; + if (err) + return err; + } } ice_vsi_cfg_dcb_rings(vsi); diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c index bb1721f1321d..f4907a3c2d19 100644 --- a/drivers/net/ethernet/intel/ice/ice_sriov.c +++ b/drivers/net/ethernet/intel/ice/ice_sriov.c @@ -1310,39 +1310,6 @@ out_put_vf: } /** - * ice_unicast_mac_exists - check if the unicast MAC exists on the PF's switch - * @pf: PF used to reference the switch's rules - * @umac: unicast MAC to compare against existing switch rules - * - * Return true on the first/any match, else return false - */ -static bool ice_unicast_mac_exists(struct ice_pf *pf, u8 *umac) -{ - struct ice_sw_recipe *mac_recipe_list = - &pf->hw.switch_info->recp_list[ICE_SW_LKUP_MAC]; - struct ice_fltr_mgmt_list_entry *list_itr; - struct list_head *rule_head; - struct mutex *rule_lock; /* protect MAC filter list access */ - - rule_head = &mac_recipe_list->filt_rules; - rule_lock = &mac_recipe_list->filt_rule_lock; - - mutex_lock(rule_lock); - list_for_each_entry(list_itr, rule_head, list_entry) { - u8 *existing_mac = &list_itr->fltr_info.l_data.mac.mac_addr[0]; - - if (ether_addr_equal(existing_mac, umac)) { - mutex_unlock(rule_lock); - return true; - } - } - - mutex_unlock(rule_lock); - - return false; -} - -/** * ice_set_vf_mac * @netdev: network interface device structure * @vf_id: VF identifier @@ -1376,13 +1343,6 @@ int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) if (ret) goto out_put_vf; - if (ice_unicast_mac_exists(pf, mac)) { - netdev_err(netdev, "Unicast MAC %pM already exists on this PF. Preventing setting VF %u unicast MAC address to %pM\n", - mac, vf_id, mac); - ret = -EINVAL; - goto out_put_vf; - } - mutex_lock(&vf->cfg_lock); /* VF is notified of its new MAC via the PF's response to the diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 3f8b7274ed2f..836dce840712 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1751,11 +1751,13 @@ int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off) protocol = vlan_get_protocol(skb); - if (eth_p_mpls(protocol)) + if (eth_p_mpls(protocol)) { ip.hdr = skb_inner_network_header(skb); - else + l4.hdr = skb_checksum_start(skb); + } else { ip.hdr = skb_network_header(skb); - l4.hdr = skb_checksum_start(skb); + l4.hdr = skb_transport_header(skb); + } /* compute outer L2 header size */ l2_len = ip.hdr - skb->data; diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c index 4547bc1f7cee..24188ec594d5 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c @@ -2948,7 +2948,8 @@ ice_vc_validate_add_vlan_filter_list(struct ice_vsi *vsi, struct virtchnl_vlan_filtering_caps *vfc, struct virtchnl_vlan_filter_list_v2 *vfl) { - u16 num_requested_filters = vsi->num_vlan + vfl->num_elements; + u16 num_requested_filters = ice_vsi_num_non_zero_vlans(vsi) + + vfl->num_elements; if (num_requested_filters > vfc->max_filters) return false; diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index ae17af44fe02..a5ebee7df4a8 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6171,6 +6171,9 @@ u32 igc_rd32(struct igc_hw *hw, u32 reg) u8 __iomem *hw_addr = READ_ONCE(hw->hw_addr); u32 value = 0; + if (IGC_REMOVED(hw_addr)) + return ~value; + value = readl(&hw_addr[reg]); /* reads should not return all F's */ diff --git a/drivers/net/ethernet/intel/igc/igc_regs.h b/drivers/net/ethernet/intel/igc/igc_regs.h index e197a33d93a0..026c3b65fc37 100644 --- a/drivers/net/ethernet/intel/igc/igc_regs.h +++ b/drivers/net/ethernet/intel/igc/igc_regs.h @@ -306,7 +306,8 @@ u32 igc_rd32(struct igc_hw *hw, u32 reg); #define wr32(reg, val) \ do { \ u8 __iomem *hw_addr = READ_ONCE((hw)->hw_addr); \ - writel((val), &hw_addr[(reg)]); \ + if (!IGC_REMOVED(hw_addr)) \ + writel((val), &hw_addr[(reg)]); \ } while (0) #define rd32(reg) (igc_rd32(hw, reg)) @@ -318,4 +319,6 @@ do { \ #define array_rd32(reg, offset) (igc_rd32(hw, (reg) + ((offset) << 2))) +#define IGC_REMOVED(h) unlikely(!(h)) + #endif diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 921a4d977d65..8813b4dd6872 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -779,6 +779,7 @@ struct ixgbe_adapter { #ifdef CONFIG_IXGBE_IPSEC struct ixgbe_ipsec *ipsec; #endif /* CONFIG_IXGBE_IPSEC */ + spinlock_t vfs_lock; }; static inline int ixgbe_determine_xdp_q_idx(int cpu) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 77c2e70b0860..55f91c9ff047 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6403,6 +6403,9 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter, /* n-tuple support exists, always init our spinlock */ spin_lock_init(&adapter->fdir_perfect_lock); + /* init spinlock to avoid concurrency of VF resources */ + spin_lock_init(&adapter->vfs_lock); + #ifdef CONFIG_IXGBE_DCB ixgbe_init_dcb(adapter); #endif diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c index d4e63f0644c3..a1e69c734863 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c @@ -205,10 +205,13 @@ void ixgbe_enable_sriov(struct ixgbe_adapter *adapter, unsigned int max_vfs) int ixgbe_disable_sriov(struct ixgbe_adapter *adapter) { unsigned int num_vfs = adapter->num_vfs, vf; + unsigned long flags; int rss; + spin_lock_irqsave(&adapter->vfs_lock, flags); /* set num VFs to 0 to prevent access to vfinfo */ adapter->num_vfs = 0; + spin_unlock_irqrestore(&adapter->vfs_lock, flags); /* put the reference to all of the vf devices */ for (vf = 0; vf < num_vfs; ++vf) { @@ -1355,8 +1358,10 @@ static void ixgbe_rcv_ack_from_vf(struct ixgbe_adapter *adapter, u32 vf) void ixgbe_msg_task(struct ixgbe_adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; + unsigned long flags; u32 vf; + spin_lock_irqsave(&adapter->vfs_lock, flags); for (vf = 0; vf < adapter->num_vfs; vf++) { /* process any reset requests */ if (!ixgbe_check_for_rst(hw, vf)) @@ -1370,6 +1375,7 @@ void ixgbe_msg_task(struct ixgbe_adapter *adapter) if (!ixgbe_check_for_ack(hw, vf)) ixgbe_rcv_ack_from_vf(adapter, vf); } + spin_unlock_irqrestore(&adapter->vfs_lock, flags); } static inline void ixgbe_ping_vf(struct ixgbe_adapter *adapter, int vf) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c index 28b19945d716..e64318c110fd 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c @@ -28,6 +28,9 @@ #define MAX_RATE_EXPONENT 0x0FULL #define MAX_RATE_MANTISSA 0xFFULL +#define CN10K_MAX_BURST_MANTISSA 0x7FFFULL +#define CN10K_MAX_BURST_SIZE 8453888ULL + /* Bitfields in NIX_TLX_PIR register */ #define TLX_RATE_MANTISSA GENMASK_ULL(8, 1) #define TLX_RATE_EXPONENT GENMASK_ULL(12, 9) @@ -35,6 +38,9 @@ #define TLX_BURST_MANTISSA GENMASK_ULL(36, 29) #define TLX_BURST_EXPONENT GENMASK_ULL(40, 37) +#define CN10K_TLX_BURST_MANTISSA GENMASK_ULL(43, 29) +#define CN10K_TLX_BURST_EXPONENT GENMASK_ULL(47, 44) + struct otx2_tc_flow_stats { u64 bytes; u64 pkts; @@ -77,33 +83,42 @@ int otx2_tc_alloc_ent_bitmap(struct otx2_nic *nic) } EXPORT_SYMBOL(otx2_tc_alloc_ent_bitmap); -static void otx2_get_egress_burst_cfg(u32 burst, u32 *burst_exp, - u32 *burst_mantissa) +static void otx2_get_egress_burst_cfg(struct otx2_nic *nic, u32 burst, + u32 *burst_exp, u32 *burst_mantissa) { + int max_burst, max_mantissa; unsigned int tmp; + if (is_dev_otx2(nic->pdev)) { + max_burst = MAX_BURST_SIZE; + max_mantissa = MAX_BURST_MANTISSA; + } else { + max_burst = CN10K_MAX_BURST_SIZE; + max_mantissa = CN10K_MAX_BURST_MANTISSA; + } + /* Burst is calculated as * ((256 + BURST_MANTISSA) << (1 + BURST_EXPONENT)) / 256 * Max supported burst size is 130,816 bytes. */ - burst = min_t(u32, burst, MAX_BURST_SIZE); + burst = min_t(u32, burst, max_burst); if (burst) { *burst_exp = ilog2(burst) ? ilog2(burst) - 1 : 0; tmp = burst - rounddown_pow_of_two(burst); - if (burst < MAX_BURST_MANTISSA) + if (burst < max_mantissa) *burst_mantissa = tmp * 2; else *burst_mantissa = tmp / (1ULL << (*burst_exp - 7)); } else { *burst_exp = MAX_BURST_EXPONENT; - *burst_mantissa = MAX_BURST_MANTISSA; + *burst_mantissa = max_mantissa; } } -static void otx2_get_egress_rate_cfg(u32 maxrate, u32 *exp, +static void otx2_get_egress_rate_cfg(u64 maxrate, u32 *exp, u32 *mantissa, u32 *div_exp) { - unsigned int tmp; + u64 tmp; /* Rate calculation by hardware * @@ -132,21 +147,44 @@ static void otx2_get_egress_rate_cfg(u32 maxrate, u32 *exp, } } -static int otx2_set_matchall_egress_rate(struct otx2_nic *nic, u32 burst, u32 maxrate) +static u64 otx2_get_txschq_rate_regval(struct otx2_nic *nic, + u64 maxrate, u32 burst) { - struct otx2_hw *hw = &nic->hw; - struct nix_txschq_config *req; u32 burst_exp, burst_mantissa; u32 exp, mantissa, div_exp; + u64 regval = 0; + + /* Get exponent and mantissa values from the desired rate */ + otx2_get_egress_burst_cfg(nic, burst, &burst_exp, &burst_mantissa); + otx2_get_egress_rate_cfg(maxrate, &exp, &mantissa, &div_exp); + + if (is_dev_otx2(nic->pdev)) { + regval = FIELD_PREP(TLX_BURST_EXPONENT, (u64)burst_exp) | + FIELD_PREP(TLX_BURST_MANTISSA, (u64)burst_mantissa) | + FIELD_PREP(TLX_RATE_DIVIDER_EXPONENT, div_exp) | + FIELD_PREP(TLX_RATE_EXPONENT, exp) | + FIELD_PREP(TLX_RATE_MANTISSA, mantissa) | BIT_ULL(0); + } else { + regval = FIELD_PREP(CN10K_TLX_BURST_EXPONENT, (u64)burst_exp) | + FIELD_PREP(CN10K_TLX_BURST_MANTISSA, (u64)burst_mantissa) | + FIELD_PREP(TLX_RATE_DIVIDER_EXPONENT, div_exp) | + FIELD_PREP(TLX_RATE_EXPONENT, exp) | + FIELD_PREP(TLX_RATE_MANTISSA, mantissa) | BIT_ULL(0); + } + + return regval; +} + +static int otx2_set_matchall_egress_rate(struct otx2_nic *nic, + u32 burst, u64 maxrate) +{ + struct otx2_hw *hw = &nic->hw; + struct nix_txschq_config *req; int txschq, err; /* All SQs share the same TL4, so pick the first scheduler */ txschq = hw->txschq_list[NIX_TXSCH_LVL_TL4][0]; - /* Get exponent and mantissa values from the desired rate */ - otx2_get_egress_burst_cfg(burst, &burst_exp, &burst_mantissa); - otx2_get_egress_rate_cfg(maxrate, &exp, &mantissa, &div_exp); - mutex_lock(&nic->mbox.lock); req = otx2_mbox_alloc_msg_nix_txschq_cfg(&nic->mbox); if (!req) { @@ -157,11 +195,7 @@ static int otx2_set_matchall_egress_rate(struct otx2_nic *nic, u32 burst, u32 ma req->lvl = NIX_TXSCH_LVL_TL4; req->num_regs = 1; req->reg[0] = NIX_AF_TL4X_PIR(txschq); - req->regval[0] = FIELD_PREP(TLX_BURST_EXPONENT, burst_exp) | - FIELD_PREP(TLX_BURST_MANTISSA, burst_mantissa) | - FIELD_PREP(TLX_RATE_DIVIDER_EXPONENT, div_exp) | - FIELD_PREP(TLX_RATE_EXPONENT, exp) | - FIELD_PREP(TLX_RATE_MANTISSA, mantissa) | BIT_ULL(0); + req->regval[0] = otx2_get_txschq_rate_regval(nic, maxrate, burst); err = otx2_sync_mbox_msg(&nic->mbox); mutex_unlock(&nic->mbox.lock); @@ -230,7 +264,7 @@ static int otx2_tc_egress_matchall_install(struct otx2_nic *nic, struct netlink_ext_ack *extack = cls->common.extack; struct flow_action *actions = &cls->rule->action; struct flow_action_entry *entry; - u32 rate; + u64 rate; int err; err = otx2_tc_validate_flow(nic, actions, extack); @@ -256,7 +290,7 @@ static int otx2_tc_egress_matchall_install(struct otx2_nic *nic, } /* Convert bytes per second to Mbps */ rate = entry->police.rate_bytes_ps * 8; - rate = max_t(u32, rate / 1000000, 1); + rate = max_t(u64, rate / 1000000, 1); err = otx2_set_matchall_egress_rate(nic, entry->police.burst, rate); if (err) return err; @@ -614,21 +648,27 @@ static int otx2_tc_prepare_flow(struct otx2_nic *nic, struct otx2_tc_flow *node, flow_spec->dport = match.key->dst; flow_mask->dport = match.mask->dst; - if (ip_proto == IPPROTO_UDP) - req->features |= BIT_ULL(NPC_DPORT_UDP); - else if (ip_proto == IPPROTO_TCP) - req->features |= BIT_ULL(NPC_DPORT_TCP); - else if (ip_proto == IPPROTO_SCTP) - req->features |= BIT_ULL(NPC_DPORT_SCTP); + + if (flow_mask->dport) { + if (ip_proto == IPPROTO_UDP) + req->features |= BIT_ULL(NPC_DPORT_UDP); + else if (ip_proto == IPPROTO_TCP) + req->features |= BIT_ULL(NPC_DPORT_TCP); + else if (ip_proto == IPPROTO_SCTP) + req->features |= BIT_ULL(NPC_DPORT_SCTP); + } flow_spec->sport = match.key->src; flow_mask->sport = match.mask->src; - if (ip_proto == IPPROTO_UDP) - req->features |= BIT_ULL(NPC_SPORT_UDP); - else if (ip_proto == IPPROTO_TCP) - req->features |= BIT_ULL(NPC_SPORT_TCP); - else if (ip_proto == IPPROTO_SCTP) - req->features |= BIT_ULL(NPC_SPORT_SCTP); + + if (flow_mask->sport) { + if (ip_proto == IPPROTO_UDP) + req->features |= BIT_ULL(NPC_SPORT_UDP); + else if (ip_proto == IPPROTO_TCP) + req->features |= BIT_ULL(NPC_SPORT_TCP); + else if (ip_proto == IPPROTO_SCTP) + req->features |= BIT_ULL(NPC_SPORT_SCTP); + } } return otx2_tc_parse_actions(nic, &rule->action, req, f, node); diff --git a/drivers/net/ethernet/marvell/prestera/prestera_flower.c b/drivers/net/ethernet/marvell/prestera/prestera_flower.c index d43e503c644f..4d93ad6a284c 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_flower.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_flower.c @@ -167,12 +167,12 @@ static int prestera_flower_parse_meta(struct prestera_acl_rule *rule, } port = netdev_priv(ingress_dev); - mask = htons(0x1FFF); - key = htons(port->hw_id); + mask = htons(0x1FFF << 3); + key = htons(port->hw_id << 3); rule_match_set(r_match->key, SYS_PORT, key); rule_match_set(r_match->mask, SYS_PORT, mask); - mask = htons(0x1FF); + mask = htons(0x3FF); key = htons(port->dev_id); rule_match_set(r_match->key, SYS_DEV, key); rule_match_set(r_match->mask, SYS_DEV, mask); diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c index 90e7dfd011c9..5d457bc9acc1 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c @@ -93,6 +93,9 @@ mtk_flow_get_wdma_info(struct net_device *dev, const u8 *addr, struct mtk_wdma_i }; struct net_device_path path = {}; + if (!ctx.dev) + return -ENODEV; + memcpy(ctx.daddr, addr, sizeof(ctx.daddr)); if (!IS_ENABLED(CONFIG_NET_MEDIATEK_SOC_WED)) diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c index 8f0cd3196aac..29be2fcafea3 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.c +++ b/drivers/net/ethernet/mediatek/mtk_wed.c @@ -651,7 +651,7 @@ mtk_wed_tx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs) * WDMA RX. */ - BUG_ON(idx > ARRAY_SIZE(dev->tx_ring)); + BUG_ON(idx >= ARRAY_SIZE(dev->tx_ring)); if (mtk_wed_ring_alloc(dev, ring, MTK_WED_TX_RING_SIZE)) return -ENOMEM; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 0d8a0068e4ca..ce33dbde124d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -5384,7 +5384,7 @@ static bool mlxsw_sp_fi_is_gateway(const struct mlxsw_sp *mlxsw_sp, { const struct fib_nh *nh = fib_info_nh(fi, 0); - return nh->fib_nh_scope == RT_SCOPE_LINK || + return nh->fib_nh_gw_family || mlxsw_sp_nexthop4_ipip_type(mlxsw_sp, nh, NULL); } @@ -10324,7 +10324,7 @@ static void mlxsw_sp_mp4_hash_init(struct mlxsw_sp *mlxsw_sp, unsigned long *fields = config->fields; u32 hash_fields; - switch (net->ipv4.sysctl_fib_multipath_hash_policy) { + switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) { case 0: mlxsw_sp_mp4_hash_outer_addr(config); break; @@ -10342,7 +10342,7 @@ static void mlxsw_sp_mp4_hash_init(struct mlxsw_sp *mlxsw_sp, mlxsw_sp_mp_hash_inner_l3(config); break; case 3: - hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); /* Outer */ MLXSW_SP_MP_HASH_HEADER_SET(headers, IPV4_EN_NOT_TCP_NOT_UDP); MLXSW_SP_MP_HASH_HEADER_SET(headers, IPV4_EN_TCP_UDP); @@ -10523,13 +10523,14 @@ static int mlxsw_sp_dscp_init(struct mlxsw_sp *mlxsw_sp) static int __mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp) { struct net *net = mlxsw_sp_net(mlxsw_sp); - bool usp = net->ipv4.sysctl_ip_fwd_update_priority; char rgcr_pl[MLXSW_REG_RGCR_LEN]; u64 max_rifs; + bool usp; if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_RIFS)) return -EIO; max_rifs = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); + usp = READ_ONCE(net->ipv4.sysctl_ip_fwd_update_priority); mlxsw_reg_rgcr_pack(rgcr_pl, true, true); mlxsw_reg_rgcr_max_router_interfaces_set(rgcr_pl, max_rifs); diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c index 005e56ea5da1..5893770bfd94 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c @@ -75,6 +75,9 @@ static int __lan966x_mac_learn(struct lan966x *lan966x, int pgid, unsigned int vid, enum macaccess_entry_type type) { + int ret; + + spin_lock(&lan966x->mac_lock); lan966x_mac_select(lan966x, mac, vid); /* Issue a write command */ @@ -86,7 +89,10 @@ static int __lan966x_mac_learn(struct lan966x *lan966x, int pgid, ANA_MACACCESS_MAC_TABLE_CMD_SET(MACACCESS_CMD_LEARN), lan966x, ANA_MACACCESS); - return lan966x_mac_wait_for_completion(lan966x); + ret = lan966x_mac_wait_for_completion(lan966x); + spin_unlock(&lan966x->mac_lock); + + return ret; } /* The mask of the front ports is encoded inside the mac parameter via a call @@ -113,11 +119,13 @@ int lan966x_mac_learn(struct lan966x *lan966x, int port, return __lan966x_mac_learn(lan966x, port, false, mac, vid, type); } -int lan966x_mac_forget(struct lan966x *lan966x, - const unsigned char mac[ETH_ALEN], - unsigned int vid, - enum macaccess_entry_type type) +static int lan966x_mac_forget_locked(struct lan966x *lan966x, + const unsigned char mac[ETH_ALEN], + unsigned int vid, + enum macaccess_entry_type type) { + lockdep_assert_held(&lan966x->mac_lock); + lan966x_mac_select(lan966x, mac, vid); /* Issue a forget command */ @@ -128,6 +136,20 @@ int lan966x_mac_forget(struct lan966x *lan966x, return lan966x_mac_wait_for_completion(lan966x); } +int lan966x_mac_forget(struct lan966x *lan966x, + const unsigned char mac[ETH_ALEN], + unsigned int vid, + enum macaccess_entry_type type) +{ + int ret; + + spin_lock(&lan966x->mac_lock); + ret = lan966x_mac_forget_locked(lan966x, mac, vid, type); + spin_unlock(&lan966x->mac_lock); + + return ret; +} + int lan966x_mac_cpu_learn(struct lan966x *lan966x, const char *addr, u16 vid) { return lan966x_mac_learn(lan966x, PGID_CPU, addr, vid, ENTRYTYPE_LOCKED); @@ -161,7 +183,7 @@ static struct lan966x_mac_entry *lan966x_mac_alloc_entry(const unsigned char *ma { struct lan966x_mac_entry *mac_entry; - mac_entry = kzalloc(sizeof(*mac_entry), GFP_KERNEL); + mac_entry = kzalloc(sizeof(*mac_entry), GFP_ATOMIC); if (!mac_entry) return NULL; @@ -179,7 +201,6 @@ static struct lan966x_mac_entry *lan966x_mac_find_entry(struct lan966x *lan966x, struct lan966x_mac_entry *res = NULL; struct lan966x_mac_entry *mac_entry; - spin_lock(&lan966x->mac_lock); list_for_each_entry(mac_entry, &lan966x->mac_entries, list) { if (mac_entry->vid == vid && ether_addr_equal(mac, mac_entry->mac) && @@ -188,7 +209,6 @@ static struct lan966x_mac_entry *lan966x_mac_find_entry(struct lan966x *lan966x, break; } } - spin_unlock(&lan966x->mac_lock); return res; } @@ -231,8 +251,11 @@ int lan966x_mac_add_entry(struct lan966x *lan966x, struct lan966x_port *port, { struct lan966x_mac_entry *mac_entry; - if (lan966x_mac_lookup(lan966x, addr, vid, ENTRYTYPE_NORMAL)) + spin_lock(&lan966x->mac_lock); + if (lan966x_mac_lookup(lan966x, addr, vid, ENTRYTYPE_NORMAL)) { + spin_unlock(&lan966x->mac_lock); return 0; + } /* In case the entry already exists, don't add it again to SW, * just update HW, but we need to look in the actual HW because @@ -241,21 +264,25 @@ int lan966x_mac_add_entry(struct lan966x *lan966x, struct lan966x_port *port, * add the entry but without the extern_learn flag. */ mac_entry = lan966x_mac_find_entry(lan966x, addr, vid, port->chip_port); - if (mac_entry) - return lan966x_mac_learn(lan966x, port->chip_port, - addr, vid, ENTRYTYPE_LOCKED); + if (mac_entry) { + spin_unlock(&lan966x->mac_lock); + goto mac_learn; + } mac_entry = lan966x_mac_alloc_entry(addr, vid, port->chip_port); - if (!mac_entry) + if (!mac_entry) { + spin_unlock(&lan966x->mac_lock); return -ENOMEM; + } - spin_lock(&lan966x->mac_lock); list_add_tail(&mac_entry->list, &lan966x->mac_entries); spin_unlock(&lan966x->mac_lock); - lan966x_mac_learn(lan966x, port->chip_port, addr, vid, ENTRYTYPE_LOCKED); lan966x_fdb_call_notifiers(SWITCHDEV_FDB_OFFLOADED, addr, vid, port->dev); +mac_learn: + lan966x_mac_learn(lan966x, port->chip_port, addr, vid, ENTRYTYPE_LOCKED); + return 0; } @@ -269,8 +296,9 @@ int lan966x_mac_del_entry(struct lan966x *lan966x, const unsigned char *addr, list) { if (mac_entry->vid == vid && ether_addr_equal(addr, mac_entry->mac)) { - lan966x_mac_forget(lan966x, mac_entry->mac, mac_entry->vid, - ENTRYTYPE_LOCKED); + lan966x_mac_forget_locked(lan966x, mac_entry->mac, + mac_entry->vid, + ENTRYTYPE_LOCKED); list_del(&mac_entry->list); kfree(mac_entry); @@ -288,8 +316,8 @@ void lan966x_mac_purge_entries(struct lan966x *lan966x) spin_lock(&lan966x->mac_lock); list_for_each_entry_safe(mac_entry, tmp, &lan966x->mac_entries, list) { - lan966x_mac_forget(lan966x, mac_entry->mac, mac_entry->vid, - ENTRYTYPE_LOCKED); + lan966x_mac_forget_locked(lan966x, mac_entry->mac, + mac_entry->vid, ENTRYTYPE_LOCKED); list_del(&mac_entry->list); kfree(mac_entry); @@ -325,10 +353,13 @@ static void lan966x_mac_irq_process(struct lan966x *lan966x, u32 row, { struct lan966x_mac_entry *mac_entry, *tmp; unsigned char mac[ETH_ALEN] __aligned(2); + struct list_head mac_deleted_entries; u32 dest_idx; u32 column; u16 vid; + INIT_LIST_HEAD(&mac_deleted_entries); + spin_lock(&lan966x->mac_lock); list_for_each_entry_safe(mac_entry, tmp, &lan966x->mac_entries, list) { bool found = false; @@ -362,20 +393,26 @@ static void lan966x_mac_irq_process(struct lan966x *lan966x, u32 row, } if (!found) { - /* Notify the bridge that the entry doesn't exist - * anymore in the HW and remove the entry from the SW - * list - */ - lan966x_mac_notifiers(SWITCHDEV_FDB_DEL_TO_BRIDGE, - mac_entry->mac, mac_entry->vid, - lan966x->ports[mac_entry->port_index]->dev); - list_del(&mac_entry->list); - kfree(mac_entry); + /* Move the entry from SW list to a tmp list such that + * it would be deleted later + */ + list_add_tail(&mac_entry->list, &mac_deleted_entries); } } spin_unlock(&lan966x->mac_lock); + list_for_each_entry_safe(mac_entry, tmp, &mac_deleted_entries, list) { + /* Notify the bridge that the entry doesn't exist + * anymore in the HW + */ + lan966x_mac_notifiers(SWITCHDEV_FDB_DEL_TO_BRIDGE, + mac_entry->mac, mac_entry->vid, + lan966x->ports[mac_entry->port_index]->dev); + list_del(&mac_entry->list); + kfree(mac_entry); + } + /* Now go to the list of columns and see if any entry was not in the SW * list, then that means that the entry is new so it needs to notify the * bridge. @@ -396,13 +433,20 @@ static void lan966x_mac_irq_process(struct lan966x *lan966x, u32 row, if (WARN_ON(dest_idx >= lan966x->num_phys_ports)) continue; + spin_lock(&lan966x->mac_lock); + mac_entry = lan966x_mac_find_entry(lan966x, mac, vid, dest_idx); + if (mac_entry) { + spin_unlock(&lan966x->mac_lock); + continue; + } + mac_entry = lan966x_mac_alloc_entry(mac, vid, dest_idx); - if (!mac_entry) + if (!mac_entry) { + spin_unlock(&lan966x->mac_lock); return; + } mac_entry->row = row; - - spin_lock(&lan966x->mac_lock); list_add_tail(&mac_entry->list, &lan966x->mac_entries); spin_unlock(&lan966x->mac_lock); @@ -424,6 +468,7 @@ irqreturn_t lan966x_mac_irq_handler(struct lan966x *lan966x) lan966x, ANA_MACTINDX); while (1) { + spin_lock(&lan966x->mac_lock); lan_rmw(ANA_MACACCESS_MAC_TABLE_CMD_SET(MACACCESS_CMD_SYNC_GET_NEXT), ANA_MACACCESS_MAC_TABLE_CMD, lan966x, ANA_MACACCESS); @@ -447,12 +492,15 @@ irqreturn_t lan966x_mac_irq_handler(struct lan966x *lan966x) stop = false; if (column == LAN966X_MAC_COLUMNS - 1 && - index == 0 && stop) + index == 0 && stop) { + spin_unlock(&lan966x->mac_lock); break; + } entry[column].mach = lan_rd(lan966x, ANA_MACHDATA); entry[column].macl = lan_rd(lan966x, ANA_MACLDATA); entry[column].maca = lan_rd(lan966x, ANA_MACACCESS); + spin_unlock(&lan966x->mac_lock); /* Once all the columns are read process them */ if (column == LAN966X_MAC_COLUMNS - 1) { diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index e31f8fbbc696..df2ab5cbd49b 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -4233,7 +4233,7 @@ static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog) } /* If the chain is ended by an load/store pair then this - * could serve as the new head of the the next chain. + * could serve as the new head of the next chain. */ if (curr_pair_is_memcpy(meta1, meta2)) { head_ld_meta = meta1; diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c index 0147de405365..ffb6f6d05a07 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/action.c +++ b/drivers/net/ethernet/netronome/nfp/flower/action.c @@ -474,7 +474,7 @@ nfp_fl_set_tun(struct nfp_app *app, struct nfp_fl_set_tun *set_tun, set_tun->ttl = ip4_dst_hoplimit(&rt->dst); ip_rt_put(rt); } else { - set_tun->ttl = net->ipv4.sysctl_ip_default_ttl; + set_tun->ttl = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); } } diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c index 4625f85acab2..10ad0b93d283 100644 --- a/drivers/net/ethernet/sfc/ptp.c +++ b/drivers/net/ethernet/sfc/ptp.c @@ -1100,7 +1100,29 @@ static void efx_ptp_xmit_skb_queue(struct efx_nic *efx, struct sk_buff *skb) tx_queue = efx_channel_get_tx_queue(ptp_data->channel, type); if (tx_queue && tx_queue->timestamping) { + /* This code invokes normal driver TX code which is always + * protected from softirqs when called from generic TX code, + * which in turn disables preemption. Look at __dev_queue_xmit + * which uses rcu_read_lock_bh disabling preemption for RCU + * plus disabling softirqs. We do not need RCU reader + * protection here. + * + * Although it is theoretically safe for current PTP TX/RX code + * running without disabling softirqs, there are three good + * reasond for doing so: + * + * 1) The code invoked is mainly implemented for non-PTP + * packets and it is always executed with softirqs + * disabled. + * 2) This being a single PTP packet, better to not + * interrupt its processing by softirqs which can lead + * to high latencies. + * 3) netdev_xmit_more checks preemption is disabled and + * triggers a BUG_ON if not. + */ + local_bh_disable(); efx_enqueue_skb(tx_queue, skb); + local_bh_enable(); } else { WARN_ONCE(1, "PTP channel has no timestamped tx queue\n"); dev_kfree_skb_any(skb); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 38fe77d1035e..3fe720c5dc9f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -298,6 +298,11 @@ static void get_arttime(struct mii_bus *mii, int intel_adhoc_addr, *art_time = ns; } +static int stmmac_cross_ts_isr(struct stmmac_priv *priv) +{ + return (readl(priv->ioaddr + GMAC_INT_STATUS) & GMAC_INT_TSIE); +} + static int intel_crosststamp(ktime_t *device, struct system_counterval_t *system, void *ctx) @@ -313,8 +318,6 @@ static int intel_crosststamp(ktime_t *device, u32 num_snapshot; u32 gpio_value; u32 acr_value; - int ret; - u32 v; int i; if (!boot_cpu_has(X86_FEATURE_ART)) @@ -328,6 +331,8 @@ static int intel_crosststamp(ktime_t *device, if (priv->plat->ext_snapshot_en) return -EBUSY; + priv->plat->int_snapshot_en = 1; + mutex_lock(&priv->aux_ts_lock); /* Enable Internal snapshot trigger */ acr_value = readl(ptpaddr + PTP_ACR); @@ -347,6 +352,7 @@ static int intel_crosststamp(ktime_t *device, break; default: mutex_unlock(&priv->aux_ts_lock); + priv->plat->int_snapshot_en = 0; return -EINVAL; } writel(acr_value, ptpaddr + PTP_ACR); @@ -368,13 +374,12 @@ static int intel_crosststamp(ktime_t *device, gpio_value |= GMAC_GPO1; writel(gpio_value, ioaddr + GMAC_GPIO_STATUS); - /* Poll for time sync operation done */ - ret = readl_poll_timeout(priv->ioaddr + GMAC_INT_STATUS, v, - (v & GMAC_INT_TSIE), 100, 10000); - - if (ret == -ETIMEDOUT) { - pr_err("%s: Wait for time sync operation timeout\n", __func__); - return ret; + /* Time sync done Indication - Interrupt method */ + if (!wait_event_interruptible_timeout(priv->tstamp_busy_wait, + stmmac_cross_ts_isr(priv), + HZ / 100)) { + priv->plat->int_snapshot_en = 0; + return -ETIMEDOUT; } num_snapshot = (readl(ioaddr + GMAC_TIMESTAMP_STATUS) & @@ -392,6 +397,7 @@ static int intel_crosststamp(ktime_t *device, } system->cycles *= intel_priv->crossts_adj; + priv->plat->int_snapshot_en = 0; return 0; } @@ -576,6 +582,7 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, plat->has_crossts = true; plat->crosststamp = intel_crosststamp; + plat->int_snapshot_en = 0; /* Setup MSI vector offset specific to Intel mGbE controller */ plat->msi_mac_vec = 29; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c index 6ff88df58767..d42e1afb6521 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c @@ -576,32 +576,7 @@ static int mediatek_dwmac_init(struct platform_device *pdev, void *priv) } } - ret = clk_bulk_prepare_enable(variant->num_clks, plat->clks); - if (ret) { - dev_err(plat->dev, "failed to enable clks, err = %d\n", ret); - return ret; - } - - ret = clk_prepare_enable(plat->rmii_internal_clk); - if (ret) { - dev_err(plat->dev, "failed to enable rmii internal clk, err = %d\n", ret); - goto err_clk; - } - return 0; - -err_clk: - clk_bulk_disable_unprepare(variant->num_clks, plat->clks); - return ret; -} - -static void mediatek_dwmac_exit(struct platform_device *pdev, void *priv) -{ - struct mediatek_dwmac_plat_data *plat = priv; - const struct mediatek_dwmac_variant *variant = plat->variant; - - clk_disable_unprepare(plat->rmii_internal_clk); - clk_bulk_disable_unprepare(variant->num_clks, plat->clks); } static int mediatek_dwmac_clks_config(void *priv, bool enabled) @@ -643,7 +618,6 @@ static int mediatek_dwmac_common_data(struct platform_device *pdev, plat->addr64 = priv_plat->variant->dma_bit_mask; plat->bsp_priv = priv_plat; plat->init = mediatek_dwmac_init; - plat->exit = mediatek_dwmac_exit; plat->clks_config = mediatek_dwmac_clks_config; if (priv_plat->variant->dwmac_fix_mac_speed) plat->fix_mac_speed = priv_plat->variant->dwmac_fix_mac_speed; @@ -712,13 +686,33 @@ static int mediatek_dwmac_probe(struct platform_device *pdev) mediatek_dwmac_common_data(pdev, plat_dat, priv_plat); mediatek_dwmac_init(pdev, priv_plat); + ret = mediatek_dwmac_clks_config(priv_plat, true); + if (ret) + goto err_remove_config_dt; + ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); - if (ret) { - stmmac_remove_config_dt(pdev, plat_dat); - return ret; - } + if (ret) + goto err_drv_probe; return 0; + +err_drv_probe: + mediatek_dwmac_clks_config(priv_plat, false); +err_remove_config_dt: + stmmac_remove_config_dt(pdev, plat_dat); + + return ret; +} + +static int mediatek_dwmac_remove(struct platform_device *pdev) +{ + struct mediatek_dwmac_plat_data *priv_plat = get_stmmac_bsp_priv(&pdev->dev); + int ret; + + ret = stmmac_pltfr_remove(pdev); + mediatek_dwmac_clks_config(priv_plat, false); + + return ret; } static const struct of_device_id mediatek_dwmac_match[] = { @@ -733,7 +727,7 @@ MODULE_DEVICE_TABLE(of, mediatek_dwmac_match); static struct platform_driver mediatek_dwmac_driver = { .probe = mediatek_dwmac_probe, - .remove = stmmac_pltfr_remove, + .remove = mediatek_dwmac_remove, .driver = { .name = "dwmac-mediatek", .pm = &stmmac_pltfr_pm_ops, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h index 462ca7ed095a..71dad409f78b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h @@ -150,7 +150,8 @@ #define GMAC_PCS_IRQ_DEFAULT (GMAC_INT_RGSMIIS | GMAC_INT_PCS_LINK | \ GMAC_INT_PCS_ANE) -#define GMAC_INT_DEFAULT_ENABLE (GMAC_INT_PMT_EN | GMAC_INT_LPI_EN) +#define GMAC_INT_DEFAULT_ENABLE (GMAC_INT_PMT_EN | GMAC_INT_LPI_EN | \ + GMAC_INT_TSIE) enum dwmac4_irq_status { time_stamp_irq = 0x00001000, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index fd41db65fe1d..d8f1fbc25bdd 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -23,6 +23,7 @@ static void dwmac4_core_init(struct mac_device_info *hw, struct net_device *dev) { + struct stmmac_priv *priv = netdev_priv(dev); void __iomem *ioaddr = hw->pcsr; u32 value = readl(ioaddr + GMAC_CONFIG); @@ -58,6 +59,9 @@ static void dwmac4_core_init(struct mac_device_info *hw, value |= GMAC_INT_FPE_EN; writel(value, ioaddr + GMAC_INT_EN); + + if (GMAC_INT_DEFAULT_ENABLE & GMAC_INT_TSIE) + init_waitqueue_head(&priv->tstamp_busy_wait); } static void dwmac4_rx_queue_enable(struct mac_device_info *hw, @@ -219,6 +223,9 @@ static void dwmac4_map_mtl_dma(struct mac_device_info *hw, u32 queue, u32 chan) if (queue == 0 || queue == 4) { value &= ~MTL_RXQ_DMA_Q04MDMACH_MASK; value |= MTL_RXQ_DMA_Q04MDMACH(chan); + } else if (queue > 4) { + value &= ~MTL_RXQ_DMA_QXMDMACH_MASK(queue - 4); + value |= MTL_RXQ_DMA_QXMDMACH(chan, queue - 4); } else { value &= ~MTL_RXQ_DMA_QXMDMACH_MASK(queue); value |= MTL_RXQ_DMA_QXMDMACH(chan, queue); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index 57970ae2178d..f9e83964aa7e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -266,6 +266,7 @@ struct stmmac_priv { rwlock_t ptp_lock; /* Protects auxiliary snapshot registers from concurrent access. */ struct mutex aux_ts_lock; + wait_queue_head_t tstamp_busy_wait; void __iomem *mmcaddr; void __iomem *ptpaddr; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index abfb3cd5958d..9c3055ee2608 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -803,14 +803,6 @@ static int stmmac_ethtool_op_set_eee(struct net_device *dev, netdev_warn(priv->dev, "Setting EEE tx-lpi is not supported\n"); - if (priv->hw->xpcs) { - ret = xpcs_config_eee(priv->hw->xpcs, - priv->plat->mult_fact_100ns, - edata->eee_enabled); - if (ret) - return ret; - } - if (!edata->eee_enabled) stmmac_disable_eee_mode(priv); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c index 92d32940aff0..764832f4dae1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c @@ -179,6 +179,11 @@ static void timestamp_interrupt(struct stmmac_priv *priv) u64 ptp_time; int i; + if (priv->plat->int_snapshot_en) { + wake_up(&priv->tstamp_busy_wait); + return; + } + tsync_int = readl(priv->ioaddr + GMAC_INT_STATUS) & GMAC_INT_TSIE; if (!tsync_int) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index d1a7cf4567bc..c5f33630e771 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -834,19 +834,10 @@ int stmmac_init_tstamp_counter(struct stmmac_priv *priv, u32 systime_flags) struct timespec64 now; u32 sec_inc = 0; u64 temp = 0; - int ret; if (!(priv->dma_cap.time_stamp || priv->dma_cap.atime_stamp)) return -EOPNOTSUPP; - ret = clk_prepare_enable(priv->plat->clk_ptp_ref); - if (ret < 0) { - netdev_warn(priv->dev, - "failed to enable PTP reference clock: %pe\n", - ERR_PTR(ret)); - return ret; - } - stmmac_config_hw_tstamping(priv, priv->ptpaddr, systime_flags); priv->systime_flags = systime_flags; @@ -3270,6 +3261,14 @@ static int stmmac_hw_setup(struct net_device *dev, bool ptp_register) stmmac_mmc_setup(priv); + if (ptp_register) { + ret = clk_prepare_enable(priv->plat->clk_ptp_ref); + if (ret < 0) + netdev_warn(priv->dev, + "failed to enable PTP reference clock: %pe\n", + ERR_PTR(ret)); + } + ret = stmmac_init_ptp(priv); if (ret == -EOPNOTSUPP) netdev_info(priv->dev, "PTP not supported by HW\n"); @@ -7213,8 +7212,6 @@ int stmmac_dvr_remove(struct device *dev) netdev_info(priv->dev, "%s: removing driver", __func__); pm_runtime_get_sync(dev); - pm_runtime_disable(dev); - pm_runtime_put_noidle(dev); stmmac_stop_all_dma(priv); stmmac_mac_set(priv, priv->ioaddr, false); @@ -7241,6 +7238,9 @@ int stmmac_dvr_remove(struct device *dev) mutex_destroy(&priv->lock); bitmap_free(priv->af_xdp_zc_qps); + pm_runtime_disable(dev); + pm_runtime_put_noidle(dev); + return 0; } EXPORT_SYMBOL_GPL(stmmac_dvr_remove); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 11e1055e8260..9f5cac4000da 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -815,7 +815,13 @@ static int __maybe_unused stmmac_pltfr_noirq_resume(struct device *dev) if (ret) return ret; - stmmac_init_tstamp_counter(priv, priv->systime_flags); + ret = clk_prepare_enable(priv->plat->clk_ptp_ref); + if (ret < 0) { + netdev_warn(priv->dev, + "failed to enable PTP reference clock: %pe\n", + ERR_PTR(ret)); + return ret; + } } return 0; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c index e45fb191d8e6..4d11980dcd64 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c @@ -175,11 +175,10 @@ static int stmmac_enable(struct ptp_clock_info *ptp, struct stmmac_priv *priv = container_of(ptp, struct stmmac_priv, ptp_clock_ops); void __iomem *ptpaddr = priv->ptpaddr; - void __iomem *ioaddr = priv->hw->pcsr; struct stmmac_pps_cfg *cfg; - u32 intr_value, acr_value; int ret = -EOPNOTSUPP; unsigned long flags; + u32 acr_value; switch (rq->type) { case PTP_CLK_REQ_PEROUT: @@ -213,19 +212,10 @@ static int stmmac_enable(struct ptp_clock_info *ptp, netdev_dbg(priv->dev, "Auxiliary Snapshot %d enabled.\n", priv->plat->ext_snapshot_num >> PTP_ACR_ATSEN_SHIFT); - /* Enable Timestamp Interrupt */ - intr_value = readl(ioaddr + GMAC_INT_EN); - intr_value |= GMAC_INT_TSIE; - writel(intr_value, ioaddr + GMAC_INT_EN); - } else { netdev_dbg(priv->dev, "Auxiliary Snapshot %d disabled.\n", priv->plat->ext_snapshot_num >> PTP_ACR_ATSEN_SHIFT); - /* Disable Timestamp Interrupt */ - intr_value = readl(ioaddr + GMAC_INT_EN); - intr_value &= ~GMAC_INT_TSIE; - writel(intr_value, ioaddr + GMAC_INT_EN); } writel(acr_value, ptpaddr + PTP_ACR); mutex_unlock(&priv->aux_ts_lock); diff --git a/drivers/net/ipa/ipa_qmi_msg.h b/drivers/net/ipa/ipa_qmi_msg.h index 3233d145fd87..495e85abe50b 100644 --- a/drivers/net/ipa/ipa_qmi_msg.h +++ b/drivers/net/ipa/ipa_qmi_msg.h @@ -214,7 +214,7 @@ struct ipa_init_modem_driver_req { /* The response to a IPA_QMI_INIT_DRIVER request begins with a standard * QMI response, but contains other information as well. Currently we - * simply wait for the the INIT_DRIVER transaction to complete and + * simply wait for the INIT_DRIVER transaction to complete and * ignore any other data that might be returned. */ struct ipa_init_modem_driver_rsp { diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 817577e713d7..f354fad05714 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -243,6 +243,7 @@ static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb) #define DEFAULT_SEND_SCI true #define DEFAULT_ENCRYPT false #define DEFAULT_ENCODING_SA 0 +#define MACSEC_XPN_MAX_REPLAY_WINDOW (((1 << 30) - 1)) static bool send_sci(const struct macsec_secy *secy) { @@ -1697,7 +1698,7 @@ static bool validate_add_rxsa(struct nlattr **attrs) return false; if (attrs[MACSEC_SA_ATTR_PN] && - *(u64 *)nla_data(attrs[MACSEC_SA_ATTR_PN]) == 0) + nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) return false; if (attrs[MACSEC_SA_ATTR_ACTIVE]) { @@ -1753,7 +1754,8 @@ static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info) } pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; - if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { + if (tb_sa[MACSEC_SA_ATTR_PN] && + nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { pr_notice("macsec: nl: add_rxsa: bad pn length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); rtnl_unlock(); @@ -1769,7 +1771,7 @@ static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info) if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) { pr_notice("macsec: nl: add_rxsa: bad salt length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_SALT]), - MACSEC_SA_ATTR_SALT); + MACSEC_SALT_LEN); rtnl_unlock(); return -EINVAL; } @@ -1842,7 +1844,7 @@ static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info) return 0; cleanup: - kfree(rx_sa); + macsec_rxsa_put(rx_sa); rtnl_unlock(); return err; } @@ -1939,7 +1941,7 @@ static bool validate_add_txsa(struct nlattr **attrs) if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) return false; - if (nla_get_u32(attrs[MACSEC_SA_ATTR_PN]) == 0) + if (nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) return false; if (attrs[MACSEC_SA_ATTR_ACTIVE]) { @@ -2011,7 +2013,7 @@ static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info) if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) { pr_notice("macsec: nl: add_txsa: bad salt length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_SALT]), - MACSEC_SA_ATTR_SALT); + MACSEC_SALT_LEN); rtnl_unlock(); return -EINVAL; } @@ -2085,7 +2087,7 @@ static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info) cleanup: secy->operational = was_operational; - kfree(tx_sa); + macsec_txsa_put(tx_sa); rtnl_unlock(); return err; } @@ -2293,7 +2295,7 @@ static bool validate_upd_sa(struct nlattr **attrs) if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) return false; - if (attrs[MACSEC_SA_ATTR_PN] && nla_get_u32(attrs[MACSEC_SA_ATTR_PN]) == 0) + if (attrs[MACSEC_SA_ATTR_PN] && nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) return false; if (attrs[MACSEC_SA_ATTR_ACTIVE]) { @@ -3745,9 +3747,6 @@ static int macsec_changelink_common(struct net_device *dev, secy->operational = tx_sa && tx_sa->active; } - if (data[IFLA_MACSEC_WINDOW]) - secy->replay_window = nla_get_u32(data[IFLA_MACSEC_WINDOW]); - if (data[IFLA_MACSEC_ENCRYPT]) tx_sc->encrypt = !!nla_get_u8(data[IFLA_MACSEC_ENCRYPT]); @@ -3793,6 +3792,16 @@ static int macsec_changelink_common(struct net_device *dev, } } + if (data[IFLA_MACSEC_WINDOW]) { + secy->replay_window = nla_get_u32(data[IFLA_MACSEC_WINDOW]); + + /* IEEE 802.1AEbw-2013 10.7.8 - maximum replay window + * for XPN cipher suites */ + if (secy->xpn && + secy->replay_window > MACSEC_XPN_MAX_REPLAY_WINDOW) + return -EINVAL; + } + return 0; } @@ -3822,7 +3831,7 @@ static int macsec_changelink(struct net_device *dev, struct nlattr *tb[], ret = macsec_changelink_common(dev, data); if (ret) - return ret; + goto cleanup; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c index 4cfd05c15aee..d25fbb9caeba 100644 --- a/drivers/net/pcs/pcs-xpcs.c +++ b/drivers/net/pcs/pcs-xpcs.c @@ -896,7 +896,7 @@ static int xpcs_get_state_c37_sgmii(struct dw_xpcs *xpcs, */ ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_AN_INTR_STS); if (ret < 0) - return false; + return ret; if (ret & DW_VR_MII_C37_ANSGM_SP_LNKSTS) { int speed_value; diff --git a/drivers/net/sungem_phy.c b/drivers/net/sungem_phy.c index ff22b6b1c686..36803d932dff 100644 --- a/drivers/net/sungem_phy.c +++ b/drivers/net/sungem_phy.c @@ -450,6 +450,7 @@ static int bcm5421_init(struct mii_phy* phy) int can_low_power = 1; if (np == NULL || of_get_property(np, "no-autolowpower", NULL)) can_low_power = 0; + of_node_put(np); if (can_low_power) { /* Enable automatic low-power */ sungem_phy_write(phy, 0x1c, 0x9002); diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index b082819509e1..0f6efaabaa32 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -32,7 +32,7 @@ #define NETNEXT_VERSION "12" /* Information for net */ -#define NET_VERSION "12" +#define NET_VERSION "13" #define DRIVER_VERSION "v1." NETNEXT_VERSION "." NET_VERSION #define DRIVER_AUTHOR "Realtek linux nic maintainers <nic_swsd@realtek.com>" @@ -5917,7 +5917,8 @@ static void r8153_enter_oob(struct r8152 *tp) wait_oob_link_list_ready(tp); - ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, mtu_to_size(tp->netdev->mtu)); + ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, 1522); + ocp_write_byte(tp, MCU_TYPE_PLA, PLA_MTPS, MTPS_DEFAULT); switch (tp->version) { case RTL_VER_03: @@ -5953,6 +5954,10 @@ static void r8153_enter_oob(struct r8152 *tp) ocp_data |= NOW_IS_OOB | DIS_MCU_CLROOB; ocp_write_byte(tp, MCU_TYPE_PLA, PLA_OOB_CTRL, ocp_data); + ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_SFF_STS_7); + ocp_data |= MCU_BORW_EN; + ocp_write_word(tp, MCU_TYPE_PLA, PLA_SFF_STS_7, ocp_data); + rxdy_gated_en(tp, false); ocp_data = ocp_read_dword(tp, MCU_TYPE_PLA, PLA_RCR); @@ -6555,6 +6560,9 @@ static void rtl8156_down(struct r8152 *tp) rtl_disable(tp); rtl_reset_bmu(tp); + ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, 1522); + ocp_write_byte(tp, MCU_TYPE_PLA, PLA_MTPS, MTPS_DEFAULT); + /* Clear teredo wake event. bit[15:8] is the teredo wakeup * type. Set it to zero. bits[7:0] are the W1C bits about * the events. Set them to all 1 to clear them. @@ -6565,6 +6573,10 @@ static void rtl8156_down(struct r8152 *tp) ocp_data |= NOW_IS_OOB; ocp_write_byte(tp, MCU_TYPE_PLA, PLA_OOB_CTRL, ocp_data); + ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_SFF_STS_7); + ocp_data |= MCU_BORW_EN; + ocp_write_word(tp, MCU_TYPE_PLA, PLA_SFF_STS_7, ocp_data); + rtl_rx_vlan_en(tp, true); rxdy_gated_en(tp, false); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 356cf8dd4164..ec8e1b3108c3 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -242,9 +242,15 @@ struct virtnet_info { /* Packet virtio header size */ u8 hdr_len; - /* Work struct for refilling if we run low on memory. */ + /* Work struct for delayed refilling if we run low on memory. */ struct delayed_work refill; + /* Is delayed refill enabled? */ + bool refill_enabled; + + /* The lock to synchronize the access to refill_enabled */ + spinlock_t refill_lock; + /* Work struct for config space updates */ struct work_struct config_work; @@ -348,6 +354,20 @@ static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask) return p; } +static void enable_delayed_refill(struct virtnet_info *vi) +{ + spin_lock_bh(&vi->refill_lock); + vi->refill_enabled = true; + spin_unlock_bh(&vi->refill_lock); +} + +static void disable_delayed_refill(struct virtnet_info *vi) +{ + spin_lock_bh(&vi->refill_lock); + vi->refill_enabled = false; + spin_unlock_bh(&vi->refill_lock); +} + static void virtqueue_napi_schedule(struct napi_struct *napi, struct virtqueue *vq) { @@ -1527,8 +1547,12 @@ static int virtnet_receive(struct receive_queue *rq, int budget, } if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) { - if (!try_fill_recv(vi, rq, GFP_ATOMIC)) - schedule_delayed_work(&vi->refill, 0); + if (!try_fill_recv(vi, rq, GFP_ATOMIC)) { + spin_lock(&vi->refill_lock); + if (vi->refill_enabled) + schedule_delayed_work(&vi->refill, 0); + spin_unlock(&vi->refill_lock); + } } u64_stats_update_begin(&rq->stats.syncp); @@ -1651,6 +1675,8 @@ static int virtnet_open(struct net_device *dev) struct virtnet_info *vi = netdev_priv(dev); int i, err; + enable_delayed_refill(vi); + for (i = 0; i < vi->max_queue_pairs; i++) { if (i < vi->curr_queue_pairs) /* Make sure we have some buffers: if oom use wq. */ @@ -2033,6 +2059,8 @@ static int virtnet_close(struct net_device *dev) struct virtnet_info *vi = netdev_priv(dev); int i; + /* Make sure NAPI doesn't schedule refill work */ + disable_delayed_refill(vi); /* Make sure refill_work doesn't re-enable napi! */ cancel_delayed_work_sync(&vi->refill); @@ -2792,6 +2820,8 @@ static int virtnet_restore_up(struct virtio_device *vdev) virtio_device_ready(vdev); + enable_delayed_refill(vi); + if (netif_running(vi->dev)) { err = virtnet_open(vi->dev); if (err) @@ -3535,6 +3565,7 @@ static int virtnet_probe(struct virtio_device *vdev) vdev->priv = vi; INIT_WORK(&vi->config_work, virtnet_config_changed_work); + spin_lock_init(&vi->refill_lock); /* If we can receive ANY GSO packets, we must allocate large ones. */ if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 58c72d55769a..73d9fcba3b1c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3515,6 +3515,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1e49, 0x0041), /* ZHITAI TiPro7000 NVMe SSD */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, + { PCI_DEVICE(0xc0a9, 0x540a), /* Crucial P2 */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065), diff --git a/drivers/pinctrl/Kconfig b/drivers/pinctrl/Kconfig index f52960d2dfbe..bff144c97e66 100644 --- a/drivers/pinctrl/Kconfig +++ b/drivers/pinctrl/Kconfig @@ -32,7 +32,7 @@ config DEBUG_PINCTRL Say Y here to add some extra checks and diagnostics to PINCTRL calls. config PINCTRL_AMD - tristate "AMD GPIO pin control" + bool "AMD GPIO pin control" depends on HAS_IOMEM depends on ACPI || COMPILE_TEST select GPIOLIB diff --git a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c index a140b6bfbfaa..bcde042d29dc 100644 --- a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c +++ b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c @@ -102,7 +102,7 @@ struct armada_37xx_pinctrl { struct device *dev; struct gpio_chip gpio_chip; struct irq_chip irq_chip; - spinlock_t irq_lock; + raw_spinlock_t irq_lock; struct pinctrl_desc pctl; struct pinctrl_dev *pctl_dev; struct armada_37xx_pin_group *groups; @@ -523,9 +523,9 @@ static void armada_37xx_irq_ack(struct irq_data *d) unsigned long flags; armada_37xx_irq_update_reg(®, d); - spin_lock_irqsave(&info->irq_lock, flags); + raw_spin_lock_irqsave(&info->irq_lock, flags); writel(d->mask, info->base + reg); - spin_unlock_irqrestore(&info->irq_lock, flags); + raw_spin_unlock_irqrestore(&info->irq_lock, flags); } static void armada_37xx_irq_mask(struct irq_data *d) @@ -536,10 +536,10 @@ static void armada_37xx_irq_mask(struct irq_data *d) unsigned long flags; armada_37xx_irq_update_reg(®, d); - spin_lock_irqsave(&info->irq_lock, flags); + raw_spin_lock_irqsave(&info->irq_lock, flags); val = readl(info->base + reg); writel(val & ~d->mask, info->base + reg); - spin_unlock_irqrestore(&info->irq_lock, flags); + raw_spin_unlock_irqrestore(&info->irq_lock, flags); } static void armada_37xx_irq_unmask(struct irq_data *d) @@ -550,10 +550,10 @@ static void armada_37xx_irq_unmask(struct irq_data *d) unsigned long flags; armada_37xx_irq_update_reg(®, d); - spin_lock_irqsave(&info->irq_lock, flags); + raw_spin_lock_irqsave(&info->irq_lock, flags); val = readl(info->base + reg); writel(val | d->mask, info->base + reg); - spin_unlock_irqrestore(&info->irq_lock, flags); + raw_spin_unlock_irqrestore(&info->irq_lock, flags); } static int armada_37xx_irq_set_wake(struct irq_data *d, unsigned int on) @@ -564,14 +564,14 @@ static int armada_37xx_irq_set_wake(struct irq_data *d, unsigned int on) unsigned long flags; armada_37xx_irq_update_reg(®, d); - spin_lock_irqsave(&info->irq_lock, flags); + raw_spin_lock_irqsave(&info->irq_lock, flags); val = readl(info->base + reg); if (on) val |= (BIT(d->hwirq % GPIO_PER_REG)); else val &= ~(BIT(d->hwirq % GPIO_PER_REG)); writel(val, info->base + reg); - spin_unlock_irqrestore(&info->irq_lock, flags); + raw_spin_unlock_irqrestore(&info->irq_lock, flags); return 0; } @@ -583,7 +583,7 @@ static int armada_37xx_irq_set_type(struct irq_data *d, unsigned int type) u32 val, reg = IRQ_POL; unsigned long flags; - spin_lock_irqsave(&info->irq_lock, flags); + raw_spin_lock_irqsave(&info->irq_lock, flags); armada_37xx_irq_update_reg(®, d); val = readl(info->base + reg); switch (type) { @@ -607,11 +607,11 @@ static int armada_37xx_irq_set_type(struct irq_data *d, unsigned int type) break; } default: - spin_unlock_irqrestore(&info->irq_lock, flags); + raw_spin_unlock_irqrestore(&info->irq_lock, flags); return -EINVAL; } writel(val, info->base + reg); - spin_unlock_irqrestore(&info->irq_lock, flags); + raw_spin_unlock_irqrestore(&info->irq_lock, flags); return 0; } @@ -626,7 +626,7 @@ static int armada_37xx_edge_both_irq_swap_pol(struct armada_37xx_pinctrl *info, regmap_read(info->regmap, INPUT_VAL + 4*reg_idx, &l); - spin_lock_irqsave(&info->irq_lock, flags); + raw_spin_lock_irqsave(&info->irq_lock, flags); p = readl(info->base + IRQ_POL + 4 * reg_idx); if ((p ^ l) & (1 << bit_num)) { /* @@ -647,7 +647,7 @@ static int armada_37xx_edge_both_irq_swap_pol(struct armada_37xx_pinctrl *info, ret = -1; } - spin_unlock_irqrestore(&info->irq_lock, flags); + raw_spin_unlock_irqrestore(&info->irq_lock, flags); return ret; } @@ -664,11 +664,11 @@ static void armada_37xx_irq_handler(struct irq_desc *desc) u32 status; unsigned long flags; - spin_lock_irqsave(&info->irq_lock, flags); + raw_spin_lock_irqsave(&info->irq_lock, flags); status = readl_relaxed(info->base + IRQ_STATUS + 4 * i); /* Manage only the interrupt that was enabled */ status &= readl_relaxed(info->base + IRQ_EN + 4 * i); - spin_unlock_irqrestore(&info->irq_lock, flags); + raw_spin_unlock_irqrestore(&info->irq_lock, flags); while (status) { u32 hwirq = ffs(status) - 1; u32 virq = irq_find_mapping(d, hwirq + @@ -695,12 +695,12 @@ static void armada_37xx_irq_handler(struct irq_desc *desc) update_status: /* Update status in case a new IRQ appears */ - spin_lock_irqsave(&info->irq_lock, flags); + raw_spin_lock_irqsave(&info->irq_lock, flags); status = readl_relaxed(info->base + IRQ_STATUS + 4 * i); /* Manage only the interrupt that was enabled */ status &= readl_relaxed(info->base + IRQ_EN + 4 * i); - spin_unlock_irqrestore(&info->irq_lock, flags); + raw_spin_unlock_irqrestore(&info->irq_lock, flags); } } chained_irq_exit(chip, desc); @@ -731,7 +731,7 @@ static int armada_37xx_irqchip_register(struct platform_device *pdev, struct device *dev = &pdev->dev; unsigned int i, nr_irq_parent; - spin_lock_init(&info->irq_lock); + raw_spin_lock_init(&info->irq_lock); nr_irq_parent = of_irq_count(np); if (!nr_irq_parent) { @@ -1107,25 +1107,40 @@ static const struct of_device_id armada_37xx_pinctrl_of_match[] = { { }, }; +static const struct regmap_config armada_37xx_pinctrl_regmap_config = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, + .use_raw_spinlock = true, +}; + static int __init armada_37xx_pinctrl_probe(struct platform_device *pdev) { struct armada_37xx_pinctrl *info; struct device *dev = &pdev->dev; - struct device_node *np = dev->of_node; struct regmap *regmap; + void __iomem *base; int ret; + base = devm_platform_get_and_ioremap_resource(pdev, 0, NULL); + if (IS_ERR(base)) { + dev_err(dev, "failed to ioremap base address: %pe\n", base); + return PTR_ERR(base); + } + + regmap = devm_regmap_init_mmio(dev, base, + &armada_37xx_pinctrl_regmap_config); + if (IS_ERR(regmap)) { + dev_err(dev, "failed to create regmap: %pe\n", regmap); + return PTR_ERR(regmap); + } + info = devm_kzalloc(dev, sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; info->dev = dev; - - regmap = syscon_node_to_regmap(np); - if (IS_ERR(regmap)) - return dev_err_probe(dev, PTR_ERR(regmap), "cannot get regmap\n"); info->regmap = regmap; - info->data = of_device_get_match_data(dev); ret = armada_37xx_pinctrl_register(pdev, info); diff --git a/drivers/pinctrl/pinctrl-ocelot.c b/drivers/pinctrl/pinctrl-ocelot.c index 5f4a8c5c6650..dfc8ea9f3843 100644 --- a/drivers/pinctrl/pinctrl-ocelot.c +++ b/drivers/pinctrl/pinctrl-ocelot.c @@ -29,19 +29,12 @@ #define ocelot_clrsetbits(addr, clear, set) \ writel((readl(addr) & ~(clear)) | (set), (addr)) -/* PINCONFIG bits (sparx5 only) */ enum { PINCONF_BIAS, PINCONF_SCHMITT, PINCONF_DRIVE_STRENGTH, }; -#define BIAS_PD_BIT BIT(4) -#define BIAS_PU_BIT BIT(3) -#define BIAS_BITS (BIAS_PD_BIT|BIAS_PU_BIT) -#define SCHMITT_BIT BIT(2) -#define DRIVE_BITS GENMASK(1, 0) - /* GPIO standard registers */ #define OCELOT_GPIO_OUT_SET 0x0 #define OCELOT_GPIO_OUT_CLR 0x4 @@ -321,6 +314,13 @@ struct ocelot_pin_caps { unsigned char a_functions[OCELOT_FUNC_PER_PIN]; /* Additional functions */ }; +struct ocelot_pincfg_data { + u8 pd_bit; + u8 pu_bit; + u8 drive_bits; + u8 schmitt_bit; +}; + struct ocelot_pinctrl { struct device *dev; struct pinctrl_dev *pctl; @@ -328,10 +328,16 @@ struct ocelot_pinctrl { struct regmap *map; struct regmap *pincfg; struct pinctrl_desc *desc; + const struct ocelot_pincfg_data *pincfg_data; struct ocelot_pmx_func func[FUNC_MAX]; u8 stride; }; +struct ocelot_match_data { + struct pinctrl_desc desc; + struct ocelot_pincfg_data pincfg_data; +}; + #define LUTON_P(p, f0, f1) \ static struct ocelot_pin_caps luton_pin_##p = { \ .pin = p, \ @@ -1325,24 +1331,27 @@ static int ocelot_hw_get_value(struct ocelot_pinctrl *info, int ret = -EOPNOTSUPP; if (info->pincfg) { + const struct ocelot_pincfg_data *opd = info->pincfg_data; u32 regcfg; - ret = regmap_read(info->pincfg, pin, ®cfg); + ret = regmap_read(info->pincfg, + pin * regmap_get_reg_stride(info->pincfg), + ®cfg); if (ret) return ret; ret = 0; switch (reg) { case PINCONF_BIAS: - *val = regcfg & BIAS_BITS; + *val = regcfg & (opd->pd_bit | opd->pu_bit); break; case PINCONF_SCHMITT: - *val = regcfg & SCHMITT_BIT; + *val = regcfg & opd->schmitt_bit; break; case PINCONF_DRIVE_STRENGTH: - *val = regcfg & DRIVE_BITS; + *val = regcfg & opd->drive_bits; break; default: @@ -1359,14 +1368,18 @@ static int ocelot_pincfg_clrsetbits(struct ocelot_pinctrl *info, u32 regaddr, u32 val; int ret; - ret = regmap_read(info->pincfg, regaddr, &val); + ret = regmap_read(info->pincfg, + regaddr * regmap_get_reg_stride(info->pincfg), + &val); if (ret) return ret; val &= ~clrbits; val |= setbits; - ret = regmap_write(info->pincfg, regaddr, val); + ret = regmap_write(info->pincfg, + regaddr * regmap_get_reg_stride(info->pincfg), + val); return ret; } @@ -1379,23 +1392,27 @@ static int ocelot_hw_set_value(struct ocelot_pinctrl *info, int ret = -EOPNOTSUPP; if (info->pincfg) { + const struct ocelot_pincfg_data *opd = info->pincfg_data; ret = 0; switch (reg) { case PINCONF_BIAS: - ret = ocelot_pincfg_clrsetbits(info, pin, BIAS_BITS, + ret = ocelot_pincfg_clrsetbits(info, pin, + opd->pd_bit | opd->pu_bit, val); break; case PINCONF_SCHMITT: - ret = ocelot_pincfg_clrsetbits(info, pin, SCHMITT_BIT, + ret = ocelot_pincfg_clrsetbits(info, pin, + opd->schmitt_bit, val); break; case PINCONF_DRIVE_STRENGTH: if (val <= 3) ret = ocelot_pincfg_clrsetbits(info, pin, - DRIVE_BITS, val); + opd->drive_bits, + val); else ret = -EINVAL; break; @@ -1425,17 +1442,20 @@ static int ocelot_pinconf_get(struct pinctrl_dev *pctldev, if (param == PIN_CONFIG_BIAS_DISABLE) val = (val == 0); else if (param == PIN_CONFIG_BIAS_PULL_DOWN) - val = (val & BIAS_PD_BIT ? true : false); + val = !!(val & info->pincfg_data->pd_bit); else /* PIN_CONFIG_BIAS_PULL_UP */ - val = (val & BIAS_PU_BIT ? true : false); + val = !!(val & info->pincfg_data->pu_bit); break; case PIN_CONFIG_INPUT_SCHMITT_ENABLE: + if (!info->pincfg_data->schmitt_bit) + return -EOPNOTSUPP; + err = ocelot_hw_get_value(info, pin, PINCONF_SCHMITT, &val); if (err) return err; - val = (val & SCHMITT_BIT ? true : false); + val = !!(val & info->pincfg_data->schmitt_bit); break; case PIN_CONFIG_DRIVE_STRENGTH: @@ -1479,6 +1499,7 @@ static int ocelot_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, unsigned long *configs, unsigned int num_configs) { struct ocelot_pinctrl *info = pinctrl_dev_get_drvdata(pctldev); + const struct ocelot_pincfg_data *opd = info->pincfg_data; u32 param, arg, p; int cfg, err = 0; @@ -1491,8 +1512,8 @@ static int ocelot_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_BIAS_PULL_UP: case PIN_CONFIG_BIAS_PULL_DOWN: arg = (param == PIN_CONFIG_BIAS_DISABLE) ? 0 : - (param == PIN_CONFIG_BIAS_PULL_UP) ? BIAS_PU_BIT : - BIAS_PD_BIT; + (param == PIN_CONFIG_BIAS_PULL_UP) ? + opd->pu_bit : opd->pd_bit; err = ocelot_hw_set_value(info, pin, PINCONF_BIAS, arg); if (err) @@ -1501,7 +1522,10 @@ static int ocelot_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, break; case PIN_CONFIG_INPUT_SCHMITT_ENABLE: - arg = arg ? SCHMITT_BIT : 0; + if (!opd->schmitt_bit) + return -EOPNOTSUPP; + + arg = arg ? opd->schmitt_bit : 0; err = ocelot_hw_set_value(info, pin, PINCONF_SCHMITT, arg); if (err) @@ -1562,69 +1586,94 @@ static const struct pinctrl_ops ocelot_pctl_ops = { .dt_free_map = pinconf_generic_dt_free_map, }; -static struct pinctrl_desc luton_desc = { - .name = "luton-pinctrl", - .pins = luton_pins, - .npins = ARRAY_SIZE(luton_pins), - .pctlops = &ocelot_pctl_ops, - .pmxops = &ocelot_pmx_ops, - .owner = THIS_MODULE, +static struct ocelot_match_data luton_desc = { + .desc = { + .name = "luton-pinctrl", + .pins = luton_pins, + .npins = ARRAY_SIZE(luton_pins), + .pctlops = &ocelot_pctl_ops, + .pmxops = &ocelot_pmx_ops, + .owner = THIS_MODULE, + }, }; -static struct pinctrl_desc serval_desc = { - .name = "serval-pinctrl", - .pins = serval_pins, - .npins = ARRAY_SIZE(serval_pins), - .pctlops = &ocelot_pctl_ops, - .pmxops = &ocelot_pmx_ops, - .owner = THIS_MODULE, +static struct ocelot_match_data serval_desc = { + .desc = { + .name = "serval-pinctrl", + .pins = serval_pins, + .npins = ARRAY_SIZE(serval_pins), + .pctlops = &ocelot_pctl_ops, + .pmxops = &ocelot_pmx_ops, + .owner = THIS_MODULE, + }, }; -static struct pinctrl_desc ocelot_desc = { - .name = "ocelot-pinctrl", - .pins = ocelot_pins, - .npins = ARRAY_SIZE(ocelot_pins), - .pctlops = &ocelot_pctl_ops, - .pmxops = &ocelot_pmx_ops, - .owner = THIS_MODULE, +static struct ocelot_match_data ocelot_desc = { + .desc = { + .name = "ocelot-pinctrl", + .pins = ocelot_pins, + .npins = ARRAY_SIZE(ocelot_pins), + .pctlops = &ocelot_pctl_ops, + .pmxops = &ocelot_pmx_ops, + .owner = THIS_MODULE, + }, }; -static struct pinctrl_desc jaguar2_desc = { - .name = "jaguar2-pinctrl", - .pins = jaguar2_pins, - .npins = ARRAY_SIZE(jaguar2_pins), - .pctlops = &ocelot_pctl_ops, - .pmxops = &ocelot_pmx_ops, - .owner = THIS_MODULE, +static struct ocelot_match_data jaguar2_desc = { + .desc = { + .name = "jaguar2-pinctrl", + .pins = jaguar2_pins, + .npins = ARRAY_SIZE(jaguar2_pins), + .pctlops = &ocelot_pctl_ops, + .pmxops = &ocelot_pmx_ops, + .owner = THIS_MODULE, + }, }; -static struct pinctrl_desc servalt_desc = { - .name = "servalt-pinctrl", - .pins = servalt_pins, - .npins = ARRAY_SIZE(servalt_pins), - .pctlops = &ocelot_pctl_ops, - .pmxops = &ocelot_pmx_ops, - .owner = THIS_MODULE, +static struct ocelot_match_data servalt_desc = { + .desc = { + .name = "servalt-pinctrl", + .pins = servalt_pins, + .npins = ARRAY_SIZE(servalt_pins), + .pctlops = &ocelot_pctl_ops, + .pmxops = &ocelot_pmx_ops, + .owner = THIS_MODULE, + }, }; -static struct pinctrl_desc sparx5_desc = { - .name = "sparx5-pinctrl", - .pins = sparx5_pins, - .npins = ARRAY_SIZE(sparx5_pins), - .pctlops = &ocelot_pctl_ops, - .pmxops = &ocelot_pmx_ops, - .confops = &ocelot_confops, - .owner = THIS_MODULE, +static struct ocelot_match_data sparx5_desc = { + .desc = { + .name = "sparx5-pinctrl", + .pins = sparx5_pins, + .npins = ARRAY_SIZE(sparx5_pins), + .pctlops = &ocelot_pctl_ops, + .pmxops = &ocelot_pmx_ops, + .confops = &ocelot_confops, + .owner = THIS_MODULE, + }, + .pincfg_data = { + .pd_bit = BIT(4), + .pu_bit = BIT(3), + .drive_bits = GENMASK(1, 0), + .schmitt_bit = BIT(2), + }, }; -static struct pinctrl_desc lan966x_desc = { - .name = "lan966x-pinctrl", - .pins = lan966x_pins, - .npins = ARRAY_SIZE(lan966x_pins), - .pctlops = &ocelot_pctl_ops, - .pmxops = &lan966x_pmx_ops, - .confops = &ocelot_confops, - .owner = THIS_MODULE, +static struct ocelot_match_data lan966x_desc = { + .desc = { + .name = "lan966x-pinctrl", + .pins = lan966x_pins, + .npins = ARRAY_SIZE(lan966x_pins), + .pctlops = &ocelot_pctl_ops, + .pmxops = &lan966x_pmx_ops, + .confops = &ocelot_confops, + .owner = THIS_MODULE, + }, + .pincfg_data = { + .pd_bit = BIT(3), + .pu_bit = BIT(2), + .drive_bits = GENMASK(1, 0), + }, }; static int ocelot_create_group_func_map(struct device *dev, @@ -1890,7 +1939,8 @@ static const struct of_device_id ocelot_pinctrl_of_match[] = { {}, }; -static struct regmap *ocelot_pinctrl_create_pincfg(struct platform_device *pdev) +static struct regmap *ocelot_pinctrl_create_pincfg(struct platform_device *pdev, + const struct ocelot_pinctrl *info) { void __iomem *base; @@ -1898,7 +1948,7 @@ static struct regmap *ocelot_pinctrl_create_pincfg(struct platform_device *pdev) .reg_bits = 32, .val_bits = 32, .reg_stride = 4, - .max_register = 32, + .max_register = info->desc->npins * 4, .name = "pincfg", }; @@ -1913,6 +1963,7 @@ static struct regmap *ocelot_pinctrl_create_pincfg(struct platform_device *pdev) static int ocelot_pinctrl_probe(struct platform_device *pdev) { + const struct ocelot_match_data *data; struct device *dev = &pdev->dev; struct ocelot_pinctrl *info; struct reset_control *reset; @@ -1929,7 +1980,16 @@ static int ocelot_pinctrl_probe(struct platform_device *pdev) if (!info) return -ENOMEM; - info->desc = (struct pinctrl_desc *)device_get_match_data(dev); + data = device_get_match_data(dev); + if (!data) + return -EINVAL; + + info->desc = devm_kmemdup(dev, &data->desc, sizeof(*info->desc), + GFP_KERNEL); + if (!info->desc) + return -ENOMEM; + + info->pincfg_data = &data->pincfg_data; reset = devm_reset_control_get_optional_shared(dev, "switch"); if (IS_ERR(reset)) @@ -1956,7 +2016,7 @@ static int ocelot_pinctrl_probe(struct platform_device *pdev) /* Pinconf registers */ if (info->desc->confops) { - pincfg = ocelot_pinctrl_create_pincfg(pdev); + pincfg = ocelot_pinctrl_create_pincfg(pdev, info); if (IS_ERR(pincfg)) dev_dbg(dev, "Failed to create pincfg regmap\n"); else diff --git a/drivers/pinctrl/ralink/pinctrl-ralink.c b/drivers/pinctrl/ralink/pinctrl-ralink.c index 63429a287434..770862f45b3f 100644 --- a/drivers/pinctrl/ralink/pinctrl-ralink.c +++ b/drivers/pinctrl/ralink/pinctrl-ralink.c @@ -266,6 +266,8 @@ static int ralink_pinctrl_pins(struct ralink_priv *p) p->func[i]->pin_count, sizeof(int), GFP_KERNEL); + if (!p->func[i]->pins) + return -ENOMEM; for (j = 0; j < p->func[i]->pin_count; j++) p->func[i]->pins[j] = p->func[i]->pin_first + j; diff --git a/drivers/pinctrl/sunplus/sppctl.c b/drivers/pinctrl/sunplus/sppctl.c index 3ba47040ac42..2b3335ab56c6 100644 --- a/drivers/pinctrl/sunplus/sppctl.c +++ b/drivers/pinctrl/sunplus/sppctl.c @@ -871,6 +871,9 @@ static int sppctl_dt_node_to_map(struct pinctrl_dev *pctldev, struct device_node } *map = kcalloc(*num_maps + nmG, sizeof(**map), GFP_KERNEL); + if (*map == NULL) + return -ENOMEM; + for (i = 0; i < (*num_maps); i++) { dt_pin = be32_to_cpu(list[i]); pin_num = FIELD_GET(GENMASK(31, 24), dt_pin); diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index 458218f88c5e..fe4971b65c64 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -176,6 +176,7 @@ config PTP_1588_CLOCK_OCP depends on !S390 depends on COMMON_CLK select NET_DEVLINK + select CRC16 help This driver adds support for an OpenCompute time card. diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 9e54fe76a9b2..35d4b398c197 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -3565,7 +3565,7 @@ static void qeth_flush_buffers(struct qeth_qdio_out_q *queue, int index, if (!atomic_read(&queue->set_pci_flags_count)) { /* * there's no outstanding PCI any more, so we - * have to request a PCI to be sure the the PCI + * have to request a PCI to be sure the PCI * will wake at some time in the future then we * can flush packed buffers that might still be * hanging around, which can happen if no diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index b519f4b59d30..5e8887fa02c8 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -11386,6 +11386,7 @@ scsih_shutdown(struct pci_dev *pdev) _scsih_ir_shutdown(ioc); _scsih_nvme_shutdown(ioc); mpt3sas_base_mask_interrupts(ioc); + mpt3sas_base_stop_watchdog(ioc); ioc->shost_recovery = 1; mpt3sas_base_make_ioc_ready(ioc, SOFT_RESET); ioc->shost_recovery = 0; diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index a480c4d589f5..729e309e6034 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -450,7 +450,7 @@ static int sg_io(struct scsi_device *sdev, struct sg_io_hdr *hdr, fmode_t mode) goto out_put_request; ret = 0; - if (hdr->iovec_count) { + if (hdr->iovec_count && hdr->dxfer_len) { struct iov_iter i; struct iovec *iov = NULL; diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c index 775c0bf2f923..0933948d7df3 100644 --- a/drivers/spi/spi-bcm2835.c +++ b/drivers/spi/spi-bcm2835.c @@ -1138,10 +1138,14 @@ static void bcm2835_spi_handle_err(struct spi_controller *ctlr, struct bcm2835_spi *bs = spi_controller_get_devdata(ctlr); /* if an error occurred and we have an active dma, then terminate */ - dmaengine_terminate_sync(ctlr->dma_tx); - bs->tx_dma_active = false; - dmaengine_terminate_sync(ctlr->dma_rx); - bs->rx_dma_active = false; + if (ctlr->dma_tx) { + dmaengine_terminate_sync(ctlr->dma_tx); + bs->tx_dma_active = false; + } + if (ctlr->dma_rx) { + dmaengine_terminate_sync(ctlr->dma_rx); + bs->rx_dma_active = false; + } bcm2835_spi_undo_prologue(bs); /* and reset */ diff --git a/drivers/spi/spi-cadence.c b/drivers/spi/spi-cadence.c index 31d778e9d255..6a7f7df1e776 100644 --- a/drivers/spi/spi-cadence.c +++ b/drivers/spi/spi-cadence.c @@ -69,7 +69,7 @@ #define CDNS_SPI_BAUD_DIV_SHIFT 3 /* Baud rate divisor shift in CR */ #define CDNS_SPI_SS_SHIFT 10 /* Slave Select field shift in CR */ #define CDNS_SPI_SS0 0x1 /* Slave Select zero */ -#define CDNS_SPI_NOSS 0x3C /* No Slave select */ +#define CDNS_SPI_NOSS 0xF /* No Slave select */ /* * SPI Interrupt Registers bit Masks diff --git a/drivers/spi/spi-rspi.c b/drivers/spi/spi-rspi.c index 7a014eeec2d0..411b1307b7fd 100644 --- a/drivers/spi/spi-rspi.c +++ b/drivers/spi/spi-rspi.c @@ -613,6 +613,10 @@ static int rspi_dma_transfer(struct rspi_data *rspi, struct sg_table *tx, rspi->dma_callbacked, HZ); if (ret > 0 && rspi->dma_callbacked) { ret = 0; + if (tx) + dmaengine_synchronize(rspi->ctlr->dma_tx); + if (rx) + dmaengine_synchronize(rspi->ctlr->dma_rx); } else { if (!ret) { dev_err(&rspi->ctlr->dev, "DMA timeout\n"); diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index c7b337480e3e..3d367be71728 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -2953,37 +2953,59 @@ ufshcd_dev_cmd_completion(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) static int ufshcd_wait_for_dev_cmd(struct ufs_hba *hba, struct ufshcd_lrb *lrbp, int max_timeout) { - int err = 0; - unsigned long time_left; + unsigned long time_left = msecs_to_jiffies(max_timeout); unsigned long flags; + bool pending; + int err; +retry: time_left = wait_for_completion_timeout(hba->dev_cmd.complete, - msecs_to_jiffies(max_timeout)); + time_left); - spin_lock_irqsave(hba->host->host_lock, flags); - hba->dev_cmd.complete = NULL; if (likely(time_left)) { + /* + * The completion handler called complete() and the caller of + * this function still owns the @lrbp tag so the code below does + * not trigger any race conditions. + */ + hba->dev_cmd.complete = NULL; err = ufshcd_get_tr_ocs(lrbp); if (!err) err = ufshcd_dev_cmd_completion(hba, lrbp); - } - spin_unlock_irqrestore(hba->host->host_lock, flags); - - if (!time_left) { + } else { err = -ETIMEDOUT; dev_dbg(hba->dev, "%s: dev_cmd request timedout, tag %d\n", __func__, lrbp->task_tag); - if (!ufshcd_clear_cmds(hba, 1U << lrbp->task_tag)) + if (ufshcd_clear_cmds(hba, 1U << lrbp->task_tag) == 0) { /* successfully cleared the command, retry if needed */ err = -EAGAIN; - /* - * in case of an error, after clearing the doorbell, - * we also need to clear the outstanding_request - * field in hba - */ - spin_lock_irqsave(&hba->outstanding_lock, flags); - __clear_bit(lrbp->task_tag, &hba->outstanding_reqs); - spin_unlock_irqrestore(&hba->outstanding_lock, flags); + /* + * Since clearing the command succeeded we also need to + * clear the task tag bit from the outstanding_reqs + * variable. + */ + spin_lock_irqsave(&hba->outstanding_lock, flags); + pending = test_bit(lrbp->task_tag, + &hba->outstanding_reqs); + if (pending) { + hba->dev_cmd.complete = NULL; + __clear_bit(lrbp->task_tag, + &hba->outstanding_reqs); + } + spin_unlock_irqrestore(&hba->outstanding_lock, flags); + + if (!pending) { + /* + * The completion handler ran while we tried to + * clear the command. + */ + time_left = 1; + goto retry; + } + } else { + dev_err(hba->dev, "%s: failed to clear tag %d\n", + __func__, lrbp->task_tag); + } } return err; diff --git a/drivers/ufs/host/ufshcd-pltfrm.c b/drivers/ufs/host/ufshcd-pltfrm.c index e7332cc65b1f..173aea8e9997 100644 --- a/drivers/ufs/host/ufshcd-pltfrm.c +++ b/drivers/ufs/host/ufshcd-pltfrm.c @@ -108,9 +108,20 @@ out: return ret; } +static bool phandle_exists(const struct device_node *np, + const char *phandle_name, int index) +{ + struct device_node *parse_np = of_parse_phandle(np, phandle_name, index); + + if (parse_np) + of_node_put(parse_np); + + return parse_np != NULL; +} + #define MAX_PROP_SIZE 32 static int ufshcd_populate_vreg(struct device *dev, const char *name, - struct ufs_vreg **out_vreg) + struct ufs_vreg **out_vreg) { char prop_name[MAX_PROP_SIZE]; struct ufs_vreg *vreg = NULL; @@ -122,7 +133,7 @@ static int ufshcd_populate_vreg(struct device *dev, const char *name, } snprintf(prop_name, MAX_PROP_SIZE, "%s-supply", name); - if (!of_parse_phandle(np, prop_name, 0)) { + if (!phandle_exists(np, prop_name, 0)) { dev_info(dev, "%s: Unable to find %s regulator, assuming enabled\n", __func__, prop_name); goto out; diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index 90ce16b6e05f..f422f9c58ba7 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -632,16 +632,19 @@ static int __init sev_guest_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct snp_guest_dev *snp_dev; struct miscdevice *misc; + void __iomem *mapping; int ret; if (!dev->platform_data) return -ENODEV; data = (struct sev_guest_platform_data *)dev->platform_data; - layout = (__force void *)ioremap_encrypted(data->secrets_gpa, PAGE_SIZE); - if (!layout) + mapping = ioremap_encrypted(data->secrets_gpa, PAGE_SIZE); + if (!mapping) return -ENODEV; + layout = (__force void *)mapping; + ret = -ENOMEM; snp_dev = devm_kzalloc(&pdev->dev, sizeof(struct snp_guest_dev), GFP_KERNEL); if (!snp_dev) @@ -706,7 +709,7 @@ e_free_response: e_free_request: free_shared_pages(snp_dev->request, sizeof(struct snp_guest_msg)); e_unmap: - iounmap(layout); + iounmap(mapping); return ret; } diff --git a/fs/attr.c b/fs/attr.c index dbe996b0dedf..b5b8835ddf15 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -22,7 +22,7 @@ * chown_ok - verify permissions to chown inode * @mnt_userns: user namespace of the mount @inode was found from * @inode: inode to check permissions on - * @uid: uid to chown @inode to + * @ia_vfsuid: uid to chown @inode to * * If the inode has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then @@ -31,15 +31,15 @@ * performed on the raw inode simply passs init_user_ns. */ static bool chown_ok(struct user_namespace *mnt_userns, - const struct inode *inode, - kuid_t uid) + const struct inode *inode, vfsuid_t ia_vfsuid) { - kuid_t kuid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, inode->i_uid)) + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid()) && + vfsuid_eq(ia_vfsuid, vfsuid)) return true; if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; - if (uid_eq(kuid, INVALID_UID) && + if (!vfsuid_valid(vfsuid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; @@ -49,7 +49,7 @@ static bool chown_ok(struct user_namespace *mnt_userns, * chgrp_ok - verify permissions to chgrp inode * @mnt_userns: user namespace of the mount @inode was found from * @inode: inode to check permissions on - * @gid: gid to chown @inode to + * @ia_vfsgid: gid to chown @inode to * * If the inode has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then @@ -58,21 +58,19 @@ static bool chown_ok(struct user_namespace *mnt_userns, * performed on the raw inode simply passs init_user_ns. */ static bool chgrp_ok(struct user_namespace *mnt_userns, - const struct inode *inode, kgid_t gid) + const struct inode *inode, vfsgid_t ia_vfsgid) { - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) { - kgid_t mapped_gid; - - if (gid_eq(gid, inode->i_gid)) + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) { + if (vfsgid_eq(ia_vfsgid, vfsgid)) return true; - mapped_gid = mapped_kgid_fs(mnt_userns, i_user_ns(inode), gid); - if (in_group_p(mapped_gid)) + if (vfsgid_in_group_p(ia_vfsgid)) return true; } if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; - if (gid_eq(kgid, INVALID_GID) && + if (!vfsgid_valid(vfsgid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; @@ -120,28 +118,29 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, goto kill_priv; /* Make sure a caller can chown. */ - if ((ia_valid & ATTR_UID) && !chown_ok(mnt_userns, inode, attr->ia_uid)) + if ((ia_valid & ATTR_UID) && + !chown_ok(mnt_userns, inode, attr->ia_vfsuid)) return -EPERM; /* Make sure caller can chgrp. */ - if ((ia_valid & ATTR_GID) && !chgrp_ok(mnt_userns, inode, attr->ia_gid)) + if ((ia_valid & ATTR_GID) && + !chgrp_ok(mnt_userns, inode, attr->ia_vfsgid)) return -EPERM; /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { - kgid_t mapped_gid; + vfsgid_t vfsgid; if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; if (ia_valid & ATTR_GID) - mapped_gid = mapped_kgid_fs(mnt_userns, - i_user_ns(inode), attr->ia_gid); + vfsgid = attr->ia_vfsgid; else - mapped_gid = i_gid_into_mnt(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); /* Also check the setgid bit! */ - if (!in_group_p(mapped_gid) && + if (!vfsgid_in_group_p(vfsgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } @@ -219,9 +218,7 @@ EXPORT_SYMBOL(inode_newsize_ok); * setattr_copy must be called with i_mutex held. * * setattr_copy updates the inode's metadata with that specified - * in attr on idmapped mounts. If file ownership is changed setattr_copy - * doesn't map ia_uid and ia_gid. It will asssume the caller has already - * provided the intended values. Necessary permission checks to determine + * in attr on idmapped mounts. Necessary permission checks to determine * whether or not the S_ISGID property needs to be removed are performed with * the correct idmapped mount permission helpers. * Noticeably missing is inode size update, which is more complex @@ -242,10 +239,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, { unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -254,8 +249,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (!in_group_p(kgid) && + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + if (!vfsgid_in_group_p(vfsgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; @@ -306,9 +301,6 @@ EXPORT_SYMBOL(may_setattr); * retry. Because breaking a delegation may take a long time, the * caller should drop the i_mutex before doing so. * - * If file ownership is changed notify_change() doesn't map ia_uid and - * ia_gid. It will asssume the caller has already provided the intended values. - * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. Also, passing NULL is fine for callers holding @@ -397,23 +389,25 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, * namespace of the superblock. */ if (ia_valid & ATTR_UID && - !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid)) + !vfsuid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + attr->ia_vfsuid)) return -EOVERFLOW; if (ia_valid & ATTR_GID && - !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid)) + !vfsgid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + attr->ia_vfsgid)) return -EOVERFLOW; /* Don't allow modifications of files with invalid uids or * gids unless those uids & gids are being made valid. */ if (!(ia_valid & ATTR_UID) && - !uid_valid(i_uid_into_mnt(mnt_userns, inode))) + !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode))) return -EOVERFLOW; if (!(ia_valid & ATTR_GID) && - !gid_valid(i_gid_into_mnt(mnt_userns, inode))) + !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) return -EOVERFLOW; - error = security_inode_setattr(dentry, attr); + error = security_inode_setattr(mnt_userns, dentry, attr); if (error) return error; error = try_break_deleg(inode, delegated_inode); diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index ee92634196a8..1105ce3c80cb 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -9,6 +9,15 @@ menuconfig DLM A general purpose distributed lock manager for kernel or userspace applications. +config DLM_DEPRECATED_API + bool "DLM deprecated API" + depends on DLM + help + Enables deprecated DLM timeout features that will be removed in + later Linux kernel releases. + + If you are unsure, say N. + config DLM_DEBUG bool "DLM debugging" depends on DLM diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile index 3545fdafc6fb..71dab733cf9a 100644 --- a/fs/dlm/Makefile +++ b/fs/dlm/Makefile @@ -9,7 +9,6 @@ dlm-y := ast.o \ member.o \ memory.o \ midcomms.o \ - netlink.o \ lowcomms.o \ plock.o \ rcom.o \ @@ -18,5 +17,6 @@ dlm-y := ast.o \ requestqueue.o \ user.o \ util.o +dlm-$(CONFIG_DLM_DEPRECATED_API) += netlink.o dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index bfac462dd3e8..19ef136f9e4f 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -255,13 +255,13 @@ void dlm_callback_work(struct work_struct *work) if (callbacks[i].flags & DLM_CB_SKIP) { continue; } else if (callbacks[i].flags & DLM_CB_BAST) { - bastfn(lkb->lkb_astparam, callbacks[i].mode); trace_dlm_bast(ls, lkb, callbacks[i].mode); + bastfn(lkb->lkb_astparam, callbacks[i].mode); } else if (callbacks[i].flags & DLM_CB_CAST) { lkb->lkb_lksb->sb_status = callbacks[i].sb_status; lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; + trace_dlm_ast(ls, lkb); castfn(lkb->lkb_astparam); - trace_dlm_ast(ls, lkb, lkb->lkb_lksb); } } diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 42eee2783756..ac8b62106ce0 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -75,8 +75,9 @@ struct dlm_cluster { unsigned int cl_log_info; unsigned int cl_protocol; unsigned int cl_mark; +#ifdef CONFIG_DLM_DEPRECATED_API unsigned int cl_timewarn_cs; - unsigned int cl_waitwarn_us; +#endif unsigned int cl_new_rsb_count; unsigned int cl_recover_callbacks; char cl_cluster_name[DLM_LOCKSPACE_LEN]; @@ -102,8 +103,9 @@ enum { CLUSTER_ATTR_LOG_INFO, CLUSTER_ATTR_PROTOCOL, CLUSTER_ATTR_MARK, +#ifdef CONFIG_DLM_DEPRECATED_API CLUSTER_ATTR_TIMEWARN_CS, - CLUSTER_ATTR_WAITWARN_US, +#endif CLUSTER_ATTR_NEW_RSB_COUNT, CLUSTER_ATTR_RECOVER_CALLBACKS, CLUSTER_ATTR_CLUSTER_NAME, @@ -224,8 +226,9 @@ CLUSTER_ATTR(log_debug, NULL); CLUSTER_ATTR(log_info, NULL); CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running); CLUSTER_ATTR(mark, NULL); +#ifdef CONFIG_DLM_DEPRECATED_API CLUSTER_ATTR(timewarn_cs, dlm_check_zero); -CLUSTER_ATTR(waitwarn_us, NULL); +#endif CLUSTER_ATTR(new_rsb_count, NULL); CLUSTER_ATTR(recover_callbacks, NULL); @@ -240,8 +243,9 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info, [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol, [CLUSTER_ATTR_MARK] = &cluster_attr_mark, +#ifdef CONFIG_DLM_DEPRECATED_API [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs, - [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us, +#endif [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count, [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks, [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name, @@ -432,8 +436,9 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_log_debug = dlm_config.ci_log_debug; cl->cl_log_info = dlm_config.ci_log_info; cl->cl_protocol = dlm_config.ci_protocol; +#ifdef CONFIG_DLM_DEPRECATED_API cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; - cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; +#endif cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks; memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name, @@ -954,8 +959,9 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_LOG_INFO 1 #define DEFAULT_PROTOCOL DLM_PROTO_TCP #define DEFAULT_MARK 0 +#ifdef CONFIG_DLM_DEPRECATED_API #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ -#define DEFAULT_WAITWARN_US 0 +#endif #define DEFAULT_NEW_RSB_COUNT 128 #define DEFAULT_RECOVER_CALLBACKS 0 #define DEFAULT_CLUSTER_NAME "" @@ -971,8 +977,9 @@ struct dlm_config_info dlm_config = { .ci_log_info = DEFAULT_LOG_INFO, .ci_protocol = DEFAULT_PROTOCOL, .ci_mark = DEFAULT_MARK, +#ifdef CONFIG_DLM_DEPRECATED_API .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, - .ci_waitwarn_us = DEFAULT_WAITWARN_US, +#endif .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT, .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS, .ci_cluster_name = DEFAULT_CLUSTER_NAME diff --git a/fs/dlm/config.h b/fs/dlm/config.h index df92b0a07fc6..55c5f2c13ebd 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -37,8 +37,9 @@ struct dlm_config_info { int ci_log_info; int ci_protocol; int ci_mark; +#ifdef CONFIG_DLM_DEPRECATED_API int ci_timewarn_cs; - int ci_waitwarn_us; +#endif int ci_new_rsb_count; int ci_recover_callbacks; char ci_cluster_name[DLM_LOCKSPACE_LEN]; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 776c3ed519f0..8aca8085d24e 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -145,7 +145,9 @@ struct dlm_args { void (*bastfn) (void *astparam, int mode); int mode; struct dlm_lksb *lksb; +#ifdef CONFIG_DLM_DEPRECATED_API unsigned long timeout; +#endif }; @@ -203,10 +205,20 @@ struct dlm_args { #define DLM_IFL_OVERLAP_UNLOCK 0x00080000 #define DLM_IFL_OVERLAP_CANCEL 0x00100000 #define DLM_IFL_ENDOFLIFE 0x00200000 +#ifdef CONFIG_DLM_DEPRECATED_API #define DLM_IFL_WATCH_TIMEWARN 0x00400000 #define DLM_IFL_TIMEOUT_CANCEL 0x00800000 +#endif #define DLM_IFL_DEADLOCK_CANCEL 0x01000000 #define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */ +/* least significant 2 bytes are message changed, they are full transmitted + * but at receive side only the 2 bytes LSB will be set. + * + * Even wireshark dlm dissector does only evaluate the lower bytes and note + * that they may not be used on transceiver side, we assume the higher bytes + * are for internal use or reserved so long they are not parsed on receiver + * side. + */ #define DLM_IFL_USER 0x00000001 #define DLM_IFL_ORPHAN 0x00000002 @@ -249,10 +261,12 @@ struct dlm_lkb { struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */ struct list_head lkb_wait_reply; /* waiting for remote reply */ struct list_head lkb_ownqueue; /* list of locks for a process */ - struct list_head lkb_time_list; ktime_t lkb_timestamp; - ktime_t lkb_wait_time; + +#ifdef CONFIG_DLM_DEPRECATED_API + struct list_head lkb_time_list; unsigned long lkb_timeout_cs; +#endif struct mutex lkb_cb_mutex; struct work_struct lkb_cb_work; @@ -568,8 +582,10 @@ struct dlm_ls { struct mutex ls_orphans_mutex; struct list_head ls_orphans; +#ifdef CONFIG_DLM_DEPRECATED_API struct mutex ls_timeout_mutex; struct list_head ls_timeout; +#endif spinlock_t ls_new_rsb_spin; int ls_new_rsb_count; @@ -606,8 +622,8 @@ struct dlm_ls { wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ int ls_uevent_result; - struct completion ls_members_done; - int ls_members_result; + struct completion ls_recovery_done; + int ls_recovery_result; struct miscdevice ls_device; @@ -688,7 +704,9 @@ struct dlm_ls { #define LSFL_RCOM_READY 5 #define LSFL_RCOM_WAIT 6 #define LSFL_UEVENT_WAIT 7 +#ifdef CONFIG_DLM_DEPRECATED_API #define LSFL_TIMEWARN 8 +#endif #define LSFL_CB_DELAY 9 #define LSFL_NODIR 10 @@ -741,9 +759,15 @@ static inline int dlm_no_directory(struct dlm_ls *ls) return test_bit(LSFL_NODIR, &ls->ls_flags); } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_netlink_init(void); void dlm_netlink_exit(void); void dlm_timeout_warn(struct dlm_lkb *lkb); +#else +static inline int dlm_netlink_init(void) { return 0; } +static inline void dlm_netlink_exit(void) { }; +static inline void dlm_timeout_warn(struct dlm_lkb *lkb) { }; +#endif int dlm_plock_init(void); void dlm_plock_exit(void); diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 226822f49d30..dac7eb75dba9 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -296,12 +296,14 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb);); +#ifdef CONFIG_DLM_DEPRECATED_API /* if the operation was a cancel, then return -DLM_ECANCEL, if a timeout caused the cancel then return -ETIMEDOUT */ if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) { lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL; rv = -ETIMEDOUT; } +#endif if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) { lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL; @@ -1210,7 +1212,9 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, kref_init(&lkb->lkb_ref); INIT_LIST_HEAD(&lkb->lkb_ownqueue); INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); +#ifdef CONFIG_DLM_DEPRECATED_API INIT_LIST_HEAD(&lkb->lkb_time_list); +#endif INIT_LIST_HEAD(&lkb->lkb_cb_list); mutex_init(&lkb->lkb_cb_mutex); INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); @@ -1306,6 +1310,13 @@ static inline void hold_lkb(struct dlm_lkb *lkb) kref_get(&lkb->lkb_ref); } +static void unhold_lkb_assert(struct kref *kref) +{ + struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref); + + DLM_ASSERT(false, dlm_print_lkb(lkb);); +} + /* This is called when we need to remove a reference and are certain it's not the last ref. e.g. del_lkb is always called between a find_lkb/put_lkb and is always the inverse of a previous add_lkb. @@ -1313,9 +1324,7 @@ static inline void hold_lkb(struct dlm_lkb *lkb) static inline void unhold_lkb(struct dlm_lkb *lkb) { - int rv; - rv = kref_put(&lkb->lkb_ref, kill_lkb); - DLM_ASSERT(!rv, dlm_print_lkb(lkb);); + kref_put(&lkb->lkb_ref, unhold_lkb_assert); } static void lkb_add_ordered(struct list_head *new, struct list_head *head, @@ -1402,75 +1411,6 @@ static int msg_reply_type(int mstype) return -1; } -static int nodeid_warned(int nodeid, int num_nodes, int *warned) -{ - int i; - - for (i = 0; i < num_nodes; i++) { - if (!warned[i]) { - warned[i] = nodeid; - return 0; - } - if (warned[i] == nodeid) - return 1; - } - return 0; -} - -void dlm_scan_waiters(struct dlm_ls *ls) -{ - struct dlm_lkb *lkb; - s64 us; - s64 debug_maxus = 0; - u32 debug_scanned = 0; - u32 debug_expired = 0; - int num_nodes = 0; - int *warned = NULL; - - if (!dlm_config.ci_waitwarn_us) - return; - - mutex_lock(&ls->ls_waiters_mutex); - - list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { - if (!lkb->lkb_wait_time) - continue; - - debug_scanned++; - - us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time)); - - if (us < dlm_config.ci_waitwarn_us) - continue; - - lkb->lkb_wait_time = 0; - - debug_expired++; - if (us > debug_maxus) - debug_maxus = us; - - if (!num_nodes) { - num_nodes = ls->ls_num_nodes; - warned = kcalloc(num_nodes, sizeof(int), GFP_KERNEL); - } - if (!warned) - continue; - if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned)) - continue; - - log_error(ls, "waitwarn %x %lld %d us check connection to " - "node %d", lkb->lkb_id, (long long)us, - dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid); - } - mutex_unlock(&ls->ls_waiters_mutex); - kfree(warned); - - if (debug_expired) - log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us", - debug_scanned, debug_expired, - dlm_config.ci_waitwarn_us, (long long)debug_maxus); -} - /* add/remove lkb from global waiters list of lkb's waiting for a reply from a remote node */ @@ -1514,7 +1454,6 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) lkb->lkb_wait_count++; lkb->lkb_wait_type = mstype; - lkb->lkb_wait_time = ktime_get(); lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */ hold_lkb(lkb); list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); @@ -1842,6 +1781,7 @@ void dlm_scan_rsbs(struct dlm_ls *ls) } } +#ifdef CONFIG_DLM_DEPRECATED_API static void add_timeout(struct dlm_lkb *lkb) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; @@ -1962,17 +1902,11 @@ void dlm_adjust_timeouts(struct dlm_ls *ls) list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); mutex_unlock(&ls->ls_timeout_mutex); - - if (!dlm_config.ci_waitwarn_us) - return; - - mutex_lock(&ls->ls_waiters_mutex); - list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { - if (ktime_to_us(lkb->lkb_wait_time)) - lkb->lkb_wait_time = ktime_get(); - } - mutex_unlock(&ls->ls_waiters_mutex); } +#else +static void add_timeout(struct dlm_lkb *lkb) { } +static void del_timeout(struct dlm_lkb *lkb) { } +#endif /* lkb is master or local copy */ @@ -2837,12 +2771,20 @@ static void confirm_master(struct dlm_rsb *r, int error) } } +#ifdef CONFIG_DLM_DEPRECATED_API static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, int namelen, unsigned long timeout_cs, void (*ast) (void *astparam), void *astparam, void (*bast) (void *astparam, int mode), struct dlm_args *args) +#else +static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, + int namelen, void (*ast)(void *astparam), + void *astparam, + void (*bast)(void *astparam, int mode), + struct dlm_args *args) +#endif { int rv = -EINVAL; @@ -2895,7 +2837,9 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, args->astfn = ast; args->astparam = astparam; args->bastfn = bast; +#ifdef CONFIG_DLM_DEPRECATED_API args->timeout = timeout_cs; +#endif args->mode = mode; args->lksb = lksb; rv = 0; @@ -2951,7 +2895,9 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_lksb = args->lksb; lkb->lkb_lvbptr = args->lksb->sb_lvbptr; lkb->lkb_ownpid = (int) current->pid; +#ifdef CONFIG_DLM_DEPRECATED_API lkb->lkb_timeout_cs = args->timeout; +#endif rv = 0; out: if (rv) @@ -3472,10 +3418,15 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error) goto out; - trace_dlm_lock_start(ls, lkb, mode, flags); + trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags); +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, lksb, flags, namelen, 0, ast, astarg, bast, &args); +#else + error = set_lock_args(mode, lksb, flags, namelen, ast, astarg, bast, + &args); +#endif if (error) goto out_put; @@ -3487,7 +3438,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error == -EINPROGRESS) error = 0; out_put: - trace_dlm_lock_end(ls, lkb, mode, flags, error); + trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error); if (convert || error) __put_lkb(ls, lkb); @@ -5839,9 +5790,14 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) return 0; } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen, unsigned long timeout_cs) +#else +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, + int mode, uint32_t flags, void *name, unsigned int namelen) +#endif { struct dlm_lkb *lkb; struct dlm_args args; @@ -5864,8 +5820,13 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, goto out; } } +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, fake_astfn, ua, fake_bastfn, &args); +#else + error = set_lock_args(mode, &ua->lksb, flags, namelen, fake_astfn, ua, + fake_bastfn, &args); +#endif if (error) { kfree(ua->lksb.sb_lvbptr); ua->lksb.sb_lvbptr = NULL; @@ -5904,9 +5865,14 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, return error; } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in, unsigned long timeout_cs) +#else +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in) +#endif { struct dlm_lkb *lkb; struct dlm_args args; @@ -5941,8 +5907,13 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, ua->bastaddr = ua_tmp->bastaddr; ua->user_lksb = ua_tmp->user_lksb; +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs, fake_astfn, ua, fake_bastfn, &args); +#else + error = set_lock_args(mode, &ua->lksb, flags, 0, fake_astfn, ua, + fake_bastfn, &args); +#endif if (error) goto out_put; @@ -5966,7 +5937,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, void *name, unsigned int namelen, - unsigned long timeout_cs, uint32_t *lkid) + uint32_t *lkid) { struct dlm_lkb *lkb = NULL, *iter; struct dlm_user_args *ua; diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 252a5898f908..a7b6474f009d 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -24,9 +24,15 @@ int dlm_put_lkb(struct dlm_lkb *lkb); void dlm_scan_rsbs(struct dlm_ls *ls); int dlm_lock_recovery_try(struct dlm_ls *ls); void dlm_unlock_recovery(struct dlm_ls *ls); -void dlm_scan_waiters(struct dlm_ls *ls); + +#ifdef CONFIG_DLM_DEPRECATED_API void dlm_scan_timeout(struct dlm_ls *ls); void dlm_adjust_timeouts(struct dlm_ls *ls); +#else +static inline void dlm_scan_timeout(struct dlm_ls *ls) { } +static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { } +#endif + int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len, unsigned int flags, int *r_nodeid, int *result); @@ -41,15 +47,22 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls); int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc); +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen, unsigned long timeout_cs); int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in, unsigned long timeout_cs); +#else +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, + uint32_t flags, void *name, unsigned int namelen); +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in); +#endif int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, void *name, unsigned int namelen, - unsigned long timeout_cs, uint32_t *lkid); + uint32_t *lkid); int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, uint32_t flags, uint32_t lkid, char *lvb_in); int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 19ed41a5da93..3972f4d86c75 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -275,7 +275,6 @@ static int dlm_scand(void *data) ls->ls_scan_time = jiffies; dlm_scan_rsbs(ls); dlm_scan_timeout(ls); - dlm_scan_waiters(ls); dlm_unlock_recovery(ls); } else { ls->ls_scan_time += HZ; @@ -490,13 +489,28 @@ static int new_lockspace(const char *name, const char *cluster, ls->ls_ops_arg = ops_arg; } - if (flags & DLM_LSFL_TIMEWARN) +#ifdef CONFIG_DLM_DEPRECATED_API + if (flags & DLM_LSFL_TIMEWARN) { + pr_warn_once("===============================================================\n" + "WARNING: the dlm DLM_LSFL_TIMEWARN flag is being deprecated and\n" + " will be removed in v6.2!\n" + " Inclusive DLM_LSFL_TIMEWARN define in UAPI header!\n" + "===============================================================\n"); + set_bit(LSFL_TIMEWARN, &ls->ls_flags); + } /* ls_exflags are forced to match among nodes, and we don't - need to require all nodes to have some flags set */ + * need to require all nodes to have some flags set + */ ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); +#else + /* ls_exflags are forced to match among nodes, and we don't + * need to require all nodes to have some flags set + */ + ls->ls_exflags = (flags & ~(DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); +#endif size = READ_ONCE(dlm_config.ci_rsbtbl_size); ls->ls_rsbtbl_size = size; @@ -527,8 +541,10 @@ static int new_lockspace(const char *name, const char *cluster, mutex_init(&ls->ls_waiters_mutex); INIT_LIST_HEAD(&ls->ls_orphans); mutex_init(&ls->ls_orphans_mutex); +#ifdef CONFIG_DLM_DEPRECATED_API INIT_LIST_HEAD(&ls->ls_timeout); mutex_init(&ls->ls_timeout_mutex); +#endif INIT_LIST_HEAD(&ls->ls_new_rsb); spin_lock_init(&ls->ls_new_rsb_spin); @@ -548,8 +564,8 @@ static int new_lockspace(const char *name, const char *cluster, init_waitqueue_head(&ls->ls_uevent_wait); ls->ls_uevent_result = 0; - init_completion(&ls->ls_members_done); - ls->ls_members_result = -1; + init_completion(&ls->ls_recovery_done); + ls->ls_recovery_result = -1; mutex_init(&ls->ls_cb_mutex); INIT_LIST_HEAD(&ls->ls_cb_delay); @@ -645,8 +661,9 @@ static int new_lockspace(const char *name, const char *cluster, if (error) goto out_recoverd; - wait_for_completion(&ls->ls_members_done); - error = ls->ls_members_result; + /* wait until recovery is successful or failed */ + wait_for_completion(&ls->ls_recovery_done); + error = ls->ls_recovery_result; if (error) goto out_members; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 19e82f08c0e0..a4e84e8d94c8 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -529,7 +529,7 @@ static void lowcomms_write_space(struct sock *sk) return; if (!test_and_set_bit(CF_CONNECTED, &con->flags)) { - log_print("successful connected to node %d", con->nodeid); + log_print("connected to node %d", con->nodeid); queue_work(send_workqueue, &con->swork); return; } @@ -1931,7 +1931,7 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock, return ret; if (!test_and_set_bit(CF_CONNECTED, &con->flags)) - log_print("successful connected to node %d", con->nodeid); + log_print("connected to node %d", con->nodeid); return 0; } diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 98084e0cfccf..2af2ccfe43a9 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -534,7 +534,11 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) int i, error, neg = 0, low = -1; /* previously removed members that we've not finished removing need to - count as a negative change so the "neg" recovery steps will happen */ + * count as a negative change so the "neg" recovery steps will happen + * + * This functionality must report all member changes to lsops or + * midcomms layer and must never return before. + */ list_for_each_entry(memb, &ls->ls_nodes_gone, list) { log_rinfo(ls, "prev removed member %d", memb->nodeid); @@ -583,19 +587,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) *neg_out = neg; error = ping_members(ls); - /* error -EINTR means that a new recovery action is triggered. - * We ignore this recovery action and let run the new one which might - * have new member configuration. - */ - if (error == -EINTR) - error = 0; - - /* new_lockspace() may be waiting to know if the config - * is good or bad - */ - ls->ls_members_result = error; - complete(&ls->ls_members_done); - log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; } @@ -675,7 +666,16 @@ int dlm_ls_stop(struct dlm_ls *ls) if (!ls->ls_recover_begin) ls->ls_recover_begin = jiffies; - dlm_lsop_recover_prep(ls); + /* call recover_prep ops only once and not multiple times + * for each possible dlm_ls_stop() when recovery is already + * stopped. + * + * If we successful was able to clear LSFL_RUNNING bit and + * it was set we know it is the first dlm_ls_stop() call. + */ + if (new) + dlm_lsop_recover_prep(ls); + return 0; } diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 0993eebf2060..737f185aad8d 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -29,6 +29,8 @@ struct plock_async_data { struct plock_op { struct list_head list; int done; + /* if lock op got interrupted while waiting dlm_controld reply */ + bool sigint; struct dlm_plock_info info; /* if set indicates async handling */ struct plock_async_data *data; @@ -79,8 +81,7 @@ static void send_op(struct plock_op *op) abandoned waiter. So, we have to insert the unlock-close when the lock call is interrupted. */ -static void do_unlock_close(struct dlm_ls *ls, u64 number, - struct file *file, struct file_lock *fl) +static void do_unlock_close(const struct dlm_plock_info *info) { struct plock_op *op; @@ -89,15 +90,12 @@ static void do_unlock_close(struct dlm_ls *ls, u64 number, return; op->info.optype = DLM_PLOCK_OP_UNLOCK; - op->info.pid = fl->fl_pid; - op->info.fsid = ls->ls_global_id; - op->info.number = number; + op->info.pid = info->pid; + op->info.fsid = info->fsid; + op->info.number = info->number; op->info.start = 0; op->info.end = OFFSET_MAX; - if (fl->fl_lmops && fl->fl_lmops->lm_grant) - op->info.owner = (__u64) fl->fl_pid; - else - op->info.owner = (__u64)(long) fl->fl_owner; + op->info.owner = info->owner; op->info.flags |= DLM_PLOCK_FL_CLOSE; send_op(op); @@ -161,16 +159,24 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, rv = wait_event_interruptible(recv_wq, (op->done != 0)); if (rv == -ERESTARTSYS) { spin_lock(&ops_lock); - list_del(&op->list); + /* recheck under ops_lock if we got a done != 0, + * if so this interrupt case should be ignored + */ + if (op->done != 0) { + spin_unlock(&ops_lock); + goto do_lock_wait; + } + + op->sigint = true; spin_unlock(&ops_lock); - log_print("%s: wait interrupted %x %llx, op removed", + log_debug(ls, "%s: wait interrupted %x %llx pid %d", __func__, ls->ls_global_id, - (unsigned long long)number); - dlm_release_plock_op(op); - do_unlock_close(ls, number, file, fl); + (unsigned long long)number, op->info.pid); goto out; } +do_lock_wait: + WARN_ON(!list_empty(&op->list)); rv = op->info.rv; @@ -378,7 +384,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, spin_lock(&ops_lock); if (!list_empty(&send_list)) { - op = list_entry(send_list.next, struct plock_op, list); + op = list_first_entry(&send_list, struct plock_op, list); if (op->info.flags & DLM_PLOCK_FL_CLOSE) list_del(&op->list); else @@ -425,6 +431,19 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, if (iter->info.fsid == info.fsid && iter->info.number == info.number && iter->info.owner == info.owner) { + if (iter->sigint) { + list_del(&iter->list); + spin_unlock(&ops_lock); + + pr_debug("%s: sigint cleanup %x %llx pid %d", + __func__, iter->info.fsid, + (unsigned long long)iter->info.number, + iter->info.pid); + do_unlock_close(&iter->info); + memcpy(&iter->info, &info, sizeof(info)); + dlm_release_plock_op(iter); + return count; + } list_del_init(&iter->list); memcpy(&iter->info, &info, sizeof(info)); if (iter->data) @@ -443,7 +462,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, else wake_up(&recv_wq); } else - log_print("%s: no op %x %llx - may got interrupted?", __func__, + log_print("%s: no op %x %llx", __func__, info.fsid, (unsigned long long)info.number); return count; } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index a55dfce705dd..e15eb511b04b 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -70,6 +70,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) /* * Add or remove nodes from the lockspace's ls_nodes list. + * + * Due to the fact that we must report all membership changes to lsops + * or midcomms layer, it is not permitted to abort ls_recover() until + * this is done. */ error = dlm_recover_members(ls, rv, &neg); @@ -239,14 +243,12 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) jiffies_to_msecs(jiffies - start)); mutex_unlock(&ls->ls_recoverd_active); - dlm_lsop_recover_done(ls); return 0; fail: dlm_release_root_list(ls); - log_rinfo(ls, "dlm_recover %llu error %d", - (unsigned long long)rv->seq, error); mutex_unlock(&ls->ls_recoverd_active); + return error; } @@ -257,6 +259,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) static void do_ls_recovery(struct dlm_ls *ls) { struct dlm_recover *rv = NULL; + int error; spin_lock(&ls->ls_recover_lock); rv = ls->ls_recover_args; @@ -266,7 +269,31 @@ static void do_ls_recovery(struct dlm_ls *ls) spin_unlock(&ls->ls_recover_lock); if (rv) { - ls_recover(ls, rv); + error = ls_recover(ls, rv); + switch (error) { + case 0: + ls->ls_recovery_result = 0; + complete(&ls->ls_recovery_done); + + dlm_lsop_recover_done(ls); + break; + case -EINTR: + /* if recovery was interrupted -EINTR we wait for the next + * ls_recover() iteration until it hopefully succeeds. + */ + log_rinfo(ls, "%s %llu interrupted and should be queued to run again", + __func__, (unsigned long long)rv->seq); + break; + default: + log_rinfo(ls, "%s %llu error %d", __func__, + (unsigned long long)rv->seq, error); + + /* let new_lockspace() get aware of critical error */ + ls->ls_recovery_result = error; + complete(&ls->ls_recovery_done); + break; + } + kfree(rv->nodes); kfree(rv); } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 1060b24f18d4..99e8f0744513 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -250,6 +250,14 @@ static int device_user_lock(struct dlm_user_proc *proc, goto out; } +#ifdef CONFIG_DLM_DEPRECATED_API + if (params->timeout) + pr_warn_once("========================================================\n" + "WARNING: the lkb timeout feature is being deprecated and\n" + " will be removed in v6.2!\n" + "========================================================\n"); +#endif + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); if (!ua) goto out; @@ -262,23 +270,34 @@ static int device_user_lock(struct dlm_user_proc *proc, ua->xid = params->xid; if (params->flags & DLM_LKF_CONVERT) { +#ifdef CONFIG_DLM_DEPRECATED_API error = dlm_user_convert(ls, ua, params->mode, params->flags, params->lkid, params->lvb, (unsigned long) params->timeout); +#else + error = dlm_user_convert(ls, ua, + params->mode, params->flags, + params->lkid, params->lvb); +#endif } else if (params->flags & DLM_LKF_ORPHAN) { error = dlm_user_adopt_orphan(ls, ua, params->mode, params->flags, params->name, params->namelen, - (unsigned long) params->timeout, &lkid); if (!error) error = lkid; } else { +#ifdef CONFIG_DLM_DEPRECATED_API error = dlm_user_request(ls, ua, params->mode, params->flags, params->name, params->namelen, (unsigned long) params->timeout); +#else + error = dlm_user_request(ls, ua, + params->mode, params->flags, + params->name, params->namelen); +#endif if (!error) error = ua->lksb.sb_lkid; } diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 19e6c56a9f47..26fa170090b8 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -17,7 +17,7 @@ struct z_erofs_decompress_req { /* indicate the algorithm will be used for decompression */ unsigned int alg; - bool inplace_io, partial_decoding; + bool inplace_io, partial_decoding, fillgaps; }; struct z_erofs_decompressor { diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fbb037ba326e..fe8ac0e163f7 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -366,42 +366,33 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block) return iomap_bmap(mapping, block, &erofs_iomap_ops); } -static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to) +static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); - loff_t align = iocb->ki_pos | iov_iter_count(to) | - iov_iter_alignment(to); - struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int blksize_mask; - - if (bdev) - blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1; - else - blksize_mask = (1 << inode->i_blkbits) - 1; - if (align & blksize_mask) - return -EINVAL; - return 0; -} - -static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ /* no need taking (shared) inode lock since it's a ro filesystem */ if (!iov_iter_count(to)) return 0; #ifdef CONFIG_FS_DAX - if (IS_DAX(iocb->ki_filp->f_mapping->host)) + if (IS_DAX(inode)) return dax_iomap_rw(iocb, to, &erofs_iomap_ops); #endif if (iocb->ki_flags & IOCB_DIRECT) { - int err = erofs_prepare_dio(iocb, to); + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int blksize_mask; + + if (bdev) + blksize_mask = bdev_logical_block_size(bdev) - 1; + else + blksize_mask = (1 << inode->i_blkbits) - 1; + + if ((iocb->ki_pos | iov_iter_count(to) | + iov_iter_alignment(to)) & blksize_mask) + return -EINVAL; - if (!err) - return iomap_dio_rw(iocb, to, &erofs_iomap_ops, - NULL, 0, NULL, 0); - if (err < 0) - return err; + return iomap_dio_rw(iocb, to, &erofs_iomap_ops, + NULL, 0, NULL, 0); } return filemap_read(iocb, to, 0); } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 6dca1900c733..2d55569f96ac 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -83,7 +83,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, j = 0; /* 'valid' bounced can only be tested after a complete round */ - if (test_bit(j, bounced)) { + if (!rq->fillgaps && test_bit(j, bounced)) { DBG_BUGON(i < lz4_max_distance_pages); DBG_BUGON(top >= lz4_max_distance_pages); availables[top++] = rq->out[i - lz4_max_distance_pages]; @@ -91,14 +91,18 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, if (page) { __clear_bit(j, bounced); - if (kaddr) { - if (kaddr + PAGE_SIZE == page_address(page)) + if (!PageHighMem(page)) { + if (!i) { + kaddr = page_address(page); + continue; + } + if (kaddr && + kaddr + PAGE_SIZE == page_address(page)) { kaddr += PAGE_SIZE; - else - kaddr = NULL; - } else if (!i) { - kaddr = page_address(page); + continue; + } } + kaddr = NULL; continue; } kaddr = NULL; diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 05a3063cf2bc..5e59b3f523eb 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -143,6 +143,7 @@ again: DBG_BUGON(z_erofs_lzma_head); z_erofs_lzma_head = head; spin_unlock(&z_erofs_lzma_lock); + wake_up_all(&z_erofs_lzma_wq); z_erofs_lzma_max_dictsize = dict_size; mutex_unlock(&lzma_resize_mutex); diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 18e59821c597..ecf28f66b97d 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -22,10 +22,9 @@ static void debug_one_dentry(unsigned char d_type, const char *de_name, } static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, - void *dentry_blk, unsigned int *ofs, + void *dentry_blk, struct erofs_dirent *de, unsigned int nameoff, unsigned int maxsize) { - struct erofs_dirent *de = dentry_blk + *ofs; const struct erofs_dirent *end = dentry_blk + nameoff; while (de < end) { @@ -59,9 +58,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, /* stopped by some reason */ return 1; ++de; - *ofs += sizeof(struct erofs_dirent); + ctx->pos += sizeof(struct erofs_dirent); } - *ofs = maxsize; return 0; } @@ -90,33 +88,33 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) nameoff = le16_to_cpu(de->nameoff); if (nameoff < sizeof(struct erofs_dirent) || - nameoff >= PAGE_SIZE) { + nameoff >= EROFS_BLKSIZ) { erofs_err(dir->i_sb, "invalid de[0].nameoff %u @ nid %llu", nameoff, EROFS_I(dir)->nid); err = -EFSCORRUPTED; - goto skip_this; + break; } maxsize = min_t(unsigned int, - dirsize - ctx->pos + ofs, PAGE_SIZE); + dirsize - ctx->pos + ofs, EROFS_BLKSIZ); /* search dirents at the arbitrary position */ if (initial) { initial = false; ofs = roundup(ofs, sizeof(struct erofs_dirent)); + ctx->pos = blknr_to_addr(i) + ofs; if (ofs >= nameoff) goto skip_this; } - err = erofs_fill_dentries(dir, ctx, de, &ofs, + err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs, nameoff, maxsize); -skip_this: - ctx->pos = blknr_to_addr(i) + ofs; - if (err) break; +skip_this: + ctx->pos = blknr_to_addr(i) + maxsize; ++i; ofs = 0; } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 724bb57075f6..5792ca9e0d5e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2022 Alibaba Cloud */ #include "zdata.h" #include "compress.h" @@ -26,6 +27,82 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) }; +struct z_erofs_bvec_iter { + struct page *bvpage; + struct z_erofs_bvset *bvset; + unsigned int nr, cur; +}; + +static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) +{ + if (iter->bvpage) + kunmap_local(iter->bvset); + return iter->bvpage; +} + +static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) +{ + unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; + /* have to access nextpage in advance, otherwise it will be unmapped */ + struct page *nextpage = iter->bvset->nextpage; + struct page *oldpage; + + DBG_BUGON(!nextpage); + oldpage = z_erofs_bvec_iter_end(iter); + iter->bvpage = nextpage; + iter->bvset = kmap_local_page(nextpage); + iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); + iter->cur = 0; + return oldpage; +} + +static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvset_inline *bvset, + unsigned int bootstrap_nr, + unsigned int cur) +{ + *iter = (struct z_erofs_bvec_iter) { + .nr = bootstrap_nr, + .bvset = (struct z_erofs_bvset *)bvset, + }; + + while (cur > iter->nr) { + cur -= iter->nr; + z_erofs_bvset_flip(iter); + } + iter->cur = cur; +} + +static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **candidate_bvpage) +{ + if (iter->cur == iter->nr) { + if (!*candidate_bvpage) + return -EAGAIN; + + DBG_BUGON(iter->bvset->nextpage); + iter->bvset->nextpage = *candidate_bvpage; + z_erofs_bvset_flip(iter); + + iter->bvset->nextpage = NULL; + *candidate_bvpage = NULL; + } + iter->bvset->bvec[iter->cur++] = *bvec; + return 0; +} + +static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **old_bvpage) +{ + if (iter->cur == iter->nr) + *old_bvpage = z_erofs_bvset_flip(iter); + else + *old_bvpage = NULL; + *bvec = iter->bvset->bvec[iter->cur++]; +} + static void z_erofs_destroy_pcluster_pool(void) { int i; @@ -46,7 +123,7 @@ static int z_erofs_create_pcluster_pool(void) for (pcs = pcluster_pool; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { - size = struct_size(a, compressed_pages, pcs->maxpages); + size = struct_size(a, compressed_bvecs, pcs->maxpages); sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); pcs->slab = kmem_cache_create(pcs->name, size, 0, @@ -150,30 +227,29 @@ int __init z_erofs_init_zip_subsystem(void) return err; } -enum z_erofs_collectmode { - COLLECT_SECONDARY, - COLLECT_PRIMARY, +enum z_erofs_pclustermode { + Z_EROFS_PCLUSTER_INFLIGHT, /* - * The current collection was the tail of an exist chain, in addition - * that the previous processed chained collections are all decided to + * The current pclusters was the tail of an exist chain, in addition + * that the previous processed chained pclusters are all decided to * be hooked up to it. - * A new chain will be created for the remaining collections which are - * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED, - * the next collection cannot reuse the whole page safely in - * the following scenario: + * A new chain will be created for the remaining pclusters which are + * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED, + * the next pcluster cannot reuse the whole page safely for inplace I/O + * in the following scenario: * ________________________________________________________________ * | tail (partial) page | head (partial) page | - * | (belongs to the next cl) | (belongs to the current cl) | - * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| + * | (belongs to the next pcl) | (belongs to the current pcl) | + * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________| */ - COLLECT_PRIMARY_HOOKED, + Z_EROFS_PCLUSTER_HOOKED, /* - * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it + * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it * could be dispatched into bypass queue later due to uptodated managed * pages. All related online pages cannot be reused for inplace I/O (or - * pagevec) since it can be directly decoded without I/O submission. + * bvpage) since it can be directly decoded without I/O submission. */ - COLLECT_PRIMARY_FOLLOWED_NOINPLACE, + Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* * The current collection has been linked with the owned chain, and * could also be linked with the remaining collections, which means @@ -184,39 +260,36 @@ enum z_erofs_collectmode { * ________________________________________________________________ * | tail (partial) page | head (partial) page | * | (of the current cl) | (of the previous collection) | - * | PRIMARY_FOLLOWED or | | - * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________| + * | PCLUSTER_FOLLOWED or | | + * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________| * * [ (*) the above page can be used as inplace I/O. ] */ - COLLECT_PRIMARY_FOLLOWED, + Z_EROFS_PCLUSTER_FOLLOWED, }; struct z_erofs_decompress_frontend { struct inode *const inode; struct erofs_map_blocks map; + struct z_erofs_bvec_iter biter; - struct z_erofs_pagevec_ctor vector; - + struct page *candidate_bvpage; struct z_erofs_pcluster *pcl, *tailpcl; - /* a pointer used to pick up inplace I/O pages */ - struct page **icpage_ptr; z_erofs_next_pcluster_t owned_head; - - enum z_erofs_collectmode mode; + enum z_erofs_pclustermode mode; bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; + + /* a pointer used to pick up inplace I/O pages */ + unsigned int icur; }; #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = COLLECT_PRIMARY_FOLLOWED, .backmost = true } - -static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; -static DEFINE_MUTEX(z_pagemap_global_lock); + .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, enum z_erofs_cache_alloctype type, @@ -231,24 +304,21 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, */ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; - struct page **pages; - pgoff_t index; + unsigned int i; - if (fe->mode < COLLECT_PRIMARY_FOLLOWED) + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; - pages = pcl->compressed_pages; - index = pcl->obj.index; - for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) { + for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page; compressed_page_t t; struct page *newpage = NULL; /* the compressed page was loaded before */ - if (READ_ONCE(*pages)) + if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - page = find_get_page(mc, index); + page = find_get_page(mc, pcl->obj.index + i); if (page) { t = tag_compressed_page_justfound(page); @@ -269,7 +339,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, } } - if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) + if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, + tagptr_cast_ptr(t))) continue; if (page) @@ -283,7 +354,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, * managed cache since it can be moved to the bypass queue instead. */ if (standalone) - fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* called by erofs_shrinker to get rid of all compressed_pages */ @@ -300,7 +371,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, * therefore no need to worry about available decompression users. */ for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page = pcl->compressed_pages[i]; + struct page *page = pcl->compressed_bvecs[i].page; if (!page) continue; @@ -313,7 +384,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, continue; /* barrier is implied in the following 'unlock_page' */ - WRITE_ONCE(pcl->compressed_pages[i], NULL); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); detach_page_private(page); unlock_page(page); } @@ -323,56 +394,59 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, int erofs_try_to_free_cached_page(struct page *page) { struct z_erofs_pcluster *const pcl = (void *)page_private(page); - int ret = 0; /* 0 - busy */ + int ret, i; - if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { - unsigned int i; + if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) + return 0; - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - for (i = 0; i < pcl->pclusterpages; ++i) { - if (pcl->compressed_pages[i] == page) { - WRITE_ONCE(pcl->compressed_pages[i], NULL); - ret = 1; - break; - } + ret = 0; + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + for (i = 0; i < pcl->pclusterpages; ++i) { + if (pcl->compressed_bvecs[i].page == page) { + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); + ret = 1; + break; } - erofs_workgroup_unfreeze(&pcl->obj, 1); - - if (ret) - detach_page_private(page); } + erofs_workgroup_unfreeze(&pcl->obj, 1); + if (ret) + detach_page_private(page); return ret; } -/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, - struct page *page) + struct z_erofs_bvec *bvec) { struct z_erofs_pcluster *const pcl = fe->pcl; - while (fe->icpage_ptr > pcl->compressed_pages) - if (!cmpxchg(--fe->icpage_ptr, NULL, page)) + while (fe->icur > 0) { + if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, + NULL, bvec->page)) { + pcl->compressed_bvecs[fe->icur] = *bvec; return true; + } + } return false; } /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, - struct page *page, enum z_erofs_page_type type, - bool pvec_safereuse) + struct z_erofs_bvec *bvec, bool exclusive) { int ret; - /* give priority for inplaceio */ - if (fe->mode >= COLLECT_PRIMARY && - type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && - z_erofs_try_inplace_io(fe, page)) - return 0; - - ret = z_erofs_pagevec_enqueue(&fe->vector, page, type, - pvec_safereuse); - fe->pcl->vcnt += (unsigned int)ret; - return ret ? 0 : -EAGAIN; + if (exclusive) { + /* give priority for inplaceio to use file pages first */ + if (z_erofs_try_inplace_io(fe, bvec)) + return 0; + /* otherwise, check if it can be used as a bvpage */ + if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && + !fe->candidate_bvpage) + fe->candidate_bvpage = bvec->page; + } + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); + fe->pcl->vcnt += (ret >= 0); + return ret; } static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) @@ -385,7 +459,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) *owned_head) == Z_EROFS_PCLUSTER_NIL) { *owned_head = &pcl->next; /* so we can attach this pcluster to our submission chain. */ - f->mode = COLLECT_PRIMARY_FOLLOWED; + f->mode = Z_EROFS_PCLUSTER_FOLLOWED; return; } @@ -393,66 +467,21 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) * type 2, link to the end of an existing open chain, be careful * that its submission is controlled by the original attached chain. */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + if (*owned_head != &pcl->next && pcl != f->tailpcl && + cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, *owned_head) == Z_EROFS_PCLUSTER_TAIL) { *owned_head = Z_EROFS_PCLUSTER_TAIL; - f->mode = COLLECT_PRIMARY_HOOKED; + f->mode = Z_EROFS_PCLUSTER_HOOKED; f->tailpcl = NULL; return; } /* type 3, it belongs to a chain, but it isn't the end of the chain */ - f->mode = COLLECT_PRIMARY; + f->mode = Z_EROFS_PCLUSTER_INFLIGHT; } -static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) -{ - struct z_erofs_pcluster *pcl = fe->pcl; - unsigned int length; - - /* to avoid unexpected loop formed by corrupted images */ - if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - if (pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - length = READ_ONCE(pcl->length); - if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { - if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - } else { - unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; - - if (map->m_flags & EROFS_MAP_FULL_MAPPED) - llen |= Z_EROFS_PCLUSTER_FULL_LENGTH; - - while (llen > length && - length != cmpxchg_relaxed(&pcl->length, length, llen)) { - cpu_relax(); - length = READ_ONCE(pcl->length); - } - } - mutex_lock(&pcl->lock); - /* used to check tail merging loop due to corrupted images */ - if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) - fe->tailpcl = pcl; - - z_erofs_try_to_claim_pcluster(fe); - return 0; -} - -static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { + struct erofs_map_blocks *map = &fe->map; bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; struct erofs_workgroup *grp; @@ -471,14 +500,13 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, atomic_set(&pcl->obj.refcount, 1); pcl->algorithmformat = map->m_algorithmformat; - pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | - (map->m_flags & EROFS_MAP_FULL_MAPPED ? - Z_EROFS_PCLUSTER_FULL_LENGTH : 0); + pcl->length = 0; + pcl->partial = true; /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = fe->owned_head; pcl->pageofs_out = map->m_la & ~PAGE_MASK; - fe->mode = COLLECT_PRIMARY_FOLLOWED; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; /* * lock all primary followed works before visible to others @@ -494,7 +522,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, } else { pcl->obj.index = map->m_pa >> PAGE_SHIFT; - grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); + grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); if (IS_ERR(grp)) { err = PTR_ERR(grp); goto err_out; @@ -520,11 +548,10 @@ err_out: return err; } -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) { - struct erofs_workgroup *grp; + struct erofs_map_blocks *map = &fe->map; + struct erofs_workgroup *grp = NULL; int ret; DBG_BUGON(fe->pcl); @@ -533,38 +560,35 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - if (map->m_flags & EROFS_MAP_META) { - if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - goto tailpacking; + if (!(map->m_flags & EROFS_MAP_META)) { + grp = erofs_find_workgroup(fe->inode->i_sb, + map->m_pa >> PAGE_SHIFT); + } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { + DBG_BUGON(1); + return -EFSCORRUPTED; } - grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); if (grp) { fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); + ret = -EEXIST; } else { -tailpacking: - ret = z_erofs_register_pcluster(fe, inode, map); - if (!ret) - goto out; - if (ret != -EEXIST) - return ret; + ret = z_erofs_register_pcluster(fe); } - ret = z_erofs_lookup_pcluster(fe, inode, map); - if (ret) { - erofs_workgroup_put(&fe->pcl->obj); + if (ret == -EEXIST) { + mutex_lock(&fe->pcl->lock); + /* used to check tail merging loop due to corrupted images */ + if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) + fe->tailpcl = fe->pcl; + + z_erofs_try_to_claim_pcluster(fe); + } else if (ret) { return ret; } - -out: - z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, - fe->pcl->pagevec, fe->pcl->vcnt); + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, + Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ - fe->icpage_ptr = fe->pcl->compressed_pages + - z_erofs_pclusterpages(fe->pcl); + fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } @@ -593,14 +617,19 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) if (!pcl) return false; - z_erofs_pagevec_ctor_exit(&fe->vector, false); + z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); + if (fe->candidate_bvpage) { + DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); + fe->candidate_bvpage = NULL; + } + /* * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. */ - if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; @@ -628,11 +657,10 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); - bool tight = true; + bool tight = true, exclusive; enum z_erofs_cache_alloctype cache_strategy; - enum z_erofs_page_type page_type; - unsigned int cur, end, spiltted, index; + unsigned int cur, end, spiltted; int err = 0; /* register locked file pages as online pages in pack */ @@ -653,7 +681,7 @@ repeat: map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) - goto err_out; + goto out; } else { if (fe->pcl) goto hitted; @@ -663,9 +691,9 @@ repeat: if (!(map->m_flags & EROFS_MAP_MAPPED)) goto hitted; - err = z_erofs_collector_begin(fe, inode, map); + err = z_erofs_collector_begin(fe); if (err) - goto err_out; + goto out; if (z_erofs_is_inline_pcluster(fe->pcl)) { void *mp; @@ -676,11 +704,12 @@ repeat: err = PTR_ERR(mp); erofs_err(inode->i_sb, "failed to get inline page, err %d", err); - goto err_out; + goto out; } get_page(fe->map.buf.page); - WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page); - fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, + fe->map.buf.page); + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, @@ -696,10 +725,10 @@ hitted: * Ensure the current partial page belongs to this submit chain rather * than other concurrent submit chains or the noio(bypass) chain since * those chains are handled asynchronously thus the page cannot be used - * for inplace I/O or pagevec (should be processed in strict order.) + * for inplace I/O or bvpage (should be processed in a strict order.) */ - tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED && - fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED && + fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { @@ -707,60 +736,59 @@ hitted: goto next_part; } - /* let's derive page type */ - page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : - (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); - + exclusive = (!cur && (!spiltted || tight)); if (cur) - tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); retry: - err = z_erofs_attach_page(fe, page, page_type, - fe->mode >= COLLECT_PRIMARY_FOLLOWED); - /* should allocate an additional short-lived page for pagevec */ - if (err == -EAGAIN) { - struct page *const newpage = - alloc_page(GFP_NOFS | __GFP_NOFAIL); - - set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE); - err = z_erofs_attach_page(fe, newpage, - Z_EROFS_PAGE_TYPE_EXCLUSIVE, true); - if (!err) - goto retry; + err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { + .page = page, + .offset = offset - map->m_la, + .end = end, + }), exclusive); + /* should allocate an additional short-lived page for bvset */ + if (err == -EAGAIN && !fe->candidate_bvpage) { + fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); + set_page_private(fe->candidate_bvpage, + Z_EROFS_SHORTLIVED_PAGE); + goto retry; } - if (err) - goto err_out; - - index = page->index - (map->m_la >> PAGE_SHIFT); - - z_erofs_onlinepage_fixup(page, index, true); + if (err) { + DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); + goto out; + } + z_erofs_onlinepage_split(page); /* bump up the number of spiltted parts of a page */ ++spiltted; - /* also update nr_pages */ - fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1); + if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) + fe->pcl->multibases = true; + + if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + fe->pcl->length == map->m_llen) + fe->pcl->partial = false; + if (fe->pcl->length < offset + end - map->m_la) { + fe->pcl->length = offset + end - map->m_la; + fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; + } next_part: - /* can be used for verification */ + /* shorten the remaining extent to update progress */ map->m_llen = offset + cur - map->m_la; + map->m_flags &= ~EROFS_MAP_FULL_MAPPED; end = cur; if (end > 0) goto repeat; out: + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", __func__, page, spiltted, map->m_llen); return err; - - /* if some error occurred while processing this page */ -err_out: - SetPageError(page); - goto out; } static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, @@ -783,97 +811,137 @@ static bool z_erofs_page_is_invalidated(struct page *page) return !page->mapping && !z_erofs_is_shortlived_page(page); } -static int z_erofs_decompress_pcluster(struct super_block *sb, - struct z_erofs_pcluster *pcl, - struct page **pagepool) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - struct z_erofs_pagevec_ctor ctor; - unsigned int i, inputsize, outputsize, llen, nr_pages; - struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; - struct page **pages, **compressed_pages, *page; +struct z_erofs_decompress_backend { + struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; + struct super_block *sb; + struct z_erofs_pcluster *pcl; - enum z_erofs_page_type page_type; - bool overlapped, partial; - int err; + /* pages with the longest decompressed length for deduplication */ + struct page **decompressed_pages; + /* pages to keep the compressed data */ + struct page **compressed_pages; - might_sleep(); - DBG_BUGON(!READ_ONCE(pcl->nr_pages)); + struct list_head decompressed_secondary_bvecs; + struct page **pagepool; + unsigned int onstack_used, nr_pages; +}; - mutex_lock(&pcl->lock); - nr_pages = pcl->nr_pages; +struct z_erofs_bvec_item { + struct z_erofs_bvec bvec; + struct list_head list; +}; - if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { - pages = pages_onstack; - } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES && - mutex_trylock(&z_pagemap_global_lock)) { - pages = z_pagemap_global; - } else { - gfp_t gfp_flags = GFP_KERNEL; +static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, + struct z_erofs_bvec *bvec) +{ + struct z_erofs_bvec_item *item; - if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES) - gfp_flags |= __GFP_NOFAIL; + if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) { + unsigned int pgnr; + struct page *oldpage; - pages = kvmalloc_array(nr_pages, sizeof(struct page *), - gfp_flags); + pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + oldpage = be->decompressed_pages[pgnr]; + be->decompressed_pages[pgnr] = bvec->page; - /* fallback to global pagemap for the lowmem scenario */ - if (!pages) { - mutex_lock(&z_pagemap_global_lock); - pages = z_pagemap_global; - } + if (!oldpage) + return; } - for (i = 0; i < nr_pages; ++i) - pages[i] = NULL; - - err = 0; - z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, - pcl->pagevec, 0); - - for (i = 0; i < pcl->vcnt; ++i) { - unsigned int pagenr; + /* (cold path) one pcluster is requested multiple times */ + item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL); + item->bvec = *bvec; + list_add(&item->list, &be->decompressed_secondary_bvecs); +} - page = z_erofs_pagevec_dequeue(&ctor, &page_type); +static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, + int err) +{ + unsigned int off0 = be->pcl->pageofs_out; + struct list_head *p, *n; + + list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) { + struct z_erofs_bvec_item *bvi; + unsigned int end, cur; + void *dst, *src; + + bvi = container_of(p, struct z_erofs_bvec_item, list); + cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0; + end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset, + bvi->bvec.end); + dst = kmap_local_page(bvi->bvec.page); + while (cur < end) { + unsigned int pgnr, scur, len; + + pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + + scur = bvi->bvec.offset + cur - + ((pgnr << PAGE_SHIFT) - off0); + len = min_t(unsigned int, end - cur, PAGE_SIZE - scur); + if (!be->decompressed_pages[pgnr]) { + err = -EFSCORRUPTED; + cur += len; + continue; + } + src = kmap_local_page(be->decompressed_pages[pgnr]); + memcpy(dst + cur, src + scur, len); + kunmap_local(src); + cur += len; + } + kunmap_local(dst); + if (err) + z_erofs_page_mark_eio(bvi->bvec.page); + z_erofs_onlinepage_endio(bvi->bvec.page); + list_del(p); + kfree(bvi); + } +} - /* all pages in pagevec ought to be valid */ - DBG_BUGON(!page); - DBG_BUGON(z_erofs_page_is_invalidated(page)); +static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) +{ + struct z_erofs_pcluster *pcl = be->pcl; + struct z_erofs_bvec_iter biter; + struct page *old_bvpage; + int i; - if (z_erofs_put_shortlivedpage(pagepool, page)) - continue; + z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); + for (i = 0; i < pcl->vcnt; ++i) { + struct z_erofs_bvec bvec; - if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) - pagenr = 0; - else - pagenr = z_erofs_onlinepage_index(page); + z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); - DBG_BUGON(pagenr >= nr_pages); + if (old_bvpage) + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); - /* - * currently EROFS doesn't support multiref(dedup), - * so here erroring out one multiref page. - */ - if (pages[pagenr]) { - DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; - } - pages[pagenr] = page; + DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); + z_erofs_do_decompressed_bvec(be, &bvec); } - z_erofs_pagevec_ctor_exit(&ctor, true); - overlapped = false; - compressed_pages = pcl->compressed_pages; + old_bvpage = z_erofs_bvec_iter_end(&biter); + if (old_bvpage) + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); +} +static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, + bool *overlapped) +{ + struct z_erofs_pcluster *pcl = be->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + int i, err = 0; + + *overlapped = false; for (i = 0; i < pclusterpages; ++i) { - unsigned int pagenr; + struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; + struct page *page = bvec->page; - page = compressed_pages[i]; - /* all compressed pages ought to be valid */ - DBG_BUGON(!page); + /* compressed pages ought to be present before decompressing */ + if (!page) { + DBG_BUGON(1); + continue; + } + be->compressed_pages[i] = page; if (z_erofs_is_inline_pcluster(pcl)) { if (!PageUptodate(page)) @@ -883,109 +951,129 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { - if (erofs_page_is_managed(sbi, page)) { + if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { if (!PageUptodate(page)) err = -EIO; continue; } + z_erofs_do_decompressed_bvec(be, bvec); + *overlapped = true; + } + } - /* - * only if non-head page can be selected - * for inplace decompression - */ - pagenr = z_erofs_onlinepage_index(page); - - DBG_BUGON(pagenr >= nr_pages); - if (pages[pagenr]) { - DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; - } - pages[pagenr] = page; + if (err) + return err; + return 0; +} - overlapped = true; - } +static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, + int err) +{ + struct erofs_sb_info *const sbi = EROFS_SB(be->sb); + struct z_erofs_pcluster *pcl = be->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + unsigned int i, inputsize; + int err2; + struct page *page; + bool overlapped; - /* PG_error needs checking for all non-managed pages */ - if (PageError(page)) { - DBG_BUGON(PageUptodate(page)); - err = -EIO; - } + mutex_lock(&pcl->lock); + be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; + + /* allocate (de)compressed page arrays if cannot be kept on stack */ + be->decompressed_pages = NULL; + be->compressed_pages = NULL; + be->onstack_used = 0; + if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) { + be->decompressed_pages = be->onstack_pages; + be->onstack_used = be->nr_pages; + memset(be->decompressed_pages, 0, + sizeof(struct page *) * be->nr_pages); } + if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) + be->compressed_pages = be->onstack_pages + be->onstack_used; + + if (!be->decompressed_pages) + be->decompressed_pages = + kvcalloc(be->nr_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + if (!be->compressed_pages) + be->compressed_pages = + kvcalloc(pclusterpages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + + z_erofs_parse_out_bvecs(be); + err2 = z_erofs_parse_in_bvecs(be, &overlapped); + if (err2) + err = err2; if (err) goto out; - llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; - if (nr_pages << PAGE_SHIFT >= pcl->pageofs_out + llen) { - outputsize = llen; - partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); - } else { - outputsize = (nr_pages << PAGE_SHIFT) - pcl->pageofs_out; - partial = true; - } - if (z_erofs_is_inline_pcluster(pcl)) inputsize = pcl->tailpacking_size; else inputsize = pclusterpages * PAGE_SIZE; err = z_erofs_decompress(&(struct z_erofs_decompress_req) { - .sb = sb, - .in = compressed_pages, - .out = pages, + .sb = be->sb, + .in = be->compressed_pages, + .out = be->decompressed_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, .inputsize = inputsize, - .outputsize = outputsize, + .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, - .partial_decoding = partial - }, pagepool); + .partial_decoding = pcl->partial, + .fillgaps = pcl->multibases, + }, be->pagepool); out: /* must handle all compressed pages before actual file pages */ if (z_erofs_is_inline_pcluster(pcl)) { - page = compressed_pages[0]; - WRITE_ONCE(compressed_pages[0], NULL); + page = pcl->compressed_bvecs[0].page; + WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); put_page(page); } else { for (i = 0; i < pclusterpages; ++i) { - page = compressed_pages[i]; + page = pcl->compressed_bvecs[i].page; if (erofs_page_is_managed(sbi, page)) continue; /* recycle all individual short-lived pages */ - (void)z_erofs_put_shortlivedpage(pagepool, page); - WRITE_ONCE(compressed_pages[i], NULL); + (void)z_erofs_put_shortlivedpage(be->pagepool, page); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } } + if (be->compressed_pages < be->onstack_pages || + be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) + kvfree(be->compressed_pages); + z_erofs_fill_other_copies(be, err); - for (i = 0; i < nr_pages; ++i) { - page = pages[i]; + for (i = 0; i < be->nr_pages; ++i) { + page = be->decompressed_pages[i]; if (!page) continue; DBG_BUGON(z_erofs_page_is_invalidated(page)); /* recycle all individual short-lived pages */ - if (z_erofs_put_shortlivedpage(pagepool, page)) + if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; - - if (err < 0) - SetPageError(page); - + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); } - if (pages == z_pagemap_global) - mutex_unlock(&z_pagemap_global_lock); - else if (pages != pages_onstack) - kvfree(pages); + if (be->decompressed_pages != be->onstack_pages) + kvfree(be->decompressed_pages); - pcl->nr_pages = 0; + pcl->length = 0; + pcl->partial = true; + pcl->multibases = false; + pcl->bvset.nextpage = NULL; pcl->vcnt = 0; /* pcluster lock MUST be taken before the following line */ @@ -997,22 +1085,25 @@ out: static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, struct page **pagepool) { + struct z_erofs_decompress_backend be = { + .sb = io->sb, + .pagepool = pagepool, + .decompressed_secondary_bvecs = + LIST_HEAD_INIT(be.decompressed_secondary_bvecs), + }; z_erofs_next_pcluster_t owned = io->head; while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { - struct z_erofs_pcluster *pcl; - - /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ + /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); - - /* no possible that 'owned' equals NULL */ + /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); - pcl = container_of(owned, struct z_erofs_pcluster, next); - owned = READ_ONCE(pcl->next); + be.pcl = container_of(owned, struct z_erofs_pcluster, next); + owned = READ_ONCE(be.pcl->next); - z_erofs_decompress_pcluster(io->sb, pcl, pagepool); - erofs_workgroup_put(&pcl->obj); + z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); + erofs_workgroup_put(&be.pcl->obj); } } @@ -1038,7 +1129,6 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (sync) { if (!atomic_add_return(bios, &io->pending_bios)) complete(&io->u.done); - return; } @@ -1071,7 +1161,7 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, int justfound; repeat: - page = READ_ONCE(pcl->compressed_pages[nr]); + page = READ_ONCE(pcl->compressed_bvecs[nr].page); oldpage = page; if (!page) @@ -1087,7 +1177,7 @@ repeat: * otherwise, it will go inplace I/O path instead. */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { - WRITE_ONCE(pcl->compressed_pages[nr], page); + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); set_page_private(page, 0); tocache = true; goto out_tocache; @@ -1113,14 +1203,13 @@ repeat: /* the page is still in manage cache */ if (page->mapping == mc) { - WRITE_ONCE(pcl->compressed_pages[nr], page); + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - ClearPageError(page); if (!PagePrivate(page)) { /* * impossible to be !PagePrivate(page) for * the current restriction as well if - * the page is already in compressed_pages[]. + * the page is already in compressed_bvecs[]. */ DBG_BUGON(!justfound); @@ -1149,7 +1238,8 @@ repeat: put_page(page); out_allocpage: page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { + if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, + oldpage, page)) { erofs_pagepool_add(pagepool, page); cond_resched(); goto repeat; @@ -1186,6 +1276,7 @@ fg_out: q = fgq; init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); + q->eio = false; } q->sb = sb; q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; @@ -1246,26 +1337,25 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) DBG_BUGON(PageUptodate(page)); DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (err) - SetPageError(page); - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { if (!err) SetPageUptodate(page); unlock_page(page); } } + if (err) + q->eio = true; z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); bio_put(bio); } -static void z_erofs_submit_queue(struct super_block *sb, - struct z_erofs_decompress_frontend *f, +static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct page **pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) { - struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct super_block *sb = f->inode->i_sb; + struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; @@ -1317,7 +1407,7 @@ static void z_erofs_submit_queue(struct super_block *sb, struct page *page; page = pickup_page_for_submission(pcl, i++, pagepool, - MNGD_MAPPING(sbi)); + mc); if (!page) continue; @@ -1369,15 +1459,14 @@ submit_bio_retry: z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); } -static void z_erofs_runqueue(struct super_block *sb, - struct z_erofs_decompress_frontend *f, +static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, struct page **pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(sb, f, pagepool, io, &force_fg); + z_erofs_submit_queue(f, pagepool, io, &force_fg); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); @@ -1475,7 +1564,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_runqueue(&f, &pagepool, z_erofs_get_sync_decompress_policy(sbi, 0)); if (err) @@ -1524,7 +1613,7 @@ static void z_erofs_readahead(struct readahead_control *rac) z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); (void)z_erofs_collector_end(&f); - z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_runqueue(&f, &pagepool, z_erofs_get_sync_decompress_policy(sbi, nr_pages)); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&pagepool); diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 58053bb5066f..e7f04c4fbb81 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -7,13 +7,10 @@ #define __EROFS_FS_ZDATA_H #include "internal.h" -#include "zpvec.h" +#include "tagptr.h" #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) -#define Z_EROFS_NR_INLINE_PAGEVECS 3 - -#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 -#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 +#define Z_EROFS_INLINE_BVECS 2 /* * let's leave a type here in case of introducing @@ -21,6 +18,21 @@ */ typedef void *z_erofs_next_pcluster_t; +struct z_erofs_bvec { + struct page *page; + int offset; + unsigned int end; +}; + +#define __Z_EROFS_BVSET(name, total) \ +struct name { \ + /* point to the next page which contains the following bvecs */ \ + struct page *nextpage; \ + struct z_erofs_bvec bvec[total]; \ +} +__Z_EROFS_BVSET(z_erofs_bvset,); +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); + /* * Structure fields follow one of the following exclusion rules. * @@ -38,24 +50,21 @@ struct z_erofs_pcluster { /* A: point to next chained pcluster or TAILs */ z_erofs_next_pcluster_t next; - /* A: lower limit of decompressed length and if full length or not */ + /* L: the maximum decompression size of this round */ unsigned int length; + /* L: total number of bvecs */ + unsigned int vcnt; + /* I: page offset of start position of decompression */ unsigned short pageofs_out; /* I: page offset of inline compressed data */ unsigned short pageofs_in; - /* L: maximum relative page index in pagevec[] */ - unsigned short nr_pages; - - /* L: total number of pages in pagevec[] */ - unsigned int vcnt; - union { - /* L: inline a certain number of pagevecs for bootstrap */ - erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS]; + /* L: inline a certain number of bvec for bootstrap */ + struct z_erofs_bvset_inline bvset; /* I: can be used to free the pcluster by RCU. */ struct rcu_head rcu; @@ -72,8 +81,14 @@ struct z_erofs_pcluster { /* I: compression algorithm format */ unsigned char algorithmformat; - /* A: compressed pages (can be cached or inplaced pages) */ - struct page *compressed_pages[]; + /* L: whether partial decompression or not */ + bool partial; + + /* L: indicate several pageofs_outs or not */ + bool multibases; + + /* A: compressed bvecs (can be cached or inplaced pages) */ + struct z_erofs_bvec compressed_bvecs[]; }; /* let's avoid the valid 32-bit kernel addresses */ @@ -94,6 +109,8 @@ struct z_erofs_decompressqueue { struct completion done; struct work_struct work; } u; + + bool eio; }; static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) @@ -108,38 +125,17 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) return pcl->pclusterpages; } -#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 -#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) -#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) - /* - * waiters (aka. ongoing_packs): # to unlock the page - * sub-index: 0 - for partial page, >= 1 full page sub-index + * bit 31: I/O error occurred on this page + * bit 0 - 30: remaining parts to complete this page */ -typedef atomic_t z_erofs_onlinepage_t; - -/* type punning */ -union z_erofs_onlinepage_converter { - z_erofs_onlinepage_t *o; - unsigned long *v; -}; - -static inline unsigned int z_erofs_onlinepage_index(struct page *page) -{ - union z_erofs_onlinepage_converter u; - - DBG_BUGON(!PagePrivate(page)); - u.v = &page_private(page); - - return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; -} +#define Z_EROFS_PAGE_EIO (1 << 31) static inline void z_erofs_onlinepage_init(struct page *page) { union { - z_erofs_onlinepage_t o; + atomic_t o; unsigned long v; - /* keep from being unlocked in advance */ } u = { .o = ATOMIC_INIT(1) }; set_page_private(page, u.v); @@ -147,49 +143,36 @@ static inline void z_erofs_onlinepage_init(struct page *page) SetPagePrivate(page); } -static inline void z_erofs_onlinepage_fixup(struct page *page, - uintptr_t index, bool down) +static inline void z_erofs_onlinepage_split(struct page *page) { - union z_erofs_onlinepage_converter u = { .v = &page_private(page) }; - int orig, orig_index, val; - -repeat: - orig = atomic_read(u.o); - orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; - if (orig_index) { - if (!index) - return; + atomic_inc((atomic_t *)&page->private); +} - DBG_BUGON(orig_index != index); - } +static inline void z_erofs_page_mark_eio(struct page *page) +{ + int orig; - val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | - ((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down); - if (atomic_cmpxchg(u.o, orig, val) != orig) - goto repeat; + do { + orig = atomic_read((atomic_t *)&page->private); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, + orig | Z_EROFS_PAGE_EIO) != orig); } static inline void z_erofs_onlinepage_endio(struct page *page) { - union z_erofs_onlinepage_converter u; unsigned int v; DBG_BUGON(!PagePrivate(page)); - u.v = &page_private(page); - - v = atomic_dec_return(u.o); - if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) { + v = atomic_dec_return((atomic_t *)&page->private); + if (!(v & ~Z_EROFS_PAGE_EIO)) { set_page_private(page, 0); ClearPagePrivate(page); - if (!PageError(page)) + if (!(v & Z_EROFS_PAGE_EIO)) SetPageUptodate(page); unlock_page(page); } - erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o)); } -#define Z_EROFS_VMAP_ONSTACK_PAGES \ - min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U) -#define Z_EROFS_VMAP_GLOBAL_PAGES 2048 +#define Z_EROFS_ONSTACK_PAGES 32 #endif diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h deleted file mode 100644 index b05464f4a808..000000000000 --- a/fs/erofs/zpvec.h +++ /dev/null @@ -1,159 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2018 HUAWEI, Inc. - * https://www.huawei.com/ - */ -#ifndef __EROFS_FS_ZPVEC_H -#define __EROFS_FS_ZPVEC_H - -#include "tagptr.h" - -/* page type in pagevec for decompress subsystem */ -enum z_erofs_page_type { - /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ - Z_EROFS_PAGE_TYPE_EXCLUSIVE, - - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, - - Z_EROFS_VLE_PAGE_TYPE_HEAD, - Z_EROFS_VLE_PAGE_TYPE_MAX -}; - -extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0") - __bad_page_type_exclusive(void); - -/* pagevec tagged pointer */ -typedef tagptr2_t erofs_vtptr_t; - -/* pagevec collector */ -struct z_erofs_pagevec_ctor { - struct page *curr, *next; - erofs_vtptr_t *pages; - - unsigned int nr, index; -}; - -static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor, - bool atomic) -{ - if (!ctor->curr) - return; - - if (atomic) - kunmap_atomic(ctor->pages); - else - kunmap(ctor->curr); -} - -static inline struct page * -z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor, - unsigned int nr) -{ - unsigned int index; - - /* keep away from occupied pages */ - if (ctor->next) - return ctor->next; - - for (index = 0; index < nr; ++index) { - const erofs_vtptr_t t = ctor->pages[index]; - const unsigned int tags = tagptr_unfold_tags(t); - - if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE) - return tagptr_unfold_ptr(t); - } - DBG_BUGON(nr >= ctor->nr); - return NULL; -} - -static inline void -z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor, - bool atomic) -{ - struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr); - - z_erofs_pagevec_ctor_exit(ctor, atomic); - - ctor->curr = next; - ctor->next = NULL; - ctor->pages = atomic ? - kmap_atomic(ctor->curr) : kmap(ctor->curr); - - ctor->nr = PAGE_SIZE / sizeof(struct page *); - ctor->index = 0; -} - -static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, - unsigned int nr, - erofs_vtptr_t *pages, - unsigned int i) -{ - ctor->nr = nr; - ctor->curr = ctor->next = NULL; - ctor->pages = pages; - - if (i >= nr) { - i -= nr; - z_erofs_pagevec_ctor_pagedown(ctor, false); - while (i > ctor->nr) { - i -= ctor->nr; - z_erofs_pagevec_ctor_pagedown(ctor, false); - } - } - ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i); - ctor->index = i; -} - -static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, - struct page *page, - enum z_erofs_page_type type, - bool pvec_safereuse) -{ - if (!ctor->next) { - /* some pages cannot be reused as pvec safely without I/O */ - if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse) - type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED; - - if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE && - ctor->index + 1 == ctor->nr) - return false; - } - - if (ctor->index >= ctor->nr) - z_erofs_pagevec_ctor_pagedown(ctor, false); - - /* exclusive page type must be 0 */ - if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL) - __bad_page_type_exclusive(); - - /* should remind that collector->next never equal to 1, 2 */ - if (type == (uintptr_t)ctor->next) { - ctor->next = page; - } - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); - return true; -} - -static inline struct page * -z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor, - enum z_erofs_page_type *type) -{ - erofs_vtptr_t t; - - if (ctor->index >= ctor->nr) { - DBG_BUGON(!ctor->next); - z_erofs_pagevec_ctor_pagedown(ctor, true); - } - - t = ctor->pages[ctor->index]; - - *type = tagptr_unfold_tags(t); - - /* should remind that collector->next never equal to 1, 2 */ - if (*type == (uintptr_t)ctor->next) - ctor->next = tagptr_unfold_ptr(t); - - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0); - return tagptr_unfold_ptr(t); -} -#endif diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e6b932219803..7a192e4e7fa9 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1679,14 +1679,14 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; - if (is_quota_modification(inode, iattr)) { + if (is_quota_modification(mnt_userns, inode, iattr)) { error = dquot_initialize(inode); if (error) return error; } - if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || - (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { - error = dquot_transfer(inode, iattr); + if (i_uid_needs_update(mnt_userns, iattr, inode) || + i_gid_needs_update(mnt_userns, iattr, inode)) { + error = dquot_transfer(mnt_userns, inode, iattr); if (error) return error; } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f6a19f6d9f6d..6f475d2e3b18 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1059,9 +1059,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_frags_per_group); goto failed_mount; } - if (sbi->s_inodes_per_group > sb->s_blocksize * 8) { + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || + sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext2_msg(sb, KERN_ERR, - "error: #inodes per group too big: %lu", + "error: invalid #inodes per group: %lu", sbi->s_inodes_per_group); goto failed_mount; } @@ -1071,6 +1072,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - le32_to_cpu(es->s_first_data_block) - 1) / EXT2_BLOCKS_PER_GROUP(sb)) + 1; + if ((u64)sbi->s_groups_count * sbi->s_inodes_per_group != + le32_to_cpu(es->s_inodes_count)) { + ext2_msg(sb, KERN_ERR, "error: invalid #inodes: %u vs computed %llu", + le32_to_cpu(es->s_inodes_count), + (u64)sbi->s_groups_count * sbi->s_inodes_per_group); + goto failed_mount; + } db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / EXT2_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc_array(db_count, @@ -1490,8 +1498,7 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, len = i_size-off; toread = len; while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; + tocopy = min_t(size_t, sb->s_blocksize - offset, toread); tmp_bh.b_state = 0; tmp_bh.b_size = sb->s_blocksize; @@ -1529,8 +1536,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, struct buffer_head *bh; while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; + tocopy = min_t(size_t, sb->s_blocksize - offset, towrite); tmp_bh.b_state = 0; tmp_bh.b_size = sb->s_blocksize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 84c0eb55071d..3dcc1dd1f179 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5350,14 +5350,14 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } - if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || - (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { + if (i_uid_needs_update(mnt_userns, attr, inode) || + i_gid_needs_update(mnt_userns, attr, inode)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, @@ -5374,7 +5374,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); - error = dquot_transfer(inode, attr); + error = dquot_transfer(mnt_userns, inode, attr); up_read(&EXT4_I(inode)->xattr_sem); if (error) { @@ -5383,10 +5383,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Update corresponding info in inode so that everything is in * one transaction */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bd14cef1b08f..d66e37d80a2d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -861,10 +861,8 @@ static void __setattr_copy(struct user_namespace *mnt_userns, { unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -917,17 +915,15 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (err) return err; - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { err = f2fs_dquot_initialize(inode); if (err) return err; } - if ((attr->ia_valid & ATTR_UID && - !uid_eq(attr->ia_uid, inode->i_uid)) || - (attr->ia_valid & ATTR_GID && - !gid_eq(attr->ia_gid, inode->i_gid))) { + if (i_uid_needs_update(mnt_userns, attr, inode) || + i_gid_needs_update(mnt_userns, attr, inode)) { f2fs_lock_op(F2FS_I_SB(inode)); - err = dquot_transfer(inode, attr); + err = dquot_transfer(mnt_userns, inode, attr); if (err) { set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); @@ -938,10 +934,8 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * update uid/gid under lock_op(), so that dquot and inode can * be updated atomically. */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); f2fs_mark_inode_dirty_sync(inode, true); f2fs_unlock_op(F2FS_I_SB(inode)); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 3cb7f8a43b4d..dcd0a1e35095 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -255,18 +255,18 @@ static int recover_quota_data(struct inode *inode, struct page *page) memset(&attr, 0, sizeof(attr)); - attr.ia_uid = make_kuid(inode->i_sb->s_user_ns, i_uid); - attr.ia_gid = make_kgid(inode->i_sb->s_user_ns, i_gid); + attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid)); + attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid)); - if (!uid_eq(attr.ia_uid, inode->i_uid)) + if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&init_user_ns, inode))) attr.ia_valid |= ATTR_UID; - if (!gid_eq(attr.ia_gid, inode->i_gid)) + if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&init_user_ns, inode))) attr.ia_valid |= ATTR_GID; if (!attr.ia_valid) return 0; - err = dquot_transfer(inode, &attr); + err = dquot_transfer(&init_user_ns, inode, &attr); if (err) set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); return err; diff --git a/fs/fat/file.c b/fs/fat/file.c index 3dae3ed60f3a..3e4eb3467cb4 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -90,7 +90,8 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) * out the RO attribute for checking by the security * module, just because it maps to a file mode. */ - err = security_inode_setattr(file->f_path.dentry, &ia); + err = security_inode_setattr(file_mnt_user_ns(file), + file->f_path.dentry, &ia); if (err) goto out_unlock_inode; @@ -516,9 +517,11 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } if (((attr->ia_valid & ATTR_UID) && - (!uid_eq(attr->ia_uid, sbi->options.fs_uid))) || + (!uid_eq(from_vfsuid(mnt_userns, i_user_ns(inode), attr->ia_vfsuid), + sbi->options.fs_uid))) || ((attr->ia_valid & ATTR_GID) && - (!gid_eq(attr->ia_gid, sbi->options.fs_gid))) || + (!gid_eq(from_vfsgid(mnt_userns, i_user_ns(inode), attr->ia_vfsgid), + sbi->options.fs_gid))) || ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~FAT_VALID_MODE))) error = -EPERM; diff --git a/fs/io_uring.c b/fs/io_uring.c index a01ea49f3017..e8e769be9ed0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1738,6 +1738,14 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) return; /* + * READV uses fields in `struct io_rw` (len/addr) to stash the selected + * buffer data. However if that buffer is recycled the original request + * data stored in addr is lost. Therefore forbid recycling for now. + */ + if (req->opcode == IORING_OP_READV) + return; + + /* * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear * the flag and hence ensure that bl->head doesn't get incremented. * If the tail has already been incremented, hang on to it. @@ -12931,7 +12939,7 @@ static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_ring *br; struct io_uring_buf_reg reg; - struct io_buffer_list *bl; + struct io_buffer_list *bl, *free_bl = NULL; struct page **pages; int nr_pages; @@ -12963,7 +12971,7 @@ static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) return -EEXIST; } else { - bl = kzalloc(sizeof(*bl), GFP_KERNEL); + free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); if (!bl) return -ENOMEM; } @@ -12972,7 +12980,7 @@ static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) struct_size(br, bufs, reg.ring_entries), &nr_pages); if (IS_ERR(pages)) { - kfree(bl); + kfree(free_bl); return PTR_ERR(pages); } diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 1d732fd223d4..332dc9ac47a9 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -95,14 +95,14 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (rc) return rc; - if (is_quota_modification(inode, iattr)) { + if (is_quota_modification(mnt_userns, inode, iattr)) { rc = dquot_initialize(inode); if (rc) return rc; } if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { - rc = dquot_transfer(inode, iattr); + rc = dquot_transfer(mnt_userns, inode, iattr); if (rc) return rc; } diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 05efcdf7a4a7..7c849024999f 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -963,7 +963,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, */ int ksmbd_vfs_setxattr(struct user_namespace *user_ns, struct dentry *dentry, const char *attr_name, - const void *attr_value, size_t attr_size, int flags) + void *attr_value, size_t attr_size, int flags) { int err; diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 8c37aaf936ab..70da4c0ba7ad 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -109,7 +109,7 @@ ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, int attr_name_len); int ksmbd_vfs_setxattr(struct user_namespace *user_ns, struct dentry *dentry, const char *attr_name, - const void *attr_value, size_t attr_size, int flags); + void *attr_value, size_t attr_size, int flags); int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, diff --git a/fs/locks.c b/fs/locks.c index ca28e0e50e56..c266cfdc3291 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -425,21 +425,9 @@ static inline int flock_translate_cmd(int cmd) { } /* Fill in a file_lock structure with an appropriate FLOCK lock. */ -static struct file_lock * -flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl) +static void flock_make_lock(struct file *filp, struct file_lock *fl, int type) { - int type = flock_translate_cmd(cmd); - - if (type < 0) - return ERR_PTR(type); - - if (fl == NULL) { - fl = locks_alloc_lock(); - if (fl == NULL) - return ERR_PTR(-ENOMEM); - } else { - locks_init_lock(fl); - } + locks_init_lock(fl); fl->fl_file = filp; fl->fl_owner = filp; @@ -447,8 +435,6 @@ flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl) fl->fl_flags = FL_FLOCK; fl->fl_type = type; fl->fl_end = OFFSET_MAX; - - return fl; } static int assign_type(struct file_lock *fl, long type) @@ -2097,21 +2083,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait); */ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) { - struct fd f = fdget(fd); - struct file_lock *lock; - int can_sleep, unlock; - int error; - - error = -EBADF; - if (!f.file) - goto out; - - can_sleep = !(cmd & LOCK_NB); - cmd &= ~LOCK_NB; - unlock = (cmd == LOCK_UN); - - if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) - goto out_putf; + int can_sleep, error, type; + struct file_lock fl; + struct fd f; /* * LOCK_MAND locks were broken for a long time in that they never @@ -2123,36 +2097,41 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ if (cmd & LOCK_MAND) { pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n"); - error = 0; - goto out_putf; + return 0; } - lock = flock_make_lock(f.file, cmd, NULL); - if (IS_ERR(lock)) { - error = PTR_ERR(lock); + type = flock_translate_cmd(cmd & ~LOCK_NB); + if (type < 0) + return type; + + error = -EBADF; + f = fdget(fd); + if (!f.file) + return error; + + if (type != F_UNLCK && !(f.file->f_mode & (FMODE_READ | FMODE_WRITE))) goto out_putf; - } - if (can_sleep) - lock->fl_flags |= FL_SLEEP; + flock_make_lock(f.file, &fl, type); - error = security_file_lock(f.file, lock->fl_type); + error = security_file_lock(f.file, fl.fl_type); if (error) - goto out_free; + goto out_putf; + + can_sleep = !(cmd & LOCK_NB); + if (can_sleep) + fl.fl_flags |= FL_SLEEP; if (f.file->f_op->flock) error = f.file->f_op->flock(f.file, - (can_sleep) ? F_SETLKW : F_SETLK, - lock); + (can_sleep) ? F_SETLKW : F_SETLK, + &fl); else - error = locks_lock_file_wait(f.file, lock); - - out_free: - locks_free_lock(lock); + error = locks_lock_file_wait(f.file, &fl); out_putf: fdput(f); - out: + return error; } @@ -2614,7 +2593,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) if (list_empty(&flctx->flc_flock)) return; - flock_make_lock(filp, LOCK_UN, &fl); + flock_make_lock(filp, &fl, F_UNLCK); fl.fl_flags |= FL_CLOSE; if (filp->f_op->flock) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 4f897e109547..cd7d09a569ff 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -295,12 +295,13 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, const void *data, int data_type, struct inode *dir) { - __u32 marks_mask = 0, marks_ignored_mask = 0; + __u32 marks_mask = 0, marks_ignore_mask = 0; __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS | FANOTIFY_EVENT_FLAGS; const struct path *path = fsnotify_data_path(data, data_type); unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); struct fsnotify_mark *mark; + bool ondir = event_mask & FAN_ONDIR; int type; pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n", @@ -315,19 +316,21 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, return 0; } else if (!(fid_mode & FAN_REPORT_FID)) { /* Do we have a directory inode to report? */ - if (!dir && !(event_mask & FS_ISDIR)) + if (!dir && !ondir) return 0; } fsnotify_foreach_iter_mark_type(iter_info, mark, type) { - /* Apply ignore mask regardless of mark's ISDIR flag */ - marks_ignored_mask |= mark->ignored_mask; + /* + * Apply ignore mask depending on event flags in ignore mask. + */ + marks_ignore_mask |= + fsnotify_effective_ignore_mask(mark, ondir, type); /* - * If the event is on dir and this mark doesn't care about - * events on dir, don't send it! + * Send the event depending on event flags in mark mask. */ - if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR)) + if (!fsnotify_mask_applicable(mark->mask, ondir, type)) continue; marks_mask |= mark->mask; @@ -336,7 +339,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, *match_mask |= 1U << type; } - test_mask = event_mask & marks_mask & ~marks_ignored_mask; + test_mask = event_mask & marks_mask & ~marks_ignore_mask; /* * For dirent modification events (create/delete/move) that do not carry diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 80e0ec95b113..1d9f11255c64 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -499,6 +499,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark) mflags |= FAN_MARK_IGNORED_SURV_MODIFY; if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF) mflags |= FAN_MARK_EVICTABLE; + if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) + mflags |= FAN_MARK_IGNORE; return mflags; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index b08ce0d821a7..f0e49a406ffa 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1009,10 +1009,10 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, mask &= ~umask; spin_lock(&fsn_mark->lock); oldmask = fsnotify_calc_mask(fsn_mark); - if (!(flags & FAN_MARK_IGNORED_MASK)) { + if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) { fsn_mark->mask &= ~mask; } else { - fsn_mark->ignored_mask &= ~mask; + fsn_mark->ignore_mask &= ~mask; } newmask = fsnotify_calc_mask(fsn_mark); /* @@ -1021,7 +1021,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, * changes to the mask. * Destroy mark when only umask bits remain. */ - *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask); + *destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask); spin_unlock(&fsn_mark->lock); return oldmask & ~newmask; @@ -1085,15 +1085,24 @@ static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark, unsigned int fan_flags) { bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE); + unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS; bool recalc = false; /* + * When using FAN_MARK_IGNORE for the first time, mark starts using + * independent event flags in ignore mask. After that, trying to + * update the ignore mask with the old FAN_MARK_IGNORED_MASK API + * will result in EEXIST error. + */ + if (ignore == FAN_MARK_IGNORE) + fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS; + + /* * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to * the removal of the FS_MODIFY bit in calculated mask if it was set - * because of an ignored mask that is now going to survive FS_MODIFY. + * because of an ignore mask that is now going to survive FS_MODIFY. */ - if ((fan_flags & FAN_MARK_IGNORED_MASK) && - (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && + if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) { fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; if (!(fsn_mark->mask & FS_MODIFY)) @@ -1120,10 +1129,10 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, bool recalc; spin_lock(&fsn_mark->lock); - if (!(fan_flags & FAN_MARK_IGNORED_MASK)) + if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS)) fsn_mark->mask |= mask; else - fsn_mark->ignored_mask |= mask; + fsn_mark->ignore_mask |= mask; recalc = fsnotify_calc_mask(fsn_mark) & ~fsnotify_conn_mask(fsn_mark->connector); @@ -1187,6 +1196,37 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group) sizeof(struct fanotify_error_event)); } +static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, + unsigned int fan_flags) +{ + /* + * Non evictable mark cannot be downgraded to evictable mark. + */ + if (fan_flags & FAN_MARK_EVICTABLE && + !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) + return -EEXIST; + + /* + * New ignore mask semantics cannot be downgraded to old semantics. + */ + if (fan_flags & FAN_MARK_IGNORED_MASK && + fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) + return -EEXIST; + + /* + * An ignore mask that survives modify could never be downgraded to not + * survive modify. With new FAN_MARK_IGNORE semantics we make that rule + * explicit and return an error when trying to update the ignore mask + * without the original FAN_MARK_IGNORED_SURV_MODIFY value. + */ + if (fan_flags & FAN_MARK_IGNORE && + !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && + fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) + return -EEXIST; + + return 0; +} + static int fanotify_add_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int obj_type, __u32 mask, unsigned int fan_flags, @@ -1208,19 +1248,18 @@ static int fanotify_add_mark(struct fsnotify_group *group, } /* - * Non evictable mark cannot be downgraded to evictable mark. + * Check if requested mark flags conflict with an existing mark flags. */ - if (fan_flags & FAN_MARK_EVICTABLE && - !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) { - ret = -EEXIST; + ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags); + if (ret) goto out; - } /* * Error events are pre-allocated per group, only if strictly * needed (i.e. FAN_FS_ERROR was requested). */ - if (!(fan_flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) { + if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) && + (mask & FAN_FS_ERROR)) { ret = fanotify_group_init_error_pool(group); if (ret) goto out; @@ -1261,10 +1300,10 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, /* * If some other task has this inode open for write we should not add - * an ignored mark, unless that ignored mark is supposed to survive + * an ignore mask, unless that ignore mask is supposed to survive * modification changes anyway. */ - if ((flags & FAN_MARK_IGNORED_MASK) && + if ((flags & FANOTIFY_MARK_IGNORE_BITS) && !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && inode_is_open_for_write(inode)) return 0; @@ -1520,7 +1559,8 @@ static int fanotify_events_supported(struct fsnotify_group *group, unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; /* Strict validation of events in non-dir inode mask with v5.17+ APIs */ bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) || - (mask & FAN_RENAME); + (mask & FAN_RENAME) || + (flags & FAN_MARK_IGNORE); /* * Some filesystems such as 'proc' acquire unusual locks when opening @@ -1557,7 +1597,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, __kernel_fsid_t __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; - bool ignored = flags & FAN_MARK_IGNORED_MASK; + unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; + unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; unsigned int obj_type, fid_mode; u32 umask = 0; int ret; @@ -1586,7 +1627,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EINVAL; } - switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { + switch (mark_cmd) { case FAN_MARK_ADD: case FAN_MARK_REMOVE: if (!mask) @@ -1606,9 +1647,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & ~valid_mask) return -EINVAL; - /* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */ - if (ignored) + + /* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */ + if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK)) + return -EINVAL; + + /* + * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with + * FAN_MARK_IGNORED_MASK. + */ + if (ignore == FAN_MARK_IGNORED_MASK) { mask &= ~FANOTIFY_EVENT_FLAGS; + umask = FANOTIFY_EVENT_FLAGS; + } f = fdget(fanotify_fd); if (unlikely(!f.file)) @@ -1672,7 +1723,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) goto fput_and_out; - if (flags & FAN_MARK_FLUSH) { + if (mark_cmd == FAN_MARK_FLUSH) { ret = 0; if (mark_type == FAN_MARK_MOUNT) fsnotify_clear_vfsmount_marks_by_group(group); @@ -1688,7 +1739,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (ret) goto fput_and_out; - if (flags & FAN_MARK_ADD) { + if (mark_cmd == FAN_MARK_ADD) { ret = fanotify_events_supported(group, &path, mask, flags); if (ret) goto path_put_and_out; @@ -1712,6 +1763,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, else mnt = path.mnt; + ret = mnt ? -EINVAL : -EISDIR; + /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ + if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE && + (mnt || S_ISDIR(inode->i_mode)) && + !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) + goto path_put_and_out; + /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ if (mnt || !S_ISDIR(inode->i_mode)) { mask &= ~FAN_EVENT_ON_CHILD; @@ -1721,12 +1779,12 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * events with parent/name info for non-directory. */ if ((fid_mode & FAN_REPORT_DIR_FID) && - (flags & FAN_MARK_ADD) && !ignored) + (flags & FAN_MARK_ADD) && !ignore) mask |= FAN_EVENT_ON_CHILD; } /* create/update an inode mark */ - switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { + switch (mark_cmd) { case FAN_MARK_ADD: if (mark_type == FAN_MARK_MOUNT) ret = fanotify_add_vfsmount_mark(group, mnt, mask, @@ -1804,7 +1862,7 @@ static int __init fanotify_user_setup(void) BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 10); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 59fb40abe33d..55081ae3a6ec 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -113,7 +113,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) return; seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", inode->i_ino, inode->i_sb->s_dev, - mflags, mark->mask, mark->ignored_mask); + mflags, mark->mask, mark->ignore_mask); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); @@ -121,12 +121,12 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) struct mount *mnt = fsnotify_conn_mount(mark->connector); seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", - mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); + mnt->mnt_id, mflags, mark->mask, mark->ignore_mask); } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) { struct super_block *sb = fsnotify_conn_sb(mark->connector); seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n", - sb->s_dev, mflags, mark->mask, mark->ignored_mask); + sb->s_dev, mflags, mark->mask, mark->ignore_mask); } } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 0b3e74935cb4..7974e91ffe13 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -100,7 +100,7 @@ void fsnotify_sb_delete(struct super_block *sb) * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event * on a child we run all of our children and set a dentry flag saying that the - * parent cares. Thus when an event happens on a child it can quickly tell if + * parent cares. Thus when an event happens on a child it can quickly tell * if there is a need to find a parent and send the event to the parent. */ void __fsnotify_update_child_dentry_flags(struct inode *inode) @@ -324,7 +324,8 @@ static int send_to_group(__u32 mask, const void *data, int data_type, struct fsnotify_group *group = NULL; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; - __u32 marks_ignored_mask = 0; + __u32 marks_ignore_mask = 0; + bool is_dir = mask & FS_ISDIR; struct fsnotify_mark *mark; int type; @@ -336,7 +337,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type, fsnotify_foreach_iter_mark_type(iter_info, mark, type) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) - mark->ignored_mask = 0; + mark->ignore_mask = 0; } } @@ -344,14 +345,15 @@ static int send_to_group(__u32 mask, const void *data, int data_type, fsnotify_foreach_iter_mark_type(iter_info, mark, type) { group = mark->group; marks_mask |= mark->mask; - marks_ignored_mask |= mark->ignored_mask; + marks_ignore_mask |= + fsnotify_effective_ignore_mask(mark, is_dir, type); } - pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", - __func__, group, mask, marks_mask, marks_ignored_mask, + pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", + __func__, group, mask, marks_mask, marks_ignore_mask, data, data_type, dir, cookie); - if (!(test_mask & marks_mask & ~marks_ignored_mask)) + if (!(test_mask & marks_mask & ~marks_ignore_mask)) return 0; if (group->ops->handle_event) { @@ -423,7 +425,8 @@ static bool fsnotify_iter_select_report_types( * But is *this mark* watching children? */ if (type == FSNOTIFY_ITER_TYPE_PARENT && - !(mark->mask & FS_EVENT_ON_CHILD)) + !(mark->mask & FS_EVENT_ON_CHILD) && + !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD)) continue; fsnotify_iter_set_report_type(iter_info, type); @@ -532,8 +535,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, /* - * If this is a modify event we may need to clear some ignored masks. - * In that case, the object with ignored masks will have the FS_MODIFY + * If this is a modify event we may need to clear some ignore masks. + * In that case, the object with ignore masks will have the FS_MODIFY * event in its mask. * Otherwise, return if none of the marks care about this type of event. */ diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ed42a189faa2..1c4bfdab008d 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -136,7 +136,7 @@ static inline u32 inotify_mask_to_arg(__u32 mask) IN_Q_OVERFLOW); } -/* intofiy userspace file descriptor functions */ +/* inotify userspace file descriptor functions */ static __poll_t inotify_poll(struct file *file, poll_table *wait) { struct fsnotify_group *group = file->private_data; diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 4de597a83b88..52615e6090e1 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -592,8 +592,12 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, a = (ATTR_RECORD*)((u8*)ctx->attr + le32_to_cpu(ctx->attr->length)); for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_allocated)) + u8 *mrec_end = (u8 *)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_allocated); + u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) + + a->name_length * sizeof(ntfschar); + if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end || + name_end > mrec_end) break; ctx->attr = a; if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7497cd592258..9c67edd215d5 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1146,7 +1146,7 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (status) return status; - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { status = dquot_initialize(inode); if (status) return status; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 337527571461..740b64238312 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -277,7 +277,6 @@ enum ocfs2_mount_options OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ - OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -673,8 +672,7 @@ static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) static inline int ocfs2_mount_local(struct ocfs2_super *osb) { - return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT) - || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER)); + return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); } static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 0b0ae3ebb0cf..da7718cef735 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -252,16 +252,14 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, int i, ret = -ENOSPC; if ((preferred >= 0) && (preferred < si->si_num_slots)) { - if (!si->si_slots[preferred].sl_valid || - !si->si_slots[preferred].sl_node_num) { + if (!si->si_slots[preferred].sl_valid) { ret = preferred; goto out; } } for(i = 0; i < si->si_num_slots; i++) { - if (!si->si_slots[i].sl_valid || - !si->si_slots[i].sl_node_num) { + if (!si->si_slots[i].sl_valid) { ret = i; break; } @@ -456,30 +454,24 @@ int ocfs2_find_slot(struct ocfs2_super *osb) spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); - if (ocfs2_mount_local(osb)) - /* use slot 0 directly in local mode */ - slot = 0; - else { - /* search for ourselves first and take the slot if it already - * exists. Perhaps we need to mark this in a variable for our - * own journal recovery? Possibly not, though we certainly - * need to warn to the user */ - slot = __ocfs2_node_num_to_slot(si, osb->node_num); + /* search for ourselves first and take the slot if it already + * exists. Perhaps we need to mark this in a variable for our + * own journal recovery? Possibly not, though we certainly + * need to warn to the user */ + slot = __ocfs2_node_num_to_slot(si, osb->node_num); + if (slot < 0) { + /* if no slot yet, then just take 1st available + * one. */ + slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); if (slot < 0) { - /* if no slot yet, then just take 1st available - * one. */ - slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); - if (slot < 0) { - spin_unlock(&osb->osb_lock); - mlog(ML_ERROR, "no free slots available!\n"); - status = -EINVAL; - goto bail; - } - } else - printk(KERN_INFO "ocfs2: Slot %d on device (%s) was " - "already allocated to this node!\n", - slot, osb->dev_str); - } + spin_unlock(&osb->osb_lock); + mlog(ML_ERROR, "no free slots available!\n"); + status = -EINVAL; + goto bail; + } + } else + printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " + "allocated to this node!\n", slot, osb->dev_str); ocfs2_set_slot(si, slot, osb->node_num); osb->slot_num = slot; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f7298816d8d9..438be028935d 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -172,7 +172,6 @@ enum { Opt_dir_resv_level, Opt_journal_async_commit, Opt_err_cont, - Opt_nocluster, Opt_err, }; @@ -206,7 +205,6 @@ static const match_table_t tokens = { {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_err_cont, "errors=continue"}, - {Opt_nocluster, "nocluster"}, {Opt_err, NULL} }; @@ -618,13 +616,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } - tmp = OCFS2_MOUNT_NOCLUSTER; - if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { - ret = -EINVAL; - mlog(ML_ERROR, "Cannot change nocluster option on remount\n"); - goto out; - } - tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | OCFS2_MOUNT_HB_NONE; if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { @@ -865,7 +856,6 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, } if (ocfs2_userspace_stack(osb) && - !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && strncmp(osb->osb_cluster_stack, mopt->cluster_stack, OCFS2_STACK_LABEL_LEN)) { mlog(ML_ERROR, @@ -1137,11 +1127,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : "ordered"); - if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && - !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)) - printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted " - "without cluster aware mode.\n", osb->dev_str); - atomic_set(&osb->vol_state, VOLUME_MOUNTED); wake_up(&osb->osb_mount_event); @@ -1452,9 +1437,6 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_journal_async_commit: mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; break; - case Opt_nocluster: - mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER; - break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1566,9 +1548,6 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) seq_printf(s, ",journal_async_commit"); - if (opts & OCFS2_MOUNT_NOCLUSTER) - seq_printf(s, ",nocluster"); - return 0; } diff --git a/fs/open.c b/fs/open.c index 1d57fbde2feb..2790aac66e58 100644 --- a/fs/open.c +++ b/fs/open.c @@ -663,6 +663,42 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) return do_fchmodat(AT_FDCWD, filename, mode); } +/** + * setattr_vfsuid - check and set ia_fsuid attribute + * @kuid: new inode owner + * + * Check whether @kuid is valid and if so generate and set vfsuid_t in + * ia_vfsuid. + * + * Return: true if @kuid is valid, false if not. + */ +static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) +{ + if (!uid_valid(kuid)) + return false; + attr->ia_valid |= ATTR_UID; + attr->ia_vfsuid = VFSUIDT_INIT(kuid); + return true; +} + +/** + * setattr_vfsgid - check and set ia_fsgid attribute + * @kgid: new inode owner + * + * Check whether @kgid is valid and if so generate and set vfsgid_t in + * ia_vfsgid. + * + * Return: true if @kgid is valid, false if not. + */ +static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) +{ + if (!gid_valid(kgid)) + return false; + attr->ia_valid |= ATTR_GID; + attr->ia_vfsgid = VFSGIDT_INIT(kgid); + return true; +} + int chown_common(const struct path *path, uid_t user, gid_t group) { struct user_namespace *mnt_userns, *fs_userns; @@ -678,28 +714,22 @@ int chown_common(const struct path *path, uid_t user, gid_t group) mnt_userns = mnt_user_ns(path->mnt); fs_userns = i_user_ns(inode); - uid = mapped_kuid_user(mnt_userns, fs_userns, uid); - gid = mapped_kgid_user(mnt_userns, fs_userns, gid); retry_deleg: newattrs.ia_valid = ATTR_CTIME; - if (user != (uid_t) -1) { - if (!uid_valid(uid)) - return -EINVAL; - newattrs.ia_valid |= ATTR_UID; - newattrs.ia_uid = uid; - } - if (group != (gid_t) -1) { - if (!gid_valid(gid)) - return -EINVAL; - newattrs.ia_valid |= ATTR_GID; - newattrs.ia_gid = gid; - } + if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) + return -EINVAL; + if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) + return -EINVAL; if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; inode_lock(inode); - error = security_path_chown(path, uid, gid); + /* Continue to send actual fs values, not the mount values. */ + error = security_path_chown( + path, + from_vfsuid(mnt_userns, fs_userns, newattrs.ia_vfsuid), + from_vfsgid(mnt_userns, fs_userns, newattrs.ia_vfsgid)); if (!error) error = notify_change(mnt_userns, path->dentry, &newattrs, &delegated_inode); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 714ec569d25b..245e2cb62708 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -331,8 +331,8 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry, if (!err) { struct iattr attr = { .ia_valid = ATTR_UID | ATTR_GID, - .ia_uid = stat->uid, - .ia_gid = stat->gid, + .ia_vfsuid = VFSUIDT_INIT(stat->uid), + .ia_vfsgid = VFSGIDT_INIT(stat->gid), }; err = ovl_do_notify_change(ofs, upperdentry, &attr); } diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 492eddeb481f..7922b619f6c8 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -454,23 +454,94 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return res; } +/* + * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone + * of the POSIX ACLs retrieved from the lower layer to this function to not + * alter the POSIX ACLs for the underlying filesystem. + */ +static void ovl_idmap_posix_acl(struct user_namespace *mnt_userns, + struct posix_acl *acl) +{ + for (unsigned int i = 0; i < acl->a_count; i++) { + vfsuid_t vfsuid; + vfsgid_t vfsgid; + + struct posix_acl_entry *e = &acl->a_entries[i]; + switch (e->e_tag) { + case ACL_USER: + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, e->e_uid); + e->e_uid = vfsuid_into_kuid(vfsuid); + break; + case ACL_GROUP: + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, e->e_gid); + e->e_gid = vfsgid_into_kgid(vfsgid); + break; + } + } +} + +/* + * When the relevant layer is an idmapped mount we need to take the idmapping + * of the layer into account and translate any ACL_{GROUP,USER} values + * according to the idmapped mount. + * + * We cannot alter the ACLs returned from the relevant layer as that would + * alter the cached values filesystem wide for the lower filesystem. Instead we + * can clone the ACLs and then apply the relevant idmapping of the layer. + * + * This is obviously only relevant when idmapped layers are used. + */ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) { struct inode *realinode = ovl_inode_real(inode); - const struct cred *old_cred; - struct posix_acl *acl; + struct posix_acl *acl, *clone; + struct path realpath; if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) return NULL; - if (rcu) - return get_cached_acl_rcu(realinode, type); + /* Careful in RCU walk mode */ + ovl_i_path_real(inode, &realpath); + if (!realpath.dentry) { + WARN_ON(!rcu); + return ERR_PTR(-ECHILD); + } - old_cred = ovl_override_creds(inode->i_sb); - acl = get_acl(realinode, type); - revert_creds(old_cred); + if (rcu) { + acl = get_cached_acl_rcu(realinode, type); + } else { + const struct cred *old_cred; + + old_cred = ovl_override_creds(inode->i_sb); + acl = get_acl(realinode, type); + revert_creds(old_cred); + } + /* + * If there are no POSIX ACLs, or we encountered an error, + * or the layer isn't idmapped we don't need to do anything. + */ + if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl)) + return acl; - return acl; + /* + * We only get here if the layer is idmapped. So drop out of RCU path + * walk so we can clone the ACLs. There's no need to release the ACLs + * since get_cached_acl_rcu() doesn't take a reference on the ACLs. + */ + if (rcu) + return ERR_PTR(-ECHILD); + + clone = posix_acl_clone(acl, GFP_KERNEL); + if (!clone) + clone = ERR_PTR(-ENOMEM); + else + ovl_idmap_posix_acl(mnt_user_ns(realpath.mnt), clone); + /* + * Since we're not in RCU path walk we always need to release the + * original ACLs. + */ + posix_acl_release(acl); + return clone; } int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 4f34b7e02eee..6ec815b84d48 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -139,17 +139,7 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs, struct dentry *upperdentry, struct iattr *attr) { - struct user_namespace *upper_mnt_userns = ovl_upper_mnt_userns(ofs); - struct user_namespace *fs_userns = i_user_ns(d_inode(upperdentry)); - - if (attr->ia_valid & ATTR_UID) - attr->ia_uid = mapped_kuid_user(upper_mnt_userns, - fs_userns, attr->ia_uid); - if (attr->ia_valid & ATTR_GID) - attr->ia_gid = mapped_kgid_user(upper_mnt_userns, - fs_userns, attr->ia_gid); - - return notify_change(upper_mnt_userns, upperdentry, attr, NULL); + return notify_change(ovl_upper_mnt_userns(ofs), upperdentry, attr, NULL); } static inline int ovl_do_rmdir(struct ovl_fs *ofs, @@ -259,7 +249,8 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, value, size, flags); + int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, + (void *)value, size, flags); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", dentry, name, min((int)size, 48), value, size, flags, err); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 1ce5c9698393..e0a2e0468ee7 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1003,9 +1003,6 @@ ovl_posix_acl_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { - if (!IS_POSIXACL(inode)) - return -EOPNOTSUPP; - return ovl_xattr_get(dentry, inode, handler->name, buffer, size); } @@ -1021,9 +1018,6 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler, struct posix_acl *acl = NULL; int err; - if (!IS_POSIXACL(inode)) - return -EOPNOTSUPP; - /* Check that everything is OK before copy-up */ if (value) { acl = posix_acl_from_xattr(&init_user_ns, value, size); @@ -1966,20 +1960,6 @@ static struct dentry *ovl_get_root(struct super_block *sb, return root; } -static bool ovl_has_idmapped_layers(struct ovl_fs *ofs) -{ - - unsigned int i; - const struct vfsmount *mnt; - - for (i = 0; i < ofs->numlayer; i++) { - mnt = ofs->layers[i].mnt; - if (mnt && is_idmapped_mnt(mnt)) - return true; - } - return false; -} - static int ovl_fill_super(struct super_block *sb, void *data, int silent) { struct path upperpath = { }; @@ -2149,10 +2129,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers : ovl_trusted_xattr_handlers; sb->s_fs_info = ofs; - if (ovl_has_idmapped_layers(ofs)) - pr_warn("POSIX ACLs are not yet supported with idmapped layers, mounting without ACL support.\n"); - else - sb->s_flags |= SB_POSIXACL; + sb->s_flags |= SB_POSIXACL; sb->s_iflags |= SB_I_SKIP_SYNC; err = -ENOMEM; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 962d32468eb4..1d17d7b13dcd 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -199,7 +199,7 @@ EXPORT_SYMBOL(posix_acl_alloc); /* * Clone an ACL. */ -static struct posix_acl * +struct posix_acl * posix_acl_clone(const struct posix_acl *acl, gfp_t flags) { struct posix_acl *clone = NULL; @@ -213,6 +213,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) } return clone; } +EXPORT_SYMBOL_GPL(posix_acl_clone); /* * Check if an acl is valid. Returns 0 if it is, or -E... otherwise. @@ -361,8 +362,8 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, { const struct posix_acl_entry *pa, *pe, *mask_obj; int found = 0; - kuid_t uid; - kgid_t gid; + vfsuid_t vfsuid; + vfsgid_t vfsgid; want &= MAY_READ | MAY_WRITE | MAY_EXEC; @@ -370,30 +371,28 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, switch(pa->e_tag) { case ACL_USER_OBJ: /* (May have been checked already) */ - uid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(uid, current_fsuid())) + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto check_perm; break; case ACL_USER: - uid = mapped_kuid_fs(mnt_userns, - i_user_ns(inode), + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, pa->e_uid); - if (uid_eq(uid, current_fsuid())) + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto mask; break; case ACL_GROUP_OBJ: - gid = i_gid_into_mnt(mnt_userns, inode); - if (in_group_p(gid)) { + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; } break; case ACL_GROUP: - gid = mapped_kgid_fs(mnt_userns, - i_user_ns(inode), + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, pa->e_gid); - if (in_group_p(gid)) { + if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; @@ -699,7 +698,7 @@ int posix_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; @@ -710,46 +709,127 @@ EXPORT_SYMBOL(posix_acl_update_mode); /* * Fix up the uids and gids in posix acl extended attributes in place. */ -static void posix_acl_fix_xattr_userns( - struct user_namespace *to, struct user_namespace *from, - struct user_namespace *mnt_userns, - void *value, size_t size, bool from_user) +static int posix_acl_fix_xattr_common(void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + int count; + + if (!header) + return -EINVAL; + if (size < sizeof(struct posix_acl_xattr_header)) + return -EINVAL; + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return -EINVAL; + + count = posix_acl_xattr_count(size); + if (count < 0) + return -EINVAL; + if (count == 0) + return -EINVAL; + + return count; +} + +void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size) { struct posix_acl_xattr_header *header = value; struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; int count; + vfsuid_t vfsuid; + vfsgid_t vfsgid; kuid_t uid; kgid_t gid; - if (!value) + if (no_idmapping(mnt_userns, i_user_ns(inode))) return; - if (size < sizeof(struct posix_acl_xattr_header)) + + count = posix_acl_fix_xattr_common(value, size); + if (count < 0) return; - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + + for (end = entry + count; entry != end; entry++) { + switch (le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, uid); + entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, + vfsuid_into_kuid(vfsuid))); + break; + case ACL_GROUP: + gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, gid); + entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, + vfsgid_into_kgid(vfsgid))); + break; + default: + break; + } + } +} + +void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; + int count; + vfsuid_t vfsuid; + vfsgid_t vfsgid; + kuid_t uid; + kgid_t gid; + + if (no_idmapping(mnt_userns, i_user_ns(inode))) return; - count = posix_acl_xattr_count(size); + count = posix_acl_fix_xattr_common(value, size); if (count < 0) return; - if (count == 0) + + for (end = entry + count; entry != end; entry++) { + switch (le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsuid = VFSUIDT_INIT(uid); + uid = from_vfsuid(mnt_userns, &init_user_ns, vfsuid); + entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, uid)); + break; + case ACL_GROUP: + gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsgid = VFSGIDT_INIT(gid); + gid = from_vfsgid(mnt_userns, &init_user_ns, vfsgid); + entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, gid)); + break; + default: + break; + } + } +} + +static void posix_acl_fix_xattr_userns( + struct user_namespace *to, struct user_namespace *from, + void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; + int count; + kuid_t uid; + kgid_t gid; + + count = posix_acl_fix_xattr_common(value, size); + if (count < 0) return; for (end = entry + count; entry != end; entry++) { switch(le16_to_cpu(entry->e_tag)) { case ACL_USER: uid = make_kuid(from, le32_to_cpu(entry->e_id)); - if (from_user) - uid = mapped_kuid_user(mnt_userns, &init_user_ns, uid); - else - uid = mapped_kuid_fs(mnt_userns, &init_user_ns, uid); entry->e_id = cpu_to_le32(from_kuid(to, uid)); break; case ACL_GROUP: gid = make_kgid(from, le32_to_cpu(entry->e_id)); - if (from_user) - gid = mapped_kgid_user(mnt_userns, &init_user_ns, gid); - else - gid = mapped_kgid_fs(mnt_userns, &init_user_ns, gid); entry->e_id = cpu_to_le32(from_kgid(to, gid)); break; default: @@ -758,34 +838,20 @@ static void posix_acl_fix_xattr_userns( } } -void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +void posix_acl_fix_xattr_from_user(void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - - /* Leave ids untouched on non-idmapped mounts. */ - if (no_idmapping(mnt_userns, i_user_ns(inode))) - mnt_userns = &init_user_ns; - if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) + if (user_ns == &init_user_ns) return; - posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value, - size, true); + posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); } -void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +void posix_acl_fix_xattr_to_user(void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - - /* Leave ids untouched on non-idmapped mounts. */ - if (no_idmapping(mnt_userns, i_user_ns(inode))) - mnt_userns = &init_user_ns; - if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) + if (user_ns == &init_user_ns) return; - posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value, - size, false); + posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); } /* diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 09d1307959d0..28966da7834e 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2085,7 +2085,8 @@ EXPORT_SYMBOL(__dquot_transfer); /* Wrapper for transferring ownership of an inode for uid/gid only * Called from FSXXX_setattr() */ -int dquot_transfer(struct inode *inode, struct iattr *iattr) +int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, + struct iattr *iattr) { struct dquot *transfer_to[MAXQUOTAS] = {}; struct dquot *dquot; @@ -2095,8 +2096,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (!dquot_active(inode)) return 0; - if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){ - dquot = dqget(sb, make_kqid_uid(iattr->ia_uid)); + if (i_uid_needs_update(mnt_userns, iattr, inode)) { + kuid_t kuid = from_vfsuid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsuid); + + dquot = dqget(sb, make_kqid_uid(kuid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); @@ -2106,8 +2110,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) } transfer_to[USRQUOTA] = dquot; } - if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){ - dquot = dqget(sb, make_kqid_gid(iattr->ia_gid)); + if (i_gid_needs_update(mnt_userns, iattr, inode)) { + kgid_t kgid = from_vfsgid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsgid); + + dquot = dqget(sb, make_kqid_gid(kgid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); diff --git a/fs/read_write.c b/fs/read_write.c index e0777eefd846..397da0236607 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1263,6 +1263,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, count, fl); file_end_write(out.file); } else { + if (out.file->f_flags & O_NONBLOCK) + fl |= SPLICE_F_NONBLOCK; + retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl); } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 0cffe054b78e..0df48d176732 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -290,7 +290,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block, struct buffer_head *bh; struct item_head *ih, tmp_ih; b_blocknr_t blocknr; - char *p = NULL; + char *p; int chars; int ret; int result; @@ -305,8 +305,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block, result = search_for_position_by_key(inode->i_sb, &key, &path); if (result != POSITION_FOUND) { pathrelse(&path); - if (p) - kunmap(bh_result->b_page); if (result == IO_ERROR) return -EIO; /* @@ -352,8 +350,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block, } pathrelse(&path); - if (p) - kunmap(bh_result->b_page); return ret; } /* requested data are in direct item(s) */ @@ -363,8 +359,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block, * when it is stored in direct item(s) */ pathrelse(&path); - if (p) - kunmap(bh_result->b_page); return -ENOENT; } @@ -396,9 +390,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block, * sure we need to. But, this means the item might move if * kmap schedules */ - if (!p) - p = (char *)kmap(bh_result->b_page); - + p = (char *)kmap(bh_result->b_page); p += offset; memset(p, 0, inode->i_sb->s_blocksize); do { @@ -3284,7 +3276,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { error = dquot_initialize(inode); if (error) return error; @@ -3367,7 +3359,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, reiserfs_write_unlock(inode->i_sb); if (error) goto out; - error = dquot_transfer(inode, attr); + error = dquot_transfer(mnt_userns, inode, attr); reiserfs_write_lock(inode->i_sb); if (error) { journal_end(&th); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e943370107d0..de86f5b2859f 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -192,17 +192,19 @@ static inline void msg_init(struct uffd_msg *msg) } static inline struct uffd_msg userfault_msg(unsigned long address, + unsigned long real_address, unsigned int flags, unsigned long reason, unsigned int features) { struct uffd_msg msg; + msg_init(&msg); msg.event = UFFD_EVENT_PAGEFAULT; - if (!(features & UFFD_FEATURE_EXACT_ADDRESS)) - address &= PAGE_MASK; - msg.arg.pagefault.address = address; + msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ? + real_address : address; + /* * These flags indicate why the userfault occurred: * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. @@ -488,8 +490,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason, - ctx->features); + uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags, + reason, ctx->features); uwq.ctx = ctx; uwq.waken = false; diff --git a/fs/xattr.c b/fs/xattr.c index e8dd03e4561e..a1f4998bc6be 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -282,9 +282,15 @@ out: } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); +static inline bool is_posix_acl_xattr(const char *name) +{ + return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0); +} + int vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, - const char *name, const void *value, size_t size, int flags) + const char *name, void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; @@ -292,12 +298,16 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { - error = cap_convert_nscap(mnt_userns, dentry, &value, size); + error = cap_convert_nscap(mnt_userns, dentry, + (const void **)&value, size); if (error < 0) return error; size = error; } + if (size && is_posix_acl_xattr(name)) + posix_acl_setxattr_idmapped_mnt(mnt_userns, inode, value, size); + retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size, @@ -431,7 +441,10 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, return ret; } nolsm: - return __vfs_getxattr(dentry, inode, name, value, size); + error = __vfs_getxattr(dentry, inode, name, value, size); + if (error > 0 && is_posix_acl_xattr(name)) + posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size); + return error; } EXPORT_SYMBOL_GPL(vfs_getxattr); @@ -577,8 +590,7 @@ static void setxattr_convert(struct user_namespace *mnt_userns, if (ctx->size && ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))) - posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), - ctx->kvalue, ctx->size); + posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size); } int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, @@ -695,8 +707,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d), - ctx->kvalue, error); + posix_acl_fix_xattr_to_user(ctx->kvalue, error); if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 29f5b8b8aca6..a7402f6ea510 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -667,13 +667,15 @@ xfs_setattr_nonsize( uint qflags = 0; if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { - uid = iattr->ia_uid; + uid = from_vfsuid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsuid); qflags |= XFS_QMOPT_UQUOTA; } else { uid = inode->i_uid; } if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { - gid = iattr->ia_gid; + gid = from_vfsgid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsgid); qflags |= XFS_QMOPT_GQUOTA; } else { gid = inode->i_gid; @@ -704,13 +706,13 @@ xfs_setattr_nonsize( * didn't have the inode locked, inode's dquot(s) would have changed * also. */ - if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp) && - !uid_eq(inode->i_uid, iattr->ia_uid)) { + if (XFS_IS_UQUOTA_ON(mp) && + i_uid_needs_update(mnt_userns, iattr, inode)) { ASSERT(udqp); old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } - if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp) && - !gid_eq(inode->i_gid, iattr->ia_gid)) { + if (XFS_IS_GQUOTA_ON(mp) && + i_gid_needs_update(mnt_userns, iattr, inode)) { ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp)); ASSERT(gdqp); old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 053299758deb..f5d8338967cb 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -616,7 +616,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns, !uid_eq(iattr->ia_uid, inode->i_uid)) || ((iattr->ia_valid & ATTR_GID) && !gid_eq(iattr->ia_gid, inode->i_gid))) { - ret = dquot_transfer(inode, iattr); + ret = dquot_transfer(mnt_userns, inode, iattr); if (ret) return ret; } diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 7ce93aaf69f8..98954dda5734 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1125,9 +1125,7 @@ static inline void memcpy_toio(volatile void __iomem *addr, const void *buffer, } #endif -#ifndef CONFIG_GENERIC_DEVMEM_IS_ALLOWED extern int devmem_is_allowed(unsigned long pfn); -#endif #endif /* __KERNEL__ */ diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index ff3e82553a76..492dce43236e 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -158,9 +158,24 @@ * Useful if your architecture doesn't use IPIs for remote TLB invalidates * and therefore doesn't naturally serialize with software page-table walkers. * + * MMU_GATHER_NO_FLUSH_CACHE + * + * Indicates the architecture has flush_cache_range() but it needs *NOT* be called + * before unmapping a VMA. + * + * NOTE: strictly speaking we shouldn't have this knob and instead rely on + * flush_cache_range() being a NOP, except Sparc64 seems to be + * different here. + * + * MMU_GATHER_MERGE_VMAS + * + * Indicates the architecture wants to merge ranges over VMAs; typical when + * multiple range invalidates are more expensive than a full invalidate. + * * MMU_GATHER_NO_RANGE * - * Use this if your architecture lacks an efficient flush_tlb_range(). + * Use this if your architecture lacks an efficient flush_tlb_range(). This + * option implies MMU_GATHER_MERGE_VMAS above. * * MMU_GATHER_NO_GATHER * @@ -288,6 +303,7 @@ struct mmu_gather { */ unsigned int vma_exec : 1; unsigned int vma_huge : 1; + unsigned int vma_pfn : 1; unsigned int batch_count; @@ -334,8 +350,8 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb) #ifdef CONFIG_MMU_GATHER_NO_RANGE -#if defined(tlb_flush) || defined(tlb_start_vma) || defined(tlb_end_vma) -#error MMU_GATHER_NO_RANGE relies on default tlb_flush(), tlb_start_vma() and tlb_end_vma() +#if defined(tlb_flush) +#error MMU_GATHER_NO_RANGE relies on default tlb_flush() #endif /* @@ -352,20 +368,9 @@ static inline void tlb_flush(struct mmu_gather *tlb) flush_tlb_mm(tlb->mm); } -static inline void -tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { } - -#define tlb_end_vma tlb_end_vma -static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { } - #else /* CONFIG_MMU_GATHER_NO_RANGE */ #ifndef tlb_flush - -#if defined(tlb_start_vma) || defined(tlb_end_vma) -#error Default tlb_flush() relies on default tlb_start_vma() and tlb_end_vma() -#endif - /* * When an architecture does not provide its own tlb_flush() implementation * but does have a reasonably efficient flush_vma_range() implementation @@ -385,6 +390,9 @@ static inline void tlb_flush(struct mmu_gather *tlb) flush_tlb_range(&vma, tlb->start, tlb->end); } } +#endif + +#endif /* CONFIG_MMU_GATHER_NO_RANGE */ static inline void tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) @@ -402,17 +410,9 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) */ tlb->vma_huge = is_vm_hugetlb_page(vma); tlb->vma_exec = !!(vma->vm_flags & VM_EXEC); + tlb->vma_pfn = !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)); } -#else - -static inline void -tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { } - -#endif - -#endif /* CONFIG_MMU_GATHER_NO_RANGE */ - static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { /* @@ -486,32 +486,36 @@ static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb) * case where we're doing a full MM flush. When we're doing a munmap, * the vmas are adjusted to only cover the region to be torn down. */ -#ifndef tlb_start_vma static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm) return; tlb_update_vma_flags(tlb, vma); +#ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE flush_cache_range(vma, vma->vm_start, vma->vm_end); -} #endif +} -#ifndef tlb_end_vma static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm) return; /* - * Do a TLB flush and reset the range at VMA boundaries; this avoids - * the ranges growing with the unused space between consecutive VMAs, - * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on - * this. + * VM_PFNMAP is more fragile because the core mm will not track the + * page mapcount -- there might not be page-frames for these PFNs after + * all. Force flush TLBs for such ranges to avoid munmap() vs + * unmap_mapping_range() races. */ - tlb_flush_mmu_tlbonly(tlb); + if (tlb->vma_pfn || !IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) { + /* + * Do a TLB flush and reset the range at VMA boundaries; this avoids + * the ranges growing with the unused space between consecutive VMAs. + */ + tlb_flush_mmu_tlbonly(tlb); + } } -#endif /* * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end, diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 0fca8f38bee4..addb135eeea6 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -28,7 +28,7 @@ #include <linux/dma-fence.h> #include <linux/completion.h> #include <linux/xarray.h> -#include <linux/irq_work.h> +#include <linux/workqueue.h> #define MAX_WAIT_SCHED_ENTITY_Q_EMPTY msecs_to_jiffies(1000) @@ -295,7 +295,7 @@ struct drm_sched_job { */ union { struct dma_fence_cb finish_cb; - struct irq_work work; + struct work_struct work; }; uint64_t id; diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 19f0dbfdd7fe..b66c5f389159 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -130,7 +130,6 @@ enum cpuhp_state { CPUHP_ZCOMP_PREPARE, CPUHP_TIMERS_PREPARE, CPUHP_MIPS_SOC_PREPARE, - CPUHP_LOONGARCH_SOC_PREPARE, CPUHP_BP_PREPARE_DYN, CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, CPUHP_BRINGUP_CPU, diff --git a/include/linux/evm.h b/include/linux/evm.h index 4c374be70247..aa63e0b3c0a2 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -21,7 +21,8 @@ extern enum integrity_status evm_verifyxattr(struct dentry *dentry, void *xattr_value, size_t xattr_value_len, struct integrity_iint_cache *iint); -extern int evm_inode_setattr(struct dentry *dentry, struct iattr *attr); +extern int evm_inode_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr); extern void evm_inode_post_setattr(struct dentry *dentry, int ia_valid); extern int evm_inode_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, @@ -68,7 +69,8 @@ static inline enum integrity_status evm_verifyxattr(struct dentry *dentry, } #endif -static inline int evm_inode_setattr(struct dentry *dentry, struct iattr *attr) +static inline int evm_inode_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { return 0; } diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index e517dbcf74ed..8ad743def6f3 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -59,15 +59,19 @@ #define FANOTIFY_MARK_TYPE_BITS (FAN_MARK_INODE | FAN_MARK_MOUNT | \ FAN_MARK_FILESYSTEM) +#define FANOTIFY_MARK_CMD_BITS (FAN_MARK_ADD | FAN_MARK_REMOVE | \ + FAN_MARK_FLUSH) + +#define FANOTIFY_MARK_IGNORE_BITS (FAN_MARK_IGNORED_MASK | \ + FAN_MARK_IGNORE) + #define FANOTIFY_MARK_FLAGS (FANOTIFY_MARK_TYPE_BITS | \ - FAN_MARK_ADD | \ - FAN_MARK_REMOVE | \ + FANOTIFY_MARK_CMD_BITS | \ + FANOTIFY_MARK_IGNORE_BITS | \ FAN_MARK_DONT_FOLLOW | \ FAN_MARK_ONLYDIR | \ - FAN_MARK_IGNORED_MASK | \ FAN_MARK_IGNORED_SURV_MODIFY | \ - FAN_MARK_EVICTABLE | \ - FAN_MARK_FLUSH) + FAN_MARK_EVICTABLE) /* * Events that can be reported with data type FSNOTIFY_EVENT_PATH. diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae..ec2e35886779 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -221,8 +221,26 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, struct iattr { unsigned int ia_valid; umode_t ia_mode; - kuid_t ia_uid; - kgid_t ia_gid; + /* + * The two anonymous unions wrap structures with the same member. + * + * Filesystems raising FS_ALLOW_IDMAP need to use ia_vfs{g,u}id which + * are a dedicated type requiring the filesystem to use the dedicated + * helpers. Other filesystem can continue to use ia_{g,u}id until they + * have been ported. + * + * They always contain the same value. In other words FS_ALLOW_IDMAP + * pass down the same value on idmapped mounts as they would on regular + * mounts. + */ + union { + kuid_t ia_uid; + vfsuid_t ia_vfsuid; + }; + union { + kgid_t ia_gid; + vfsgid_t ia_vfsgid; + }; loff_t ia_size; struct timespec64 ia_atime; struct timespec64 ia_mtime; @@ -1600,13 +1618,68 @@ static inline void i_gid_write(struct inode *inode, gid_t gid) * @mnt_userns: user namespace of the mount the inode was found from * @inode: inode to map * + * Note, this will eventually be removed completely in favor of the type-safe + * i_uid_into_vfsuid(). + * * Return: the inode's i_uid mapped down according to @mnt_userns. * If the inode's i_uid has no mapping INVALID_UID is returned. */ static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns, const struct inode *inode) { - return mapped_kuid_fs(mnt_userns, i_user_ns(inode), inode->i_uid); + return AS_KUIDT(make_vfsuid(mnt_userns, i_user_ns(inode), inode->i_uid)); +} + +/** + * i_uid_into_vfsuid - map an inode's i_uid down into a mnt_userns + * @mnt_userns: user namespace of the mount the inode was found from + * @inode: inode to map + * + * Return: whe inode's i_uid mapped down according to @mnt_userns. + * If the inode's i_uid has no mapping INVALID_VFSUID is returned. + */ +static inline vfsuid_t i_uid_into_vfsuid(struct user_namespace *mnt_userns, + const struct inode *inode) +{ + return make_vfsuid(mnt_userns, i_user_ns(inode), inode->i_uid); +} + +/** + * i_uid_needs_update - check whether inode's i_uid needs to be updated + * @mnt_userns: user namespace of the mount the inode was found from + * @attr: the new attributes of @inode + * @inode: the inode to update + * + * Check whether the $inode's i_uid field needs to be updated taking idmapped + * mounts into account if the filesystem supports it. + * + * Return: true if @inode's i_uid field needs to be updated, false if not. + */ +static inline bool i_uid_needs_update(struct user_namespace *mnt_userns, + const struct iattr *attr, + const struct inode *inode) +{ + return ((attr->ia_valid & ATTR_UID) && + !vfsuid_eq(attr->ia_vfsuid, + i_uid_into_vfsuid(mnt_userns, inode))); +} + +/** + * i_uid_update - update @inode's i_uid field + * @mnt_userns: user namespace of the mount the inode was found from + * @attr: the new attributes of @inode + * @inode: the inode to update + * + * Safely update @inode's i_uid field translating the vfsuid of any idmapped + * mount into the filesystem kuid. + */ +static inline void i_uid_update(struct user_namespace *mnt_userns, + const struct iattr *attr, + struct inode *inode) +{ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = from_vfsuid(mnt_userns, i_user_ns(inode), + attr->ia_vfsuid); } /** @@ -1614,13 +1687,68 @@ static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns, * @mnt_userns: user namespace of the mount the inode was found from * @inode: inode to map * + * Note, this will eventually be removed completely in favor of the type-safe + * i_gid_into_vfsgid(). + * * Return: the inode's i_gid mapped down according to @mnt_userns. * If the inode's i_gid has no mapping INVALID_GID is returned. */ static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns, const struct inode *inode) { - return mapped_kgid_fs(mnt_userns, i_user_ns(inode), inode->i_gid); + return AS_KGIDT(make_vfsgid(mnt_userns, i_user_ns(inode), inode->i_gid)); +} + +/** + * i_gid_into_vfsgid - map an inode's i_gid down into a mnt_userns + * @mnt_userns: user namespace of the mount the inode was found from + * @inode: inode to map + * + * Return: the inode's i_gid mapped down according to @mnt_userns. + * If the inode's i_gid has no mapping INVALID_VFSGID is returned. + */ +static inline vfsgid_t i_gid_into_vfsgid(struct user_namespace *mnt_userns, + const struct inode *inode) +{ + return make_vfsgid(mnt_userns, i_user_ns(inode), inode->i_gid); +} + +/** + * i_gid_needs_update - check whether inode's i_gid needs to be updated + * @mnt_userns: user namespace of the mount the inode was found from + * @attr: the new attributes of @inode + * @inode: the inode to update + * + * Check whether the $inode's i_gid field needs to be updated taking idmapped + * mounts into account if the filesystem supports it. + * + * Return: true if @inode's i_gid field needs to be updated, false if not. + */ +static inline bool i_gid_needs_update(struct user_namespace *mnt_userns, + const struct iattr *attr, + const struct inode *inode) +{ + return ((attr->ia_valid & ATTR_GID) && + !vfsgid_eq(attr->ia_vfsgid, + i_gid_into_vfsgid(mnt_userns, inode))); +} + +/** + * i_gid_update - update @inode's i_gid field + * @mnt_userns: user namespace of the mount the inode was found from + * @attr: the new attributes of @inode + * @inode: the inode to update + * + * Safely update @inode's i_gid field translating the vfsgid of any idmapped + * mount into the filesystem kgid. + */ +static inline void i_gid_update(struct user_namespace *mnt_userns, + const struct iattr *attr, + struct inode *inode) +{ + if (attr->ia_valid & ATTR_GID) + inode->i_gid = from_vfsgid(mnt_userns, i_user_ns(inode), + attr->ia_vfsgid); } /** @@ -2195,8 +2323,8 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns, struct inode *inode) { - return !uid_valid(i_uid_into_mnt(mnt_userns, inode)) || - !gid_valid(i_gid_into_mnt(mnt_userns, inode)); + return !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)); } static inline int iocb_flags(struct file *file); diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 9560734759fa..d7d96c806bff 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -518,8 +518,8 @@ struct fsnotify_mark { struct hlist_node obj_list; /* Head of list of marks for an object [mark ref] */ struct fsnotify_mark_connector *connector; - /* Events types to ignore [mark->lock, group->mark_mutex] */ - __u32 ignored_mask; + /* Events types and flags to ignore [mark->lock, group->mark_mutex] */ + __u32 ignore_mask; /* General fsnotify mark flags */ #define FSNOTIFY_MARK_FLAG_ALIVE 0x0001 #define FSNOTIFY_MARK_FLAG_ATTACHED 0x0002 @@ -529,6 +529,7 @@ struct fsnotify_mark { /* fanotify mark flags */ #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100 #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 +#define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400 unsigned int flags; /* flags [mark->lock] */ }; @@ -655,15 +656,91 @@ extern void fsnotify_remove_queued_event(struct fsnotify_group *group, /* functions used to manipulate the marks attached to inodes */ -/* Get mask for calculating object interest taking ignored mask into account */ +/* + * Canonical "ignore mask" including event flags. + * + * Note the subtle semantic difference from the legacy ->ignored_mask. + * ->ignored_mask traditionally only meant which events should be ignored, + * while ->ignore_mask also includes flags regarding the type of objects on + * which events should be ignored. + */ +static inline __u32 fsnotify_ignore_mask(struct fsnotify_mark *mark) +{ + __u32 ignore_mask = mark->ignore_mask; + + /* The event flags in ignore mask take effect */ + if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) + return ignore_mask; + + /* + * Legacy behavior: + * - Always ignore events on dir + * - Ignore events on child if parent is watching children + */ + ignore_mask |= FS_ISDIR; + ignore_mask &= ~FS_EVENT_ON_CHILD; + ignore_mask |= mark->mask & FS_EVENT_ON_CHILD; + + return ignore_mask; +} + +/* Legacy ignored_mask - only event types to ignore */ +static inline __u32 fsnotify_ignored_events(struct fsnotify_mark *mark) +{ + return mark->ignore_mask & ALL_FSNOTIFY_EVENTS; +} + +/* + * Check if mask (or ignore mask) should be applied depending if victim is a + * directory and whether it is reported to a watching parent. + */ +static inline bool fsnotify_mask_applicable(__u32 mask, bool is_dir, + int iter_type) +{ + /* Should mask be applied to a directory? */ + if (is_dir && !(mask & FS_ISDIR)) + return false; + + /* Should mask be applied to a child? */ + if (iter_type == FSNOTIFY_ITER_TYPE_PARENT && + !(mask & FS_EVENT_ON_CHILD)) + return false; + + return true; +} + +/* + * Effective ignore mask taking into account if event victim is a + * directory and whether it is reported to a watching parent. + */ +static inline __u32 fsnotify_effective_ignore_mask(struct fsnotify_mark *mark, + bool is_dir, int iter_type) +{ + __u32 ignore_mask = fsnotify_ignored_events(mark); + + if (!ignore_mask) + return 0; + + /* For non-dir and non-child, no need to consult the event flags */ + if (!is_dir && iter_type != FSNOTIFY_ITER_TYPE_PARENT) + return ignore_mask; + + ignore_mask = fsnotify_ignore_mask(mark); + if (!fsnotify_mask_applicable(ignore_mask, is_dir, iter_type)) + return 0; + + return ignore_mask & ALL_FSNOTIFY_EVENTS; +} + +/* Get mask for calculating object interest taking ignore mask into account */ static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark) { __u32 mask = mark->mask; - if (!mark->ignored_mask) + if (!fsnotify_ignored_events(mark)) return mask; - /* Interest in FS_MODIFY may be needed for clearing ignored mask */ + /* Interest in FS_MODIFY may be needed for clearing ignore mask */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mask |= FS_MODIFY; @@ -671,7 +748,7 @@ static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark) * If mark is interested in ignoring events on children, the object must * show interest in those events for fsnotify_parent() to notice it. */ - return mask | (mark->ignored_mask & ALL_FSNOTIFY_EVENTS); + return mask | mark->ignore_mask; } /* Get mask of events for a list of marks */ diff --git a/include/linux/mm.h b/include/linux/mm.h index cf3d0d673f6b..7898e29bcfb5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1130,23 +1130,27 @@ static inline bool is_zone_movable_page(const struct page *page) #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); -bool __put_devmap_managed_page(struct page *page); -static inline bool put_devmap_managed_page(struct page *page) +bool __put_devmap_managed_page_refs(struct page *page, int refs); +static inline bool put_devmap_managed_page_refs(struct page *page, int refs) { if (!static_branch_unlikely(&devmap_managed_key)) return false; if (!is_zone_device_page(page)) return false; - return __put_devmap_managed_page(page); + return __put_devmap_managed_page_refs(page, refs); } - #else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ -static inline bool put_devmap_managed_page(struct page *page) +static inline bool put_devmap_managed_page_refs(struct page *page, int refs) { return false; } #endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ +static inline bool put_devmap_managed_page(struct page *page) +{ + return put_devmap_managed_page_refs(page, 1); +} + /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ ((unsigned int) folio_ref_count(folio) + 127u <= 127u) diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index ee5a217de2a8..f6e5369d2928 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -13,6 +13,129 @@ struct user_namespace; */ extern struct user_namespace init_user_ns; +typedef struct { + uid_t val; +} vfsuid_t; + +typedef struct { + gid_t val; +} vfsgid_t; + +static_assert(sizeof(vfsuid_t) == sizeof(kuid_t)); +static_assert(sizeof(vfsgid_t) == sizeof(kgid_t)); +static_assert(offsetof(vfsuid_t, val) == offsetof(kuid_t, val)); +static_assert(offsetof(vfsgid_t, val) == offsetof(kgid_t, val)); + +#ifdef CONFIG_MULTIUSER +static inline uid_t __vfsuid_val(vfsuid_t uid) +{ + return uid.val; +} + +static inline gid_t __vfsgid_val(vfsgid_t gid) +{ + return gid.val; +} +#else +static inline uid_t __vfsuid_val(vfsuid_t uid) +{ + return 0; +} + +static inline gid_t __vfsgid_val(vfsgid_t gid) +{ + return 0; +} +#endif + +static inline bool vfsuid_valid(vfsuid_t uid) +{ + return __vfsuid_val(uid) != (uid_t)-1; +} + +static inline bool vfsgid_valid(vfsgid_t gid) +{ + return __vfsgid_val(gid) != (gid_t)-1; +} + +static inline bool vfsuid_eq(vfsuid_t left, vfsuid_t right) +{ + return vfsuid_valid(left) && __vfsuid_val(left) == __vfsuid_val(right); +} + +static inline bool vfsgid_eq(vfsgid_t left, vfsgid_t right) +{ + return vfsgid_valid(left) && __vfsgid_val(left) == __vfsgid_val(right); +} + +/** + * vfsuid_eq_kuid - check whether kuid and vfsuid have the same value + * @vfsuid: the vfsuid to compare + * @kuid: the kuid to compare + * + * Check whether @vfsuid and @kuid have the same values. + * + * Return: true if @vfsuid and @kuid have the same value, false if not. + * Comparison between two invalid uids returns false. + */ +static inline bool vfsuid_eq_kuid(vfsuid_t vfsuid, kuid_t kuid) +{ + return vfsuid_valid(vfsuid) && __vfsuid_val(vfsuid) == __kuid_val(kuid); +} + +/** + * vfsgid_eq_kgid - check whether kgid and vfsgid have the same value + * @vfsgid: the vfsgid to compare + * @kgid: the kgid to compare + * + * Check whether @vfsgid and @kgid have the same values. + * + * Return: true if @vfsgid and @kgid have the same value, false if not. + * Comparison between two invalid gids returns false. + */ +static inline bool vfsgid_eq_kgid(vfsgid_t vfsgid, kgid_t kgid) +{ + return vfsgid_valid(vfsgid) && __vfsgid_val(vfsgid) == __kgid_val(kgid); +} + +/* + * vfs{g,u}ids are created from k{g,u}ids. + * We don't allow them to be created from regular {u,g}id. + */ +#define VFSUIDT_INIT(val) (vfsuid_t){ __kuid_val(val) } +#define VFSGIDT_INIT(val) (vfsgid_t){ __kgid_val(val) } + +#define INVALID_VFSUID VFSUIDT_INIT(INVALID_UID) +#define INVALID_VFSGID VFSGIDT_INIT(INVALID_GID) + +/* + * Allow a vfs{g,u}id to be used as a k{g,u}id where we want to compare + * whether the mapped value is identical to value of a k{g,u}id. + */ +#define AS_KUIDT(val) (kuid_t){ __vfsuid_val(val) } +#define AS_KGIDT(val) (kgid_t){ __vfsgid_val(val) } + +#ifdef CONFIG_MULTIUSER +/** + * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups + * @vfsgid: the mnt gid to match + * + * This function can be used to determine whether @vfsuid matches any of the + * caller's groups. + * + * Return: 1 if vfsuid matches caller's groups, 0 if not. + */ +static inline int vfsgid_in_group_p(vfsgid_t vfsgid) +{ + return in_group_p(AS_KGIDT(vfsgid)); +} +#else +static inline int vfsgid_in_group_p(vfsgid_t vfsgid) +{ + return 1; +} +#endif + /** * initial_idmapping - check whether this is the initial mapping * @ns: idmapping to check @@ -48,7 +171,7 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns, } /** - * mapped_kuid_fs - map a filesystem kuid into a mnt_userns + * make_vfsuid - map a filesystem kuid into a mnt_userns * @mnt_userns: the mount's idmapping * @fs_userns: the filesystem's idmapping * @kuid : kuid to be mapped @@ -67,25 +190,33 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns, * If @kuid has no mapping in either @mnt_userns or @fs_userns INVALID_UID is * returned. */ -static inline kuid_t mapped_kuid_fs(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - kuid_t kuid) + +static inline vfsuid_t make_vfsuid(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + kuid_t kuid) { uid_t uid; if (no_idmapping(mnt_userns, fs_userns)) - return kuid; + return VFSUIDT_INIT(kuid); if (initial_idmapping(fs_userns)) uid = __kuid_val(kuid); else uid = from_kuid(fs_userns, kuid); if (uid == (uid_t)-1) - return INVALID_UID; - return make_kuid(mnt_userns, uid); + return INVALID_VFSUID; + return VFSUIDT_INIT(make_kuid(mnt_userns, uid)); +} + +static inline kuid_t mapped_kuid_fs(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + kuid_t kuid) +{ + return AS_KUIDT(make_vfsuid(mnt_userns, fs_userns, kuid)); } /** - * mapped_kgid_fs - map a filesystem kgid into a mnt_userns + * make_vfsgid - map a filesystem kgid into a mnt_userns * @mnt_userns: the mount's idmapping * @fs_userns: the filesystem's idmapping * @kgid : kgid to be mapped @@ -104,21 +235,56 @@ static inline kuid_t mapped_kuid_fs(struct user_namespace *mnt_userns, * If @kgid has no mapping in either @mnt_userns or @fs_userns INVALID_GID is * returned. */ -static inline kgid_t mapped_kgid_fs(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - kgid_t kgid) + +static inline vfsgid_t make_vfsgid(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + kgid_t kgid) { gid_t gid; if (no_idmapping(mnt_userns, fs_userns)) - return kgid; + return VFSGIDT_INIT(kgid); if (initial_idmapping(fs_userns)) gid = __kgid_val(kgid); else gid = from_kgid(fs_userns, kgid); if (gid == (gid_t)-1) - return INVALID_GID; - return make_kgid(mnt_userns, gid); + return INVALID_VFSGID; + return VFSGIDT_INIT(make_kgid(mnt_userns, gid)); +} + +static inline kgid_t mapped_kgid_fs(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + kgid_t kgid) +{ + return AS_KGIDT(make_vfsgid(mnt_userns, fs_userns, kgid)); +} + +/** + * from_vfsuid - map a vfsuid into the filesystem idmapping + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @vfsuid : vfsuid to be mapped + * + * Map @vfsuid into the filesystem idmapping. This function has to be used in + * order to e.g. write @vfsuid to inode->i_uid. + * + * Return: @vfsuid mapped into the filesystem idmapping + */ +static inline kuid_t from_vfsuid(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + vfsuid_t vfsuid) +{ + uid_t uid; + + if (no_idmapping(mnt_userns, fs_userns)) + return AS_KUIDT(vfsuid); + uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid)); + if (uid == (uid_t)-1) + return INVALID_UID; + if (initial_idmapping(fs_userns)) + return KUIDT_INIT(uid); + return make_kuid(fs_userns, uid); } /** @@ -145,16 +311,66 @@ static inline kuid_t mapped_kuid_user(struct user_namespace *mnt_userns, struct user_namespace *fs_userns, kuid_t kuid) { - uid_t uid; + return from_vfsuid(mnt_userns, fs_userns, VFSUIDT_INIT(kuid)); +} + +/** + * vfsuid_has_fsmapping - check whether a vfsuid maps into the filesystem + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @vfsuid: vfsuid to be mapped + * + * Check whether @vfsuid has a mapping in the filesystem idmapping. Use this + * function to check whether the filesystem idmapping has a mapping for + * @vfsuid. + * + * Return: true if @vfsuid has a mapping in the filesystem, false if not. + */ +static inline bool vfsuid_has_fsmapping(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + vfsuid_t vfsuid) +{ + return uid_valid(from_vfsuid(mnt_userns, fs_userns, vfsuid)); +} + +/** + * vfsuid_into_kuid - convert vfsuid into kuid + * @vfsuid: the vfsuid to convert + * + * This can be used when a vfsuid is committed as a kuid. + * + * Return: a kuid with the value of @vfsuid + */ +static inline kuid_t vfsuid_into_kuid(vfsuid_t vfsuid) +{ + return AS_KUIDT(vfsuid); +} + +/** + * from_vfsgid - map a vfsgid into the filesystem idmapping + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @vfsgid : vfsgid to be mapped + * + * Map @vfsgid into the filesystem idmapping. This function has to be used in + * order to e.g. write @vfsgid to inode->i_gid. + * + * Return: @vfsgid mapped into the filesystem idmapping + */ +static inline kgid_t from_vfsgid(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + vfsgid_t vfsgid) +{ + gid_t gid; if (no_idmapping(mnt_userns, fs_userns)) - return kuid; - uid = from_kuid(mnt_userns, kuid); - if (uid == (uid_t)-1) - return INVALID_UID; + return AS_KGIDT(vfsgid); + gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid)); + if (gid == (gid_t)-1) + return INVALID_GID; if (initial_idmapping(fs_userns)) - return KUIDT_INIT(uid); - return make_kuid(fs_userns, uid); + return KGIDT_INIT(gid); + return make_kgid(fs_userns, gid); } /** @@ -181,16 +397,39 @@ static inline kgid_t mapped_kgid_user(struct user_namespace *mnt_userns, struct user_namespace *fs_userns, kgid_t kgid) { - gid_t gid; + return from_vfsgid(mnt_userns, fs_userns, VFSGIDT_INIT(kgid)); +} - if (no_idmapping(mnt_userns, fs_userns)) - return kgid; - gid = from_kgid(mnt_userns, kgid); - if (gid == (gid_t)-1) - return INVALID_GID; - if (initial_idmapping(fs_userns)) - return KGIDT_INIT(gid); - return make_kgid(fs_userns, gid); +/** + * vfsgid_has_fsmapping - check whether a vfsgid maps into the filesystem + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @vfsgid: vfsgid to be mapped + * + * Check whether @vfsgid has a mapping in the filesystem idmapping. Use this + * function to check whether the filesystem idmapping has a mapping for + * @vfsgid. + * + * Return: true if @vfsgid has a mapping in the filesystem, false if not. + */ +static inline bool vfsgid_has_fsmapping(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + vfsgid_t vfsgid) +{ + return gid_valid(from_vfsgid(mnt_userns, fs_userns, vfsgid)); +} + +/** + * vfsgid_into_kgid - convert vfsgid into kgid + * @vfsgid: the vfsgid to convert + * + * This can be used when a vfsgid is committed as a kgid. + * + * Return: a kgid with the value of @vfsgid + */ +static inline kgid_t vfsgid_into_kgid(vfsgid_t vfsgid) +{ + return AS_KGIDT(vfsgid); } /** @@ -209,7 +448,8 @@ static inline kgid_t mapped_kgid_user(struct user_namespace *mnt_userns, static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns, struct user_namespace *fs_userns) { - return mapped_kuid_user(mnt_userns, fs_userns, current_fsuid()); + return from_vfsuid(mnt_userns, fs_userns, + VFSUIDT_INIT(current_fsuid())); } /** @@ -228,7 +468,8 @@ static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns, static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns, struct user_namespace *fs_userns) { - return mapped_kgid_user(mnt_userns, fs_userns, current_fsgid()); + return from_vfsgid(mnt_userns, fs_userns, + VFSGIDT_INIT(current_fsgid())); } #endif /* _LINUX_MNT_IDMAPPING_H */ diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 0178823ce8c2..7fa460ccf7fa 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -556,10 +556,13 @@ #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F3 0x1493 #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F3 0x144b #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443 +#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3 0x1727 #define PCI_DEVICE_ID_AMD_19H_DF_F3 0x1653 #define PCI_DEVICE_ID_AMD_19H_M10H_DF_F3 0x14b0 #define PCI_DEVICE_ID_AMD_19H_M40H_DF_F3 0x167c #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F3 0x166d +#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F3 0x14e3 +#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F3 0x14f3 #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 #define PCI_DEVICE_ID_AMD_LANCE 0x2000 #define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index b65c877d92b8..7d1e604c1325 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -73,6 +73,7 @@ extern int set_posix_acl(struct user_namespace *, struct inode *, int, struct posix_acl *); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); +struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags); #ifdef CONFIG_FS_POSIX_ACL int posix_acl_chmod(struct user_namespace *, struct inode *, umode_t); diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index 1766e1de6956..b6bd3eac2bcc 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -33,21 +33,31 @@ posix_acl_xattr_count(size_t size) } #ifdef CONFIG_FS_POSIX_ACL -void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size); -void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size); +void posix_acl_fix_xattr_from_user(void *value, size_t size); +void posix_acl_fix_xattr_to_user(void *value, size_t size); +void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size); +void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size); #else -static inline void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +static inline void posix_acl_fix_xattr_from_user(void *value, size_t size) { } -static inline void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +static inline void posix_acl_fix_xattr_to_user(void *value, size_t size) +{ +} +static inline void +posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, void *value, + size_t size) +{ +} +static inline void +posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, void *value, + size_t size) { } #endif diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index a0f6668924d3..0d8625d71733 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -20,11 +20,12 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb) } /* i_mutex must being held */ -static inline bool is_quota_modification(struct inode *inode, struct iattr *ia) +static inline bool is_quota_modification(struct user_namespace *mnt_userns, + struct inode *inode, struct iattr *ia) { - return (ia->ia_valid & ATTR_SIZE) || - (ia->ia_valid & ATTR_UID && !uid_eq(ia->ia_uid, inode->i_uid)) || - (ia->ia_valid & ATTR_GID && !gid_eq(ia->ia_gid, inode->i_gid)); + return ((ia->ia_valid & ATTR_SIZE) || + i_uid_needs_update(mnt_userns, ia, inode) || + i_gid_needs_update(mnt_userns, ia, inode)); } #if defined(CONFIG_QUOTA) @@ -115,7 +116,8 @@ int dquot_set_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int __dquot_transfer(struct inode *inode, struct dquot **transfer_to); -int dquot_transfer(struct inode *inode, struct iattr *iattr); +int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, + struct iattr *iattr); static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) { @@ -234,7 +236,8 @@ static inline void dquot_free_inode(struct inode *inode) { } -static inline int dquot_transfer(struct inode *inode, struct iattr *iattr) +static inline int dquot_transfer(struct user_namespace *mnt_userns, + struct inode *inode, struct iattr *iattr) { return 0; } diff --git a/include/linux/security.h b/include/linux/security.h index 7fc4e9f49f54..4d0baf30266e 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -353,7 +353,8 @@ int security_inode_readlink(struct dentry *dentry); int security_inode_follow_link(struct dentry *dentry, struct inode *inode, bool rcu); int security_inode_permission(struct inode *inode, int mask); -int security_inode_setattr(struct dentry *dentry, struct iattr *attr); +int security_inode_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr); int security_inode_getattr(const struct path *path); int security_inode_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, @@ -848,8 +849,9 @@ static inline int security_inode_permission(struct inode *inode, int mask) return 0; } -static inline int security_inode_setattr(struct dentry *dentry, - struct iattr *attr) +static inline int security_inode_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, + struct iattr *attr) { return 0; } diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 29917850f079..8df475db88c0 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -260,6 +260,7 @@ struct plat_stmmacenet_data { bool has_crossts; int int_snapshot_num; int ext_snapshot_num; + bool int_snapshot_en; bool ext_snapshot_en; bool multi_msi_en; int msi_mac_vec; diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 4c379d23ec6e..979a9d3e5bfb 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -61,7 +61,7 @@ int __vfs_setxattr_locked(struct user_namespace *, struct dentry *, const char *, const void *, size_t, int, struct inode **); int vfs_setxattr(struct user_namespace *, struct dentry *, const char *, - const void *, size_t, int); + void *, size_t, int); int __vfs_removexattr(struct user_namespace *, struct dentry *, const char *); int __vfs_removexattr_locked(struct user_namespace *, struct dentry *, const char *, struct inode **); diff --git a/include/net/addrconf.h b/include/net/addrconf.h index f7506f08e505..c04f359655b8 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -405,6 +405,9 @@ static inline bool ip6_ignore_linkdown(const struct net_device *dev) { const struct inet6_dev *idev = __in6_dev_get(dev); + if (unlikely(!idev)) + return true; + return !!idev->cnf.ignore_routes_with_linkdown; } diff --git a/include/net/amt.h b/include/net/amt.h index 0e40c3d64fcf..08fc30cf2f34 100644 --- a/include/net/amt.h +++ b/include/net/amt.h @@ -78,6 +78,15 @@ enum amt_status { #define AMT_STATUS_MAX (__AMT_STATUS_MAX - 1) +/* Gateway events only */ +enum amt_event { + AMT_EVENT_NONE, + AMT_EVENT_RECEIVE, + AMT_EVENT_SEND_DISCOVERY, + AMT_EVENT_SEND_REQUEST, + __AMT_EVENT_MAX, +}; + struct amt_header { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 type:4, @@ -292,6 +301,12 @@ struct amt_group_node { struct hlist_head sources[]; }; +#define AMT_MAX_EVENTS 16 +struct amt_events { + enum amt_event event; + struct sk_buff *skb; +}; + struct amt_dev { struct net_device *dev; struct net_device *stream_dev; @@ -308,6 +323,7 @@ struct amt_dev { struct delayed_work req_wq; /* Protected by RTNL */ struct delayed_work secret_wq; + struct work_struct event_wq; /* AMT status */ enum amt_status status; /* Generated key */ @@ -345,6 +361,10 @@ struct amt_dev { /* Used only in gateway mode */ u64 mac:48, reserved:16; + /* AMT gateway side message handler queue */ + struct amt_events events[AMT_MAX_EVENTS]; + u8 event_idx; + u8 nr_events; }; #define AMT_TOS 0xc0 diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 3c4f550e5a8b..2f766e3437ce 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -847,6 +847,7 @@ enum { }; void l2cap_chan_hold(struct l2cap_chan *c); +struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c); void l2cap_chan_put(struct l2cap_chan *c); static inline void l2cap_chan_lock(struct l2cap_chan *chan) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 85cd695e7fd1..ee88f0f1350f 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -321,7 +321,7 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu); -#define TCP_PINGPONG_THRESH 3 +#define TCP_PINGPONG_THRESH 1 static inline void inet_csk_enter_pingpong_mode(struct sock *sk) { @@ -338,14 +338,6 @@ static inline bool inet_csk_in_pingpong_mode(struct sock *sk) return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH; } -static inline void inet_csk_inc_pingpong_cnt(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - if (icsk->icsk_ack.pingpong < U8_MAX) - icsk->icsk_ack.pingpong++; -} - static inline bool inet_csk_has_ulp(struct sock *sk) { return inet_sk(sk)->is_icsk && !!inet_csk(sk)->icsk_ulp_ops; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index ebfa3df6f8dc..fd6b510d114b 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -179,7 +179,7 @@ static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - return inet_bound_dev_eq(!!net->ipv4.sysctl_tcp_l3mdev_accept, + return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept), bound_dev_if, dif, sdif); #else return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index daead5fb389a..6395f6b9a5d2 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -107,7 +107,8 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) { - if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept) + if (!sk->sk_mark && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)) return skb->mark; return sk->sk_mark; @@ -120,7 +121,7 @@ static inline int inet_request_bound_dev_if(const struct sock *sk, #ifdef CONFIG_NET_L3_MASTER_DEV struct net *net = sock_net(sk); - if (!bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept) + if (!bound_dev_if && READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept)) return l3mdev_master_ifindex_by_index(net, skb->skb_iif); #endif @@ -132,7 +133,7 @@ static inline int inet_sk_bound_l3mdev(const struct sock *sk) #ifdef CONFIG_NET_L3_MASTER_DEV struct net *net = sock_net(sk); - if (!net->ipv4.sysctl_tcp_l3mdev_accept) + if (!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept)) return l3mdev_master_ifindex_by_index(net, sk->sk_bound_dev_if); #endif @@ -374,7 +375,7 @@ static inline bool inet_get_convert_csum(struct sock *sk) static inline bool inet_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { - return net->ipv4.sysctl_ip_nonlocal_bind || + return READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind) || inet->freebind || inet->transparent; } diff --git a/include/net/ip.h b/include/net/ip.h index 26fffda78cca..1c979fd1904c 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -357,7 +357,7 @@ static inline bool sysctl_dev_name_is_allowed(const char *name) static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port) { - return port < net->ipv4.sysctl_ip_prot_sock; + return port < READ_ONCE(net->ipv4.sysctl_ip_prot_sock); } #else @@ -384,7 +384,7 @@ void ipfrag_init(void); void ip_static_sysctl_init(void); #define IP4_REPLY_MARK(net, mark) \ - ((net)->ipv4.sysctl_fwmark_reflect ? (mark) : 0) + (READ_ONCE((net)->ipv4.sysctl_fwmark_reflect) ? (mark) : 0) static inline bool ip_is_fragment(const struct iphdr *iph) { @@ -446,7 +446,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, struct net *net = dev_net(dst->dev); unsigned int mtu; - if (net->ipv4.sysctl_ip_fwd_use_pmtu || + if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { mtu = rt->rt_pmtu; diff --git a/include/net/protocol.h b/include/net/protocol.h index f51c06ae365f..6aef8cb11cc8 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -35,8 +35,6 @@ /* This is used to register protocols. */ struct net_protocol { - int (*early_demux)(struct sk_buff *skb); - int (*early_demux_handler)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); /* This returns an error if we weren't able to handle the error. */ @@ -52,8 +50,6 @@ struct net_protocol { #if IS_ENABLED(CONFIG_IPV6) struct inet6_protocol { - void (*early_demux)(struct sk_buff *skb); - void (*early_demux_handler)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); /* This returns an error if we weren't able to handle the error. */ diff --git a/include/net/route.h b/include/net/route.h index 991a3985712d..bbcf2aba149f 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -373,7 +373,7 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) struct net *net = dev_net(dst->dev); if (hoplimit == 0) - hoplimit = net->ipv4.sysctl_ip_default_ttl; + hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); return hoplimit; } diff --git a/include/net/sock.h b/include/net/sock.h index 9fa54762e077..7a48991cdb19 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2843,18 +2843,18 @@ static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_wmem ? */ if (proto->sysctl_wmem_offset) - return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset); + return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset)); - return *proto->sysctl_wmem; + return READ_ONCE(*proto->sysctl_wmem); } static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_rmem ? */ if (proto->sysctl_rmem_offset) - return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset); + return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset)); - return *proto->sysctl_rmem; + return READ_ONCE(*proto->sysctl_rmem); } /* Default TCP Small queue budget is ~1 ms of data (1sec >> 10) diff --git a/include/net/tcp.h b/include/net/tcp.h index 1e99f5c61f84..78a64e1b33a7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -932,7 +932,7 @@ extern const struct inet_connection_sock_af_ops ipv6_specific; INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb)); -INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *skb)); +void tcp_v6_early_demux(struct sk_buff *skb); #endif @@ -1403,8 +1403,8 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); s32 delta; - if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out || - ca_ops->cong_control) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) || + tp->packets_out || ca_ops->cong_control) return; delta = tcp_jiffies32 - tp->lsndtime; if (delta > inet_csk(sk)->icsk_rto) @@ -1419,7 +1419,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, static inline int tcp_win_from_space(const struct sock *sk, int space) { - int tcp_adv_win_scale = sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale; + int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale); return tcp_adv_win_scale <= 0 ? (space>>(-tcp_adv_win_scale)) : @@ -1493,21 +1493,24 @@ static inline int keepalive_intvl_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->keepalive_intvl ? : net->ipv4.sysctl_tcp_keepalive_intvl; + return tp->keepalive_intvl ? : + READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl); } static inline int keepalive_time_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->keepalive_time ? : net->ipv4.sysctl_tcp_keepalive_time; + return tp->keepalive_time ? : + READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time); } static inline int keepalive_probes(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->keepalive_probes ? : net->ipv4.sysctl_tcp_keepalive_probes; + return tp->keepalive_probes ? : + READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes); } static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) @@ -1520,7 +1523,8 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) static inline int tcp_fin_time(const struct sock *sk) { - int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout; + int fin_timeout = tcp_sk(sk)->linger2 ? : + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fin_timeout); const int rto = inet_csk(sk)->icsk_rto; if (fin_timeout < (rto << 2) - (rto >> 1)) @@ -2023,7 +2027,7 @@ void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat; + return tp->notsent_lowat ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); } bool tcp_stream_memory_free(const struct sock *sk, int wake); diff --git a/include/net/udp.h b/include/net/udp.h index b83a00330566..8dd4aa1485a6 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -167,7 +167,7 @@ static inline void udp_csum_pull_header(struct sk_buff *skb) typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, __be16 sport, __be16 dport); -INDIRECT_CALLABLE_DECLARE(void udp_v6_early_demux(struct sk_buff *)); +void udp_v6_early_demux(struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, @@ -238,7 +238,7 @@ static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - return inet_bound_dev_eq(!!net->ipv4.sysctl_udp_l3mdev_accept, + return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_udp_l3mdev_accept), bound_dev_if, dif, sdif); #else return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); diff --git a/include/trace/events/dlm.h b/include/trace/events/dlm.h index 32088c603244..bad21222130e 100644 --- a/include/trace/events/dlm.h +++ b/include/trace/events/dlm.h @@ -49,38 +49,52 @@ /* note: we begin tracing dlm_lock_start() only if ls and lkb are found */ TRACE_EVENT(dlm_lock_start, - TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, int mode, - __u32 flags), + TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, void *name, + unsigned int namelen, int mode, __u32 flags), - TP_ARGS(ls, lkb, mode, flags), + TP_ARGS(ls, lkb, name, namelen, mode, flags), TP_STRUCT__entry( __field(__u32, ls_id) __field(__u32, lkb_id) __field(int, mode) __field(__u32, flags) + __dynamic_array(unsigned char, res_name, + lkb->lkb_resource ? lkb->lkb_resource->res_length : namelen) ), TP_fast_assign( + struct dlm_rsb *r; + __entry->ls_id = ls->ls_global_id; __entry->lkb_id = lkb->lkb_id; __entry->mode = mode; __entry->flags = flags; + + r = lkb->lkb_resource; + if (r) + memcpy(__get_dynamic_array(res_name), r->res_name, + __get_dynamic_array_len(res_name)); + else if (name) + memcpy(__get_dynamic_array(res_name), name, + __get_dynamic_array_len(res_name)); ), - TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s", + TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s res_name=%s", __entry->ls_id, __entry->lkb_id, show_lock_mode(__entry->mode), - show_lock_flags(__entry->flags)) + show_lock_flags(__entry->flags), + __print_hex_str(__get_dynamic_array(res_name), + __get_dynamic_array_len(res_name))) ); TRACE_EVENT(dlm_lock_end, - TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, int mode, __u32 flags, - int error), + TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, void *name, + unsigned int namelen, int mode, __u32 flags, int error), - TP_ARGS(ls, lkb, mode, flags, error), + TP_ARGS(ls, lkb, name, namelen, mode, flags, error), TP_STRUCT__entry( __field(__u32, ls_id) @@ -88,14 +102,26 @@ TRACE_EVENT(dlm_lock_end, __field(int, mode) __field(__u32, flags) __field(int, error) + __dynamic_array(unsigned char, res_name, + lkb->lkb_resource ? lkb->lkb_resource->res_length : namelen) ), TP_fast_assign( + struct dlm_rsb *r; + __entry->ls_id = ls->ls_global_id; __entry->lkb_id = lkb->lkb_id; __entry->mode = mode; __entry->flags = flags; + r = lkb->lkb_resource; + if (r) + memcpy(__get_dynamic_array(res_name), r->res_name, + __get_dynamic_array_len(res_name)); + else if (name) + memcpy(__get_dynamic_array(res_name), name, + __get_dynamic_array_len(res_name)); + /* return value will be zeroed in those cases by dlm_lock() * we do it here again to not introduce more overhead if * trace isn't running and error reflects the return value. @@ -104,12 +130,15 @@ TRACE_EVENT(dlm_lock_end, __entry->error = 0; else __entry->error = error; + ), - TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s error=%d", + TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s error=%d res_name=%s", __entry->ls_id, __entry->lkb_id, show_lock_mode(__entry->mode), - show_lock_flags(__entry->flags), __entry->error) + show_lock_flags(__entry->flags), __entry->error, + __print_hex_str(__get_dynamic_array(res_name), + __get_dynamic_array_len(res_name))) ); @@ -123,42 +152,65 @@ TRACE_EVENT(dlm_bast, __field(__u32, ls_id) __field(__u32, lkb_id) __field(int, mode) + __dynamic_array(unsigned char, res_name, + lkb->lkb_resource ? lkb->lkb_resource->res_length : 0) ), TP_fast_assign( + struct dlm_rsb *r; + __entry->ls_id = ls->ls_global_id; __entry->lkb_id = lkb->lkb_id; __entry->mode = mode; + + r = lkb->lkb_resource; + if (r) + memcpy(__get_dynamic_array(res_name), r->res_name, + __get_dynamic_array_len(res_name)); ), - TP_printk("ls_id=%u lkb_id=%x mode=%s", __entry->ls_id, - __entry->lkb_id, show_lock_mode(__entry->mode)) + TP_printk("ls_id=%u lkb_id=%x mode=%s res_name=%s", + __entry->ls_id, __entry->lkb_id, + show_lock_mode(__entry->mode), + __print_hex_str(__get_dynamic_array(res_name), + __get_dynamic_array_len(res_name))) ); TRACE_EVENT(dlm_ast, - TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_lksb *lksb), + TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb), - TP_ARGS(ls, lkb, lksb), + TP_ARGS(ls, lkb), TP_STRUCT__entry( __field(__u32, ls_id) __field(__u32, lkb_id) __field(u8, sb_flags) __field(int, sb_status) + __dynamic_array(unsigned char, res_name, + lkb->lkb_resource ? lkb->lkb_resource->res_length : 0) ), TP_fast_assign( + struct dlm_rsb *r; + __entry->ls_id = ls->ls_global_id; __entry->lkb_id = lkb->lkb_id; - __entry->sb_flags = lksb->sb_flags; - __entry->sb_status = lksb->sb_status; + __entry->sb_flags = lkb->lkb_lksb->sb_flags; + __entry->sb_status = lkb->lkb_lksb->sb_status; + + r = lkb->lkb_resource; + if (r) + memcpy(__get_dynamic_array(res_name), r->res_name, + __get_dynamic_array_len(res_name)); ), - TP_printk("ls_id=%u lkb_id=%x sb_flags=%s sb_status=%d", + TP_printk("ls_id=%u lkb_id=%x sb_flags=%s sb_status=%d res_name=%s", __entry->ls_id, __entry->lkb_id, - show_dlm_sb_flags(__entry->sb_flags), __entry->sb_status) + show_dlm_sb_flags(__entry->sb_flags), __entry->sb_status, + __print_hex_str(__get_dynamic_array(res_name), + __get_dynamic_array_len(res_name))) ); @@ -173,17 +225,28 @@ TRACE_EVENT(dlm_unlock_start, __field(__u32, ls_id) __field(__u32, lkb_id) __field(__u32, flags) + __dynamic_array(unsigned char, res_name, + lkb->lkb_resource ? lkb->lkb_resource->res_length : 0) ), TP_fast_assign( + struct dlm_rsb *r; + __entry->ls_id = ls->ls_global_id; __entry->lkb_id = lkb->lkb_id; __entry->flags = flags; + + r = lkb->lkb_resource; + if (r) + memcpy(__get_dynamic_array(res_name), r->res_name, + __get_dynamic_array_len(res_name)); ), - TP_printk("ls_id=%u lkb_id=%x flags=%s", + TP_printk("ls_id=%u lkb_id=%x flags=%s res_name=%s", __entry->ls_id, __entry->lkb_id, - show_lock_flags(__entry->flags)) + show_lock_flags(__entry->flags), + __print_hex_str(__get_dynamic_array(res_name), + __get_dynamic_array_len(res_name))) ); @@ -199,18 +262,29 @@ TRACE_EVENT(dlm_unlock_end, __field(__u32, lkb_id) __field(__u32, flags) __field(int, error) + __dynamic_array(unsigned char, res_name, + lkb->lkb_resource ? lkb->lkb_resource->res_length : 0) ), TP_fast_assign( + struct dlm_rsb *r; + __entry->ls_id = ls->ls_global_id; __entry->lkb_id = lkb->lkb_id; __entry->flags = flags; __entry->error = error; + + r = lkb->lkb_resource; + if (r) + memcpy(__get_dynamic_array(res_name), r->res_name, + __get_dynamic_array_len(res_name)); ), - TP_printk("ls_id=%u lkb_id=%x flags=%s error=%d", + TP_printk("ls_id=%u lkb_id=%x flags=%s error=%d res_name=%s", __entry->ls_id, __entry->lkb_id, - show_lock_flags(__entry->flags), __entry->error) + show_lock_flags(__entry->flags), __entry->error, + __print_hex_str(__get_dynamic_array(res_name), + __get_dynamic_array_len(res_name))) ); diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h index f13d37b60775..1ecdb911add8 100644 --- a/include/uapi/asm-generic/fcntl.h +++ b/include/uapi/asm-generic/fcntl.h @@ -192,6 +192,7 @@ struct f_owner_ex { #define F_LINUX_SPECIFIC_BASE 1024 +#ifndef HAVE_ARCH_STRUCT_FLOCK struct flock { short l_type; short l_whence; @@ -216,5 +217,6 @@ struct flock64 { __ARCH_FLOCK64_PAD #endif }; +#endif /* HAVE_ARCH_STRUCT_FLOCK */ #endif /* _ASM_GENERIC_FCNTL_H */ diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index f1f89132d60e..d8536d77fea1 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -83,12 +83,20 @@ #define FAN_MARK_FLUSH 0x00000080 /* FAN_MARK_FILESYSTEM is 0x00000100 */ #define FAN_MARK_EVICTABLE 0x00000200 +/* This bit is mutually exclusive with FAN_MARK_IGNORED_MASK bit */ +#define FAN_MARK_IGNORE 0x00000400 /* These are NOT bitwise flags. Both bits can be used togther. */ #define FAN_MARK_INODE 0x00000000 #define FAN_MARK_MOUNT 0x00000010 #define FAN_MARK_FILESYSTEM 0x00000100 +/* + * Convenience macro - FAN_MARK_IGNORE requires FAN_MARK_IGNORED_SURV_MODIFY + * for non-inode mark types. + */ +#define FAN_MARK_IGNORE_SURV (FAN_MARK_IGNORE | FAN_MARK_IGNORED_SURV_MODIFY) + /* Deprecated - do not use this in programs and do not add new flags here! */ #define FAN_ALL_MARK_FLAGS (FAN_MARK_ADD |\ FAN_MARK_REMOVE |\ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 811897dadcae..860f867c50c0 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -2084,7 +2084,7 @@ struct kvm_stats_header { #define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_BOOLEAN (0x4 << KVM_STATS_UNIT_SHIFT) -#define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES +#define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_BOOLEAN #define KVM_STATS_BASE_SHIFT 8 #define KVM_STATS_BASE_MASK (0xF << KVM_STATS_BASE_SHIFT) diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config index dcd86f32f4ed..6fac5b405334 100644 --- a/kernel/configs/x86_debug.config +++ b/kernel/configs/x86_debug.config @@ -7,12 +7,11 @@ CONFIG_DEBUG_SLAB=y CONFIG_DEBUG_KMEMLEAK=y CONFIG_DEBUG_PAGEALLOC=y CONFIG_SLUB_DEBUG_ON=y -CONFIG_KMEMCHECK=y CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1 CONFIG_GCOV_KERNEL=y CONFIG_LOCKDEP=y CONFIG_PROVE_LOCKING=y CONFIG_SCHEDSTATS=y -CONFIG_VMLINUX_VALIDATION=y +CONFIG_NOINSTR_VALIDATION=y CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 9d1db4a54d34..65f0262f635e 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -335,8 +335,6 @@ struct rwsem_waiter { struct task_struct *task; enum rwsem_waiter_type type; unsigned long timeout; - - /* Writer only, not initialized in reader */ bool handoff_set; }; #define rwsem_first_waiter(sem) \ @@ -459,10 +457,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * to give up the lock), request a HANDOFF to * force the issue. */ - if (!(oldcount & RWSEM_FLAG_HANDOFF) && - time_after(jiffies, waiter->timeout)) { - adjustment -= RWSEM_FLAG_HANDOFF; - lockevent_inc(rwsem_rlock_handoff); + if (time_after(jiffies, waiter->timeout)) { + if (!(oldcount & RWSEM_FLAG_HANDOFF)) { + adjustment -= RWSEM_FLAG_HANDOFF; + lockevent_inc(rwsem_rlock_handoff); + } + waiter->handoff_set = true; } atomic_long_add(-adjustment, &sem->count); @@ -599,7 +599,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { - bool first = rwsem_first_waiter(sem) == waiter; + struct rwsem_waiter *first = rwsem_first_waiter(sem); long count, new; lockdep_assert_held(&sem->wait_lock); @@ -609,11 +609,20 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); if (has_handoff) { - if (!first) + /* + * Honor handoff bit and yield only when the first + * waiter is the one that set it. Otherwisee, we + * still try to acquire the rwsem. + */ + if (first->handoff_set && (waiter != first)) return false; - /* First waiter inherits a previously set handoff bit */ - waiter->handoff_set = true; + /* + * First waiter can inherit a previously set handoff + * bit and spin on rwsem if lock acquisition fails. + */ + if (waiter == first) + waiter->handoff_set = true; } new = count; @@ -1027,6 +1036,7 @@ queue: waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) { diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 50ba70f019de..1c304fec89c0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -511,10 +511,52 @@ static bool srcu_readers_active(struct srcu_struct *ssp) return sum; } -#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. -#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. -#define SRCU_MAX_NODELAY_PHASE 1 // Maximum per-GP-phase consecutive no-delay instances. -#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances. +/* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below, boot time configurable) to allow SRCU readers to exit + * their read-side critical sections. If there are still some readers + * after one jiffy, we repeatedly block for one jiffy time periods. + * The blocking time is increased as the grace-period age increases, + * with max blocking time capped at 10 jiffies. + */ +#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5 + +static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY; +module_param(srcu_retry_check_delay, ulong, 0444); + +#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. +#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. + +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase + // no-delay instances. +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase + // no-delay instances. + +#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low)) +#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high)) +#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high)) +// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto +// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay() +// called from process_srcu(). +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \ + (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY) + +// Maximum per-GP-phase consecutive no-delay instances. +#define SRCU_DEFAULT_MAX_NODELAY_PHASE \ + SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \ + SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \ + SRCU_DEFAULT_MAX_NODELAY_PHASE_HI) + +static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE; +module_param(srcu_max_nodelay_phase, ulong, 0444); + +// Maximum consecutive no-delay instances. +#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \ + SRCU_DEFAULT_MAX_NODELAY_PHASE : 100) + +static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY; +module_param(srcu_max_nodelay, ulong, 0444); /* * Return grace-period delay, zero if there are expedited grace @@ -522,16 +564,22 @@ static bool srcu_readers_active(struct srcu_struct *ssp) */ static unsigned long srcu_get_delay(struct srcu_struct *ssp) { + unsigned long gpstart; + unsigned long j; unsigned long jbase = SRCU_INTERVAL; if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) jbase = 0; - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) - jbase += jiffies - READ_ONCE(ssp->srcu_gp_start); - if (!jbase) { - WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); - if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE) - jbase = 1; + if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) { + j = jiffies - 1; + gpstart = READ_ONCE(ssp->srcu_gp_start); + if (time_after(j, gpstart)) + jbase += j - gpstart; + if (!jbase) { + WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); + if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) + jbase = 1; + } } return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase; } @@ -607,15 +655,6 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* - * We use an adaptive strategy for synchronize_srcu() and especially for - * synchronize_srcu_expedited(). We spin for a fixed time period - * (defined below) to allow SRCU readers to exit their read-side critical - * sections. If there are still some readers after a few microseconds, - * we repeatedly block for 1-millisecond time periods. - */ -#define SRCU_RETRY_CHECK_DELAY 5 - -/* * Start an SRCU grace period. */ static void srcu_gp_start(struct srcu_struct *ssp) @@ -700,7 +739,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp */ static void srcu_gp_end(struct srcu_struct *ssp) { - unsigned long cbdelay; + unsigned long cbdelay = 1; bool cbs; bool last_lvl; int cpu; @@ -720,7 +759,9 @@ static void srcu_gp_end(struct srcu_struct *ssp) spin_lock_irq_rcu_node(ssp); idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - cbdelay = !!srcu_get_delay(ssp); + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + cbdelay = 0; + WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); @@ -921,12 +962,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, */ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { + unsigned long curdelay; + + curdelay = !srcu_get_delay(ssp); + for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) return true; - if (--trycount + !srcu_get_delay(ssp) <= 0) + if ((--trycount + curdelay) <= 0) return false; - udelay(SRCU_RETRY_CHECK_DELAY); + udelay(srcu_retry_check_delay); } } @@ -1582,7 +1627,7 @@ static void process_srcu(struct work_struct *work) j = jiffies; if (READ_ONCE(ssp->reschedule_jiffies) == j) { WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1); - if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY) + if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay) curdelay = 1; } else { WRITE_ONCE(ssp->reschedule_count, 1); @@ -1674,6 +1719,11 @@ static int __init srcu_bootup_announce(void) pr_info("Hierarchical SRCU implementation.\n"); if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF) pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff); + if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY) + pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay); + if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY) + pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay); + pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase); return 0; } early_initcall(srcu_bootup_announce); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b5152961b743..7bf561262cb8 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1701,7 +1701,10 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * the throttle. */ p->dl.dl_throttled = 0; - BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH); + if (!(flags & ENQUEUE_REPLENISH)) + printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n", + task_pid_nr(p)); + return; } diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 230038d4f908..59ddb00d6944 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -34,6 +34,27 @@ MODULE_LICENSE("GPL"); #define WATCH_QUEUE_NOTE_SIZE 128 #define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) +/* + * This must be called under the RCU read-lock, which makes + * sure that the wqueue still exists. It can then take the lock, + * and check that the wqueue hasn't been destroyed, which in + * turn makes sure that the notification pipe still exists. + */ +static inline bool lock_wqueue(struct watch_queue *wqueue) +{ + spin_lock_bh(&wqueue->lock); + if (unlikely(wqueue->defunct)) { + spin_unlock_bh(&wqueue->lock); + return false; + } + return true; +} + +static inline void unlock_wqueue(struct watch_queue *wqueue) +{ + spin_unlock_bh(&wqueue->lock); +} + static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -69,6 +90,10 @@ static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { /* * Post a notification to a watch queue. + * + * Must be called with the RCU lock for reading, and the + * watch_queue lock held, which guarantees that the pipe + * hasn't been released. */ static bool post_one_notification(struct watch_queue *wqueue, struct watch_notification *n) @@ -85,9 +110,6 @@ static bool post_one_notification(struct watch_queue *wqueue, spin_lock_irq(&pipe->rd_wait.lock); - if (wqueue->defunct) - goto out; - mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; @@ -203,7 +225,10 @@ void __post_watch_notification(struct watch_list *wlist, if (security_post_notification(watch->cred, cred, n) < 0) continue; - post_one_notification(wqueue, n); + if (lock_wqueue(wqueue)) { + post_one_notification(wqueue, n); + unlock_wqueue(wqueue); + } } rcu_read_unlock(); @@ -429,6 +454,33 @@ void init_watch(struct watch *watch, struct watch_queue *wqueue) rcu_assign_pointer(watch->queue, wqueue); } +static int add_one_watch(struct watch *watch, struct watch_list *wlist, struct watch_queue *wqueue) +{ + const struct cred *cred; + struct watch *w; + + hlist_for_each_entry(w, &wlist->watchers, list_node) { + struct watch_queue *wq = rcu_access_pointer(w->queue); + if (wqueue == wq && watch->id == w->id) + return -EBUSY; + } + + cred = current_cred(); + if (atomic_inc_return(&cred->user->nr_watches) > task_rlimit(current, RLIMIT_NOFILE)) { + atomic_dec(&cred->user->nr_watches); + return -EAGAIN; + } + + watch->cred = get_cred(cred); + rcu_assign_pointer(watch->watch_list, wlist); + + kref_get(&wqueue->usage); + kref_get(&watch->usage); + hlist_add_head(&watch->queue_node, &wqueue->watches); + hlist_add_head_rcu(&watch->list_node, &wlist->watchers); + return 0; +} + /** * add_watch_to_object - Add a watch on an object to a watch list * @watch: The watch to add @@ -443,33 +495,21 @@ void init_watch(struct watch *watch, struct watch_queue *wqueue) */ int add_watch_to_object(struct watch *watch, struct watch_list *wlist) { - struct watch_queue *wqueue = rcu_access_pointer(watch->queue); - struct watch *w; - - hlist_for_each_entry(w, &wlist->watchers, list_node) { - struct watch_queue *wq = rcu_access_pointer(w->queue); - if (wqueue == wq && watch->id == w->id) - return -EBUSY; - } + struct watch_queue *wqueue; + int ret = -ENOENT; - watch->cred = get_current_cred(); - rcu_assign_pointer(watch->watch_list, wlist); + rcu_read_lock(); - if (atomic_inc_return(&watch->cred->user->nr_watches) > - task_rlimit(current, RLIMIT_NOFILE)) { - atomic_dec(&watch->cred->user->nr_watches); - put_cred(watch->cred); - return -EAGAIN; + wqueue = rcu_access_pointer(watch->queue); + if (lock_wqueue(wqueue)) { + spin_lock(&wlist->lock); + ret = add_one_watch(watch, wlist, wqueue); + spin_unlock(&wlist->lock); + unlock_wqueue(wqueue); } - spin_lock_bh(&wqueue->lock); - kref_get(&wqueue->usage); - kref_get(&watch->usage); - hlist_add_head(&watch->queue_node, &wqueue->watches); - spin_unlock_bh(&wqueue->lock); - - hlist_add_head(&watch->list_node, &wlist->watchers); - return 0; + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(add_watch_to_object); @@ -520,20 +560,15 @@ found: wqueue = rcu_dereference(watch->queue); - /* We don't need the watch list lock for the next bit as RCU is - * protecting *wqueue from deallocation. - */ - if (wqueue) { + if (lock_wqueue(wqueue)) { post_one_notification(wqueue, &n.watch); - spin_lock_bh(&wqueue->lock); - if (!hlist_unhashed(&watch->queue_node)) { hlist_del_init_rcu(&watch->queue_node); put_watch(watch); } - spin_unlock_bh(&wqueue->lock); + unlock_wqueue(wqueue); } if (wlist->release_watch) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1ea50f6be843..aa8a82bc6738 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5001,7 +5001,10 @@ static void unbind_workers(int cpu) for_each_pool_worker(worker, pool) { kthread_set_per_cpu(worker->task, -1); - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); + if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); + else + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); } mutex_unlock(&wq_pool_attach_mutex); @@ -87,7 +87,8 @@ retry: * belongs to this folio. */ if (unlikely(page_folio(page) != folio)) { - folio_put_refs(folio, refs); + if (!put_devmap_managed_page_refs(&folio->page, refs)) + folio_put_refs(folio, refs); goto retry; } @@ -176,7 +177,8 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) refs *= GUP_PIN_COUNTING_BIAS; } - folio_put_refs(folio, refs); + if (!put_devmap_managed_page_refs(&folio->page, refs)) + folio_put_refs(folio, refs); } /** @@ -212,14 +212,6 @@ int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, unsigned long end, unsigned long hmm_pfns[], pmd_t pmd); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline bool hmm_is_device_private_entry(struct hmm_range *range, - swp_entry_t entry) -{ - return is_device_private_entry(entry) && - pfn_swap_entry_to_page(entry)->pgmap->owner == - range->dev_private_owner; -} - static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) { @@ -252,10 +244,12 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, swp_entry_t entry = pte_to_swp_entry(pte); /* - * Never fault in device private pages, but just report - * the PFN even if not present. + * Don't fault in device private pages owned by the caller, + * just report the PFN. */ - if (hmm_is_device_private_entry(range, entry)) { + if (is_device_private_entry(entry) && + pfn_swap_entry_to_page(entry)->pgmap->owner == + range->dev_private_owner) { cpu_flags = HMM_PFN_VALID; if (is_writable_device_private_entry(entry)) cpu_flags |= HMM_PFN_WRITE; @@ -273,6 +267,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (!non_swap_entry(entry)) goto fault; + if (is_device_private_entry(entry)) + goto fault; + if (is_device_exclusive_entry(entry)) goto fault; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a57e1be41401..a18c071c294e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4788,8 +4788,13 @@ again: * sharing with another vma. */ ; - } else if (unlikely(is_hugetlb_entry_migration(entry) || - is_hugetlb_entry_hwpoisoned(entry))) { + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { + bool uffd_wp = huge_pte_uffd_wp(entry); + + if (!userfaultfd_wp(dst_vma) && uffd_wp) + entry = huge_pte_clear_uffd_wp(entry); + set_huge_pte_at(dst, addr, dst_pte, entry); + } else if (unlikely(is_hugetlb_entry_migration(entry))) { swp_entry_t swp_entry = pte_to_swp_entry(entry); bool uffd_wp = huge_pte_uffd_wp(entry); @@ -5947,6 +5952,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, page = alloc_huge_page(dst_vma, dst_addr, 0); if (IS_ERR(page)) { + put_page(*pagep); ret = -ENOMEM; *pagep = NULL; goto out; diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 4b5e5a3d3a63..6aff49f6b79e 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -603,14 +603,6 @@ static unsigned long kfence_init_pool(void) addr += 2 * PAGE_SIZE; } - /* - * The pool is live and will never be deallocated from this point on. - * Remove the pool object from the kmemleak object tree, as it would - * otherwise overlap with allocations returned by kfence_alloc(), which - * are registered with kmemleak through the slab post-alloc hook. - */ - kmemleak_free(__kfence_pool); - return 0; } @@ -623,8 +615,16 @@ static bool __init kfence_init_pool_early(void) addr = kfence_init_pool(); - if (!addr) + if (!addr) { + /* + * The pool is live and will never be deallocated from this point on. + * Ignore the pool object from the kmemleak phys object tree, as it would + * otherwise overlap with allocations returned by kfence_alloc(), which + * are registered with kmemleak through the slab post-alloc hook. + */ + kmemleak_ignore_phys(__pa(__kfence_pool)); return true; + } /* * Only release unprotected pages, and do not try to go back and change diff --git a/mm/memory.c b/mm/memory.c index 4cf7d4b6c950..1c6027adc542 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3043,7 +3043,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) pte_t entry; VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(PageAnon(page) && !PageAnonExclusive(page)); + VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page)); /* * Clear the pages cpupid information as the existing @@ -4369,9 +4369,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return VM_FAULT_OOM; } - /* See comment in handle_pte_fault() */ + /* + * See comment in handle_pte_fault() for how this scenario happens, we + * need to return NOPAGE so that we drop this page. + */ if (pmd_devmap_trans_unstable(vmf->pmd)) - return 0; + return VM_FAULT_NOPAGE; vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); diff --git a/mm/memremap.c b/mm/memremap.c index b870a659eee6..745eea0f99c3 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -499,7 +499,7 @@ void free_zone_device_page(struct page *page) } #ifdef CONFIG_FS_DAX -bool __put_devmap_managed_page(struct page *page) +bool __put_devmap_managed_page_refs(struct page *page, int refs) { if (page->pgmap->type != MEMORY_DEVICE_FS_DAX) return false; @@ -509,9 +509,9 @@ bool __put_devmap_managed_page(struct page *page) * refcount is 1, then the page is free and the refcount is * stable because nobody holds a reference on the page. */ - if (page_ref_dec_return(page) == 1) + if (page_ref_sub_return(page, refs) == 1) wake_up_var(&page->_refcount); return true; } -EXPORT_SYMBOL(__put_devmap_managed_page); +EXPORT_SYMBOL(__put_devmap_managed_page_refs); #endif /* CONFIG_FS_DAX */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e008a3df0485..b5b14b78c4fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3968,11 +3968,15 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * need to be calculated. */ if (!order) { - long fast_free; + long usable_free; + long reserved; - fast_free = free_pages; - fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags); - if (fast_free > mark + z->lowmem_reserve[highest_zoneidx]) + usable_free = free_pages; + reserved = __zone_watermark_unusable_free(z, 0, alloc_flags); + + /* reserved may over estimate high-atomic reserves. */ + usable_free -= min(usable_free, reserved); + if (usable_free > mark + z->lowmem_reserve[highest_zoneidx]) return true; } diff --git a/mm/secretmem.c b/mm/secretmem.c index 206ed6b40c1d..f06279d6190a 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -55,22 +55,28 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf) gfp_t gfp = vmf->gfp_mask; unsigned long addr; struct page *page; + vm_fault_t ret; int err; if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return vmf_error(-EINVAL); + filemap_invalidate_lock_shared(mapping); + retry: page = find_lock_page(mapping, offset); if (!page) { page = alloc_page(gfp | __GFP_ZERO); - if (!page) - return VM_FAULT_OOM; + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } err = set_direct_map_invalid_noflush(page); if (err) { put_page(page); - return vmf_error(err); + ret = vmf_error(err); + goto out; } __SetPageUptodate(page); @@ -86,7 +92,8 @@ retry: if (err == -EEXIST) goto retry; - return vmf_error(err); + ret = vmf_error(err); + goto out; } addr = (unsigned long)page_address(page); @@ -94,7 +101,11 @@ retry: } vmf->page = page; - return VM_FAULT_LOCKED; + ret = VM_FAULT_LOCKED; + +out: + filemap_invalidate_unlock_shared(mapping); + return ret; } static const struct vm_operations_struct secretmem_vm_ops = { @@ -162,12 +173,20 @@ static int secretmem_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); + struct address_space *mapping = inode->i_mapping; unsigned int ia_valid = iattr->ia_valid; + int ret; + + filemap_invalidate_lock(mapping); if ((ia_valid & ATTR_SIZE) && inode->i_size) - return -EINVAL; + ret = -EINVAL; + else + ret = simple_setattr(mnt_userns, dentry, iattr); - return simple_setattr(mnt_userns, dentry, iattr); + filemap_invalidate_unlock(mapping); + + return ret; } static const struct inode_operations secretmem_iops = { diff --git a/mm/shmem.c b/mm/shmem.c index a6f565308133..b7f2d4a56867 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3392,7 +3392,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) break; case Opt_nr_blocks: ctx->blocks = memparse(param->string, &rest); - if (*rest) + if (*rest || ctx->blocks > S64_MAX) goto bad_value; ctx->seen |= SHMEM_SEEN_BLOCKS; break; @@ -3514,10 +3514,7 @@ static int shmem_reconfigure(struct fs_context *fc) raw_spin_lock(&sbinfo->stat_lock); inodes = sbinfo->max_inodes - sbinfo->free_inodes; - if (ctx->blocks > S64_MAX) { - err = "Number of blocks too large"; - goto out; - } + if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { err = "Cannot retroactively limit size"; diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 1739e8cb3291..c17021642234 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4973,6 +4973,9 @@ int hci_suspend_sync(struct hci_dev *hdev) return err; } + /* Update event mask so only the allowed event can wakeup the host */ + hci_set_event_mask_sync(hdev); + /* Only configure accept list if disconnect succeeded and wake * isn't being prevented. */ @@ -4984,9 +4987,6 @@ int hci_suspend_sync(struct hci_dev *hdev) /* Unpause to take care of updating scanning params */ hdev->scanning_paused = false; - /* Update event mask so only the allowed event can wakeup the host */ - hci_set_event_mask_sync(hdev); - /* Enable event filter for paired devices */ hci_update_event_filter_sync(hdev); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index ae78490ecd3d..52668662ae8d 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -111,7 +111,8 @@ static struct l2cap_chan *__l2cap_get_chan_by_scid(struct l2cap_conn *conn, } /* Find channel with given SCID. - * Returns locked channel. */ + * Returns a reference locked channel. + */ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, u16 cid) { @@ -119,15 +120,19 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_scid(conn, cid); - if (c) - l2cap_chan_lock(c); + if (c) { + /* Only lock if chan reference is not 0 */ + c = l2cap_chan_hold_unless_zero(c); + if (c) + l2cap_chan_lock(c); + } mutex_unlock(&conn->chan_lock); return c; } /* Find channel with given DCID. - * Returns locked channel. + * Returns a reference locked channel. */ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn, u16 cid) @@ -136,8 +141,12 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn, mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_dcid(conn, cid); - if (c) - l2cap_chan_lock(c); + if (c) { + /* Only lock if chan reference is not 0 */ + c = l2cap_chan_hold_unless_zero(c); + if (c) + l2cap_chan_lock(c); + } mutex_unlock(&conn->chan_lock); return c; @@ -162,8 +171,12 @@ static struct l2cap_chan *l2cap_get_chan_by_ident(struct l2cap_conn *conn, mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_ident(conn, ident); - if (c) - l2cap_chan_lock(c); + if (c) { + /* Only lock if chan reference is not 0 */ + c = l2cap_chan_hold_unless_zero(c); + if (c) + l2cap_chan_lock(c); + } mutex_unlock(&conn->chan_lock); return c; @@ -497,6 +510,16 @@ void l2cap_chan_hold(struct l2cap_chan *c) kref_get(&c->kref); } +struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c) +{ + BT_DBG("chan %p orig refcnt %u", c, kref_read(&c->kref)); + + if (!kref_get_unless_zero(&c->kref)) + return NULL; + + return c; +} + void l2cap_chan_put(struct l2cap_chan *c) { BT_DBG("chan %p orig refcnt %u", c, kref_read(&c->kref)); @@ -1968,7 +1991,10 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, src_match = !bacmp(&c->src, src); dst_match = !bacmp(&c->dst, dst); if (src_match && dst_match) { - l2cap_chan_hold(c); + c = l2cap_chan_hold_unless_zero(c); + if (!c) + continue; + read_unlock(&chan_list_lock); return c; } @@ -1983,7 +2009,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, } if (c1) - l2cap_chan_hold(c1); + c1 = l2cap_chan_hold_unless_zero(c1); read_unlock(&chan_list_lock); @@ -4463,6 +4489,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, unlock: l2cap_chan_unlock(chan); + l2cap_chan_put(chan); return err; } @@ -4577,6 +4604,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, done: l2cap_chan_unlock(chan); + l2cap_chan_put(chan); return err; } @@ -5304,6 +5332,7 @@ send_move_response: l2cap_send_move_chan_rsp(chan, result); l2cap_chan_unlock(chan); + l2cap_chan_put(chan); return 0; } @@ -5396,6 +5425,7 @@ static void l2cap_move_continue(struct l2cap_conn *conn, u16 icid, u16 result) } l2cap_chan_unlock(chan); + l2cap_chan_put(chan); } static void l2cap_move_fail(struct l2cap_conn *conn, u8 ident, u16 icid, @@ -5425,6 +5455,7 @@ static void l2cap_move_fail(struct l2cap_conn *conn, u8 ident, u16 icid, l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED); l2cap_chan_unlock(chan); + l2cap_chan_put(chan); } static int l2cap_move_channel_rsp(struct l2cap_conn *conn, @@ -5488,6 +5519,7 @@ static int l2cap_move_channel_confirm(struct l2cap_conn *conn, l2cap_send_move_chan_cfm_rsp(conn, cmd->ident, icid); l2cap_chan_unlock(chan); + l2cap_chan_put(chan); return 0; } @@ -5523,6 +5555,7 @@ static inline int l2cap_move_channel_confirm_rsp(struct l2cap_conn *conn, } l2cap_chan_unlock(chan); + l2cap_chan_put(chan); return 0; } @@ -5895,12 +5928,11 @@ static inline int l2cap_le_credits(struct l2cap_conn *conn, if (credits > max_credits) { BT_ERR("LE credits overflow"); l2cap_send_disconn_req(chan, ECONNRESET); - l2cap_chan_unlock(chan); /* Return 0 so that we don't trigger an unnecessary * command reject packet. */ - return 0; + goto unlock; } chan->tx_credits += credits; @@ -5911,7 +5943,9 @@ static inline int l2cap_le_credits(struct l2cap_conn *conn, if (chan->tx_credits) chan->ops->resume(chan); +unlock: l2cap_chan_unlock(chan); + l2cap_chan_put(chan); return 0; } @@ -7597,6 +7631,7 @@ drop: done: l2cap_chan_unlock(chan); + l2cap_chan_put(chan); } static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, @@ -8085,7 +8120,7 @@ static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c, if (src_type != c->src_type) continue; - l2cap_chan_hold(c); + c = l2cap_chan_hold_unless_zero(c); read_unlock(&chan_list_lock); return c; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index ae758ab1b558..2f91a8c2b678 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4723,7 +4723,6 @@ static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, else status = MGMT_STATUS_FAILED; - mgmt_pending_remove(cmd); goto unlock; } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index bb01776d2d88..c96509c442a5 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -589,9 +589,13 @@ static int br_fill_ifinfo(struct sk_buff *skb, } done: + if (af) { + if (nlmsg_get_pos(skb) - (void *)af > nla_attr_size(0)) + nla_nest_end(skb, af); + else + nla_nest_cancel(skb, af); + } - if (af) - nla_nest_end(skb, af); nlmsg_end(skb, nlh); return 0; diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 251e666ba9a2..748be7253248 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -47,7 +47,7 @@ enum caif_states { struct caifsock { struct sock sk; /* must be first member */ struct cflayer layer; - u32 flow_state; + unsigned long flow_state; struct caif_connect_request conn_req; struct mutex readlock; struct dentry *debugfs_socket_dir; @@ -56,38 +56,32 @@ struct caifsock { static int rx_flow_is_on(struct caifsock *cf_sk) { - return test_bit(RX_FLOW_ON_BIT, - (void *) &cf_sk->flow_state); + return test_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state); } static int tx_flow_is_on(struct caifsock *cf_sk) { - return test_bit(TX_FLOW_ON_BIT, - (void *) &cf_sk->flow_state); + return test_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state); } static void set_rx_flow_off(struct caifsock *cf_sk) { - clear_bit(RX_FLOW_ON_BIT, - (void *) &cf_sk->flow_state); + clear_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state); } static void set_rx_flow_on(struct caifsock *cf_sk) { - set_bit(RX_FLOW_ON_BIT, - (void *) &cf_sk->flow_state); + set_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state); } static void set_tx_flow_off(struct caifsock *cf_sk) { - clear_bit(TX_FLOW_ON_BIT, - (void *) &cf_sk->flow_state); + clear_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state); } static void set_tx_flow_on(struct caifsock *cf_sk) { - set_bit(TX_FLOW_ON_BIT, - (void *) &cf_sk->flow_state); + set_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state); } static void caif_read_lock(struct sock *sk) diff --git a/net/core/filter.c b/net/core/filter.c index 2a6a0b0ce43e..7950f7520765 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7041,7 +7041,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; - if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -EINVAL; if (!th->ack || th->rst || th->syn) @@ -7116,7 +7116,7 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; - if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -ENOENT; if (!th->syn || th->ack || th->fin || th->rst) diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 5f85e01d4093..b0ff6153be62 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -64,7 +64,7 @@ u32 secure_tcpv6_ts_off(const struct net *net, .daddr = *(struct in6_addr *)daddr, }; - if (net->ipv4.sysctl_tcp_timestamps != 1) + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) return 0; ts_secret_init(); @@ -120,7 +120,7 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #ifdef CONFIG_INET u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr) { - if (net->ipv4.sysctl_tcp_timestamps != 1) + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) return 0; ts_secret_init(); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 3f00a28fe762..5daa1fa54249 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -387,7 +387,7 @@ void reuseport_stop_listen_sock(struct sock *sk) prog = rcu_dereference_protected(reuse->prog, lockdep_is_held(&reuseport_lock)); - if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req || + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) || (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) { /* Migration capable, move sk from the listening section * to the closed section. @@ -545,7 +545,7 @@ struct sock *reuseport_migrate_sock(struct sock *sk, hash = migrating_sk->sk_hash; prog = rcu_dereference(reuse->prog); if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) { - if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req)) goto select_by_hash; goto failure; } diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index dc92a67baea3..7d542eb46172 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -480,8 +480,8 @@ static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gf sk->sk_family = PF_DECnet; sk->sk_protocol = 0; sk->sk_allocation = gfp; - sk->sk_sndbuf = sysctl_decnet_wmem[1]; - sk->sk_rcvbuf = sysctl_decnet_rmem[1]; + sk->sk_sndbuf = READ_ONCE(sysctl_decnet_wmem[1]); + sk->sk_rcvbuf = READ_ONCE(sysctl_decnet_rmem[1]); /* Initialization of DECnet Session Control Port */ scp = DN_SK(sk); diff --git a/net/dsa/port.c b/net/dsa/port.c index 3738f2d40a0b..2dd76eb1621c 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -248,6 +248,7 @@ static void dsa_port_reset_vlan_filtering(struct dsa_port *dp, struct netlink_ext_ack extack = {0}; bool change_vlan_filtering = false; struct dsa_switch *ds = dp->ds; + struct dsa_port *other_dp; bool vlan_filtering; int err; @@ -270,8 +271,8 @@ static void dsa_port_reset_vlan_filtering(struct dsa_port *dp, * VLAN-aware bridge. */ if (change_vlan_filtering && ds->vlan_filtering_is_global) { - dsa_switch_for_each_port(dp, ds) { - struct net_device *br = dsa_port_bridge_dev_get(dp); + dsa_switch_for_each_port(other_dp, ds) { + struct net_device *br = dsa_port_bridge_dev_get(other_dp); if (br && br_vlan_enabled(br)) { change_vlan_filtering = false; @@ -799,7 +800,7 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, ds->vlan_filtering = vlan_filtering; dsa_switch_for_each_user_port(other_dp, ds) { - struct net_device *slave = dp->slave; + struct net_device *slave = other_dp->slave; /* We might be called in the unbind path, so not * all slave devices might still be registered. diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 2b56218fc57c..4dfd68cf61c5 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -344,6 +344,7 @@ static int dsa_switch_do_lag_fdb_add(struct dsa_switch *ds, struct dsa_lag *lag, ether_addr_copy(a->addr, addr); a->vid = vid; + a->db = db; refcount_set(&a->refcount, 1); list_add_tail(&a->list, &lag->fdbs); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ac67f6b4ec70..252c8bceaba4 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -217,7 +217,7 @@ int inet_listen(struct socket *sock, int backlog) * because the socket was in TCP_LISTEN state previously but * was shutdown() rather than close(). */ - tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen); if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { @@ -335,7 +335,7 @@ lookup_protocol: inet->hdrincl = 1; } - if (net->ipv4.sysctl_ip_no_pmtu_disc) + if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; @@ -1710,24 +1710,14 @@ static const struct net_protocol igmp_protocol = { }; #endif -/* thinking of making this const? Don't. - * early_demux can change based on sysctl. - */ -static struct net_protocol tcp_protocol = { - .early_demux = tcp_v4_early_demux, - .early_demux_handler = tcp_v4_early_demux, +static const struct net_protocol tcp_protocol = { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, .icmp_strict_tag_validation = 1, }; -/* thinking of making this const? Don't. - * early_demux can change based on sysctl. - */ -static struct net_protocol udp_protocol = { - .early_demux = udp_v4_early_demux, - .early_demux_handler = udp_v4_early_demux, +static const struct net_protocol udp_protocol = { .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 6eea1e9e998d..f8ad04470d3a 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -507,7 +507,7 @@ static int ah_init_state(struct xfrm_state *x) if (aalg_desc->uinfo.auth.icv_fullbits/8 != crypto_ahash_digestsize(ahash)) { - pr_info("%s: %s digestsize %u != %hu\n", + pr_info("%s: %s digestsize %u != %u\n", __func__, x->aalg->alg_name, crypto_ahash_digestsize(ahash), aalg_desc->uinfo.auth.icv_fullbits / 8); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b21238df3301..b694f352ce7a 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -1108,7 +1108,7 @@ static int esp_init_authenc(struct xfrm_state *x) err = -EINVAL; if (aalg_desc->uinfo.auth.icv_fullbits / 8 != crypto_aead_authsize(aead)) { - pr_info("ESP: %s digestsize %u != %hu\n", + pr_info("ESP: %s digestsize %u != %u\n", x->aalg->alg_name, crypto_aead_authsize(aead), aalg_desc->uinfo.auth.icv_fullbits / 8); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d9fdcbae16ee..db7b2503f068 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -2216,7 +2216,7 @@ void fib_select_multipath(struct fib_result *res, int hash) } change_nexthops(fi) { - if (net->ipv4.sysctl_fib_multipath_use_neigh) { + if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) { if (!fib_good_nh(nexthop_nh)) continue; if (!first) { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 46e8a5125853..452ff177e4da 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1042,6 +1042,7 @@ fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) { + u8 fib_notify_on_flag_change; struct fib_alias *fa_match; struct sk_buff *skb; int err; @@ -1063,14 +1064,16 @@ void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) WRITE_ONCE(fa_match->offload, fri->offload); WRITE_ONCE(fa_match->trap, fri->trap); + fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change); + /* 2 means send notifications only if offload_failed was changed. */ - if (net->ipv4.sysctl_fib_notify_on_flag_change == 2 && + if (fib_notify_on_flag_change == 2 && READ_ONCE(fa_match->offload_failed) == fri->offload_failed) goto out; WRITE_ONCE(fa_match->offload_failed, fri->offload_failed); - if (!net->ipv4.sysctl_fib_notify_on_flag_change) + if (!fib_notify_on_flag_change) goto out; skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 57c4f0d87a7a..d5d745c3e345 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -881,7 +881,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb) * values please see * Documentation/networking/ip-sysctl.rst */ - switch (net->ipv4.sysctl_ip_no_pmtu_disc) { + switch (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) { default: net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n", &iph->daddr); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index b65d074d9620..e3ab0cb61624 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -467,7 +467,8 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; - if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(pmc->multiaddr) && + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return skb; mtu = READ_ONCE(dev->mtu); @@ -593,7 +594,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(pmc->multiaddr) && - !net->ipv4.sysctl_igmp_llm_reports) + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) @@ -736,7 +737,8 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); - if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(group) && + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return 0; if (type == IGMP_HOST_LEAVE_MESSAGE) @@ -825,7 +827,7 @@ static void igmp_ifc_event(struct in_device *in_dev) struct net *net = dev_net(in_dev->dev); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; - WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv); + WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv)); igmp_ifc_start_timer(in_dev, 1); } @@ -920,7 +922,8 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group) if (group == IGMP_ALL_HOSTS) return false; - if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(group) && + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return false; rcu_read_lock(); @@ -1006,7 +1009,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, * received value was zero, use the default or statically * configured value. */ - in_dev->mr_qrv = ih3->qrv ?: net->ipv4.sysctl_igmp_qrv; + in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; /* RFC3376, 8.3. Query Response Interval: @@ -1045,7 +1048,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && - !net->ipv4.sysctl_igmp_llm_reports) + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; spin_lock_bh(&im->lock); if (im->tm_running) @@ -1186,7 +1189,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im, pmc->interface = im->interface; in_dev_hold(in_dev); pmc->multiaddr = im->multiaddr; - pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); pmc->sfmode = im->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { struct ip_sf_list *psf; @@ -1237,9 +1240,11 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) swap(im->tomb, pmc->tomb); swap(im->sources, pmc->sources); for (psf = im->sources; psf; psf = psf->sf_next) - psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + psf->sf_crcount = in_dev->mr_qrv ?: + READ_ONCE(net->ipv4.sysctl_igmp_qrv); } else { - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + im->crcount = in_dev->mr_qrv ?: + READ_ONCE(net->ipv4.sysctl_igmp_qrv); } in_dev_put(pmc->interface); kfree_pmc(pmc); @@ -1296,7 +1301,8 @@ static void __igmp_group_dropped(struct ip_mc_list *im, gfp_t gfp) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; - if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(im->multiaddr) && + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return; reporter = im->reporter; @@ -1338,13 +1344,14 @@ static void igmp_group_added(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; - if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(im->multiaddr) && + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return; if (in_dev->dead) return; - im->unsolicit_count = net->ipv4.sysctl_igmp_qrv; + im->unsolicit_count = READ_ONCE(net->ipv4.sysctl_igmp_qrv); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { spin_lock_bh(&im->lock); igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY); @@ -1358,7 +1365,7 @@ static void igmp_group_added(struct ip_mc_list *im) * IN() to IN(A). */ if (im->sfmode == MCAST_EXCLUDE) - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); igmp_ifc_event(in_dev); #endif @@ -1642,7 +1649,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && - !net->ipv4.sysctl_igmp_llm_reports) + !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; /* a failover is happening and switches @@ -1749,7 +1756,7 @@ static void ip_mc_reset(struct in_device *in_dev) in_dev->mr_qi = IGMP_QUERY_INTERVAL; in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL; - in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; + in_dev->mr_qrv = READ_ONCE(net->ipv4.sysctl_igmp_qrv); } #else static void ip_mc_reset(struct in_device *in_dev) @@ -1883,7 +1890,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, #ifdef CONFIG_IP_MULTICAST if (psf->sf_oldin && !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { - psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); psf->sf_next = pmc->tomb; pmc->tomb = psf; rv = 1; @@ -1947,7 +1954,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, /* filter mode change */ pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST - pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -2126,7 +2133,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, #ifdef CONFIG_IP_MULTICAST /* else no filters; keep old mode for reports */ - pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -2192,7 +2199,7 @@ static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr, count++; } err = -ENOBUFS; - if (count >= net->ipv4.sysctl_igmp_max_memberships) + if (count >= READ_ONCE(net->ipv4.sysctl_igmp_max_memberships)) goto done; iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); if (!iml) @@ -2379,7 +2386,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct } /* else, add a new source to the filter */ - if (psl && psl->sl_count >= net->ipv4.sysctl_igmp_max_msf) { + if (psl && psl->sl_count >= READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) { err = -ENOBUFS; goto done; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 53f5f956d948..eb31c7158b39 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -263,7 +263,7 @@ next_port: goto other_half_scan; } - if (net->ipv4.sysctl_ip_autobind_reuse && !relax) { + if (READ_ONCE(net->ipv4.sysctl_ip_autobind_reuse) && !relax) { /* We still have a chance to connect to different destinations */ relax = true; goto ports_exhausted; @@ -833,7 +833,8 @@ static void reqsk_timer_handler(struct timer_list *t) icsk = inet_csk(sk_listener); net = sock_net(sk_listener); - max_syn_ack_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; + max_syn_ack_retries = icsk->icsk_syn_retries ? : + READ_ONCE(net->ipv4.sysctl_tcp_synack_retries); /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 1 second, it means diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index e3aa436a1bdf..e18931a6d153 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -157,7 +157,7 @@ int ip_forward(struct sk_buff *skb) !skb_sec_path(skb)) ip_rt_send_redirect(skb); - if (net->ipv4.sysctl_ip_fwd_update_priority) + if (READ_ONCE(net->ipv4.sysctl_ip_fwd_update_priority)) skb->priority = rt_tos2priority(iph->tos); return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index b1165f717cd1..1b512390b3cf 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -312,14 +312,13 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, ip_hdr(hint)->tos == iph->tos; } -INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *)); +int tcp_v4_early_demux(struct sk_buff *skb); +int udp_v4_early_demux(struct sk_buff *skb); static int ip_rcv_finish_core(struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) { const struct iphdr *iph = ip_hdr(skb); - int (*edemux)(struct sk_buff *skb); int err, drop_reason; struct rtable *rt; @@ -332,21 +331,29 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, goto drop_error; } - if (net->ipv4.sysctl_ip_early_demux && + if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && !ip_is_fragment(iph)) { - const struct net_protocol *ipprot; - int protocol = iph->protocol; - - ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { - err = INDIRECT_CALL_2(edemux, tcp_v4_early_demux, - udp_v4_early_demux, skb); - if (unlikely(err)) - goto drop_error; - /* must reload iph, skb->head might have changed */ - iph = ip_hdr(skb); + switch (iph->protocol) { + case IPPROTO_TCP: + if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) { + tcp_v4_early_demux(skb); + + /* must reload iph, skb->head might have changed */ + iph = ip_hdr(skb); + } + break; + case IPPROTO_UDP: + if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) { + err = udp_v4_early_demux(skb); + if (unlikely(err)) + goto drop_error; + + /* must reload iph, skb->head might have changed */ + iph = ip_hdr(skb); + } + break; } } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 445a9ecaefa1..a8a323ecbb54 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -782,7 +782,7 @@ static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) /* numsrc >= (4G-140)/128 overflow in 32 bits */ err = -ENOBUFS; if (gsf->gf_numsrc >= 0x1ffffff || - gsf->gf_numsrc > sock_net(sk)->ipv4.sysctl_igmp_max_msf) + gsf->gf_numsrc > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf)) goto out_free_gsf; err = -EINVAL; @@ -832,7 +832,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, /* numsrc >= (4G-140)/128 overflow in 32 bits */ err = -ENOBUFS; - if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf) + if (n > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf)) goto out_free_gsf; err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode, &gf32->gf_group, gf32->gf_slist_flex); @@ -1244,7 +1244,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, } /* numsrc >= (1G-4) overflow in 32 bits */ if (msf->imsf_numsrc >= 0x3ffffffcU || - msf->imsf_numsrc > net->ipv4.sysctl_igmp_max_msf) { + msf->imsf_numsrc > READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) { kfree(msf); err = -ENOBUFS; break; @@ -1606,7 +1606,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, { struct net *net = sock_net(sk); val = (inet->uc_ttl == -1 ? - net->ipv4.sysctl_ip_default_ttl : + READ_ONCE(net->ipv4.sysctl_ip_default_ttl) : inet->uc_ttl); break; } diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 918c61fda0f3..d640adcaf1b1 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -62,7 +62,7 @@ struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net, skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, - net->ipv4.sysctl_ip_default_ttl); + READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); niph->tot_len = htons(nskb->len); ip_send_check(niph); @@ -117,7 +117,7 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP, - net->ipv4.sysctl_ip_default_ttl); + READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); skb_reset_transport_header(nskb); icmph = skb_put_zero(nskb, sizeof(struct icmphdr)); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 28836071f0a6..0088a4c64d77 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -387,7 +387,7 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) seq_printf(seq, "\nIp: %d %d", IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, - net->ipv4.sysctl_ip_default_ttl); + READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 356f535f3443..4702c61207a8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1398,7 +1398,7 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) struct fib_info *fi = res->fi; u32 mtu = 0; - if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu || + if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) || fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) mtu = fi->fib_mtu; @@ -1929,7 +1929,7 @@ static u32 fib_multipath_custom_hash_outer(const struct net *net, const struct sk_buff *skb, bool *p_has_inner) { - u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) @@ -1958,7 +1958,7 @@ static u32 fib_multipath_custom_hash_inner(const struct net *net, const struct sk_buff *skb, bool has_inner) { - u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; /* We assume the packet carries an encapsulation, but if none was @@ -2018,7 +2018,7 @@ static u32 fib_multipath_custom_hash_skb(const struct net *net, static u32 fib_multipath_custom_hash_fl4(const struct net *net, const struct flowi4 *fl4) { - u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) @@ -2048,7 +2048,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, struct flow_keys hash_keys; u32 mhash = 0; - switch (net->ipv4.sysctl_fib_multipath_hash_policy) { + switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) { case 0: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b387c4835155..942d2dfa1115 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -247,12 +247,12 @@ bool cookie_timestamp_decode(const struct net *net, return true; } - if (!net->ipv4.sysctl_tcp_timestamps) + if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) return false; tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0; - if (tcp_opt->sack_ok && !net->ipv4.sysctl_tcp_sack) + if (tcp_opt->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) return false; if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK) @@ -261,7 +261,7 @@ bool cookie_timestamp_decode(const struct net *net, tcp_opt->wscale_ok = 1; tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK; - return net->ipv4.sysctl_tcp_window_scaling != 0; + return READ_ONCE(net->ipv4.sysctl_tcp_window_scaling) != 0; } EXPORT_SYMBOL(cookie_timestamp_decode); @@ -340,7 +340,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) struct flowi4 fl4; u32 tsoff = 0; - if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) || + !th->ack || th->rst) goto out; if (tcp_synq_no_recent_overflow(sk)) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 108fd86f2718..5490c285668b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -84,7 +84,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, * port limit. */ if ((range[1] < range[0]) || - (range[0] < net->ipv4.sysctl_ip_prot_sock)) + (range[0] < READ_ONCE(net->ipv4.sysctl_ip_prot_sock))) ret = -EINVAL; else set_local_port_range(net, range); @@ -110,7 +110,7 @@ static int ipv4_privileged_ports(struct ctl_table *table, int write, .extra2 = &ip_privileged_port_max, }; - pports = net->ipv4.sysctl_ip_prot_sock; + pports = READ_ONCE(net->ipv4.sysctl_ip_prot_sock); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); @@ -122,7 +122,7 @@ static int ipv4_privileged_ports(struct ctl_table *table, int write, if (range[0] < pports) ret = -EINVAL; else - net->ipv4.sysctl_ip_prot_sock = pports; + WRITE_ONCE(net->ipv4.sysctl_ip_prot_sock, pports); } return ret; @@ -350,61 +350,6 @@ bad_key: return ret; } -static void proc_configure_early_demux(int enabled, int protocol) -{ - struct net_protocol *ipprot; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_protocol *ip6prot; -#endif - - rcu_read_lock(); - - ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot) - ipprot->early_demux = enabled ? ipprot->early_demux_handler : - NULL; - -#if IS_ENABLED(CONFIG_IPV6) - ip6prot = rcu_dereference(inet6_protos[protocol]); - if (ip6prot) - ip6prot->early_demux = enabled ? ip6prot->early_demux_handler : - NULL; -#endif - rcu_read_unlock(); -} - -static int proc_tcp_early_demux(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int ret = 0; - - ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); - - if (write && !ret) { - int enabled = init_net.ipv4.sysctl_tcp_early_demux; - - proc_configure_early_demux(enabled, IPPROTO_TCP); - } - - return ret; -} - -static int proc_udp_early_demux(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int ret = 0; - - ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); - - if (write && !ret) { - int enabled = init_net.ipv4.sysctl_udp_early_demux; - - proc_configure_early_demux(enabled, IPPROTO_UDP); - } - - return ret; -} - static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -707,14 +652,14 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_udp_early_demux, .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_udp_early_demux + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_early_demux", .data = &init_net.ipv4.sysctl_tcp_early_demux, .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_tcp_early_demux + .proc_handler = proc_dou8vec_minmax, }, { .procname = "nexthop_compat_mode", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2222dfdde316..766881775abb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -441,7 +441,7 @@ void tcp_init_sock(struct sock *sk) tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; - tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; + tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); tcp_assign_congestion_control(sk); tp->tsoffset = 0; @@ -452,8 +452,8 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); - WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); + WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); + WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); sk_sockets_allocated_inc(sk); } @@ -686,7 +686,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && - sock_net(sk)->ipv4.sysctl_tcp_autocorking && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) && !tcp_rtx_queue_empty(sk) && refcount_read(&sk->sk_wmem_alloc) > skb->truesize && tcp_skb_can_collapse_to(skb); @@ -1150,7 +1150,8 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & + TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; @@ -1723,7 +1724,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else - cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1; + cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; val = min(val, cap); WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); @@ -3617,7 +3618,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; - } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { + } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) & + TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else @@ -3967,12 +3969,13 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = keepalive_probes(tp); break; case TCP_SYNCNT: - val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; + val = icsk->icsk_syn_retries ? : + READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); break; case TCP_LINGER2: val = tp->linger2; if (val >= 0) - val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ; + val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; break; case TCP_DEFER_ACCEPT: val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, @@ -4456,9 +4459,18 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, return SKB_DROP_REASON_TCP_MD5UNEXPECTED; } - /* check the signature */ - genhash = tp->af_specific->calc_md5_hash(newhash, hash_expected, - NULL, skb); + /* Check the signature. + * To support dual stack listeners, we need to handle + * IPv4-mapped case. + */ + if (family == AF_INET) + genhash = tcp_v4_md5_hash_skb(newhash, + hash_expected, + NULL, skb); + else + genhash = tp->af_specific->calc_md5_hash(newhash, + hash_expected, + NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index fdbcf2a6d08e..825b216d11f5 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -332,7 +332,7 @@ static bool tcp_fastopen_no_cookie(const struct sock *sk, const struct dst_entry *dst, int flag) { - return (sock_net(sk)->ipv4.sysctl_tcp_fastopen & flag) || + return (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & flag) || tcp_sk(sk)->fastopen_no_cookie || (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE)); } @@ -347,7 +347,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, const struct dst_entry *dst) { bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; - int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + int tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen); struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; int ret = 0; @@ -489,7 +489,7 @@ void tcp_fastopen_active_disable(struct sock *sk) { struct net *net = sock_net(sk); - if (!sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout)) return; /* Paired with READ_ONCE() in tcp_fastopen_active_should_disable() */ @@ -510,7 +510,8 @@ void tcp_fastopen_active_disable(struct sock *sk) */ bool tcp_fastopen_active_should_disable(struct sock *sk) { - unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout; + unsigned int tfo_bh_timeout = + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout); unsigned long timeout; int tfo_da_times; int multiplier; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3ec4edc37313..b1637990d570 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -426,7 +426,7 @@ static void tcp_sndbuf_expand(struct sock *sk) if (sk->sk_sndbuf < sndmem) WRITE_ONCE(sk->sk_sndbuf, - min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2])); + min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2]))); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -461,7 +461,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(sk, skbtruesize) >> 1; - int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; + int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -534,7 +534,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, */ static void tcp_init_buffer_space(struct sock *sk) { - int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; + int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win); struct tcp_sock *tp = tcp_sk(sk); int maxwin; @@ -574,16 +574,17 @@ static void tcp_clamp_window(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); + int rmem2; icsk->icsk_ack.quick = 0; + rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); - if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && + if (sk->sk_rcvbuf < rmem2 && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { WRITE_ONCE(sk->sk_rcvbuf, - min(atomic_read(&sk->sk_rmem_alloc), - net->ipv4.sysctl_tcp_rmem[2])); + min(atomic_read(&sk->sk_rmem_alloc), rmem2)); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); @@ -724,7 +725,7 @@ void tcp_rcv_space_adjust(struct sock *sk) * <prev RTT . ><current RTT .. ><next RTT .... > */ - if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int rcvmem, rcvbuf; u64 rcvwin, grow; @@ -745,7 +746,7 @@ void tcp_rcv_space_adjust(struct sock *sk) do_div(rcvwin, tp->advmss); rcvbuf = min_t(u64, rcvwin * rcvmem, - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); @@ -910,9 +911,9 @@ static void tcp_update_pacing_rate(struct sock *sk) * end of slow start and should slow down. */ if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / 2) - rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio; + rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio); else - rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio; + rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio); rate *= max(tcp_snd_cwnd(tp), tp->packets_out); @@ -1051,7 +1052,7 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, tp->undo_marker ? tp->undo_retrans : 0); #endif tp->reordering = min_t(u32, (metric + mss - 1) / mss, - sock_net(sk)->ipv4.sysctl_tcp_max_reordering); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); } /* This exciting event is worth to be remembered. 8) */ @@ -2030,7 +2031,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) return; tp->reordering = min_t(u32, tp->packets_out + addend, - sock_net(sk)->ipv4.sysctl_tcp_max_reordering); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); tp->reord_seen++; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); } @@ -2095,7 +2096,8 @@ static inline void tcp_init_undo(struct tcp_sock *tp) static bool tcp_is_rack(const struct sock *sk) { - return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION; + return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & + TCP_RACK_LOSS_DETECTION; } /* If we detect SACK reneging, forget all SACK information @@ -2139,6 +2141,7 @@ void tcp_enter_loss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; + u8 reordering; tcp_timeout_mark_lost(sk); @@ -2159,10 +2162,12 @@ void tcp_enter_loss(struct sock *sk) /* Timeout in disordered state after receiving substantial DUPACKs * suggests that the degree of reordering is over-estimated. */ + reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering); if (icsk->icsk_ca_state <= TCP_CA_Disorder && - tp->sacked_out >= net->ipv4.sysctl_tcp_reordering) + tp->sacked_out >= reordering) tp->reordering = min_t(unsigned int, tp->reordering, - net->ipv4.sysctl_tcp_reordering); + reordering); + tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; tcp_ecn_queue_cwr(tp); @@ -2171,7 +2176,7 @@ void tcp_enter_loss(struct sock *sk) * loss recovery is underway except recurring timeout(s) on * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing */ - tp->frto = net->ipv4.sysctl_tcp_frto && + tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) && (new_recovery || icsk->icsk_retransmits) && !inet_csk(sk)->icsk_mtup.probe_size; } @@ -3054,7 +3059,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag) { - u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; + u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ; struct tcp_sock *tp = tcp_sk(sk); if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) { @@ -3464,7 +3469,8 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) * new SACK or ECE mark may first advance cwnd here and later reduce * cwnd in tcp_fastretrans_alert() based on more states. */ - if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering) + if (tcp_sk(sk)->reordering > + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering)) return flag & FLAG_FORWARD_PROGRESS; return flag & FLAG_DATA_ACKED; @@ -3576,7 +3582,8 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, if (*last_oow_ack_time) { s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); - if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) { + if (0 <= elapsed && + elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) { NET_INC_STATS(net, mib_idx); return true; /* rate-limited: don't send yet! */ } @@ -3624,7 +3631,7 @@ static void tcp_send_challenge_ack(struct sock *sk) /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; if (now != challenge_timestamp) { - u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit; + u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); u32 half = (ack_limit + 1) >> 1; challenge_timestamp = now; @@ -4056,7 +4063,7 @@ void tcp_parse_options(const struct net *net, break; case TCPOPT_WINDOW: if (opsize == TCPOLEN_WINDOW && th->syn && - !estab && net->ipv4.sysctl_tcp_window_scaling) { + !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) { __u8 snd_wscale = *(__u8 *)ptr; opt_rx->wscale_ok = 1; if (snd_wscale > TCP_MAX_WSCALE) { @@ -4072,7 +4079,7 @@ void tcp_parse_options(const struct net *net, case TCPOPT_TIMESTAMP: if ((opsize == TCPOLEN_TIMESTAMP) && ((estab && opt_rx->tstamp_ok) || - (!estab && net->ipv4.sysctl_tcp_timestamps))) { + (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) { opt_rx->saw_tstamp = 1; opt_rx->rcv_tsval = get_unaligned_be32(ptr); opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); @@ -4080,7 +4087,7 @@ void tcp_parse_options(const struct net *net, break; case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM && th->syn && - !estab && net->ipv4.sysctl_tcp_sack) { + !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) { opt_rx->sack_ok = TCP_SACK_SEEN; tcp_sack_reset(opt_rx); } @@ -4421,7 +4428,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { int mib_idx; if (before(seq, tp->rcv_nxt)) @@ -4468,7 +4475,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; tcp_rcv_spurious_retrans(sk, skb); @@ -5514,7 +5521,7 @@ send_now: } if (!tcp_is_sack(tp) || - tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) + tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)) goto send_now; if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { @@ -5535,11 +5542,12 @@ send_now: if (tp->srtt_us && tp->srtt_us < rtt) rtt = tp->srtt_us; - delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, + delay = min_t(unsigned long, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns), rtt * (NSEC_PER_USEC >> 3)/20); sock_hold(sk); hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), - sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns), HRTIMER_MODE_REL_PINNED_SOFT); } @@ -5567,7 +5575,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) struct tcp_sock *tp = tcp_sk(sk); u32 ptr = ntohs(th->urg_ptr); - if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg) + if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg)) ptr--; ptr += ntohl(th->seq); @@ -6797,11 +6805,14 @@ static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; - bool want_cookie = false; struct net *net = sock_net(sk); + bool want_cookie = false; + u8 syncookies; + + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); #ifdef CONFIG_SYN_COOKIES - if (net->ipv4.sysctl_tcp_syncookies) { + if (syncookies) { msg = "Sending cookies"; want_cookie = true; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); @@ -6809,8 +6820,7 @@ static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) #endif __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - if (!queue->synflood_warned && - net->ipv4.sysctl_tcp_syncookies != 2 && + if (!queue->synflood_warned && syncookies != 2 && xchg(&queue->synflood_warned, 1) == 0) net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, sk->sk_num, msg); @@ -6859,7 +6869,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, struct tcp_sock *tp = tcp_sk(sk); u16 mss; - if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 && + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 && !inet_csk_reqsk_queue_is_full(sk)) return 0; @@ -6893,13 +6903,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, bool want_cookie = false; struct dst_entry *dst; struct flowi fl; + u8 syncookies; + + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is * evidently real one. */ - if ((net->ipv4.sysctl_tcp_syncookies == 2 || - inet_csk_reqsk_queue_is_full(sk)) && !isn) { + if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); if (!want_cookie) goto drop; @@ -6948,10 +6960,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); if (!want_cookie && !isn) { + int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog); + /* Kill the following clause, if you dislike this way. */ - if (!net->ipv4.sysctl_tcp_syncookies && - (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (net->ipv4.sysctl_max_syn_backlog >> 2)) && + if (!syncookies && + (max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (max_syn_backlog >> 2)) && !tcp_peer_is_proven(req, dst)) { /* Without syncookies last quarter of * backlog is filled with destinations, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index da5a3c44c4fb..586c102ce152 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -108,10 +108,10 @@ static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) { + int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); const struct inet_timewait_sock *tw = inet_twsk(sktw); const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); struct tcp_sock *tp = tcp_sk(sk); - int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; if (reuse == 2) { /* Still does not detect *everything* that goes through @@ -1006,7 +1006,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); - tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | (inet_sk(sk)->tos & INET_ECN_MASK) : inet_sk(sk)->tos; @@ -1526,7 +1526,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, /* Set ToS of the new socket based upon the value of incoming SYN. * ECT bits are set later in tcp_init_transfer(). */ - if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; if (!dst) { diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 7029b0e98edb..d58e672be31c 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -329,7 +329,7 @@ void tcp_update_metrics(struct sock *sk) int m; sk_dst_confirm(sk); - if (net->ipv4.sysctl_tcp_nometrics_save || !dst) + if (READ_ONCE(net->ipv4.sysctl_tcp_nometrics_save) || !dst) return; rcu_read_lock(); @@ -385,7 +385,7 @@ void tcp_update_metrics(struct sock *sk) if (tcp_in_initial_slowstart(tp)) { /* Slow start still did not finish. */ - if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && (tcp_snd_cwnd(tp) >> 1) > val) @@ -401,7 +401,7 @@ void tcp_update_metrics(struct sock *sk) } else if (!tcp_in_slow_start(tp) && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ - if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, max(tcp_snd_cwnd(tp) >> 1, tp->snd_ssthresh)); @@ -418,7 +418,7 @@ void tcp_update_metrics(struct sock *sk) tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_ssthresh) >> 1); } - if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && tp->snd_ssthresh > val) @@ -428,7 +428,8 @@ void tcp_update_metrics(struct sock *sk) if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val < tp->reordering && - tp->reordering != net->ipv4.sysctl_tcp_reordering) + tp->reordering != + READ_ONCE(net->ipv4.sysctl_tcp_reordering)) tcp_metric_set(tm, TCP_METRIC_REORDERING, tp->reordering); } @@ -462,7 +463,7 @@ void tcp_init_metrics(struct sock *sk) if (tcp_metric_locked(tm, TCP_METRIC_CWND)) tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); - val = net->ipv4.sysctl_tcp_no_ssthresh_metrics_save ? + val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ? 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val) { tp->snd_ssthresh = val; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6854bb1fb32b..cb95d88497ae 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -173,7 +173,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ - if (twsk_net(tw)->ipv4.sysctl_tcp_rfc1337 == 0) { + if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) { kill: inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; @@ -781,7 +781,7 @@ listen_overflow: if (sk != req->rsk_listener) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); - if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) { + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) { inet_rsk(req)->acked = 1; return NULL; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 11aa0ab10bba..4c376b6d8764 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -167,16 +167,13 @@ static void tcp_event_data_sent(struct tcp_sock *tp, if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); - /* If this is the first data packet sent in response to the - * previous received data, - * and it is a reply for ato after last received packet, - * increase pingpong count. - */ - if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) && - (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) - inet_csk_inc_pingpong_cnt(sk); - tp->lsndtime = now; + + /* If it is a reply for ato after last received + * packet, enter pingpong mode. + */ + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + inet_csk_enter_pingpong_mode(sk); } /* Account for an ACK we sent. */ @@ -230,7 +227,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, * which we interpret as a sign the remote TCP is not * misinterpreting the window field as a signed quantity. */ - if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else (*rcv_wnd) = min_t(u32, space, U16_MAX); @@ -241,7 +238,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, *rcv_wscale = 0; if (wscale_ok) { /* Set window scaling on max possible window */ - space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, @@ -285,7 +282,7 @@ static u16 tcp_select_window(struct sock *sk) * scaled window. */ if (!tp->rx_opt.rcv_wscale && - sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); @@ -791,18 +788,18 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, opts->mss = tcp_advertise_mss(sk); remaining -= TCPOLEN_MSS_ALIGNED; - if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) { + if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } - if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) { + if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) { opts->ws = tp->rx_opt.rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } - if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) { + if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) remaining -= TCPOLEN_SACKPERM_ALIGNED; @@ -1719,7 +1716,8 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ - mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss); + mss_now = max(mss_now, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss)); return mss_now; } @@ -1762,10 +1760,10 @@ void tcp_mtup_init(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); - icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1; + icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; - icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss)); icsk->icsk_mtup.probe_size = 0; if (icsk->icsk_mtup.enabled) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; @@ -1897,7 +1895,7 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; - if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle && + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) && (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); @@ -1975,7 +1973,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift); - r = tcp_min_rtt(tcp_sk(sk)) >> sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log; + r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log); if (r < BITS_PER_TYPE(sk->sk_gso_max_size)) bytes += sk->sk_gso_max_size >> r; @@ -1994,7 +1992,7 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) min_tso = ca_ops->min_tso_segs ? ca_ops->min_tso_segs(sk) : - sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs; + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); return min_t(u32, tso_segs, sk->sk_gso_max_segs); @@ -2282,7 +2280,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) u32 interval; s32 delta; - interval = net->ipv4.sysctl_tcp_probe_interval; + interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval); delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp; if (unlikely(delta >= interval * HZ)) { int mss = tcp_current_mss(sk); @@ -2366,7 +2364,7 @@ static int tcp_mtu_probe(struct sock *sk) * probing process by not resetting search range to its orignal. */ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || - interval < net->ipv4.sysctl_tcp_probe_threshold) { + interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) { /* Check whether enough time has elaplased for * another round of probing. */ @@ -2506,7 +2504,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift)); if (sk->sk_pacing_status == SK_PACING_NONE) limit = min_t(unsigned long, limit, - sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); limit <<= factor; if (static_branch_unlikely(&tcp_tx_delay_enabled) && @@ -2740,7 +2738,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) if (rcu_access_pointer(tp->fastopen_rsk)) return false; - early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; + early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans); /* Schedule a loss probe in 2*RTT for SACK capable connections * not in loss recovery, that are either limited by cwnd or application. */ @@ -3104,7 +3102,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, struct sk_buff *skb = to, *tmp; bool first = true; - if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)) return; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; @@ -3646,7 +3644,7 @@ static void tcp_connect_init(struct sock *sk) * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. */ tp->tcp_header_len = sizeof(struct tcphdr); - if (sock_net(sk)->ipv4.sysctl_tcp_timestamps) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps)) tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; #ifdef CONFIG_TCP_MD5SIG @@ -3682,7 +3680,7 @@ static void tcp_connect_init(struct sock *sk) tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, - sock_net(sk)->ipv4.sysctl_tcp_window_scaling, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling), &rcv_wscale, rcv_wnd); @@ -4089,7 +4087,7 @@ void tcp_send_probe0(struct sock *sk) icsk->icsk_probes_out++; if (err <= 0) { - if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2) + if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; timeout = tcp_probe0_when(sk, TCP_RTO_MAX); } else { diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 48f30e7209f2..50abaa941387 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -14,7 +14,8 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk) return 0; if (tp->sacked_out >= tp->reordering && - !(sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_NO_DUPTHRESH)) + !(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & + TCP_RACK_NO_DUPTHRESH)) return 0; } @@ -187,7 +188,8 @@ void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); - if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND || + if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & + TCP_RACK_STATIC_REO_WND) || !rs->prior_delivered) return; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 20cf4a98c69d..50bba370486e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -143,7 +143,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) */ static int tcp_orphan_retries(struct sock *sk, bool alive) { - int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */ + int retries = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_orphan_retries); /* May be zero. */ /* We know from an ICMP that something is wrong. */ if (sk->sk_err_soft && !alive) @@ -163,7 +163,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) int mss; /* Black hole detection */ - if (!net->ipv4.sysctl_tcp_mtu_probing) + if (!READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing)) return; if (!icsk->icsk_mtup.enabled) { @@ -171,9 +171,9 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } else { mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; - mss = min(net->ipv4.sysctl_tcp_base_mss, mss); - mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor); - mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss); + mss = min(READ_ONCE(net->ipv4.sysctl_tcp_base_mss), mss); + mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_mtu_probe_floor)); + mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_min_snd_mss)); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); } tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -239,17 +239,18 @@ static int tcp_write_timeout(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) __dst_negative_advice(sk); - retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; + retry_until = icsk->icsk_syn_retries ? : + READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); expired = icsk->icsk_retransmits >= retry_until; } else { - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { + if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) { /* Black hole detection */ tcp_mtu_probing(icsk, sk); __dst_negative_advice(sk); } - retry_until = net->ipv4.sysctl_tcp_retries2; + retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { const bool alive = icsk->icsk_rto < TCP_RTO_MAX; @@ -380,7 +381,7 @@ static void tcp_probe_timer(struct sock *sk) msecs_to_jiffies(icsk->icsk_user_timeout)) goto abort; - max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; + max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; @@ -406,12 +407,15 @@ abort: tcp_write_err(sk); static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) { struct inet_connection_sock *icsk = inet_csk(sk); - int max_retries = icsk->icsk_syn_retries ? : - sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ struct tcp_sock *tp = tcp_sk(sk); + int max_retries; req->rsk_ops->syn_ack_timeout(req); + /* add one more retry for fastopen */ + max_retries = icsk->icsk_syn_retries ? : + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1; + if (req->num_timeout >= max_retries) { tcp_write_err(sk); return; @@ -574,7 +578,7 @@ out_reset_timer: * linear-timeout retransmissions into a black hole */ if (sk->sk_state == TCP_ESTABLISHED && - (tp->thin_lto || net->ipv4.sysctl_tcp_thin_linear_timeouts) && + (tp->thin_lto || READ_ONCE(net->ipv4.sysctl_tcp_thin_linear_timeouts)) && tcp_stream_is_thin(tp) && icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { icsk->icsk_backoff = 0; @@ -585,7 +589,7 @@ out_reset_timer: } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX); - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) + if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0)) __sk_dst_reset(sk); out:; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 70564ddccc46..6f354f8be2c5 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -226,7 +226,7 @@ lookup_protocol: RCU_INIT_POINTER(inet->mc_list, NULL); inet->rcv_tos = 0; - if (net->ipv4.sysctl_ip_no_pmtu_disc) + if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 0322cc86b84e..e1ebf5e42ebe 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -45,20 +45,23 @@ #include <net/inet_ecn.h> #include <net/dst_metadata.h> -INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *)); static void ip6_rcv_finish_core(struct net *net, struct sock *sk, struct sk_buff *skb) { - void (*edemux)(struct sk_buff *skb); - - if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { - const struct inet6_protocol *ipprot; - - ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); - if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) - INDIRECT_CALL_2(edemux, tcp_v6_early_demux, - udp_v6_early_demux, skb); + if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && + !skb_dst(skb) && !skb->sk) { + switch (ipv6_hdr(skb)->nexthdr) { + case IPPROTO_TCP: + if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) + tcp_v6_early_demux(skb); + break; + case IPPROTO_UDP: + if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) + udp_v6_early_demux(skb); + break; + } } + if (!skb_valid_dst(skb)) ip6_route_input(skb); } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 7f695c39d9a8..87c699d57b36 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1522,7 +1522,6 @@ static void mld_query_work(struct work_struct *work) if (++cnt >= MLD_MAX_QUEUE) { rework = true; - schedule_delayed_work(&idev->mc_query_work, 0); break; } } @@ -1533,8 +1532,10 @@ static void mld_query_work(struct work_struct *work) __mld_query_work(skb); mutex_unlock(&idev->mc_lock); - if (!rework) - in6_dev_put(idev); + if (rework && queue_delayed_work(mld_wq, &idev->mc_query_work, 0)) + return; + + in6_dev_put(idev); } /* called with rcu_read_lock() */ @@ -1624,7 +1625,6 @@ static void mld_report_work(struct work_struct *work) if (++cnt >= MLD_MAX_QUEUE) { rework = true; - schedule_delayed_work(&idev->mc_report_work, 0); break; } } @@ -1635,8 +1635,10 @@ static void mld_report_work(struct work_struct *work) __mld_report_work(skb); mutex_unlock(&idev->mc_lock); - if (!rework) - in6_dev_put(idev); + if (rework && queue_delayed_work(mld_wq, &idev->mc_report_work, 0)) + return; + + in6_dev_put(idev); } static bool is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index ecf3a553a0dc..8c6c2d82c1cd 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -22,6 +22,11 @@ #include <linux/proc_fs.h> #include <net/ping.h> +static void ping_v6_destroy(struct sock *sk) +{ + inet6_destroy_sock(sk); +} + /* Compatibility glue so we can support IPv6 when it's compiled as a module */ static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) @@ -181,6 +186,7 @@ struct proto pingv6_prot = { .owner = THIS_MODULE, .init = ping_init_sock, .close = ping_close, + .destroy = ping_v6_destroy, .connect = ip6_datagram_connect_v6_only, .disconnect = __udp_disconnect, .setsockopt = ipv6_setsockopt, diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 9cc123f000fb..5014aa663452 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -141,7 +141,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) __u8 rcv_wscale; u32 tsoff = 0; - if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) || + !th->ack || th->rst) goto out; if (tcp_synq_no_recent_overflow(sk)) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f37dd4aa91c6..be09941fe6d9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -546,7 +546,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, if (np->repflow && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); - tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | (np->tclass & INET_ECN_MASK) : np->tclass; @@ -1314,7 +1314,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * /* Set ToS of the new socket based upon the value of incoming SYN. * ECT bits are set later in tcp_init_transfer(). */ - if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; /* Clone native IPv6 options from listening socket (if any) @@ -1822,7 +1822,7 @@ do_time_wait: goto discard_it; } -INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb) +void tcp_v6_early_demux(struct sk_buff *skb) { const struct ipv6hdr *hdr; const struct tcphdr *th; @@ -2176,12 +2176,7 @@ struct proto tcpv6_prot = { }; EXPORT_SYMBOL_GPL(tcpv6_prot); -/* thinking of making this const? Don't. - * early_demux can change based on sysctl. - */ -static struct inet6_protocol tcpv6_protocol = { - .early_demux = tcp_v6_early_demux, - .early_demux_handler = tcp_v6_early_demux, +static const struct inet6_protocol tcpv6_protocol = { .handler = tcp_v6_rcv, .err_handler = tcp_v6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 55afd7f39c04..e2f2e087a753 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1052,7 +1052,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, return NULL; } -INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb) +void udp_v6_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct udphdr *uh; @@ -1660,12 +1660,7 @@ int udpv6_getsockopt(struct sock *sk, int level, int optname, return ipv6_getsockopt(sk, level, optname, optval, optlen); } -/* thinking of making this const? Don't. - * early_demux can change based on sysctl. - */ -static struct inet6_protocol udpv6_protocol = { - .early_demux = udp_v6_early_demux, - .early_demux_handler = udp_v6_early_demux, +static const struct inet6_protocol udpv6_protocol = { .handler = udpv6_rcv, .err_handler = udpv6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 15a73b7fdd75..1a9ada411879 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -377,9 +377,8 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do bool cancel_scan; struct cfg80211_nan_func *func; - spin_lock_bh(&local->fq.lock); clear_bit(SDATA_STATE_RUNNING, &sdata->state); - spin_unlock_bh(&local->fq.lock); + synchronize_rcu(); /* flush _ieee80211_wake_txqs() */ cancel_scan = rcu_access_pointer(local->scan_sdata) == sdata; if (cancel_scan) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index bd8f0f425be4..30d289044e71 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1271,7 +1271,7 @@ raise_win: if (unlikely(th->syn)) new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale; if (!tp->rx_opt.rcv_wscale && - sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows) + READ_ONCE(sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 21a3ed64226e..7e1518bb6115 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1908,7 +1908,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; - if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int rcvmem, rcvbuf; u64 rcvwin, grow; @@ -1926,7 +1926,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) do_div(rcvwin, advmss); rcvbuf = min_t(u64, rcvwin * rcvmem, - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; @@ -2669,8 +2669,8 @@ static int mptcp_init_sock(struct sock *sk) mptcp_ca_reset(sk); sk_sockets_allocated_inc(sk); - sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; - sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); + sk->sk_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); return 0; } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 63e8892ec807..af28f3b60389 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1533,7 +1533,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, mptcp_sock_graft(ssk, sk->sk_socket); iput(SOCK_INODE(sf)); WRITE_ONCE(msk->allow_infinite_fallback, false); - return err; + return 0; failed_unlink: list_del(&subflow->node); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index e479dd0561c5..16915f8eef2b 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -405,7 +405,7 @@ synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, iph->tos = 0; iph->id = 0; iph->frag_off = htons(IP_DF); - iph->ttl = net->ipv4.sysctl_ip_default_ttl; + iph->ttl = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); iph->protocol = IPPROTO_TCP; iph->check = 0; iph->saddr = saddr; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 646d5fd53604..9f976b11d896 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3340,6 +3340,8 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) if (err < 0) return err; } + + cond_resched(); } return 0; @@ -9367,9 +9369,13 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, break; } } + + cond_resched(); } list_for_each_entry(set, &ctx->table->sets, list) { + cond_resched(); + if (!nft_is_active_next(ctx->net, set)) continue; if (!(set->flags & NFT_SET_MAP) || diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index a364f8e5e698..87a9009d5234 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -843,11 +843,16 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) } static int -nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) +nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff) { struct sk_buff *nskb; if (diff < 0) { + unsigned int min_len = skb_transport_offset(e->skb); + + if (data_len < min_len) + return -EINVAL; + if (pskb_trim(e->skb, data_len)) return -ENOMEM; } else if (diff > 0) { diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index 15e4b7640dc0..da29e92c03e2 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -68,6 +68,31 @@ static void nft_queue_sreg_eval(const struct nft_expr *expr, regs->verdict.code = ret; } +static int nft_queue_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + static const unsigned int supported_hooks = ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING)); + + switch (ctx->family) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + case NFPROTO_BRIDGE: + break; + case NFPROTO_NETDEV: /* lacks okfn */ + fallthrough; + default: + return -EOPNOTSUPP; + } + + return nft_chain_validate_hooks(ctx->chain, supported_hooks); +} + static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = { [NFTA_QUEUE_NUM] = { .type = NLA_U16 }, [NFTA_QUEUE_TOTAL] = { .type = NLA_U16 }, @@ -164,6 +189,7 @@ static const struct nft_expr_ops nft_queue_ops = { .eval = nft_queue_eval, .init = nft_queue_init, .dump = nft_queue_dump, + .validate = nft_queue_validate, .reduce = NFT_REDUCE_READONLY, }; @@ -173,6 +199,7 @@ static const struct nft_expr_ops nft_queue_sreg_ops = { .eval = nft_queue_sreg_eval, .init = nft_queue_sreg_init, .dump = nft_queue_sreg_dump, + .validate = nft_queue_validate, .reduce = NFT_REDUCE_READONLY, }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 9bb4d3dcc994..ac366c99086f 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3533,7 +3533,7 @@ int tc_setup_action(struct flow_action *flow_action, struct tc_action *actions[], struct netlink_ext_ack *extack) { - int i, j, index, err = 0; + int i, j, k, index, err = 0; struct tc_action *act; BUILD_BUG_ON(TCA_ACT_HW_STATS_ANY != FLOW_ACTION_HW_STATS_ANY); @@ -3553,14 +3553,18 @@ int tc_setup_action(struct flow_action *flow_action, if (err) goto err_out_locked; - entry->hw_stats = tc_act_hw_stats(act->hw_stats); - entry->hw_index = act->tcfa_index; index = 0; err = tc_setup_offload_act(act, entry, &index, extack); - if (!err) - j += index; - else + if (err) goto err_out_locked; + + for (k = 0; k < index ; k++) { + entry[k].hw_stats = tc_act_hw_stats(act->hw_stats); + entry[k].hw_index = act->tcfa_index; + } + + j += index; + spin_unlock_bh(&act->tcfa_lock); } diff --git a/net/sctp/associola.c b/net/sctp/associola.c index be29da09cc7a..3460abceba44 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -229,9 +229,8 @@ static struct sctp_association *sctp_association_init( if (!sctp_ulpq_init(&asoc->ulpq, asoc)) goto fail_init; - if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, - 0, gfp)) - goto fail_init; + if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, 0, gfp)) + goto stream_free; /* Initialize default path MTU. */ asoc->pathmtu = sp->pathmtu; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 35928fefae33..1a094b087d88 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -358,7 +358,7 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) && ret != RTN_LOCAL && !sp->inet.freebind && - !net->ipv4.sysctl_ip_nonlocal_bind) + !READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind)) return 0; if (ipv6_only_sock(sctp_opt2sk(sp))) diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 6dc95dcc0ff4..ef9fceadef8d 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -137,7 +137,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, ret = sctp_stream_alloc_out(stream, outcnt, gfp); if (ret) - goto out_err; + return ret; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; @@ -145,22 +145,9 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, handle_in: sctp_stream_interleave_init(stream); if (!incnt) - goto out; - - ret = sctp_stream_alloc_in(stream, incnt, gfp); - if (ret) - goto in_err; - - goto out; + return 0; -in_err: - sched->free(stream); - genradix_free(&stream->in); -out_err: - genradix_free(&stream->out); - stream->outcnt = 0; -out: - return ret; + return sctp_stream_alloc_in(stream, incnt, gfp); } int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index 518b1b9bf89d..1ad565ed5627 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -160,7 +160,7 @@ int sctp_sched_set_sched(struct sctp_association *asoc, if (!SCTP_SO(&asoc->stream, i)->ext) continue; - ret = n->init_sid(&asoc->stream, i, GFP_KERNEL); + ret = n->init_sid(&asoc->stream, i, GFP_ATOMIC); if (ret) goto err; } diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index c4d057b2941d..0bde36b56472 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -2122,7 +2122,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) init_waitqueue_head(&lgr->llc_flow_waiter); init_waitqueue_head(&lgr->llc_msg_waiter); mutex_init(&lgr->llc_conf_mutex); - lgr->llc_testlink_time = net->ipv4.sysctl_tcp_keepalive_time; + lgr->llc_testlink_time = READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time); } /* called after lgr was removed from lgr_list */ diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 43509c7e90fc..f1c3b8eb4b3d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -517,7 +517,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, timer_setup(&sk->sk_timer, tipc_sk_timeout, 0); sk->sk_shutdown = 0; sk->sk_backlog_rcv = tipc_sk_backlog_rcv; - sk->sk_rcvbuf = sysctl_tipc_rmem[1]; + sk->sk_rcvbuf = READ_ONCE(sysctl_tipc_rmem[1]); sk->sk_data_ready = tipc_data_ready; sk->sk_write_space = tipc_write_space; sk->sk_destruct = tipc_sock_destruct; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index ce827e79c66a..9975df34d9c2 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -97,13 +97,16 @@ static void tls_device_queue_ctx_destruction(struct tls_context *ctx) unsigned long flags; spin_lock_irqsave(&tls_device_lock, flags); + if (unlikely(!refcount_dec_and_test(&ctx->refcount))) + goto unlock; + list_move_tail(&ctx->list, &tls_device_gc_list); /* schedule_work inside the spinlock * to make sure tls_device_down waits for that work. */ schedule_work(&tls_device_gc_work); - +unlock: spin_unlock_irqrestore(&tls_device_lock, flags); } @@ -194,8 +197,7 @@ void tls_device_sk_destruct(struct sock *sk) clean_acked_data_disable(inet_csk(sk)); } - if (refcount_dec_and_test(&tls_ctx->refcount)) - tls_device_queue_ctx_destruction(tls_ctx); + tls_device_queue_ctx_destruction(tls_ctx); } EXPORT_SYMBOL_GPL(tls_device_sk_destruct); @@ -1374,8 +1376,13 @@ static int tls_device_down(struct net_device *netdev) * by tls_device_free_ctx. rx_conf and tx_conf stay in TLS_HW. * Now release the ref taken above. */ - if (refcount_dec_and_test(&ctx->refcount)) + if (refcount_dec_and_test(&ctx->refcount)) { + /* sk_destruct ran after tls_device_down took a ref, and + * it returned early. Complete the destruction here. + */ + list_del(&ctx->list); tls_device_free_ctx(ctx); + } } up_write(&device_offload_lock); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f1876ea61fdc..f1a0bab920a5 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2678,8 +2678,10 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, *num_xfrms = 0; return 0; } - if (IS_ERR(pols[0])) + if (IS_ERR(pols[0])) { + *num_pols = 0; return PTR_ERR(pols[0]); + } *num_xfrms = pols[0]->xfrm_nr; @@ -2694,6 +2696,7 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, if (pols[1]) { if (IS_ERR(pols[1])) { xfrm_pols_put(pols, *num_pols); + *num_pols = 0; return PTR_ERR(pols[1]); } (*num_pols)++; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 08564e0eef20..ccfb172eb5b8 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2620,7 +2620,7 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) int err; if (family == AF_INET && - xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc) + READ_ONCE(xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)) x->props.flags |= XFRM_STATE_NOPMTUDISC; err = -EPROTONOSUPPORT; diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py index 46f7542db08c..dc07b6d12e30 100644 --- a/scripts/gdb/linux/symbols.py +++ b/scripts/gdb/linux/symbols.py @@ -180,7 +180,7 @@ lx-symbols command.""" self.breakpoint.delete() self.breakpoint = None self.breakpoint = LoadModuleBreakpoint( - "kernel/module.c:do_init_module", self) + "kernel/module/main.c:do_init_module", self) else: gdb.write("Note: symbol update on module loading not supported " "with this gdb version\n") diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index cc88f02c7562..93e8bc047a73 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -755,13 +755,14 @@ void evm_inode_post_removexattr(struct dentry *dentry, const char *xattr_name) evm_update_evmxattr(dentry, xattr_name, NULL, 0); } -static int evm_attr_change(struct dentry *dentry, struct iattr *attr) +static int evm_attr_change(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_backing_inode(dentry); unsigned int ia_valid = attr->ia_valid; - if ((!(ia_valid & ATTR_UID) || uid_eq(attr->ia_uid, inode->i_uid)) && - (!(ia_valid & ATTR_GID) || gid_eq(attr->ia_gid, inode->i_gid)) && + if (!i_uid_needs_update(mnt_userns, attr, inode) && + !i_gid_needs_update(mnt_userns, attr, inode) && (!(ia_valid & ATTR_MODE) || attr->ia_mode == inode->i_mode)) return 0; @@ -775,7 +776,8 @@ static int evm_attr_change(struct dentry *dentry, struct iattr *attr) * Permit update of file attributes when files have a valid EVM signature, * except in the case of them having an immutable portable signature. */ -int evm_inode_setattr(struct dentry *dentry, struct iattr *attr) +int evm_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; enum integrity_status evm_status; @@ -801,7 +803,7 @@ int evm_inode_setattr(struct dentry *dentry, struct iattr *attr) return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && - !evm_attr_change(dentry, attr)) + !evm_attr_change(mnt_userns, dentry, attr)) return 0; integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry), diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 73917413365b..a8802b8da946 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -2247,6 +2247,10 @@ bool ima_appraise_signature(enum kernel_read_file_id id) if (id >= READING_MAX_ID) return false; + if (id == READING_KEXEC_IMAGE && !(ima_appraise & IMA_APPRAISE_ENFORCE) + && security_locked_down(LOCKDOWN_KEXEC)) + return false; + func = read_idmap[id] ?: FILE_CHECK; rcu_read_lock(); diff --git a/security/security.c b/security/security.c index 188b8f782220..f85afb02ea1c 100644 --- a/security/security.c +++ b/security/security.c @@ -1324,7 +1324,8 @@ int security_inode_permission(struct inode *inode, int mask) return call_int_hook(inode_permission, 0, inode, mask); } -int security_inode_setattr(struct dentry *dentry, struct iattr *attr) +int security_inode_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { int ret; @@ -1333,7 +1334,7 @@ int security_inode_setattr(struct dentry *dentry, struct iattr *attr) ret = call_int_hook(inode_setattr, 0, dentry, attr); if (ret) return ret; - return evm_inode_setattr(dentry, attr); + return evm_inode_setattr(mnt_userns, dentry, attr); } EXPORT_SYMBOL_GPL(security_inode_setattr); diff --git a/sound/soc/rockchip/rockchip_i2s.c b/sound/soc/rockchip/rockchip_i2s.c index 99a128a666fb..4ce5d2579387 100644 --- a/sound/soc/rockchip/rockchip_i2s.c +++ b/sound/soc/rockchip/rockchip_i2s.c @@ -13,7 +13,6 @@ #include <linux/of_gpio.h> #include <linux/of_device.h> #include <linux/clk.h> -#include <linux/pinctrl/consumer.h> #include <linux/pm_runtime.h> #include <linux/regmap.h> #include <linux/spinlock.h> @@ -55,40 +54,8 @@ struct rk_i2s_dev { const struct rk_i2s_pins *pins; unsigned int bclk_ratio; spinlock_t lock; /* tx/rx lock */ - struct pinctrl *pinctrl; - struct pinctrl_state *bclk_on; - struct pinctrl_state *bclk_off; }; -static int i2s_pinctrl_select_bclk_on(struct rk_i2s_dev *i2s) -{ - int ret = 0; - - if (!IS_ERR(i2s->pinctrl) && !IS_ERR_OR_NULL(i2s->bclk_on)) - ret = pinctrl_select_state(i2s->pinctrl, - i2s->bclk_on); - - if (ret) - dev_err(i2s->dev, "bclk enable failed %d\n", ret); - - return ret; -} - -static int i2s_pinctrl_select_bclk_off(struct rk_i2s_dev *i2s) -{ - - int ret = 0; - - if (!IS_ERR(i2s->pinctrl) && !IS_ERR_OR_NULL(i2s->bclk_off)) - ret = pinctrl_select_state(i2s->pinctrl, - i2s->bclk_off); - - if (ret) - dev_err(i2s->dev, "bclk disable failed %d\n", ret); - - return ret; -} - static int i2s_runtime_suspend(struct device *dev) { struct rk_i2s_dev *i2s = dev_get_drvdata(dev); @@ -125,49 +92,38 @@ static inline struct rk_i2s_dev *to_info(struct snd_soc_dai *dai) return snd_soc_dai_get_drvdata(dai); } -static int rockchip_snd_txctrl(struct rk_i2s_dev *i2s, int on) +static void rockchip_snd_txctrl(struct rk_i2s_dev *i2s, int on) { unsigned int val = 0; int retry = 10; - int ret = 0; spin_lock(&i2s->lock); if (on) { - ret = regmap_update_bits(i2s->regmap, I2S_DMACR, - I2S_DMACR_TDE_ENABLE, I2S_DMACR_TDE_ENABLE); - if (ret < 0) - goto end; + regmap_update_bits(i2s->regmap, I2S_DMACR, + I2S_DMACR_TDE_ENABLE, I2S_DMACR_TDE_ENABLE); - ret = regmap_update_bits(i2s->regmap, I2S_XFER, - I2S_XFER_TXS_START | I2S_XFER_RXS_START, - I2S_XFER_TXS_START | I2S_XFER_RXS_START); - if (ret < 0) - goto end; + regmap_update_bits(i2s->regmap, I2S_XFER, + I2S_XFER_TXS_START | I2S_XFER_RXS_START, + I2S_XFER_TXS_START | I2S_XFER_RXS_START); i2s->tx_start = true; } else { i2s->tx_start = false; - ret = regmap_update_bits(i2s->regmap, I2S_DMACR, - I2S_DMACR_TDE_ENABLE, I2S_DMACR_TDE_DISABLE); - if (ret < 0) - goto end; + regmap_update_bits(i2s->regmap, I2S_DMACR, + I2S_DMACR_TDE_ENABLE, I2S_DMACR_TDE_DISABLE); if (!i2s->rx_start) { - ret = regmap_update_bits(i2s->regmap, I2S_XFER, - I2S_XFER_TXS_START | - I2S_XFER_RXS_START, - I2S_XFER_TXS_STOP | - I2S_XFER_RXS_STOP); - if (ret < 0) - goto end; + regmap_update_bits(i2s->regmap, I2S_XFER, + I2S_XFER_TXS_START | + I2S_XFER_RXS_START, + I2S_XFER_TXS_STOP | + I2S_XFER_RXS_STOP); udelay(150); - ret = regmap_update_bits(i2s->regmap, I2S_CLR, - I2S_CLR_TXC | I2S_CLR_RXC, - I2S_CLR_TXC | I2S_CLR_RXC); - if (ret < 0) - goto end; + regmap_update_bits(i2s->regmap, I2S_CLR, + I2S_CLR_TXC | I2S_CLR_RXC, + I2S_CLR_TXC | I2S_CLR_RXC); regmap_read(i2s->regmap, I2S_CLR, &val); @@ -182,57 +138,44 @@ static int rockchip_snd_txctrl(struct rk_i2s_dev *i2s, int on) } } } -end: spin_unlock(&i2s->lock); - if (ret < 0) - dev_err(i2s->dev, "lrclk update failed\n"); - - return ret; } -static int rockchip_snd_rxctrl(struct rk_i2s_dev *i2s, int on) +static void rockchip_snd_rxctrl(struct rk_i2s_dev *i2s, int on) { unsigned int val = 0; int retry = 10; - int ret = 0; spin_lock(&i2s->lock); if (on) { - ret = regmap_update_bits(i2s->regmap, I2S_DMACR, + regmap_update_bits(i2s->regmap, I2S_DMACR, I2S_DMACR_RDE_ENABLE, I2S_DMACR_RDE_ENABLE); - if (ret < 0) - goto end; - ret = regmap_update_bits(i2s->regmap, I2S_XFER, + regmap_update_bits(i2s->regmap, I2S_XFER, I2S_XFER_TXS_START | I2S_XFER_RXS_START, I2S_XFER_TXS_START | I2S_XFER_RXS_START); - if (ret < 0) - goto end; i2s->rx_start = true; } else { i2s->rx_start = false; - ret = regmap_update_bits(i2s->regmap, I2S_DMACR, + regmap_update_bits(i2s->regmap, I2S_DMACR, I2S_DMACR_RDE_ENABLE, I2S_DMACR_RDE_DISABLE); - if (ret < 0) - goto end; if (!i2s->tx_start) { - ret = regmap_update_bits(i2s->regmap, I2S_XFER, + regmap_update_bits(i2s->regmap, I2S_XFER, I2S_XFER_TXS_START | I2S_XFER_RXS_START, I2S_XFER_TXS_STOP | I2S_XFER_RXS_STOP); - if (ret < 0) - goto end; + udelay(150); - ret = regmap_update_bits(i2s->regmap, I2S_CLR, + regmap_update_bits(i2s->regmap, I2S_CLR, I2S_CLR_TXC | I2S_CLR_RXC, I2S_CLR_TXC | I2S_CLR_RXC); - if (ret < 0) - goto end; + regmap_read(i2s->regmap, I2S_CLR, &val); + /* Should wait for clear operation to finish */ while (val) { regmap_read(i2s->regmap, I2S_CLR, &val); @@ -244,12 +187,7 @@ static int rockchip_snd_rxctrl(struct rk_i2s_dev *i2s, int on) } } } -end: spin_unlock(&i2s->lock); - if (ret < 0) - dev_err(i2s->dev, "lrclk update failed\n"); - - return ret; } static int rockchip_i2s_set_fmt(struct snd_soc_dai *cpu_dai, @@ -487,26 +425,17 @@ static int rockchip_i2s_trigger(struct snd_pcm_substream *substream, case SNDRV_PCM_TRIGGER_RESUME: case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: if (substream->stream == SNDRV_PCM_STREAM_CAPTURE) - ret = rockchip_snd_rxctrl(i2s, 1); + rockchip_snd_rxctrl(i2s, 1); else - ret = rockchip_snd_txctrl(i2s, 1); - /* Do not turn on bclk if lrclk open fails. */ - if (ret < 0) - return ret; - i2s_pinctrl_select_bclk_on(i2s); + rockchip_snd_txctrl(i2s, 1); break; case SNDRV_PCM_TRIGGER_SUSPEND: case SNDRV_PCM_TRIGGER_STOP: case SNDRV_PCM_TRIGGER_PAUSE_PUSH: - if (substream->stream == SNDRV_PCM_STREAM_CAPTURE) { - if (!i2s->tx_start) - i2s_pinctrl_select_bclk_off(i2s); - ret = rockchip_snd_rxctrl(i2s, 0); - } else { - if (!i2s->rx_start) - i2s_pinctrl_select_bclk_off(i2s); - ret = rockchip_snd_txctrl(i2s, 0); - } + if (substream->stream == SNDRV_PCM_STREAM_CAPTURE) + rockchip_snd_rxctrl(i2s, 0); + else + rockchip_snd_txctrl(i2s, 0); break; default: ret = -EINVAL; @@ -807,33 +736,6 @@ static int rockchip_i2s_probe(struct platform_device *pdev) } i2s->bclk_ratio = 64; - i2s->pinctrl = devm_pinctrl_get(&pdev->dev); - if (IS_ERR(i2s->pinctrl)) - dev_err(&pdev->dev, "failed to find i2s pinctrl\n"); - - i2s->bclk_on = pinctrl_lookup_state(i2s->pinctrl, - "bclk_on"); - if (IS_ERR_OR_NULL(i2s->bclk_on)) - dev_err(&pdev->dev, "failed to find i2s default state\n"); - else - dev_dbg(&pdev->dev, "find i2s bclk state\n"); - - i2s->bclk_off = pinctrl_lookup_state(i2s->pinctrl, - "bclk_off"); - if (IS_ERR_OR_NULL(i2s->bclk_off)) - dev_err(&pdev->dev, "failed to find i2s gpio state\n"); - else - dev_dbg(&pdev->dev, "find i2s bclk_off state\n"); - - i2s_pinctrl_select_bclk_off(i2s); - - i2s->playback_dma_data.addr = res->start + I2S_TXDR; - i2s->playback_dma_data.addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; - i2s->playback_dma_data.maxburst = 4; - - i2s->capture_dma_data.addr = res->start + I2S_RXDR; - i2s->capture_dma_data.addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; - i2s->capture_dma_data.maxburst = 4; dev_set_drvdata(&pdev->dev, i2s); diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 00f5227c8459..a77b915d36a8 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -302,6 +302,7 @@ #define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ #define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ +#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/tools/include/uapi/asm-generic/fcntl.h b/tools/include/uapi/asm-generic/fcntl.h index 0197042b7dfb..1ecdb911add8 100644 --- a/tools/include/uapi/asm-generic/fcntl.h +++ b/tools/include/uapi/asm-generic/fcntl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_GENERIC_FCNTL_H #define _ASM_GENERIC_FCNTL_H @@ -90,7 +91,7 @@ /* a horrid kludge trying to make sure that this will fail on old kernels */ #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) -#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT) +#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT) #ifndef O_NDELAY #define O_NDELAY O_NONBLOCK @@ -115,11 +116,13 @@ #define F_GETSIG 11 /* for sockets. */ #endif +#if __BITS_PER_LONG == 32 || defined(__KERNEL__) #ifndef F_GETLK64 #define F_GETLK64 12 /* using 'struct flock64' */ #define F_SETLK64 13 #define F_SETLKW64 14 #endif +#endif /* __BITS_PER_LONG == 32 || defined(__KERNEL__) */ #ifndef F_SETOWN_EX #define F_SETOWN_EX 15 @@ -178,6 +181,10 @@ struct f_owner_ex { blocking */ #define LOCK_UN 8 /* remove lock */ +/* + * LOCK_MAND support has been removed from the kernel. We leave the symbols + * here to not break legacy builds, but these should not be used in new code. + */ #define LOCK_MAND 32 /* This is a mandatory flock ... */ #define LOCK_READ 64 /* which allows concurrent read operations */ #define LOCK_WRITE 128 /* which allows concurrent write operations */ @@ -185,6 +192,7 @@ struct f_owner_ex { #define F_LINUX_SPECIFIC_BASE 1024 +#ifndef HAVE_ARCH_STRUCT_FLOCK struct flock { short l_type; short l_whence; @@ -209,5 +217,6 @@ struct flock64 { __ARCH_FLOCK64_PAD #endif }; +#endif /* HAVE_ARCH_STRUCT_FLOCK */ #endif /* _ASM_GENERIC_FCNTL_H */ diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 811897dadcae..860f867c50c0 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -2084,7 +2084,7 @@ struct kvm_stats_header { #define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_BOOLEAN (0x4 << KVM_STATS_UNIT_SHIFT) -#define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES +#define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_BOOLEAN #define KVM_STATS_BASE_SHIFT 8 #define KVM_STATS_BASE_MASK (0xF << KVM_STATS_BASE_SHIFT) diff --git a/tools/perf/scripts/python/arm-cs-trace-disasm.py b/tools/perf/scripts/python/arm-cs-trace-disasm.py index 5f57d9829956..4339692a8d0b 100755 --- a/tools/perf/scripts/python/arm-cs-trace-disasm.py +++ b/tools/perf/scripts/python/arm-cs-trace-disasm.py @@ -61,7 +61,7 @@ def get_optional(perf_dict, field): def get_offset(perf_dict, field): if field in perf_dict: - return f"+0x{perf_dict[field]:x}" + return "+%#x" % perf_dict[field] return "" def get_dso_file_path(dso_name, dso_build_id): @@ -76,7 +76,7 @@ def get_dso_file_path(dso_name, dso_build_id): else: append = "/elf" - dso_path = f"{os.environ['PERF_BUILDID_DIR']}/{dso_name}/{dso_build_id}{append}" + dso_path = os.environ['PERF_BUILDID_DIR'] + "/" + dso_name + "/" + dso_build_id + append; # Replace duplicate slash chars to single slash char dso_path = dso_path.replace('//', '/', 1) return dso_path @@ -94,8 +94,8 @@ def read_disam(dso_fname, dso_start, start_addr, stop_addr): start_addr = start_addr - dso_start; stop_addr = stop_addr - dso_start; disasm = [ options.objdump_name, "-d", "-z", - f"--start-address=0x{start_addr:x}", - f"--stop-address=0x{stop_addr:x}" ] + "--start-address="+format(start_addr,"#x"), + "--stop-address="+format(stop_addr,"#x") ] disasm += [ dso_fname ] disasm_output = check_output(disasm).decode('utf-8').split('\n') disasm_cache[addr_range] = disasm_output @@ -109,12 +109,14 @@ def print_disam(dso_fname, dso_start, start_addr, stop_addr): m = disasm_re.search(line) if m is None: continue - print(f"\t{line}") + print("\t" + line) def print_sample(sample): - print(f"Sample = {{ cpu: {sample['cpu']:04} addr: 0x{sample['addr']:016x} " \ - f"phys_addr: 0x{sample['phys_addr']:016x} ip: 0x{sample['ip']:016x} " \ - f"pid: {sample['pid']} tid: {sample['tid']} period: {sample['period']} time: {sample['time']} }}") + print("Sample = { cpu: %04d addr: 0x%016x phys_addr: 0x%016x ip: 0x%016x " \ + "pid: %d tid: %d period: %d time: %d }" % \ + (sample['cpu'], sample['addr'], sample['phys_addr'], \ + sample['ip'], sample['pid'], sample['tid'], \ + sample['period'], sample['time'])) def trace_begin(): print('ARM CoreSight Trace Data Assembler Dump') @@ -131,7 +133,7 @@ def common_start_str(comm, sample): cpu = sample["cpu"] pid = sample["pid"] tid = sample["tid"] - return f"{comm:>16} {pid:>5}/{tid:<5} [{cpu:04}] {sec:9}.{ns:09} " + return "%16s %5u/%-5u [%04u] %9u.%09u " % (comm, pid, tid, cpu, sec, ns) # This code is copied from intel-pt-events.py for printing source code # line and symbols. @@ -171,7 +173,7 @@ def print_srccode(comm, param_dict, sample, symbol, dso): glb_line_number = line_number glb_source_file_name = source_file_name - print(f"{start_str}{src_str}") + print(start_str, src_str) def process_event(param_dict): global cache_size @@ -188,7 +190,7 @@ def process_event(param_dict): symbol = get_optional(param_dict, "symbol") if (options.verbose == True): - print(f"Event type: {name}") + print("Event type: %s" % name) print_sample(sample) # If cannot find dso so cannot dump assembler, bail out @@ -197,7 +199,7 @@ def process_event(param_dict): # Validate dso start and end addresses if ((dso_start == '[unknown]') or (dso_end == '[unknown]')): - print(f"Failed to find valid dso map for dso {dso}") + print("Failed to find valid dso map for dso %s" % dso) return if (name[0:12] == "instructions"): @@ -244,15 +246,15 @@ def process_event(param_dict): # Handle CS_ETM_TRACE_ON packet if start_addr=0 and stop_addr=4 if (start_addr == 0 and stop_addr == 4): - print(f"CPU{cpu}: CS_ETM_TRACE_ON packet is inserted") + print("CPU%d: CS_ETM_TRACE_ON packet is inserted" % cpu) return if (start_addr < int(dso_start) or start_addr > int(dso_end)): - print(f"Start address 0x{start_addr:x} is out of range [ 0x{dso_start:x} .. 0x{dso_end:x} ] for dso {dso}") + print("Start address 0x%x is out of range [ 0x%x .. 0x%x ] for dso %s" % (start_addr, int(dso_start), int(dso_end), dso)) return if (stop_addr < int(dso_start) or stop_addr > int(dso_end)): - print(f"Stop address 0x{stop_addr:x} is out of range [ 0x{dso_start:x} .. 0x{dso_end:x} ] for dso {dso}") + print("Stop address 0x%x is out of range [ 0x%x .. 0x%x ] for dso %s" % (stop_addr, int(dso_start), int(dso_end), dso)) return if (options.objdump_name != None): @@ -267,6 +269,6 @@ def process_event(param_dict): if path.exists(dso_fname): print_disam(dso_fname, dso_vm_start, start_addr, stop_addr) else: - print(f"Failed to find dso {dso} for address range [ 0x{start_addr:x} .. 0x{stop_addr:x} ]") + print("Failed to find dso %s for address range [ 0x%x .. 0x%x ]" % (dso, start_addr, stop_addr)) print_srccode(comm, param_dict, sample, symbol, dso) diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index f8ad581ea247..cdd6463a5b68 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -63,20 +63,16 @@ static struct hashmap *bpf_map_hash; static struct bpf_perf_object * bpf_perf_object__next(struct bpf_perf_object *prev) { - struct bpf_perf_object *next; - - if (!prev) - next = list_first_entry(&bpf_objects_list, - struct bpf_perf_object, - list); - else - next = list_next_entry(prev, list); + if (!prev) { + if (list_empty(&bpf_objects_list)) + return NULL; - /* Empty list is noticed here so don't need checking on entry. */ - if (&next->list == &bpf_objects_list) + return list_first_entry(&bpf_objects_list, struct bpf_perf_object, list); + } + if (list_is_last(&prev->list, &bpf_objects_list)) return NULL; - return next; + return list_next_entry(prev, list); } #define bpf_perf_object__for_each(perf_obj, tmp) \ diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index ecd377938eea..b3be5b1d9dbb 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -233,6 +233,33 @@ Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep, return NULL; } +static int elf_read_program_header(Elf *elf, u64 vaddr, GElf_Phdr *phdr) +{ + size_t i, phdrnum; + u64 sz; + + if (elf_getphdrnum(elf, &phdrnum)) + return -1; + + for (i = 0; i < phdrnum; i++) { + if (gelf_getphdr(elf, i, phdr) == NULL) + return -1; + + if (phdr->p_type != PT_LOAD) + continue; + + sz = max(phdr->p_memsz, phdr->p_filesz); + if (!sz) + continue; + + if (vaddr >= phdr->p_vaddr && (vaddr < phdr->p_vaddr + sz)) + return 0; + } + + /* Not found any valid program header */ + return -1; +} + static bool want_demangle(bool is_kernel_sym) { return is_kernel_sym ? symbol_conf.demangle_kernel : symbol_conf.demangle; @@ -1209,6 +1236,7 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, sym.st_value); used_opd = true; } + /* * When loading symbols in a data mapping, ABS symbols (which * has a value of SHN_ABS in its st_shndx) failed at @@ -1227,6 +1255,17 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, gelf_getshdr(sec, &shdr); + /* + * If the attribute bit SHF_ALLOC is not set, the section + * doesn't occupy memory during process execution. + * E.g. ".gnu.warning.*" section is used by linker to generate + * warnings when calling deprecated functions, the symbols in + * the section aren't loaded to memory during process execution, + * so skip them. + */ + if (!(shdr.sh_flags & SHF_ALLOC)) + continue; + secstrs = secstrs_sym; /* @@ -1262,11 +1301,20 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, goto out_elf_end; } else if ((used_opd && runtime_ss->adjust_symbols) || (!used_opd && syms_ss->adjust_symbols)) { + GElf_Phdr phdr; + + if (elf_read_program_header(syms_ss->elf, + (u64)sym.st_value, &phdr)) { + pr_warning("%s: failed to find program header for " + "symbol: %s st_value: %#" PRIx64 "\n", + __func__, elf_name, (u64)sym.st_value); + continue; + } pr_debug4("%s: adjusting symbol: st_value: %#" PRIx64 " " - "sh_addr: %#" PRIx64 " sh_offset: %#" PRIx64 "\n", __func__, - (u64)sym.st_value, (u64)shdr.sh_addr, - (u64)shdr.sh_offset); - sym.st_value -= shdr.sh_addr - shdr.sh_offset; + "p_vaddr: %#" PRIx64 " p_offset: %#" PRIx64 "\n", + __func__, (u64)sym.st_value, (u64)phdr.p_vaddr, + (u64)phdr.p_offset); + sym.st_value -= phdr.p_vaddr - phdr.p_offset; } demangled = demangle_sym(dso, kmodule, elf_name); diff --git a/tools/testing/selftests/gpio/Makefile b/tools/testing/selftests/gpio/Makefile index 71b306602368..616ed4019655 100644 --- a/tools/testing/selftests/gpio/Makefile +++ b/tools/testing/selftests/gpio/Makefile @@ -3,6 +3,6 @@ TEST_PROGS := gpio-mockup.sh gpio-sim.sh TEST_FILES := gpio-mockup-sysfs.sh TEST_GEN_PROGS_EXTENDED := gpio-mockup-cdev gpio-chip-info gpio-line-name -CFLAGS += -O2 -g -Wall -I../../../../usr/include/ +CFLAGS += -O2 -g -Wall -I../../../../usr/include/ $(KHDR_INCLUDES) include ../lib.mk diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index 4158da0da2bb..2237d1aac801 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -82,8 +82,9 @@ static int next_cpu(int cpu) return cpu; } -static void *migration_worker(void *ign) +static void *migration_worker(void *__rseq_tid) { + pid_t rseq_tid = (pid_t)(unsigned long)__rseq_tid; cpu_set_t allowed_mask; int r, i, cpu; @@ -106,7 +107,7 @@ static void *migration_worker(void *ign) * stable, i.e. while changing affinity is in-progress. */ smp_wmb(); - r = sched_setaffinity(0, sizeof(allowed_mask), &allowed_mask); + r = sched_setaffinity(rseq_tid, sizeof(allowed_mask), &allowed_mask); TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)", errno, strerror(errno)); smp_wmb(); @@ -231,7 +232,8 @@ int main(int argc, char *argv[]) vm = vm_create_default(VCPU_ID, 0, guest_code); ucall_init(vm, NULL); - pthread_create(&migration_thread, NULL, migration_worker, 0); + pthread_create(&migration_thread, NULL, migration_worker, + (void *)(unsigned long)gettid()); for (i = 0; !done; i++) { vcpu_run(vm, VCPU_ID); |