summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/arm64/booting.rst8
-rw-r--r--Documentation/arm64/cpu-feature-registers.rst38
-rw-r--r--Documentation/devicetree/bindings/power/fsl,imx-gpcv2.yaml3
-rw-r--r--Documentation/driver-api/driver-model/devres.rst1
-rw-r--r--Documentation/kbuild/reproducible-builds.rst10
-rw-r--r--Documentation/kernel-hacking/hacking.rst2
-rw-r--r--Documentation/process/2.Process.rst15
-rw-r--r--Documentation/process/howto.rst2
-rw-r--r--Documentation/trace/histogram.rst2
-rw-r--r--Documentation/translations/it_IT/process/howto.rst2
-rw-r--r--Documentation/translations/ja_JP/howto.rst2
-rw-r--r--Documentation/translations/ko_KR/howto.rst2
-rw-r--r--Documentation/translations/zh_CN/process/howto.rst2
-rw-r--r--Documentation/translations/zh_TW/process/howto.rst2
-rw-r--r--Documentation/virt/kvm/api.rst41
-rw-r--r--Documentation/virt/kvm/devices/arm-vgic-its.rst5
-rw-r--r--MAINTAINERS27
-rw-r--r--Makefile4
-rw-r--r--arch/arm/boot/dts/imx6q-yapp4-crux.dts4
-rw-r--r--arch/arm/boot/dts/imx6qdl-gw5910.dtsi2
-rw-r--r--arch/arm/boot/dts/imx6qdl-gw5913.dtsi2
-rw-r--r--arch/arm/boot/dts/imx6qp-yapp4-crux-plus.dts4
-rw-r--r--arch/arm/boot/dts/ste-href.dtsi8
-rw-r--r--arch/arm/boot/dts/ste-snowball.dts8
-rw-r--r--arch/arm/boot/dts/ste-ux500-samsung-codina-tmo.dts8
-rw-r--r--arch/arm/boot/dts/ste-ux500-samsung-codina.dts8
-rw-r--r--arch/arm/boot/dts/ste-ux500-samsung-gavini.dts8
-rw-r--r--arch/arm/boot/dts/ste-ux500-samsung-golden.dts8
-rw-r--r--arch/arm/boot/dts/ste-ux500-samsung-janice.dts8
-rw-r--r--arch/arm/boot/dts/ste-ux500-samsung-kyle.dts8
-rw-r--r--arch/arm/boot/dts/ste-ux500-samsung-skomer.dts8
-rw-r--r--arch/arm64/Kconfig1
-rw-r--r--arch/arm64/boot/dts/arm/juno-base.dtsi14
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi6
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi6
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi6
-rw-r--r--arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi18
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mm-mx8menlo.dts16
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mm.dtsi8
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mn.dtsi4
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi20
-rw-r--r--arch/arm64/boot/dts/freescale/imx93.dtsi11
-rw-r--r--arch/arm64/include/asm/efi.h8
-rw-r--r--arch/arm64/include/asm/kvm_arm.h8
-rw-r--r--arch/arm64/include/asm/kvm_asm.h7
-rw-r--r--arch/arm64/include/asm/kvm_host.h76
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h3
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h2
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h175
-rw-r--r--arch/arm64/include/asm/kvm_pkvm.h38
-rw-r--r--arch/arm64/include/asm/mte.h65
-rw-r--r--arch/arm64/include/asm/pgtable.h4
-rw-r--r--arch/arm64/include/asm/sysreg.h2
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h1
-rw-r--r--arch/arm64/kernel/cpufeature.c46
-rw-r--r--arch/arm64/kernel/efi-rt-wrapper.S33
-rw-r--r--arch/arm64/kernel/efi.c26
-rw-r--r--arch/arm64/kernel/elfcore.c2
-rw-r--r--arch/arm64/kernel/entry-common.c3
-rw-r--r--arch/arm64/kernel/hibernate.c2
-rw-r--r--arch/arm64/kernel/image-vars.h15
-rw-r--r--arch/arm64/kernel/mte.c21
-rw-r--r--arch/arm64/kvm/Kconfig2
-rw-r--r--arch/arm64/kvm/arm.c90
-rw-r--r--arch/arm64/kvm/guest.c18
-rw-r--r--arch/arm64/kvm/hyp/exception.c3
-rw-r--r--arch/arm64/kvm/hyp/hyp-constants.c3
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h20
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mem_protect.h25
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/memory.h27
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mm.h18
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/pkvm.h68
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/spinlock.h10
-rw-r--r--arch/arm64/kvm/hyp/nvhe/cache.S11
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c110
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-smp.c2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c523
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mm.c167
-rw-r--r--arch/arm64/kvm/hyp/nvhe/page_alloc.c29
-rw-r--r--arch/arm64/kvm/hyp/nvhe/pkvm.c436
-rw-r--r--arch/arm64/kvm/hyp/nvhe/setup.c98
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c26
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c652
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c8
-rw-r--r--arch/arm64/kvm/mmu.c193
-rw-r--r--arch/arm64/kvm/pkvm.c138
-rw-r--r--arch/arm64/kvm/pmu-emul.c482
-rw-r--r--arch/arm64/kvm/reset.c29
-rw-r--r--arch/arm64/kvm/sys_regs.c157
-rw-r--r--arch/arm64/kvm/vgic/vgic-its.c20
-rw-r--r--arch/arm64/mm/copypage.c7
-rw-r--r--arch/arm64/mm/fault.c8
-rw-r--r--arch/arm64/mm/mteswap.c16
-rw-r--r--arch/parisc/include/asm/hardware.h12
-rw-r--r--arch/parisc/include/uapi/asm/pdc.h36
-rw-r--r--arch/parisc/kernel/drivers.c14
-rw-r--r--arch/powerpc/Kconfig3
-rw-r--r--arch/powerpc/include/asm/syscalls.h7
-rw-r--r--arch/powerpc/kernel/sys_ppc32.c13
-rw-r--r--arch/powerpc/kernel/syscalls/syscall.tbl7
-rw-r--r--arch/x86/coco/tdx/tdx.c25
-rw-r--r--arch/x86/events/intel/core.c1
-rw-r--r--arch/x86/events/intel/ds.c18
-rw-r--r--arch/x86/events/rapl.c6
-rw-r--r--arch/x86/include/asm/intel-family.h11
-rw-r--r--arch/x86/include/asm/kvm_host.h2
-rw-r--r--arch/x86/include/asm/syscall_wrapper.h2
-rw-r--r--arch/x86/kvm/cpuid.c13
-rw-r--r--arch/x86/kvm/debugfs.c7
-rw-r--r--arch/x86/kvm/emulate.c108
-rw-r--r--arch/x86/kvm/vmx/capabilities.h19
-rw-r--r--arch/x86/kvm/vmx/vmx.c23
-rw-r--r--arch/x86/kvm/x86.c45
-rw-r--r--arch/x86/kvm/xen.c64
-rw-r--r--arch/x86/xen/pmu.c2
-rw-r--r--arch/x86/xen/setup.c23
-rw-r--r--block/blk-mq.c5
-rw-r--r--block/genhd.c1
-rw-r--r--drivers/acpi/numa/srat.c1
-rw-r--r--drivers/acpi/x86/utils.c6
-rw-r--r--drivers/ata/pata_legacy.c5
-rw-r--r--drivers/ata/pata_palmld.c4
-rw-r--r--drivers/block/Kconfig6
-rw-r--r--drivers/block/ublk_drv.c115
-rw-r--r--drivers/bluetooth/virtio_bt.c2
-rw-r--r--drivers/clk/clk-renesas-pcie.c65
-rw-r--r--drivers/clk/clk.c6
-rw-r--r--drivers/clk/mediatek/clk-mt8195-topckgen.c4
-rw-r--r--drivers/clk/qcom/gcc-sc7280.c1
-rw-r--r--drivers/clk/qcom/gpucc-sc7280.c1
-rw-r--r--drivers/clk/renesas/r8a779g0-cpg-mssr.c13
-rw-r--r--drivers/clk/sifive/Kconfig4
-rw-r--r--drivers/cxl/core/mbox.c2
-rw-r--r--drivers/cxl/core/pmem.c2
-rw-r--r--drivers/cxl/core/port.c11
-rw-r--r--drivers/cxl/core/region.c113
-rw-r--r--drivers/cxl/cxl.h4
-rw-r--r--drivers/cxl/pmem.c105
-rw-r--r--drivers/firmware/arm_scmi/bus.c11
-rw-r--r--drivers/firmware/arm_scmi/common.h5
-rw-r--r--drivers/firmware/arm_scmi/driver.c41
-rw-r--r--drivers/firmware/arm_scmi/mailbox.c2
-rw-r--r--drivers/firmware/arm_scmi/optee.c2
-rw-r--r--drivers/firmware/arm_scmi/shmem.c31
-rw-r--r--drivers/firmware/arm_scmi/smc.c2
-rw-r--r--drivers/firmware/arm_scmi/virtio.c26
-rw-r--r--drivers/firmware/efi/efi.c2
-rw-r--r--drivers/firmware/efi/libstub/random.c7
-rw-r--r--drivers/firmware/efi/tpm.c2
-rw-r--r--drivers/firmware/efi/vars.c68
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c15
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h764
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm6
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_migrate.c4
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c3
-rw-r--r--drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c11
-rw-r--r--drivers/gpu/drm/amd/display/dc/dc.h1
-rw-r--r--drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c4
-rw-r--r--drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c2
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/dcn20/dcn20_fpu.c1
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c4
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c10
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.c7
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.h3
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/dcn32/display_rq_dlg_calc_32.c4
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/dcn321/dcn321_fpu.c15
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h3
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/display_mode_vba.c2
-rw-r--r--drivers/gpu/drm/drm_format_helper.c66
-rw-r--r--drivers/gpu/drm/i915/Makefile1
-rw-r--r--drivers/gpu/drm/i915/display/intel_ddi.c68
-rw-r--r--drivers/gpu/drm/i915/display/intel_display_core.h8
-rw-r--r--drivers/gpu/drm/i915/display/intel_display_power_well.c7
-rw-r--r--drivers/gpu/drm/i915/display/intel_dkl_phy.c109
-rw-r--r--drivers/gpu/drm/i915/display/intel_dkl_phy.h24
-rw-r--r--drivers/gpu/drm/i915/display/intel_dp.c2
-rw-r--r--drivers/gpu/drm/i915/display/intel_dpll_mgr.c59
-rw-r--r--drivers/gpu/drm/i915/display/intel_lvds.c3
-rw-r--r--drivers/gpu/drm/i915/display/intel_panel.c4
-rw-r--r--drivers/gpu/drm/i915/display/intel_panel.h2
-rw-r--r--drivers/gpu/drm/i915/display/intel_sdvo.c64
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_internal.c19
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_shmem.c2
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_ttm.c4
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_userptr.c2
-rw-r--r--drivers/gpu/drm/i915/i915_driver.c1
-rw-r--r--drivers/gpu/drm/i915/i915_reg.h3
-rw-r--r--drivers/gpu/drm/i915/i915_scatterlist.h34
-rw-r--r--drivers/gpu/drm/imx/Kconfig1
-rw-r--r--drivers/gpu/drm/imx/imx-tve.c5
-rw-r--r--drivers/gpu/drm/rockchip/dw-mipi-dsi-rockchip.c26
-rw-r--r--drivers/gpu/drm/rockchip/dw_hdmi-rockchip.c3
-rw-r--r--drivers/gpu/drm/rockchip/rockchip_drm_gem.c5
-rw-r--r--drivers/gpu/drm/rockchip/rockchip_drm_vop2.c10
-rw-r--r--drivers/hwmon/pmbus/pmbus.h1
-rw-r--r--drivers/hwmon/scmi-hwmon.c116
-rw-r--r--drivers/i2c/busses/i2c-i801.c1
-rw-r--r--drivers/i2c/busses/i2c-piix4.c1
-rw-r--r--drivers/i2c/busses/i2c-tegra.c16
-rw-r--r--drivers/infiniband/core/cma.c2
-rw-r--r--drivers/infiniband/core/device.c10
-rw-r--r--drivers/infiniband/core/nldev.c2
-rw-r--r--drivers/infiniband/hw/efa/efa_main.c4
-rw-r--r--drivers/infiniband/hw/hfi1/pio.c3
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v2.c15
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v2.h2
-rw-r--r--drivers/infiniband/hw/qedr/main.c9
-rw-r--r--drivers/infiniband/sw/rxe/rxe_resp.c4
-rw-r--r--drivers/isdn/hardware/mISDN/netjet.c2
-rw-r--r--drivers/isdn/mISDN/core.c5
-rw-r--r--drivers/net/dsa/dsa_loop.c25
-rw-r--r--drivers/net/ethernet/adi/adin1110.c38
-rw-r--r--drivers/net/ethernet/freescale/fec_main.c4
-rw-r--r--drivers/net/ethernet/ibm/ibmvnic.c16
-rw-r--r--drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c26
-rw-r--r--drivers/net/ethernet/microchip/lan966x/lan966x_main.c4
-rw-r--r--drivers/net/ethernet/microchip/lan966x/lan966x_main.h2
-rw-r--r--drivers/net/ethernet/microchip/lan966x/lan966x_regs.h15
-rw-r--r--drivers/net/ethernet/microchip/lan966x/lan966x_vlan.c6
-rw-r--r--drivers/net/ethernet/sfc/efx.c8
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c7
-rw-r--r--drivers/net/ethernet/xilinx/xilinx_emaclite.c2
-rw-r--r--drivers/net/phy/mdio_bus.c2
-rw-r--r--drivers/net/tun.c3
-rw-r--r--drivers/nfc/fdp/fdp.c10
-rw-r--r--drivers/nfc/nfcmrvl/i2c.c7
-rw-r--r--drivers/nfc/nxp-nci/core.c7
-rw-r--r--drivers/nfc/s3fwrn5/core.c8
-rw-r--r--drivers/parisc/iosapic.c1
-rw-r--r--drivers/parisc/pdc_stable.c34
-rw-r--r--drivers/soc/imx/imx93-pd.c17
-rw-r--r--drivers/tty/serial/8250/8250_parisc.c (renamed from drivers/tty/serial/8250/8250_gsc.c)0
-rw-r--r--drivers/tty/serial/8250/Kconfig4
-rw-r--r--drivers/tty/serial/8250/Makefile2
-rw-r--r--drivers/watchdog/exar_wdt.c4
-rw-r--r--drivers/watchdog/sp805_wdt.c2
-rw-r--r--fs/btrfs/backref.c54
-rw-r--r--fs/btrfs/ctree.h5
-rw-r--r--fs/btrfs/disk-io.c10
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/export.c2
-rw-r--r--fs/btrfs/export.h2
-rw-r--r--fs/btrfs/extent-tree.c25
-rw-r--r--fs/btrfs/file.c29
-rw-r--r--fs/btrfs/inode.c16
-rw-r--r--fs/btrfs/raid56.c18
-rw-r--r--fs/btrfs/send.c24
-rw-r--r--fs/btrfs/super.c16
-rw-r--r--fs/btrfs/tests/qgroup-tests.c36
-rw-r--r--fs/btrfs/volumes.c12
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/cifs/cifsfs.c26
-rw-r--r--fs/cifs/inode.c5
-rw-r--r--fs/cifs/misc.c6
-rw-r--r--fs/cifs/smb2misc.c81
-rw-r--r--fs/cifs/smb2ops.c30
-rw-r--r--fs/cifs/smb2transport.c19
-rw-r--r--fs/ext4/fast_commit.c5
-rw-r--r--fs/ext4/ioctl.c3
-rw-r--r--fs/ext4/migrate.c3
-rw-r--r--fs/ext4/namei.c10
-rw-r--r--fs/ext4/resize.c5
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/fuse/file.c4
-rw-r--r--fs/fuse/readdir.c10
-rw-r--r--fs/nfs/client.c4
-rw-r--r--fs/nfs/delegation.c36
-rw-r--r--fs/nfs/dir.c5
-rw-r--r--fs/nfs/dns_resolve.c7
-rw-r--r--fs/nfs/dns_resolve.h2
-rw-r--r--fs/nfs/fs_context.c14
-rw-r--r--fs/nfs/internal.h14
-rw-r--r--fs/nfs/mount_clnt.c4
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs3client.c4
-rw-r--r--fs/nfs/nfs42proc.c3
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4client.c19
-rw-r--r--fs/nfs/nfs4namespace.c16
-rw-r--r--fs/nfs/nfs4proc.c10
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/pnfs_nfs.c6
-rw-r--r--fs/nfs/super.c5
-rw-r--r--fs/nfsd/filecache.c5
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/xfs/libxfs/xfs_ag.h15
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c6
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c9
-rw-r--r--fs/xfs/libxfs/xfs_format.h22
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h60
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c286
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h40
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c15
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c9
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c4
-rw-r--r--fs/xfs/libxfs/xfs_types.h30
-rw-r--r--fs/xfs/scrub/alloc.c4
-rw-r--r--fs/xfs/scrub/ialloc.c5
-rw-r--r--fs/xfs/scrub/refcount.c72
-rw-r--r--fs/xfs/xfs_attr_item.c67
-rw-r--r--fs/xfs/xfs_bmap_item.c54
-rw-r--r--fs/xfs/xfs_error.c9
-rw-r--r--fs/xfs/xfs_extfree_item.c94
-rw-r--r--fs/xfs/xfs_extfree_item.h16
-rw-r--r--fs/xfs/xfs_file.c7
-rw-r--r--fs/xfs/xfs_inode.c2
-rw-r--r--fs/xfs/xfs_log_recover.c10
-rw-r--r--fs/xfs/xfs_ondisk.h23
-rw-r--r--fs/xfs/xfs_refcount_item.c57
-rw-r--r--fs/xfs/xfs_rmap_item.c70
-rw-r--r--fs/xfs/xfs_super.c12
-rw-r--r--fs/xfs/xfs_sysfs.h7
-rw-r--r--fs/xfs/xfs_trace.h48
-rw-r--r--fs/xfs/xfs_trans_ail.c3
-rw-r--r--include/asm-generic/compat.h2
-rw-r--r--include/kvm/arm_pmu.h15
-rw-r--r--include/kvm/arm_vgic.h1
-rw-r--r--include/linux/efi.h2
-rw-r--r--include/linux/fortify-string.h13
-rw-r--r--include/linux/kernel-page-flags.h1
-rw-r--r--include/linux/kvm_dirty_ring.h20
-rw-r--r--include/linux/kvm_host.h34
-rw-r--r--include/linux/page-flags.h3
-rw-r--r--include/net/netlink.h48
-rw-r--r--include/net/sock.h7
-rw-r--r--include/trace/events/mmflags.h9
-rw-r--r--include/uapi/linux/kvm.h1
-rw-r--r--kernel/events/hw_breakpoint_test.c4
-rw-r--r--kernel/kprobes.c5
-rw-r--r--kernel/trace/fprobe.c5
-rw-r--r--kernel/trace/ftrace.c16
-rw-r--r--kernel/trace/kprobe_event_gen_test.c18
-rw-r--r--kernel/trace/ring_buffer.c11
-rw-r--r--lib/nlattr.c41
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/huge_memory.c3
-rw-r--r--net/bluetooth/hci_conn.c18
-rw-r--r--net/bluetooth/iso.c14
-rw-r--r--net/bluetooth/l2cap_core.c86
-rw-r--r--net/bridge/br_netlink.c2
-rw-r--r--net/bridge/br_sysfs_br.c2
-rw-r--r--net/core/neighbour.c2
-rw-r--r--net/dsa/dsa2.c13
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/tcp_bpf.c4
-rw-r--r--net/ipv4/tcp_ulp.c3
-rw-r--r--net/ipv4/udp_bpf.c4
-rw-r--r--net/ipv6/route.c14
-rw-r--r--net/ipv6/udp.c1
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h30
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c10
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c30
-rw-r--r--net/netfilter/nf_nat_core.c11
-rw-r--r--net/netfilter/nf_tables_api.c8
-rw-r--r--net/netfilter/nft_payload.c6
-rw-r--r--net/openvswitch/datapath.c1
-rw-r--r--net/rose/rose_link.c3
-rw-r--r--net/sched/sch_red.c4
-rw-r--r--net/smc/af_smc.c6
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c2
-rw-r--r--net/sunrpc/sysfs.c12
-rw-r--r--net/unix/unix_bpf.c8
-rw-r--r--net/vmw_vsock/af_vsock.c7
-rw-r--r--scripts/Makefile.modpost2
-rw-r--r--scripts/kconfig/menu.c23
-rw-r--r--security/commoncap.c6
-rw-r--r--tools/include/linux/bitfield.h176
-rw-r--r--tools/include/nolibc/string.h17
-rw-r--r--tools/testing/cxl/test/cxl.c301
-rw-r--r--tools/testing/selftests/kvm/.gitignore1
-rw-r--r--tools/testing/selftests/kvm/Makefile3
-rw-r--r--tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c3
-rw-r--r--tools/testing/selftests/kvm/aarch64/debug-exceptions.c311
-rw-r--r--tools/testing/selftests/kvm/aarch64/page_fault_test.c1112
-rw-r--r--tools/testing/selftests/kvm/access_tracking_perf_test.c8
-rw-r--r--tools/testing/selftests/kvm/demand_paging_test.c228
-rw-r--r--tools/testing/selftests/kvm/dirty_log_test.c53
-rw-r--r--tools/testing/selftests/kvm/include/aarch64/processor.h35
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util_base.h31
-rw-r--r--tools/testing/selftests/kvm/include/perf_test_util.h3
-rw-r--r--tools/testing/selftests/kvm/include/userfaultfd_util.h45
-rw-r--r--tools/testing/selftests/kvm/lib/aarch64/processor.c55
-rw-r--r--tools/testing/selftests/kvm/lib/elf.c3
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c84
-rw-r--r--tools/testing/selftests/kvm/lib/perf_test_util.c3
-rw-r--r--tools/testing/selftests/kvm/lib/riscv/processor.c29
-rw-r--r--tools/testing/selftests/kvm/lib/s390x/processor.c8
-rw-r--r--tools/testing/selftests/kvm/lib/userfaultfd_util.c186
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/processor.c13
-rw-r--r--tools/testing/selftests/kvm/memslot_modification_stress_test.c6
-rw-r--r--tools/testing/selftests/kvm/memslot_perf_test.c317
-rw-r--r--tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c142
-rw-r--r--tools/testing/selftests/landlock/Makefile7
-rw-r--r--tools/testing/selftests/pidfd/Makefile2
-rw-r--r--tools/testing/selftests/pidfd/pidfd_test.c4
-rw-r--r--tools/testing/selftests/pidfd/pidfd_wait.c12
-rw-r--r--virt/kvm/Kconfig6
-rw-r--r--virt/kvm/dirty_ring.c46
-rw-r--r--virt/kvm/kvm_main.c81
-rw-r--r--virt/kvm/pfncache.c62
404 files changed, 9344 insertions, 3622 deletions
diff --git a/Documentation/arm64/booting.rst b/Documentation/arm64/booting.rst
index 8aefa1001ae5..8c324ad638de 100644
--- a/Documentation/arm64/booting.rst
+++ b/Documentation/arm64/booting.rst
@@ -340,6 +340,14 @@ Before jumping into the kernel, the following conditions must be met:
- SMCR_EL2.LEN must be initialised to the same value for all CPUs the
kernel will execute on.
+ - HWFGRTR_EL2.nTPIDR2_EL0 (bit 55) must be initialised to 0b01.
+
+ - HWFGWTR_EL2.nTPIDR2_EL0 (bit 55) must be initialised to 0b01.
+
+ - HWFGRTR_EL2.nSMPRI_EL1 (bit 54) must be initialised to 0b01.
+
+ - HWFGWTR_EL2.nSMPRI_EL1 (bit 54) must be initialised to 0b01.
+
For CPUs with the Scalable Matrix Extension FA64 feature (FEAT_SME_FA64)
- If EL3 is present:
diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst
index 04ba83e1965f..c7adc7897df6 100644
--- a/Documentation/arm64/cpu-feature-registers.rst
+++ b/Documentation/arm64/cpu-feature-registers.rst
@@ -92,7 +92,7 @@ operation if the source belongs to the supported system register space.
The infrastructure emulates only the following system register space::
- Op0=3, Op1=0, CRn=0, CRm=0,4,5,6,7
+ Op0=3, Op1=0, CRn=0, CRm=0,2,3,4,5,6,7
(See Table C5-6 'System instruction encodings for non-Debug System
register accesses' in ARMv8 ARM DDI 0487A.h, for the list of
@@ -293,6 +293,42 @@ infrastructure:
| WFXT | [3-0] | y |
+------------------------------+---------+---------+
+ 10) MVFR0_EL1 - AArch32 Media and VFP Feature Register 0
+
+ +------------------------------+---------+---------+
+ | Name | bits | visible |
+ +------------------------------+---------+---------+
+ | FPDP | [11-8] | y |
+ +------------------------------+---------+---------+
+
+ 11) MVFR1_EL1 - AArch32 Media and VFP Feature Register 1
+
+ +------------------------------+---------+---------+
+ | Name | bits | visible |
+ +------------------------------+---------+---------+
+ | SIMDFMAC | [31-28] | y |
+ +------------------------------+---------+---------+
+ | SIMDSP | [19-16] | y |
+ +------------------------------+---------+---------+
+ | SIMDInt | [15-12] | y |
+ +------------------------------+---------+---------+
+ | SIMDLS | [11-8] | y |
+ +------------------------------+---------+---------+
+
+ 12) ID_ISAR5_EL1 - AArch32 Instruction Set Attribute Register 5
+
+ +------------------------------+---------+---------+
+ | Name | bits | visible |
+ +------------------------------+---------+---------+
+ | CRC32 | [19-16] | y |
+ +------------------------------+---------+---------+
+ | SHA2 | [15-12] | y |
+ +------------------------------+---------+---------+
+ | SHA1 | [11-8] | y |
+ +------------------------------+---------+---------+
+ | AES | [7-4] | y |
+ +------------------------------+---------+---------+
+
Appendix I: Example
-------------------
diff --git a/Documentation/devicetree/bindings/power/fsl,imx-gpcv2.yaml b/Documentation/devicetree/bindings/power/fsl,imx-gpcv2.yaml
index 58022ae7d5dd..dfdb8dfb6b65 100644
--- a/Documentation/devicetree/bindings/power/fsl,imx-gpcv2.yaml
+++ b/Documentation/devicetree/bindings/power/fsl,imx-gpcv2.yaml
@@ -81,6 +81,9 @@ properties:
power-supply: true
+ power-domains:
+ maxItems: 1
+
resets:
description: |
A number of phandles to resets that need to be asserted during
diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 687adb58048e..56082265e8e5 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -279,6 +279,7 @@ GPIO
devm_gpio_request_one()
I2C
+ devm_i2c_add_adapter()
devm_i2c_new_dummy_device()
IIO
diff --git a/Documentation/kbuild/reproducible-builds.rst b/Documentation/kbuild/reproducible-builds.rst
index 071f0151a7a4..f2dcc39044e6 100644
--- a/Documentation/kbuild/reproducible-builds.rst
+++ b/Documentation/kbuild/reproducible-builds.rst
@@ -119,6 +119,16 @@ To avoid this, you can make the vDSO different for different
kernel versions by including an arbitrary string of "salt" in it.
This is specified by the Kconfig symbol ``CONFIG_BUILD_SALT``.
+Git
+---
+
+Uncommitted changes or different commit ids in git can also lead
+to different compilation results. For example, after executing
+``git reset HEAD^``, even if the code is the same, the
+``include/config/kernel.release`` generated during compilation
+will be different, which will eventually lead to binary differences.
+See ``scripts/setlocalversion`` for details.
+
.. _KBUILD_BUILD_TIMESTAMP: kbuild.html#kbuild-build-timestamp
.. _KBUILD_BUILD_USER and KBUILD_BUILD_HOST: kbuild.html#kbuild-build-user-kbuild-build-host
.. _KCFLAGS: kbuild.html#kcflags
diff --git a/Documentation/kernel-hacking/hacking.rst b/Documentation/kernel-hacking/hacking.rst
index 9a1f020c8449..1717348a4404 100644
--- a/Documentation/kernel-hacking/hacking.rst
+++ b/Documentation/kernel-hacking/hacking.rst
@@ -120,7 +120,7 @@ You can tell you are in a softirq (or tasklet) using the
.. warning::
Beware that this will return a false positive if a
- :ref:`botton half lock <local_bh_disable>` is held.
+ :ref:`bottom half lock <local_bh_disable>` is held.
Some Basic Rules
================
diff --git a/Documentation/process/2.Process.rst b/Documentation/process/2.Process.rst
index e05fb1b8f8b6..6a919cffcbfd 100644
--- a/Documentation/process/2.Process.rst
+++ b/Documentation/process/2.Process.rst
@@ -126,17 +126,10 @@ than one development cycle past their initial release. So, for example, the
5.2.21 was the final stable update of the 5.2 release.
Some kernels are designated "long term" kernels; they will receive support
-for a longer period. As of this writing, the current long term kernels
-and their maintainers are:
-
- ====== ================================ =======================
- 3.16 Ben Hutchings (very long-term kernel)
- 4.4 Greg Kroah-Hartman & Sasha Levin (very long-term kernel)
- 4.9 Greg Kroah-Hartman & Sasha Levin
- 4.14 Greg Kroah-Hartman & Sasha Levin
- 4.19 Greg Kroah-Hartman & Sasha Levin
- 5.4 Greg Kroah-Hartman & Sasha Levin
- ====== ================================ =======================
+for a longer period. Please refer to the following link for the list of active
+long term kernel versions and their maintainers:
+
+ https://www.kernel.org/category/releases.html
The selection of a kernel for long-term support is purely a matter of a
maintainer having the need and the time to maintain that release. There
diff --git a/Documentation/process/howto.rst b/Documentation/process/howto.rst
index bd15c393ba3c..cb6abcb2b6d0 100644
--- a/Documentation/process/howto.rst
+++ b/Documentation/process/howto.rst
@@ -36,7 +36,7 @@ experience, the following books are good for, if anything, reference:
- "C: A Reference Manual" by Harbison and Steele [Prentice Hall]
The kernel is written using GNU C and the GNU toolchain. While it
-adheres to the ISO C89 standard, it uses a number of extensions that are
+adheres to the ISO C11 standard, it uses a number of extensions that are
not featured in the standard. The kernel is a freestanding C
environment, with no reliance on the standard C library, so some
portions of the C standard are not supported. Arbitrary long long
diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst
index c1b685a38f6b..87bd772836c0 100644
--- a/Documentation/trace/histogram.rst
+++ b/Documentation/trace/histogram.rst
@@ -39,7 +39,7 @@ Documentation written by Tom Zanussi
will use the event's kernel stacktrace as the key. The keywords
'keys' or 'key' can be used to specify keys, and the keywords
'values', 'vals', or 'val' can be used to specify values. Compound
- keys consisting of up to two fields can be specified by the 'keys'
+ keys consisting of up to three fields can be specified by the 'keys'
keyword. Hashing a compound key produces a unique entry in the
table for each unique combination of component keys, and can be
useful for providing more fine-grained summaries of event data.
diff --git a/Documentation/translations/it_IT/process/howto.rst b/Documentation/translations/it_IT/process/howto.rst
index 15c08aea1dfe..052f1b3610cb 100644
--- a/Documentation/translations/it_IT/process/howto.rst
+++ b/Documentation/translations/it_IT/process/howto.rst
@@ -44,7 +44,7 @@ altro, utili riferimenti:
- "C: A Reference Manual" di Harbison and Steele [Prentice Hall]
Il kernel è stato scritto usando GNU C e la toolchain GNU.
-Sebbene si attenga allo standard ISO C89, esso utilizza una serie di
+Sebbene si attenga allo standard ISO C11, esso utilizza una serie di
estensioni che non sono previste in questo standard. Il kernel è un
ambiente C indipendente, che non ha alcuna dipendenza dalle librerie
C standard, così alcune parti del C standard non sono supportate.
diff --git a/Documentation/translations/ja_JP/howto.rst b/Documentation/translations/ja_JP/howto.rst
index b47a682d8ded..b8eeb45a02d4 100644
--- a/Documentation/translations/ja_JP/howto.rst
+++ b/Documentation/translations/ja_JP/howto.rst
@@ -65,7 +65,7 @@ Linux カーネル開発のやり方
- 『新・詳説 C 言語 H&S リファレンス』 (サミュエル P ハービソン/ガイ L スティール共著 斉藤 信男監訳)[ソフトバンク]
カーネルは GNU C と GNU ツールチェインを使って書かれています。カーネル
-は ISO C89 仕様に準拠して書く一方で、標準には無い言語拡張を多く使って
+は ISO C11 仕様に準拠して書く一方で、標準には無い言語拡張を多く使って
います。カーネルは標準 C ライブラリに依存しない、C 言語非依存環境です。
そのため、C の標準の中で使えないものもあります。特に任意の long long
の除算や浮動小数点は使えません。カーネルがツールチェインや C 言語拡張
diff --git a/Documentation/translations/ko_KR/howto.rst b/Documentation/translations/ko_KR/howto.rst
index df53fafd1b10..969e91a95bb0 100644
--- a/Documentation/translations/ko_KR/howto.rst
+++ b/Documentation/translations/ko_KR/howto.rst
@@ -62,7 +62,7 @@ Documentation/process/howto.rst
- "Practical C Programming" by Steve Oualline [O'Reilly]
- "C: A Reference Manual" by Harbison and Steele [Prentice Hall]
-커널은 GNU C와 GNU 툴체인을 사용하여 작성되었다. 이 툴들은 ISO C89 표준을
+커널은 GNU C와 GNU 툴체인을 사용하여 작성되었다. 이 툴들은 ISO C11 표준을
따르는 반면 표준에 있지 않은 많은 확장기능도 가지고 있다. 커널은 표준 C
라이브러리와는 관계없이 freestanding C 환경이어서 C 표준의 일부는
지원되지 않는다. 임의의 long long 나누기나 floating point는 지원되지 않는다.
diff --git a/Documentation/translations/zh_CN/process/howto.rst b/Documentation/translations/zh_CN/process/howto.rst
index 5bf953146929..888978a62db3 100644
--- a/Documentation/translations/zh_CN/process/howto.rst
+++ b/Documentation/translations/zh_CN/process/howto.rst
@@ -45,7 +45,7 @@ Linux内核大部分是由C语言写成的,一些体系结构相关的代码
- "C: A Reference Manual" by Harbison and Steele [Prentice Hall]
《C语言参考手册(原书第5版)》(邱仲潘 等译)[机械工业出版社]
-Linux内核使用GNU C和GNU工具链开发。虽然它遵循ISO C89标准,但也用到了一些
+Linux内核使用GNU C和GNU工具链开发。虽然它遵循ISO C11标准,但也用到了一些
标准中没有定义的扩展。内核是自给自足的C环境,不依赖于标准C库的支持,所以
并不支持C标准中的部分定义。比如long long类型的大数除法和浮点运算就不允许
使用。有时候确实很难弄清楚内核对工具链的要求和它所使用的扩展,不幸的是目
diff --git a/Documentation/translations/zh_TW/process/howto.rst b/Documentation/translations/zh_TW/process/howto.rst
index 86b0d4c6d6f9..8fb8edcaee66 100644
--- a/Documentation/translations/zh_TW/process/howto.rst
+++ b/Documentation/translations/zh_TW/process/howto.rst
@@ -48,7 +48,7 @@ Linux內核大部分是由C語言寫成的,一些體系結構相關的代碼
- "C: A Reference Manual" by Harbison and Steele [Prentice Hall]
《C語言參考手冊(原書第5版)》(邱仲潘 等譯)[機械工業出版社]
-Linux內核使用GNU C和GNU工具鏈開發。雖然它遵循ISO C89標準,但也用到了一些
+Linux內核使用GNU C和GNU工具鏈開發。雖然它遵循ISO C11標準,但也用到了一些
標準中沒有定義的擴展。內核是自給自足的C環境,不依賴於標準C庫的支持,所以
並不支持C標準中的部分定義。比如long long類型的大數除法和浮點運算就不允許
使用。有時候確實很難弄清楚內核對工具鏈的要求和它所使用的擴展,不幸的是目
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index eee9f857a986..226b40baffb8 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7385,8 +7385,9 @@ hibernation of the host; however the VMM needs to manually save/restore the
tags as appropriate if the VM is migrated.
When this capability is enabled all memory in memslots must be mapped as
-not-shareable (no MAP_SHARED), attempts to create a memslot with a
-MAP_SHARED mmap will result in an -EINVAL return.
+``MAP_ANONYMOUS`` or with a RAM-based file mapping (``tmpfs``, ``memfd``),
+attempts to create a memslot with an invalid mmap will result in an
+-EINVAL return.
When enabled the VMM may make use of the ``KVM_ARM_MTE_COPY_TAGS`` ioctl to
perform a bulk copy of tags to/from the guest.
@@ -7921,7 +7922,7 @@ regardless of what has actually been exposed through the CPUID leaf.
8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL
----------------------------------------------------------
-:Architectures: x86
+:Architectures: x86, arm64
:Parameters: args[0] - size of the dirty log ring
KVM is capable of tracking dirty memory using ring buffers that are
@@ -8003,13 +8004,6 @@ flushing is done by the KVM_GET_DIRTY_LOG ioctl). To achieve that, one
needs to kick the vcpu out of KVM_RUN using a signal. The resulting
vmexit ensures that all dirty GFNs are flushed to the dirty rings.
-NOTE: the capability KVM_CAP_DIRTY_LOG_RING and the corresponding
-ioctl KVM_RESET_DIRTY_RINGS are mutual exclusive to the existing ioctls
-KVM_GET_DIRTY_LOG and KVM_CLEAR_DIRTY_LOG. After enabling
-KVM_CAP_DIRTY_LOG_RING with an acceptable dirty ring size, the virtual
-machine will switch to ring-buffer dirty page tracking and further
-KVM_GET_DIRTY_LOG or KVM_CLEAR_DIRTY_LOG ioctls will fail.
-
NOTE: KVM_CAP_DIRTY_LOG_RING_ACQ_REL is the only capability that
should be exposed by weakly ordered architecture, in order to indicate
the additional memory ordering requirements imposed on userspace when
@@ -8018,6 +8012,33 @@ Architecture with TSO-like ordering (such as x86) are allowed to
expose both KVM_CAP_DIRTY_LOG_RING and KVM_CAP_DIRTY_LOG_RING_ACQ_REL
to userspace.
+After enabling the dirty rings, the userspace needs to detect the
+capability of KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP to see whether the
+ring structures can be backed by per-slot bitmaps. With this capability
+advertised, it means the architecture can dirty guest pages without
+vcpu/ring context, so that some of the dirty information will still be
+maintained in the bitmap structure. KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP
+can't be enabled if the capability of KVM_CAP_DIRTY_LOG_RING_ACQ_REL
+hasn't been enabled, or any memslot has been existing.
+
+Note that the bitmap here is only a backup of the ring structure. The
+use of the ring and bitmap combination is only beneficial if there is
+only a very small amount of memory that is dirtied out of vcpu/ring
+context. Otherwise, the stand-alone per-slot bitmap mechanism needs to
+be considered.
+
+To collect dirty bits in the backup bitmap, userspace can use the same
+KVM_GET_DIRTY_LOG ioctl. KVM_CLEAR_DIRTY_LOG isn't needed as long as all
+the generation of the dirty bits is done in a single pass. Collecting
+the dirty bitmap should be the very last thing that the VMM does before
+considering the state as complete. VMM needs to ensure that the dirty
+state is final and avoid missing dirty pages from another ioctl ordered
+after the bitmap collection.
+
+NOTE: One example of using the backup bitmap is saving arm64 vgic/its
+tables through KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} command on
+KVM device "kvm-arm-vgic-its" when dirty ring is enabled.
+
8.30 KVM_CAP_XEN_HVM
--------------------
diff --git a/Documentation/virt/kvm/devices/arm-vgic-its.rst b/Documentation/virt/kvm/devices/arm-vgic-its.rst
index d257eddbae29..e053124f77c4 100644
--- a/Documentation/virt/kvm/devices/arm-vgic-its.rst
+++ b/Documentation/virt/kvm/devices/arm-vgic-its.rst
@@ -52,7 +52,10 @@ KVM_DEV_ARM_VGIC_GRP_CTRL
KVM_DEV_ARM_ITS_SAVE_TABLES
save the ITS table data into guest RAM, at the location provisioned
- by the guest in corresponding registers/table entries.
+ by the guest in corresponding registers/table entries. Should userspace
+ require a form of dirty tracking to identify which pages are modified
+ by the saving process, it should use a bitmap even if using another
+ mechanism to track the memory dirtied by the vCPUs.
The layout of the tables in guest memory defines an ABI. The entries
are laid out in little endian format as described in the last paragraph.
diff --git a/MAINTAINERS b/MAINTAINERS
index 379945f82a64..046ff06ff97f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3984,7 +3984,7 @@ M: Rafał Miłecki <rafal@milecki.pl>
R: Broadcom internal kernel review list <bcm-kernel-feedback-list@broadcom.com>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Maintained
-T: git git://github.com/broadcom/stblinux.git
+T: git https://github.com/broadcom/stblinux.git
F: Documentation/devicetree/bindings/arm/bcm/brcm,bcmbca.yaml
F: arch/arm64/boot/dts/broadcom/bcmbca/*
N: bcmbca
@@ -4009,7 +4009,7 @@ R: Broadcom internal kernel review list <bcm-kernel-feedback-list@broadcom.com>
L: linux-rpi-kernel@lists.infradead.org (moderated for non-subscribers)
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Maintained
-T: git git://github.com/broadcom/stblinux.git
+T: git https://github.com/broadcom/stblinux.git
F: Documentation/devicetree/bindings/pci/brcm,stb-pcie.yaml
F: drivers/pci/controller/pcie-brcmstb.c
F: drivers/staging/vc04_services
@@ -4023,7 +4023,7 @@ M: Ray Jui <rjui@broadcom.com>
M: Scott Branden <sbranden@broadcom.com>
R: Broadcom internal kernel review list <bcm-kernel-feedback-list@broadcom.com>
S: Maintained
-T: git git://github.com/broadcom/mach-bcm
+T: git https://github.com/broadcom/mach-bcm
F: arch/arm/mach-bcm/
N: bcm281*
N: bcm113*
@@ -4088,7 +4088,7 @@ M: Florian Fainelli <f.fainelli@gmail.com>
R: Broadcom internal kernel review list <bcm-kernel-feedback-list@broadcom.com>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Maintained
-T: git git://github.com/broadcom/stblinux.git
+T: git https://github.com/broadcom/stblinux.git
F: Documentation/devicetree/bindings/pci/brcm,stb-pcie.yaml
F: arch/arm/boot/dts/bcm7*.dts*
F: arch/arm/include/asm/hardware/cache-b15-rac.h
@@ -4121,7 +4121,7 @@ M: Florian Fainelli <f.fainelli@gmail.com>
R: Broadcom internal kernel review list <bcm-kernel-feedback-list@broadcom.com>
L: linux-mips@vger.kernel.org
S: Maintained
-T: git git://github.com/broadcom/stblinux.git
+T: git https://github.com/broadcom/stblinux.git
F: arch/mips/bmips/*
F: arch/mips/boot/dts/brcm/bcm*.dts*
F: arch/mips/include/asm/mach-bmips/*
@@ -4262,7 +4262,7 @@ M: Scott Branden <sbranden@broadcom.com>
R: Broadcom internal kernel review list <bcm-kernel-feedback-list@broadcom.com>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Maintained
-T: git git://github.com/broadcom/stblinux.git
+T: git https://github.com/broadcom/stblinux.git
F: arch/arm64/boot/dts/broadcom/northstar2/*
F: arch/arm64/boot/dts/broadcom/stingray/*
F: drivers/clk/bcm/clk-ns*
@@ -4332,7 +4332,7 @@ M: Florian Fainelli <f.fainelli@gmail.com>
R: Broadcom internal kernel review list <bcm-kernel-feedback-list@broadcom.com>
L: linux-pm@vger.kernel.org
S: Maintained
-T: git git://github.com/broadcom/stblinux.git
+T: git https://github.com/broadcom/stblinux.git
F: drivers/soc/bcm/bcm63xx/bcm-pmb.c
F: include/dt-bindings/soc/bcm-pmb.h
@@ -5041,7 +5041,7 @@ F: drivers/scsi/snic/
CISCO VIC ETHERNET NIC DRIVER
M: Christian Benvenuti <benve@cisco.com>
-M: Govindarajulu Varadarajan <_govind@gmx.com>
+M: Satish Kharat <satishkh@cisco.com>
S: Supported
F: drivers/net/ethernet/cisco/enic/
@@ -9217,7 +9217,7 @@ W: https://www.hisilicon.com
F: drivers/i2c/busses/i2c-hisi.c
HISILICON LPC BUS DRIVER
-M: john.garry@huawei.com
+M: Jay Fang <f.fangjian@huawei.com>
S: Maintained
W: http://www.hisilicon.com
F: Documentation/devicetree/bindings/arm/hisilicon/low-pin-count.yaml
@@ -9778,7 +9778,10 @@ S: Supported
F: drivers/pci/hotplug/rpaphp*
IBM Power SRIOV Virtual NIC Device Driver
-M: Dany Madden <drt@linux.ibm.com>
+M: Haren Myneni <haren@linux.ibm.com>
+M: Rick Lindsley <ricklind@linux.ibm.com>
+R: Nick Child <nnac123@linux.ibm.com>
+R: Dany Madden <danymadden@us.ibm.com>
R: Thomas Falcon <tlfalcon@linux.ibm.com>
L: netdev@vger.kernel.org
S: Supported
@@ -11248,7 +11251,7 @@ L: kvm@vger.kernel.org
L: kvm-riscv@lists.infradead.org
L: linux-riscv@lists.infradead.org
S: Maintained
-T: git git://github.com/kvm-riscv/linux.git
+T: git https://github.com/kvm-riscv/linux.git
F: arch/riscv/include/asm/kvm*
F: arch/riscv/include/uapi/asm/kvm*
F: arch/riscv/kvm/
@@ -15630,7 +15633,7 @@ F: drivers/input/serio/gscps2.c
F: drivers/input/serio/hp_sdc*
F: drivers/parisc/
F: drivers/parport/parport_gsc.*
-F: drivers/tty/serial/8250/8250_gsc.c
+F: drivers/tty/serial/8250/8250_parisc.c
F: drivers/video/console/sti*
F: drivers/video/fbdev/sti*
F: drivers/video/logo/logo_parisc*
diff --git a/Makefile b/Makefile
index 28026d1ebb9d..ac2ec990422d 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
VERSION = 6
PATCHLEVEL = 1
SUBLEVEL = 0
-EXTRAVERSION = -rc3
+EXTRAVERSION = -rc4
NAME = Hurr durr I'ma ninja sloth
# *DOCUMENTATION*
@@ -1218,7 +1218,7 @@ quiet_cmd_ar_vmlinux.a = AR $@
cmd_ar_vmlinux.a = \
rm -f $@; \
$(AR) cDPrST $@ $(KBUILD_VMLINUX_OBJS); \
- $(AR) mPiT $$($(AR) t $@ | head -n1) $@ $$($(AR) t $@ | grep -F --file=$(srctree)/scripts/head-object-list.txt)
+ $(AR) mPiT $$($(AR) t $@ | sed -n 1p) $@ $$($(AR) t $@ | grep -F -f $(srctree)/scripts/head-object-list.txt)
targets += vmlinux.a
vmlinux.a: $(KBUILD_VMLINUX_OBJS) scripts/head-object-list.txt autoksyms_recursive FORCE
diff --git a/arch/arm/boot/dts/imx6q-yapp4-crux.dts b/arch/arm/boot/dts/imx6q-yapp4-crux.dts
index 15f4824a5142..bddf3822ebf7 100644
--- a/arch/arm/boot/dts/imx6q-yapp4-crux.dts
+++ b/arch/arm/boot/dts/imx6q-yapp4-crux.dts
@@ -33,6 +33,10 @@
status = "okay";
};
+&reg_pu {
+ regulator-always-on;
+};
+
&reg_usb_h1_vbus {
status = "okay";
};
diff --git a/arch/arm/boot/dts/imx6qdl-gw5910.dtsi b/arch/arm/boot/dts/imx6qdl-gw5910.dtsi
index 68e5ab2e27e2..6bb4855d13ce 100644
--- a/arch/arm/boot/dts/imx6qdl-gw5910.dtsi
+++ b/arch/arm/boot/dts/imx6qdl-gw5910.dtsi
@@ -29,7 +29,7 @@
user-pb {
label = "user_pb";
- gpios = <&gsc_gpio 0 GPIO_ACTIVE_LOW>;
+ gpios = <&gsc_gpio 2 GPIO_ACTIVE_LOW>;
linux,code = <BTN_0>;
};
diff --git a/arch/arm/boot/dts/imx6qdl-gw5913.dtsi b/arch/arm/boot/dts/imx6qdl-gw5913.dtsi
index 8e23cec7149e..696427b487f0 100644
--- a/arch/arm/boot/dts/imx6qdl-gw5913.dtsi
+++ b/arch/arm/boot/dts/imx6qdl-gw5913.dtsi
@@ -26,7 +26,7 @@
user-pb {
label = "user_pb";
- gpios = <&gsc_gpio 0 GPIO_ACTIVE_LOW>;
+ gpios = <&gsc_gpio 2 GPIO_ACTIVE_LOW>;
linux,code = <BTN_0>;
};
diff --git a/arch/arm/boot/dts/imx6qp-yapp4-crux-plus.dts b/arch/arm/boot/dts/imx6qp-yapp4-crux-plus.dts
index cea165f2161a..afaf4a6759d4 100644
--- a/arch/arm/boot/dts/imx6qp-yapp4-crux-plus.dts
+++ b/arch/arm/boot/dts/imx6qp-yapp4-crux-plus.dts
@@ -33,6 +33,10 @@
status = "okay";
};
+&reg_pu {
+ regulator-always-on;
+};
+
&reg_usb_h1_vbus {
status = "okay";
};
diff --git a/arch/arm/boot/dts/ste-href.dtsi b/arch/arm/boot/dts/ste-href.dtsi
index fbaa0ce46427..8f1bb78fc1e4 100644
--- a/arch/arm/boot/dts/ste-href.dtsi
+++ b/arch/arm/boot/dts/ste-href.dtsi
@@ -24,6 +24,14 @@
polling-delay = <0>;
polling-delay-passive = <0>;
thermal-sensors = <&bat_therm>;
+
+ trips {
+ battery-crit-hi {
+ temperature = <70000>;
+ hysteresis = <2000>;
+ type = "critical";
+ };
+ };
};
};
diff --git a/arch/arm/boot/dts/ste-snowball.dts b/arch/arm/boot/dts/ste-snowball.dts
index 1c9094f24893..e2f0cdacba7d 100644
--- a/arch/arm/boot/dts/ste-snowball.dts
+++ b/arch/arm/boot/dts/ste-snowball.dts
@@ -28,6 +28,14 @@
polling-delay = <0>;
polling-delay-passive = <0>;
thermal-sensors = <&bat_therm>;
+
+ trips {
+ battery-crit-hi {
+ temperature = <70000>;
+ hysteresis = <2000>;
+ type = "critical";
+ };
+ };
};
};
diff --git a/arch/arm/boot/dts/ste-ux500-samsung-codina-tmo.dts b/arch/arm/boot/dts/ste-ux500-samsung-codina-tmo.dts
index d6940e0afa86..27a3ab7e25e1 100644
--- a/arch/arm/boot/dts/ste-ux500-samsung-codina-tmo.dts
+++ b/arch/arm/boot/dts/ste-ux500-samsung-codina-tmo.dts
@@ -44,6 +44,14 @@
polling-delay = <0>;
polling-delay-passive = <0>;
thermal-sensors = <&bat_therm>;
+
+ trips {
+ battery-crit-hi {
+ temperature = <70000>;
+ hysteresis = <2000>;
+ type = "critical";
+ };
+ };
};
};
diff --git a/arch/arm/boot/dts/ste-ux500-samsung-codina.dts b/arch/arm/boot/dts/ste-ux500-samsung-codina.dts
index 5f41256d7f4b..b88f0c07873d 100644
--- a/arch/arm/boot/dts/ste-ux500-samsung-codina.dts
+++ b/arch/arm/boot/dts/ste-ux500-samsung-codina.dts
@@ -57,6 +57,14 @@
polling-delay = <0>;
polling-delay-passive = <0>;
thermal-sensors = <&bat_therm>;
+
+ trips {
+ battery-crit-hi {
+ temperature = <70000>;
+ hysteresis = <2000>;
+ type = "critical";
+ };
+ };
};
};
diff --git a/arch/arm/boot/dts/ste-ux500-samsung-gavini.dts b/arch/arm/boot/dts/ste-ux500-samsung-gavini.dts
index 806da3fc33cd..7231bc745200 100644
--- a/arch/arm/boot/dts/ste-ux500-samsung-gavini.dts
+++ b/arch/arm/boot/dts/ste-ux500-samsung-gavini.dts
@@ -30,6 +30,14 @@
polling-delay = <0>;
polling-delay-passive = <0>;
thermal-sensors = <&bat_therm>;
+
+ trips {
+ battery-crit-hi {
+ temperature = <70000>;
+ hysteresis = <2000>;
+ type = "critical";
+ };
+ };
};
};
diff --git a/arch/arm/boot/dts/ste-ux500-samsung-golden.dts b/arch/arm/boot/dts/ste-ux500-samsung-golden.dts
index b0dce91aff4b..9604695edf53 100644
--- a/arch/arm/boot/dts/ste-ux500-samsung-golden.dts
+++ b/arch/arm/boot/dts/ste-ux500-samsung-golden.dts
@@ -35,6 +35,14 @@
polling-delay = <0>;
polling-delay-passive = <0>;
thermal-sensors = <&bat_therm>;
+
+ trips {
+ battery-crit-hi {
+ temperature = <70000>;
+ hysteresis = <2000>;
+ type = "critical";
+ };
+ };
};
};
diff --git a/arch/arm/boot/dts/ste-ux500-samsung-janice.dts b/arch/arm/boot/dts/ste-ux500-samsung-janice.dts
index ed5c79c3d04b..69387e8754a9 100644
--- a/arch/arm/boot/dts/ste-ux500-samsung-janice.dts
+++ b/arch/arm/boot/dts/ste-ux500-samsung-janice.dts
@@ -30,6 +30,14 @@
polling-delay = <0>;
polling-delay-passive = <0>;
thermal-sensors = <&bat_therm>;
+
+ trips {
+ battery-crit-hi {
+ temperature = <70000>;
+ hysteresis = <2000>;
+ type = "critical";
+ };
+ };
};
};
diff --git a/arch/arm/boot/dts/ste-ux500-samsung-kyle.dts b/arch/arm/boot/dts/ste-ux500-samsung-kyle.dts
index c57676faf181..167846df3104 100644
--- a/arch/arm/boot/dts/ste-ux500-samsung-kyle.dts
+++ b/arch/arm/boot/dts/ste-ux500-samsung-kyle.dts
@@ -34,6 +34,14 @@
polling-delay = <0>;
polling-delay-passive = <0>;
thermal-sensors = <&bat_therm>;
+
+ trips {
+ battery-crit-hi {
+ temperature = <70000>;
+ hysteresis = <2000>;
+ type = "critical";
+ };
+ };
};
};
diff --git a/arch/arm/boot/dts/ste-ux500-samsung-skomer.dts b/arch/arm/boot/dts/ste-ux500-samsung-skomer.dts
index 81b341a5ae45..93e5f5ed888d 100644
--- a/arch/arm/boot/dts/ste-ux500-samsung-skomer.dts
+++ b/arch/arm/boot/dts/ste-ux500-samsung-skomer.dts
@@ -30,6 +30,14 @@
polling-delay = <0>;
polling-delay-passive = <0>;
thermal-sensors = <&bat_therm>;
+
+ trips {
+ battery-crit-hi {
+ temperature = <70000>;
+ hysteresis = <2000>;
+ type = "critical";
+ };
+ };
};
};
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 505c8a1ccbe0..cd93d0738425 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1965,6 +1965,7 @@ config ARM64_MTE
depends on ARM64_PAN
select ARCH_HAS_SUBPAGE_FAULTS
select ARCH_USES_HIGH_VMA_FLAGS
+ select ARCH_USES_PG_ARCH_X
help
Memory Tagging (part of the ARMv8.5 Extensions) provides
architectural support for run-time, always-on detection of
diff --git a/arch/arm64/boot/dts/arm/juno-base.dtsi b/arch/arm64/boot/dts/arm/juno-base.dtsi
index 2f27619d8abd..8b4d280b1e7e 100644
--- a/arch/arm64/boot/dts/arm/juno-base.dtsi
+++ b/arch/arm64/boot/dts/arm/juno-base.dtsi
@@ -751,12 +751,26 @@
polling-delay = <1000>;
polling-delay-passive = <100>;
thermal-sensors = <&scpi_sensors0 0>;
+ trips {
+ pmic_crit0: trip0 {
+ temperature = <90000>;
+ hysteresis = <2000>;
+ type = "critical";
+ };
+ };
};
soc {
polling-delay = <1000>;
polling-delay-passive = <100>;
thermal-sensors = <&scpi_sensors0 3>;
+ trips {
+ soc_crit0: trip0 {
+ temperature = <80000>;
+ hysteresis = <2000>;
+ type = "critical";
+ };
+ };
};
big_cluster_thermal_zone: big-cluster {
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
index 421d879013d7..260d045dbd9a 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
@@ -779,6 +779,9 @@
little-endian;
#address-cells = <1>;
#size-cells = <0>;
+ clock-frequency = <2500000>;
+ clocks = <&clockgen QORIQ_CLK_PLATFORM_PLL
+ QORIQ_CLK_PLL_DIV(1)>;
status = "disabled";
};
@@ -788,6 +791,9 @@
little-endian;
#address-cells = <1>;
#size-cells = <0>;
+ clock-frequency = <2500000>;
+ clocks = <&clockgen QORIQ_CLK_PLATFORM_PLL
+ QORIQ_CLK_PLL_DIV(1)>;
status = "disabled";
};
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
index f1b9cc8714dc..348d9e3a9125 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
@@ -532,6 +532,9 @@
little-endian;
#address-cells = <1>;
#size-cells = <0>;
+ clock-frequency = <2500000>;
+ clocks = <&clockgen QORIQ_CLK_PLATFORM_PLL
+ QORIQ_CLK_PLL_DIV(2)>;
status = "disabled";
};
@@ -541,6 +544,9 @@
little-endian;
#address-cells = <1>;
#size-cells = <0>;
+ clock-frequency = <2500000>;
+ clocks = <&clockgen QORIQ_CLK_PLATFORM_PLL
+ QORIQ_CLK_PLL_DIV(2)>;
status = "disabled";
};
diff --git a/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi b/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi
index 6680fb2a6dc9..8c76d86cb756 100644
--- a/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi
@@ -1385,6 +1385,9 @@
#address-cells = <1>;
#size-cells = <0>;
little-endian;
+ clock-frequency = <2500000>;
+ clocks = <&clockgen QORIQ_CLK_PLATFORM_PLL
+ QORIQ_CLK_PLL_DIV(2)>;
status = "disabled";
};
@@ -1395,6 +1398,9 @@
little-endian;
#address-cells = <1>;
#size-cells = <0>;
+ clock-frequency = <2500000>;
+ clocks = <&clockgen QORIQ_CLK_PLATFORM_PLL
+ QORIQ_CLK_PLL_DIV(2)>;
status = "disabled";
};
diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi
index 82a1c4488378..10370d1a6c6d 100644
--- a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi
@@ -38,9 +38,9 @@ conn_subsys: bus@5b000000 {
interrupts = <GIC_SPI 232 IRQ_TYPE_LEVEL_HIGH>;
reg = <0x5b010000 0x10000>;
clocks = <&sdhc0_lpcg IMX_LPCG_CLK_4>,
- <&sdhc0_lpcg IMX_LPCG_CLK_5>,
- <&sdhc0_lpcg IMX_LPCG_CLK_0>;
- clock-names = "ipg", "per", "ahb";
+ <&sdhc0_lpcg IMX_LPCG_CLK_0>,
+ <&sdhc0_lpcg IMX_LPCG_CLK_5>;
+ clock-names = "ipg", "ahb", "per";
power-domains = <&pd IMX_SC_R_SDHC_0>;
status = "disabled";
};
@@ -49,9 +49,9 @@ conn_subsys: bus@5b000000 {
interrupts = <GIC_SPI 233 IRQ_TYPE_LEVEL_HIGH>;
reg = <0x5b020000 0x10000>;
clocks = <&sdhc1_lpcg IMX_LPCG_CLK_4>,
- <&sdhc1_lpcg IMX_LPCG_CLK_5>,
- <&sdhc1_lpcg IMX_LPCG_CLK_0>;
- clock-names = "ipg", "per", "ahb";
+ <&sdhc1_lpcg IMX_LPCG_CLK_0>,
+ <&sdhc1_lpcg IMX_LPCG_CLK_5>;
+ clock-names = "ipg", "ahb", "per";
power-domains = <&pd IMX_SC_R_SDHC_1>;
fsl,tuning-start-tap = <20>;
fsl,tuning-step = <2>;
@@ -62,9 +62,9 @@ conn_subsys: bus@5b000000 {
interrupts = <GIC_SPI 234 IRQ_TYPE_LEVEL_HIGH>;
reg = <0x5b030000 0x10000>;
clocks = <&sdhc2_lpcg IMX_LPCG_CLK_4>,
- <&sdhc2_lpcg IMX_LPCG_CLK_5>,
- <&sdhc2_lpcg IMX_LPCG_CLK_0>;
- clock-names = "ipg", "per", "ahb";
+ <&sdhc2_lpcg IMX_LPCG_CLK_0>,
+ <&sdhc2_lpcg IMX_LPCG_CLK_5>;
+ clock-names = "ipg", "ahb", "per";
power-domains = <&pd IMX_SC_R_SDHC_2>;
status = "disabled";
};
diff --git a/arch/arm64/boot/dts/freescale/imx8mm-mx8menlo.dts b/arch/arm64/boot/dts/freescale/imx8mm-mx8menlo.dts
index 32f6f2f50c10..43e89859c044 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm-mx8menlo.dts
+++ b/arch/arm64/boot/dts/freescale/imx8mm-mx8menlo.dts
@@ -250,21 +250,21 @@
/* SODIMM 96 */
MX8MM_IOMUXC_SAI1_RXD2_GPIO4_IO4 0x1c4
/* CPLD_D[7] */
- MX8MM_IOMUXC_SAI1_RXD3_GPIO4_IO5 0x1c4
+ MX8MM_IOMUXC_SAI1_RXD3_GPIO4_IO5 0x184
/* CPLD_D[6] */
- MX8MM_IOMUXC_SAI1_RXFS_GPIO4_IO0 0x1c4
+ MX8MM_IOMUXC_SAI1_RXFS_GPIO4_IO0 0x184
/* CPLD_D[5] */
- MX8MM_IOMUXC_SAI1_TXC_GPIO4_IO11 0x1c4
+ MX8MM_IOMUXC_SAI1_TXC_GPIO4_IO11 0x184
/* CPLD_D[4] */
- MX8MM_IOMUXC_SAI1_TXD0_GPIO4_IO12 0x1c4
+ MX8MM_IOMUXC_SAI1_TXD0_GPIO4_IO12 0x184
/* CPLD_D[3] */
- MX8MM_IOMUXC_SAI1_TXD1_GPIO4_IO13 0x1c4
+ MX8MM_IOMUXC_SAI1_TXD1_GPIO4_IO13 0x184
/* CPLD_D[2] */
- MX8MM_IOMUXC_SAI1_TXD2_GPIO4_IO14 0x1c4
+ MX8MM_IOMUXC_SAI1_TXD2_GPIO4_IO14 0x184
/* CPLD_D[1] */
- MX8MM_IOMUXC_SAI1_TXD3_GPIO4_IO15 0x1c4
+ MX8MM_IOMUXC_SAI1_TXD3_GPIO4_IO15 0x184
/* CPLD_D[0] */
- MX8MM_IOMUXC_SAI1_TXD4_GPIO4_IO16 0x1c4
+ MX8MM_IOMUXC_SAI1_TXD4_GPIO4_IO16 0x184
/* KBD_intK */
MX8MM_IOMUXC_SAI2_MCLK_GPIO4_IO27 0x1c4
/* DISP_reset */
diff --git a/arch/arm64/boot/dts/freescale/imx8mm.dtsi b/arch/arm64/boot/dts/freescale/imx8mm.dtsi
index afb90f59c83c..dabd94dc30c4 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mm.dtsi
@@ -276,6 +276,7 @@
assigned-clocks = <&clk IMX8MM_CLK_USB_PHY_REF>;
assigned-clock-parents = <&clk IMX8MM_SYS_PLL1_100M>;
clock-names = "main_clk";
+ power-domains = <&pgc_otg1>;
};
usbphynop2: usbphynop2 {
@@ -285,6 +286,7 @@
assigned-clocks = <&clk IMX8MM_CLK_USB_PHY_REF>;
assigned-clock-parents = <&clk IMX8MM_SYS_PLL1_100M>;
clock-names = "main_clk";
+ power-domains = <&pgc_otg2>;
};
soc: soc@0 {
@@ -674,13 +676,11 @@
pgc_otg1: power-domain@2 {
#power-domain-cells = <0>;
reg = <IMX8MM_POWER_DOMAIN_OTG1>;
- power-domains = <&pgc_hsiomix>;
};
pgc_otg2: power-domain@3 {
#power-domain-cells = <0>;
reg = <IMX8MM_POWER_DOMAIN_OTG2>;
- power-domains = <&pgc_hsiomix>;
};
pgc_gpumix: power-domain@4 {
@@ -1186,7 +1186,7 @@
assigned-clock-parents = <&clk IMX8MM_SYS_PLL2_500M>;
phys = <&usbphynop1>;
fsl,usbmisc = <&usbmisc1 0>;
- power-domains = <&pgc_otg1>;
+ power-domains = <&pgc_hsiomix>;
status = "disabled";
};
@@ -1206,7 +1206,7 @@
assigned-clock-parents = <&clk IMX8MM_SYS_PLL2_500M>;
phys = <&usbphynop2>;
fsl,usbmisc = <&usbmisc2 0>;
- power-domains = <&pgc_otg2>;
+ power-domains = <&pgc_hsiomix>;
status = "disabled";
};
diff --git a/arch/arm64/boot/dts/freescale/imx8mn.dtsi b/arch/arm64/boot/dts/freescale/imx8mn.dtsi
index cb2836bfbd95..ad0b99adf691 100644
--- a/arch/arm64/boot/dts/freescale/imx8mn.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mn.dtsi
@@ -662,7 +662,6 @@
pgc_otg1: power-domain@1 {
#power-domain-cells = <0>;
reg = <IMX8MN_POWER_DOMAIN_OTG1>;
- power-domains = <&pgc_hsiomix>;
};
pgc_gpumix: power-domain@2 {
@@ -1076,7 +1075,7 @@
assigned-clock-parents = <&clk IMX8MN_SYS_PLL2_500M>;
phys = <&usbphynop1>;
fsl,usbmisc = <&usbmisc1 0>;
- power-domains = <&pgc_otg1>;
+ power-domains = <&pgc_hsiomix>;
status = "disabled";
};
@@ -1175,5 +1174,6 @@
assigned-clocks = <&clk IMX8MN_CLK_USB_PHY_REF>;
assigned-clock-parents = <&clk IMX8MN_SYS_PLL1_100M>;
clock-names = "main_clk";
+ power-domains = <&pgc_otg1>;
};
};
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi
index 7b712d1888ea..5dcd1de586b5 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi
@@ -354,16 +354,6 @@
"SODIMM_82",
"SODIMM_70",
"SODIMM_72";
-
- ctrl-sleep-moci-hog {
- gpio-hog;
- /* Verdin CTRL_SLEEP_MOCI# (SODIMM 256) */
- gpios = <29 GPIO_ACTIVE_HIGH>;
- line-name = "CTRL_SLEEP_MOCI#";
- output-high;
- pinctrl-names = "default";
- pinctrl-0 = <&pinctrl_ctrl_sleep_moci>;
- };
};
&gpio3 {
@@ -432,6 +422,16 @@
"SODIMM_256",
"SODIMM_48",
"SODIMM_44";
+
+ ctrl-sleep-moci-hog {
+ gpio-hog;
+ /* Verdin CTRL_SLEEP_MOCI# (SODIMM 256) */
+ gpios = <29 GPIO_ACTIVE_HIGH>;
+ line-name = "CTRL_SLEEP_MOCI#";
+ output-high;
+ pinctrl-names = "default";
+ pinctrl-0 = <&pinctrl_ctrl_sleep_moci>;
+ };
};
/* On-module I2C */
diff --git a/arch/arm64/boot/dts/freescale/imx93.dtsi b/arch/arm64/boot/dts/freescale/imx93.dtsi
index 3a5713bb4880..0247866fc86b 100644
--- a/arch/arm64/boot/dts/freescale/imx93.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx93.dtsi
@@ -451,7 +451,7 @@
clocks = <&clk IMX93_CLK_GPIO2_GATE>,
<&clk IMX93_CLK_GPIO2_GATE>;
clock-names = "gpio", "port";
- gpio-ranges = <&iomuxc 0 32 32>;
+ gpio-ranges = <&iomuxc 0 4 30>;
};
gpio3: gpio@43820080 {
@@ -465,7 +465,8 @@
clocks = <&clk IMX93_CLK_GPIO3_GATE>,
<&clk IMX93_CLK_GPIO3_GATE>;
clock-names = "gpio", "port";
- gpio-ranges = <&iomuxc 0 64 32>;
+ gpio-ranges = <&iomuxc 0 84 8>, <&iomuxc 8 66 18>,
+ <&iomuxc 26 34 2>, <&iomuxc 28 0 4>;
};
gpio4: gpio@43830080 {
@@ -479,7 +480,7 @@
clocks = <&clk IMX93_CLK_GPIO4_GATE>,
<&clk IMX93_CLK_GPIO4_GATE>;
clock-names = "gpio", "port";
- gpio-ranges = <&iomuxc 0 96 32>;
+ gpio-ranges = <&iomuxc 0 38 28>, <&iomuxc 28 36 2>;
};
gpio1: gpio@47400080 {
@@ -493,7 +494,7 @@
clocks = <&clk IMX93_CLK_GPIO1_GATE>,
<&clk IMX93_CLK_GPIO1_GATE>;
clock-names = "gpio", "port";
- gpio-ranges = <&iomuxc 0 0 32>;
+ gpio-ranges = <&iomuxc 0 92 16>;
};
s4muap: mailbox@47520000 {
@@ -501,7 +502,7 @@
reg = <0x47520000 0x10000>;
interrupts = <GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 30 IRQ_TYPE_LEVEL_HIGH>;
- interrupt-names = "txirq", "rxirq";
+ interrupt-names = "tx", "rx";
#mbox-cells = <2>;
};
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 439e2bc5d5d8..d6cf535d8352 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -14,8 +14,16 @@
#ifdef CONFIG_EFI
extern void efi_init(void);
+
+bool efi_runtime_fixup_exception(struct pt_regs *regs, const char *msg);
#else
#define efi_init()
+
+static inline
+bool efi_runtime_fixup_exception(struct pt_regs *regs, const char *msg)
+{
+ return false;
+}
#endif
int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md);
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 8aa8492dafc0..0df3fc3a0173 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -135,7 +135,7 @@
* 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are
* not known to exist and will break with this configuration.
*
- * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2().
+ * The VTCR_EL2 is configured per VM and is initialised in kvm_init_stage2_mmu.
*
* Note that when using 4K pages, we concatenate two first level page tables
* together. With 16K pages, we concatenate 16 first level page tables.
@@ -340,9 +340,13 @@
* We have
* PAR [PA_Shift - 1 : 12] = PA [PA_Shift - 1 : 12]
* HPFAR [PA_Shift - 9 : 4] = FIPA [PA_Shift - 1 : 12]
+ *
+ * Always assume 52 bit PA since at this point, we don't know how many PA bits
+ * the page table has been set up for. This should be safe since unused address
+ * bits in PAR are res0.
*/
#define PAR_TO_HPFAR(par) \
- (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8)
+ (((par) & GENMASK_ULL(52 - 1, 12)) >> 8)
#define ECN(x) { ESR_ELx_EC_##x, #x }
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 53035763e48e..43c3bc0f9544 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -76,6 +76,9 @@ enum __kvm_host_smccc_func {
__KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs,
__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs,
__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps,
+ __KVM_HOST_SMCCC_FUNC___pkvm_init_vm,
+ __KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu,
+ __KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm,
};
#define DECLARE_KVM_VHE_SYM(sym) extern char sym[]
@@ -106,7 +109,7 @@ enum __kvm_host_smccc_func {
#define per_cpu_ptr_nvhe_sym(sym, cpu) \
({ \
unsigned long base, off; \
- base = kvm_arm_hyp_percpu_base[cpu]; \
+ base = kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]; \
off = (unsigned long)&CHOOSE_NVHE_SYM(sym) - \
(unsigned long)&CHOOSE_NVHE_SYM(__per_cpu_start); \
base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL; \
@@ -211,7 +214,7 @@ DECLARE_KVM_HYP_SYM(__kvm_hyp_vector);
#define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init)
#define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector)
-extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
+extern unsigned long kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[];
DECLARE_KVM_NVHE_SYM(__per_cpu_start);
DECLARE_KVM_NVHE_SYM(__per_cpu_end);
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 45e2136322ba..001c8abe87fc 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -73,6 +73,63 @@ u32 __attribute_const__ kvm_target_cpu(void);
int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);
+struct kvm_hyp_memcache {
+ phys_addr_t head;
+ unsigned long nr_pages;
+};
+
+static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc,
+ phys_addr_t *p,
+ phys_addr_t (*to_pa)(void *virt))
+{
+ *p = mc->head;
+ mc->head = to_pa(p);
+ mc->nr_pages++;
+}
+
+static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc,
+ void *(*to_va)(phys_addr_t phys))
+{
+ phys_addr_t *p = to_va(mc->head);
+
+ if (!mc->nr_pages)
+ return NULL;
+
+ mc->head = *p;
+ mc->nr_pages--;
+
+ return p;
+}
+
+static inline int __topup_hyp_memcache(struct kvm_hyp_memcache *mc,
+ unsigned long min_pages,
+ void *(*alloc_fn)(void *arg),
+ phys_addr_t (*to_pa)(void *virt),
+ void *arg)
+{
+ while (mc->nr_pages < min_pages) {
+ phys_addr_t *p = alloc_fn(arg);
+
+ if (!p)
+ return -ENOMEM;
+ push_hyp_memcache(mc, p, to_pa);
+ }
+
+ return 0;
+}
+
+static inline void __free_hyp_memcache(struct kvm_hyp_memcache *mc,
+ void (*free_fn)(void *virt, void *arg),
+ void *(*to_va)(phys_addr_t phys),
+ void *arg)
+{
+ while (mc->nr_pages)
+ free_fn(pop_hyp_memcache(mc, to_va), arg);
+}
+
+void free_hyp_memcache(struct kvm_hyp_memcache *mc);
+int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages);
+
struct kvm_vmid {
atomic64_t id;
};
@@ -115,6 +172,13 @@ struct kvm_smccc_features {
unsigned long vendor_hyp_bmap;
};
+typedef unsigned int pkvm_handle_t;
+
+struct kvm_protected_vm {
+ pkvm_handle_t handle;
+ struct kvm_hyp_memcache teardown_mc;
+};
+
struct kvm_arch {
struct kvm_s2_mmu mmu;
@@ -163,9 +227,19 @@ struct kvm_arch {
u8 pfr0_csv2;
u8 pfr0_csv3;
+ struct {
+ u8 imp:4;
+ u8 unimp:4;
+ } dfr0_pmuver;
/* Hypercall features firmware registers' descriptor */
struct kvm_smccc_features smccc_feat;
+
+ /*
+ * For an untrusted host VM, 'pkvm.handle' is used to lookup
+ * the associated pKVM instance in the hypervisor.
+ */
+ struct kvm_protected_vm pkvm;
};
struct kvm_vcpu_fault_info {
@@ -915,8 +989,6 @@ int kvm_set_ipa_limit(void);
#define __KVM_HAVE_ARCH_VM_ALLOC
struct kvm *kvm_arch_alloc_vm(void);
-int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
-
static inline bool kvm_vm_is_protected(struct kvm *kvm)
{
return false;
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index aa7fa2a08f06..6797eafe7890 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -123,4 +123,7 @@ extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val);
extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val);
extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val);
+extern unsigned long kvm_nvhe_sym(__icache_flags);
+extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits);
+
#endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 7784081088e7..e4a7e6369499 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -166,7 +166,7 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
void free_hyp_pgds(void);
void stage2_unmap_vm(struct kvm *kvm);
-int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu);
+int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
phys_addr_t pa, unsigned long size, bool writable);
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 3252eb50ecfe..63f81b27a4e3 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -42,6 +42,8 @@ typedef u64 kvm_pte_t;
#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT)
#define KVM_PTE_ADDR_51_48 GENMASK(15, 12)
+#define KVM_PHYS_INVALID (-1ULL)
+
static inline bool kvm_pte_valid(kvm_pte_t pte)
{
return pte & KVM_PTE_VALID;
@@ -57,6 +59,18 @@ static inline u64 kvm_pte_to_phys(kvm_pte_t pte)
return pa;
}
+static inline kvm_pte_t kvm_phys_to_pte(u64 pa)
+{
+ kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
+
+ if (PAGE_SHIFT == 16) {
+ pa &= GENMASK(51, 48);
+ pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
+ }
+
+ return pte;
+}
+
static inline u64 kvm_granule_shift(u32 level)
{
/* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
@@ -85,6 +99,8 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
* allocation is physically contiguous.
* @free_pages_exact: Free an exact number of memory pages previously
* allocated by zalloc_pages_exact.
+ * @free_removed_table: Free a removed paging structure by unlinking and
+ * dropping references.
* @get_page: Increment the refcount on a page.
* @put_page: Decrement the refcount on a page. When the
* refcount reaches 0 the page is automatically
@@ -103,6 +119,7 @@ struct kvm_pgtable_mm_ops {
void* (*zalloc_page)(void *arg);
void* (*zalloc_pages_exact)(size_t size);
void (*free_pages_exact)(void *addr, size_t size);
+ void (*free_removed_table)(void *addr, u32 level);
void (*get_page)(void *addr);
void (*put_page)(void *addr);
int (*page_count)(void *addr);
@@ -162,29 +179,6 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
enum kvm_pgtable_prot prot);
/**
- * struct kvm_pgtable - KVM page-table.
- * @ia_bits: Maximum input address size, in bits.
- * @start_level: Level at which the page-table walk starts.
- * @pgd: Pointer to the first top-level entry of the page-table.
- * @mm_ops: Memory management callbacks.
- * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
- * @flags: Stage-2 page-table flags.
- * @force_pte_cb: Function that returns true if page level mappings must
- * be used instead of block mappings.
- */
-struct kvm_pgtable {
- u32 ia_bits;
- u32 start_level;
- kvm_pte_t *pgd;
- struct kvm_pgtable_mm_ops *mm_ops;
-
- /* Stage-2 only */
- struct kvm_s2_mmu *mmu;
- enum kvm_pgtable_stage2_flags flags;
- kvm_pgtable_force_pte_cb_t force_pte_cb;
-};
-
-/**
* enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
* @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid
* entries.
@@ -192,17 +186,34 @@ struct kvm_pgtable {
* children.
* @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their
* children.
+ * @KVM_PGTABLE_WALK_SHARED: Indicates the page-tables may be shared
+ * with other software walkers.
*/
enum kvm_pgtable_walk_flags {
KVM_PGTABLE_WALK_LEAF = BIT(0),
KVM_PGTABLE_WALK_TABLE_PRE = BIT(1),
KVM_PGTABLE_WALK_TABLE_POST = BIT(2),
+ KVM_PGTABLE_WALK_SHARED = BIT(3),
+};
+
+struct kvm_pgtable_visit_ctx {
+ kvm_pte_t *ptep;
+ kvm_pte_t old;
+ void *arg;
+ struct kvm_pgtable_mm_ops *mm_ops;
+ u64 addr;
+ u64 end;
+ u32 level;
+ enum kvm_pgtable_walk_flags flags;
};
-typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level,
- kvm_pte_t *ptep,
- enum kvm_pgtable_walk_flags flag,
- void * const arg);
+typedef int (*kvm_pgtable_visitor_fn_t)(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit);
+
+static inline bool kvm_pgtable_walk_shared(const struct kvm_pgtable_visit_ctx *ctx)
+{
+ return ctx->flags & KVM_PGTABLE_WALK_SHARED;
+}
/**
* struct kvm_pgtable_walker - Hook into a page-table walk.
@@ -217,6 +228,94 @@ struct kvm_pgtable_walker {
const enum kvm_pgtable_walk_flags flags;
};
+/*
+ * RCU cannot be used in a non-kernel context such as the hyp. As such, page
+ * table walkers used in hyp do not call into RCU and instead use other
+ * synchronization mechanisms (such as a spinlock).
+ */
+#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__)
+
+typedef kvm_pte_t *kvm_pteref_t;
+
+static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker,
+ kvm_pteref_t pteref)
+{
+ return pteref;
+}
+
+static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
+{
+ /*
+ * Due to the lack of RCU (or a similar protection scheme), only
+ * non-shared table walkers are allowed in the hypervisor.
+ */
+ if (walker->flags & KVM_PGTABLE_WALK_SHARED)
+ return -EPERM;
+
+ return 0;
+}
+
+static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker) {}
+
+static inline bool kvm_pgtable_walk_lock_held(void)
+{
+ return true;
+}
+
+#else
+
+typedef kvm_pte_t __rcu *kvm_pteref_t;
+
+static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker,
+ kvm_pteref_t pteref)
+{
+ return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED));
+}
+
+static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
+{
+ if (walker->flags & KVM_PGTABLE_WALK_SHARED)
+ rcu_read_lock();
+
+ return 0;
+}
+
+static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker)
+{
+ if (walker->flags & KVM_PGTABLE_WALK_SHARED)
+ rcu_read_unlock();
+}
+
+static inline bool kvm_pgtable_walk_lock_held(void)
+{
+ return rcu_read_lock_held();
+}
+
+#endif
+
+/**
+ * struct kvm_pgtable - KVM page-table.
+ * @ia_bits: Maximum input address size, in bits.
+ * @start_level: Level at which the page-table walk starts.
+ * @pgd: Pointer to the first top-level entry of the page-table.
+ * @mm_ops: Memory management callbacks.
+ * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
+ * @flags: Stage-2 page-table flags.
+ * @force_pte_cb: Function that returns true if page level mappings must
+ * be used instead of block mappings.
+ */
+struct kvm_pgtable {
+ u32 ia_bits;
+ u32 start_level;
+ kvm_pteref_t pgd;
+ struct kvm_pgtable_mm_ops *mm_ops;
+
+ /* Stage-2 only */
+ struct kvm_s2_mmu *mmu;
+ enum kvm_pgtable_stage2_flags flags;
+ kvm_pgtable_force_pte_cb_t force_pte_cb;
+};
+
/**
* kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
* @pgt: Uninitialised page-table structure to initialise.
@@ -297,6 +396,14 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);
/**
+ * kvm_pgtable_stage2_pgd_size() - Helper to compute size of a stage-2 PGD
+ * @vtcr: Content of the VTCR register.
+ *
+ * Return: the size (in bytes) of the stage-2 PGD
+ */
+size_t kvm_pgtable_stage2_pgd_size(u64 vtcr);
+
+/**
* __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
* @pgt: Uninitialised page-table structure to initialise.
* @mmu: S2 MMU context for this S2 translation
@@ -325,6 +432,17 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
/**
+ * kvm_pgtable_stage2_free_removed() - Free a removed stage-2 paging structure.
+ * @mm_ops: Memory management callbacks.
+ * @pgtable: Unlinked stage-2 paging structure to be freed.
+ * @level: Level of the stage-2 paging structure to be freed.
+ *
+ * The page-table is assumed to be unreachable by any hardware walkers prior to
+ * freeing and therefore no TLB invalidation is performed.
+ */
+void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level);
+
+/**
* kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address at which to place the mapping.
@@ -333,6 +451,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
* @prot: Permissions and attributes for the mapping.
* @mc: Cache of pre-allocated and zeroed memory from which to allocate
* page-table pages.
+ * @flags: Flags to control the page-table walk (ex. a shared walk)
*
* The offset of @addr within a page is ignored, @size is rounded-up to
* the next page boundary and @phys is rounded-down to the previous page
@@ -354,7 +473,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
*/
int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
u64 phys, enum kvm_pgtable_prot prot,
- void *mc);
+ void *mc, enum kvm_pgtable_walk_flags flags);
/**
* kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to
diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 9f4ad2a8df59..01129b0d4c68 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -9,11 +9,49 @@
#include <linux/memblock.h>
#include <asm/kvm_pgtable.h>
+/* Maximum number of VMs that can co-exist under pKVM. */
+#define KVM_MAX_PVMS 255
+
#define HYP_MEMBLOCK_REGIONS 128
+int pkvm_init_host_vm(struct kvm *kvm);
+int pkvm_create_hyp_vm(struct kvm *kvm);
+void pkvm_destroy_hyp_vm(struct kvm *kvm);
+
extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);
+static inline unsigned long
+hyp_vmemmap_memblock_size(struct memblock_region *reg, size_t vmemmap_entry_size)
+{
+ unsigned long nr_pages = reg->size >> PAGE_SHIFT;
+ unsigned long start, end;
+
+ start = (reg->base >> PAGE_SHIFT) * vmemmap_entry_size;
+ end = start + nr_pages * vmemmap_entry_size;
+ start = ALIGN_DOWN(start, PAGE_SIZE);
+ end = ALIGN(end, PAGE_SIZE);
+
+ return end - start;
+}
+
+static inline unsigned long hyp_vmemmap_pages(size_t vmemmap_entry_size)
+{
+ unsigned long res = 0, i;
+
+ for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) {
+ res += hyp_vmemmap_memblock_size(&kvm_nvhe_sym(hyp_memory)[i],
+ vmemmap_entry_size);
+ }
+
+ return res >> PAGE_SHIFT;
+}
+
+static inline unsigned long hyp_vm_table_pages(void)
+{
+ return PAGE_ALIGN(KVM_MAX_PVMS * sizeof(void *)) >> PAGE_SHIFT;
+}
+
static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
{
unsigned long total = 0, i;
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 760c62f8e22f..20dd06d70af5 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -25,7 +25,7 @@ unsigned long mte_copy_tags_to_user(void __user *to, void *from,
unsigned long n);
int mte_save_tags(struct page *page);
void mte_save_page_tags(const void *page_addr, void *tag_storage);
-bool mte_restore_tags(swp_entry_t entry, struct page *page);
+void mte_restore_tags(swp_entry_t entry, struct page *page);
void mte_restore_page_tags(void *page_addr, const void *tag_storage);
void mte_invalidate_tags(int type, pgoff_t offset);
void mte_invalidate_tags_area(int type);
@@ -36,6 +36,58 @@ void mte_free_tag_storage(char *storage);
/* track which pages have valid allocation tags */
#define PG_mte_tagged PG_arch_2
+/* simple lock to avoid multiple threads tagging the same page */
+#define PG_mte_lock PG_arch_3
+
+static inline void set_page_mte_tagged(struct page *page)
+{
+ /*
+ * Ensure that the tags written prior to this function are visible
+ * before the page flags update.
+ */
+ smp_wmb();
+ set_bit(PG_mte_tagged, &page->flags);
+}
+
+static inline bool page_mte_tagged(struct page *page)
+{
+ bool ret = test_bit(PG_mte_tagged, &page->flags);
+
+ /*
+ * If the page is tagged, ensure ordering with a likely subsequent
+ * read of the tags.
+ */
+ if (ret)
+ smp_rmb();
+ return ret;
+}
+
+/*
+ * Lock the page for tagging and return 'true' if the page can be tagged,
+ * 'false' if already tagged. PG_mte_tagged is never cleared and therefore the
+ * locking only happens once for page initialisation.
+ *
+ * The page MTE lock state:
+ *
+ * Locked: PG_mte_lock && !PG_mte_tagged
+ * Unlocked: !PG_mte_lock || PG_mte_tagged
+ *
+ * Acquire semantics only if the page is tagged (returning 'false').
+ */
+static inline bool try_page_mte_tagging(struct page *page)
+{
+ if (!test_and_set_bit(PG_mte_lock, &page->flags))
+ return true;
+
+ /*
+ * The tags are either being initialised or may have been initialised
+ * already. Check if the PG_mte_tagged flag has been set or wait
+ * otherwise.
+ */
+ smp_cond_load_acquire(&page->flags, VAL & (1UL << PG_mte_tagged));
+
+ return false;
+}
void mte_zero_clear_page_tags(void *addr);
void mte_sync_tags(pte_t old_pte, pte_t pte);
@@ -56,6 +108,17 @@ size_t mte_probe_user_range(const char __user *uaddr, size_t size);
/* unused if !CONFIG_ARM64_MTE, silence the compiler */
#define PG_mte_tagged 0
+static inline void set_page_mte_tagged(struct page *page)
+{
+}
+static inline bool page_mte_tagged(struct page *page)
+{
+ return false;
+}
+static inline bool try_page_mte_tagging(struct page *page)
+{
+ return false;
+}
static inline void mte_zero_clear_page_tags(void *addr)
{
}
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 71a1af42f0e8..8735ac1a1e32 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1049,8 +1049,8 @@ static inline void arch_swap_invalidate_area(int type)
#define __HAVE_ARCH_SWAP_RESTORE
static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
{
- if (system_supports_mte() && mte_restore_tags(entry, &folio->page))
- set_bit(PG_mte_tagged, &folio->flags);
+ if (system_supports_mte())
+ mte_restore_tags(entry, &folio->page);
}
#endif /* CONFIG_ARM64_MTE */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 7d301700d1a9..84f59ce1dc6d 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -698,6 +698,8 @@
#define ID_DFR0_PERFMON_8_1 0x4
#define ID_DFR0_PERFMON_8_4 0x5
#define ID_DFR0_PERFMON_8_5 0x6
+#define ID_DFR0_PERFMON_8_7 0x7
+#define ID_DFR0_PERFMON_IMP_DEF 0xf
#define ID_ISAR4_SWP_FRAC_SHIFT 28
#define ID_ISAR4_PSR_M_SHIFT 24
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 316917b98707..a7a857f1784d 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -43,6 +43,7 @@
#define __KVM_HAVE_VCPU_EVENTS
#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+#define KVM_DIRTY_LOG_PAGE_OFFSET 64
#define KVM_REG_SIZE(id) \
(1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 6062454a9067..79d153d34206 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -428,6 +428,30 @@ static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = {
ARM64_FTR_END,
};
+static const struct arm64_ftr_bits ftr_mvfr0[] = {
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPROUND_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPSHVEC_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPSQRT_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPDIVIDE_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPTRAP_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPDP_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPSP_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_SIMD_SHIFT, 4, 0),
+ ARM64_FTR_END,
+};
+
+static const struct arm64_ftr_bits ftr_mvfr1[] = {
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_SIMDFMAC_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_FPHP_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_SIMDHP_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_SIMDSP_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_SIMDINT_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_SIMDLS_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_FPDNAN_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_FPFTZ_SHIFT, 4, 0),
+ ARM64_FTR_END,
+};
+
static const struct arm64_ftr_bits ftr_mvfr2[] = {
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_FPMISC_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_SIMDMISC_SHIFT, 4, 0),
@@ -458,10 +482,10 @@ static const struct arm64_ftr_bits ftr_id_isar0[] = {
static const struct arm64_ftr_bits ftr_id_isar5[] = {
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_RDM_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_CRC32_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SHA2_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SHA1_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_AES_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_CRC32_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SHA2_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SHA1_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_AES_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SEVL_SHIFT, 4, 0),
ARM64_FTR_END,
};
@@ -574,7 +598,7 @@ static const struct arm64_ftr_bits ftr_smcr[] = {
* Common ftr bits for a 32bit register with all hidden, strict
* attributes, with 4bit feature fields and a default safe value of
* 0. Covers the following 32bit registers:
- * id_isar[1-4], id_mmfr[1-3], id_pfr1, mvfr[0-1]
+ * id_isar[1-3], id_mmfr[1-3]
*/
static const struct arm64_ftr_bits ftr_generic_32bits[] = {
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0),
@@ -645,8 +669,8 @@ static const struct __ftr_reg_entry {
ARM64_FTR_REG(SYS_ID_ISAR6_EL1, ftr_id_isar6),
/* Op1 = 0, CRn = 0, CRm = 3 */
- ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_generic_32bits),
- ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_generic_32bits),
+ ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_mvfr0),
+ ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_mvfr1),
ARM64_FTR_REG(SYS_MVFR2_EL1, ftr_mvfr2),
ARM64_FTR_REG(SYS_ID_PFR2_EL1, ftr_id_pfr2),
ARM64_FTR_REG(SYS_ID_DFR1_EL1, ftr_id_dfr1),
@@ -2050,8 +2074,10 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
* Clear the tags in the zero page. This needs to be done via the
* linear map which has the Tagged attribute.
*/
- if (!test_and_set_bit(PG_mte_tagged, &ZERO_PAGE(0)->flags))
+ if (try_page_mte_tagging(ZERO_PAGE(0))) {
mte_clear_page_tags(lm_alias(empty_zero_page));
+ set_page_mte_tagged(ZERO_PAGE(0));
+ }
kasan_init_hw_tags_cpu();
}
@@ -3339,7 +3365,7 @@ static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *c
/*
* We emulate only the following system register space.
- * Op0 = 0x3, CRn = 0x0, Op1 = 0x0, CRm = [0, 4 - 7]
+ * Op0 = 0x3, CRn = 0x0, Op1 = 0x0, CRm = [0, 2 - 7]
* See Table C5-6 System instruction encodings for System register accesses,
* ARMv8 ARM(ARM DDI 0487A.f) for more details.
*/
@@ -3349,7 +3375,7 @@ static inline bool __attribute_const__ is_emulated(u32 id)
sys_reg_CRn(id) == 0x0 &&
sys_reg_Op1(id) == 0x0 &&
(sys_reg_CRm(id) == 0 ||
- ((sys_reg_CRm(id) >= 4) && (sys_reg_CRm(id) <= 7))));
+ ((sys_reg_CRm(id) >= 2) && (sys_reg_CRm(id) <= 7))));
}
/*
diff --git a/arch/arm64/kernel/efi-rt-wrapper.S b/arch/arm64/kernel/efi-rt-wrapper.S
index 75691a2641c1..67babd5f04c2 100644
--- a/arch/arm64/kernel/efi-rt-wrapper.S
+++ b/arch/arm64/kernel/efi-rt-wrapper.S
@@ -6,7 +6,7 @@
#include <linux/linkage.h>
SYM_FUNC_START(__efi_rt_asm_wrapper)
- stp x29, x30, [sp, #-32]!
+ stp x29, x30, [sp, #-112]!
mov x29, sp
/*
@@ -17,6 +17,20 @@ SYM_FUNC_START(__efi_rt_asm_wrapper)
stp x1, x18, [sp, #16]
/*
+ * Preserve all callee saved registers and record the stack pointer
+ * value in a per-CPU variable so we can recover from synchronous
+ * exceptions occurring while running the firmware routines.
+ */
+ stp x19, x20, [sp, #32]
+ stp x21, x22, [sp, #48]
+ stp x23, x24, [sp, #64]
+ stp x25, x26, [sp, #80]
+ stp x27, x28, [sp, #96]
+
+ adr_this_cpu x8, __efi_rt_asm_recover_sp, x9
+ str x29, [x8]
+
+ /*
* We are lucky enough that no EFI runtime services take more than
* 5 arguments, so all are passed in registers rather than via the
* stack.
@@ -31,7 +45,7 @@ SYM_FUNC_START(__efi_rt_asm_wrapper)
ldp x1, x2, [sp, #16]
cmp x2, x18
- ldp x29, x30, [sp], #32
+ ldp x29, x30, [sp], #112
b.ne 0f
ret
0:
@@ -45,3 +59,18 @@ SYM_FUNC_START(__efi_rt_asm_wrapper)
mov x18, x2
b efi_handle_corrupted_x18 // tail call
SYM_FUNC_END(__efi_rt_asm_wrapper)
+
+SYM_FUNC_START(__efi_rt_asm_recover)
+ ldr_this_cpu x8, __efi_rt_asm_recover_sp, x9
+ mov sp, x8
+
+ ldp x0, x18, [sp, #16]
+ ldp x19, x20, [sp, #32]
+ ldp x21, x22, [sp, #48]
+ ldp x23, x24, [sp, #64]
+ ldp x25, x26, [sp, #80]
+ ldp x27, x28, [sp, #96]
+ ldp x29, x30, [sp], #112
+
+ b efi_handle_runtime_exception
+SYM_FUNC_END(__efi_rt_asm_recover)
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index e1be6c429810..8d36e66a6e64 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -9,6 +9,7 @@
#include <linux/efi.h>
#include <linux/init.h>
+#include <linux/percpu.h>
#include <asm/efi.h>
@@ -128,3 +129,28 @@ asmlinkage efi_status_t efi_handle_corrupted_x18(efi_status_t s, const char *f)
pr_err_ratelimited(FW_BUG "register x18 corrupted by EFI %s\n", f);
return s;
}
+
+asmlinkage DEFINE_PER_CPU(u64, __efi_rt_asm_recover_sp);
+
+asmlinkage efi_status_t __efi_rt_asm_recover(void);
+
+asmlinkage efi_status_t efi_handle_runtime_exception(const char *f)
+{
+ pr_err(FW_BUG "Synchronous exception occurred in EFI runtime service %s()\n", f);
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+ return EFI_ABORTED;
+}
+
+bool efi_runtime_fixup_exception(struct pt_regs *regs, const char *msg)
+{
+ /* Check whether the exception occurred while running the firmware */
+ if (current_work() != &efi_rts_work.work || regs->pc >= TASK_SIZE_64)
+ return false;
+
+ pr_err(FW_BUG "Unable to handle %s in EFI runtime service\n", msg);
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+ dump_stack();
+
+ regs->pc = (u64)__efi_rt_asm_recover;
+ return true;
+}
diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c
index 27ef7ad3ffd2..353009d7f307 100644
--- a/arch/arm64/kernel/elfcore.c
+++ b/arch/arm64/kernel/elfcore.c
@@ -47,7 +47,7 @@ static int mte_dump_tag_range(struct coredump_params *cprm,
* Pages mapped in user space as !pte_access_permitted() (e.g.
* PROT_EXEC only) may not have the PG_mte_tagged flag set.
*/
- if (!test_bit(PG_mte_tagged, &page->flags)) {
+ if (!page_mte_tagged(page)) {
put_page(page);
dump_skip(cprm, MTE_PAGE_TAG_STORAGE);
continue;
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 9173fad279af..27369fa1c032 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -329,7 +329,8 @@ static void cortex_a76_erratum_1463225_svc_handler(void)
__this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 0);
}
-static bool cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
+static __always_inline bool
+cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
{
if (!__this_cpu_read(__in_cortex_a76_erratum_1463225_wa))
return false;
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index af5df48ba915..788597a6b6a2 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -271,7 +271,7 @@ static int swsusp_mte_save_tags(void)
if (!page)
continue;
- if (!test_bit(PG_mte_tagged, &page->flags))
+ if (!page_mte_tagged(page))
continue;
ret = save_tags(page, pfn);
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 8151412653de..e3f88b5836a2 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -71,12 +71,6 @@ KVM_NVHE_ALIAS(nvhe_hyp_panic_handler);
/* Vectors installed by hyp-init on reset HVC. */
KVM_NVHE_ALIAS(__hyp_stub_vectors);
-/* Kernel symbol used by icache_is_vpipt(). */
-KVM_NVHE_ALIAS(__icache_flags);
-
-/* VMID bits set by the KVM VMID allocator */
-KVM_NVHE_ALIAS(kvm_arm_vmid_bits);
-
/* Static keys which are set if a vGIC trap should be handled in hyp. */
KVM_NVHE_ALIAS(vgic_v2_cpuif_trap);
KVM_NVHE_ALIAS(vgic_v3_cpuif_trap);
@@ -92,9 +86,6 @@ KVM_NVHE_ALIAS(gic_nonsecure_priorities);
KVM_NVHE_ALIAS(__start___kvm_ex_table);
KVM_NVHE_ALIAS(__stop___kvm_ex_table);
-/* Array containing bases of nVHE per-CPU memory regions. */
-KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base);
-
/* PMU available static key */
#ifdef CONFIG_HW_PERF_EVENTS
KVM_NVHE_ALIAS(kvm_arm_pmu_available);
@@ -111,12 +102,6 @@ KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy);
KVM_NVHE_ALIAS_HYP(__memset, __pi_memset);
#endif
-/* Kernel memory sections */
-KVM_NVHE_ALIAS(__start_rodata);
-KVM_NVHE_ALIAS(__end_rodata);
-KVM_NVHE_ALIAS(__bss_start);
-KVM_NVHE_ALIAS(__bss_stop);
-
/* Hyp memory sections */
KVM_NVHE_ALIAS(__hyp_idmap_text_start);
KVM_NVHE_ALIAS(__hyp_idmap_text_end);
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 7467217c1eaf..f5bcb0dc6267 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -41,19 +41,17 @@ static void mte_sync_page_tags(struct page *page, pte_t old_pte,
if (check_swap && is_swap_pte(old_pte)) {
swp_entry_t entry = pte_to_swp_entry(old_pte);
- if (!non_swap_entry(entry) && mte_restore_tags(entry, page))
- return;
+ if (!non_swap_entry(entry))
+ mte_restore_tags(entry, page);
}
if (!pte_is_tagged)
return;
- /*
- * Test PG_mte_tagged again in case it was racing with another
- * set_pte_at().
- */
- if (!test_and_set_bit(PG_mte_tagged, &page->flags))
+ if (try_page_mte_tagging(page)) {
mte_clear_page_tags(page_address(page));
+ set_page_mte_tagged(page);
+ }
}
void mte_sync_tags(pte_t old_pte, pte_t pte)
@@ -69,9 +67,11 @@ void mte_sync_tags(pte_t old_pte, pte_t pte)
/* if PG_mte_tagged is set, tags have already been initialised */
for (i = 0; i < nr_pages; i++, page++) {
- if (!test_bit(PG_mte_tagged, &page->flags))
+ if (!page_mte_tagged(page)) {
mte_sync_page_tags(page, old_pte, check_swap,
pte_is_tagged);
+ set_page_mte_tagged(page);
+ }
}
/* ensure the tags are visible before the PTE is set */
@@ -96,8 +96,7 @@ int memcmp_pages(struct page *page1, struct page *page2)
* pages is tagged, set_pte_at() may zero or change the tags of the
* other page via mte_sync_tags().
*/
- if (test_bit(PG_mte_tagged, &page1->flags) ||
- test_bit(PG_mte_tagged, &page2->flags))
+ if (page_mte_tagged(page1) || page_mte_tagged(page2))
return addr1 != addr2;
return ret;
@@ -454,7 +453,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
put_page(page);
break;
}
- WARN_ON_ONCE(!test_bit(PG_mte_tagged, &page->flags));
+ WARN_ON_ONCE(!page_mte_tagged(page));
/* limit access to the end of the page */
offset = offset_in_page(addr);
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 815cc118c675..05da3c8f7e88 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -32,6 +32,8 @@ menuconfig KVM
select KVM_VFIO
select HAVE_KVM_EVENTFD
select HAVE_KVM_IRQFD
+ select HAVE_KVM_DIRTY_RING_ACQ_REL
+ select NEED_KVM_DIRTY_RING_WITH_BITMAP
select HAVE_KVM_MSI
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQ_ROUTING
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 94d33e296e10..00da570ed72b 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -37,6 +37,7 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_pkvm.h>
#include <asm/kvm_emulate.h>
#include <asm/sections.h>
@@ -50,7 +51,6 @@ DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
-unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
static bool vgic_present;
@@ -138,24 +138,24 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
int ret;
- ret = kvm_arm_setup_stage2(kvm, type);
- if (ret)
- return ret;
-
- ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu);
+ ret = kvm_share_hyp(kvm, kvm + 1);
if (ret)
return ret;
- ret = kvm_share_hyp(kvm, kvm + 1);
+ ret = pkvm_init_host_vm(kvm);
if (ret)
- goto out_free_stage2_pgd;
+ goto err_unshare_kvm;
if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) {
ret = -ENOMEM;
- goto out_free_stage2_pgd;
+ goto err_unshare_kvm;
}
cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);
+ ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type);
+ if (ret)
+ goto err_free_cpumask;
+
kvm_vgic_early_init(kvm);
/* The maximum number of VCPUs is limited by the host's GIC model */
@@ -164,9 +164,18 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
set_default_spectre(kvm);
kvm_arm_init_hypercalls(kvm);
- return ret;
-out_free_stage2_pgd:
- kvm_free_stage2_pgd(&kvm->arch.mmu);
+ /*
+ * Initialise the default PMUver before there is a chance to
+ * create an actual PMU.
+ */
+ kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit();
+
+ return 0;
+
+err_free_cpumask:
+ free_cpumask_var(kvm->arch.supported_cpus);
+err_unshare_kvm:
+ kvm_unshare_hyp(kvm, kvm + 1);
return ret;
}
@@ -187,6 +196,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_vgic_destroy(kvm);
+ if (is_protected_kvm_enabled())
+ pkvm_destroy_hyp_vm(kvm);
+
kvm_destroy_vcpus(kvm);
kvm_unshare_hyp(kvm, kvm + 1);
@@ -569,6 +581,12 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
if (ret)
return ret;
+ if (is_protected_kvm_enabled()) {
+ ret = pkvm_create_hyp_vm(kvm);
+ if (ret)
+ return ret;
+ }
+
if (!irqchip_in_kernel(kvm)) {
/*
* Tell the rest of the code that there are userspace irqchip
@@ -746,6 +764,9 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
return kvm_vcpu_suspend(vcpu);
+
+ if (kvm_dirty_ring_check_request(vcpu))
+ return 0;
}
return 1;
@@ -1518,7 +1539,7 @@ static int kvm_init_vector_slots(void)
return 0;
}
-static void cpu_prepare_hyp_mode(int cpu)
+static void cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
{
struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
unsigned long tcr;
@@ -1534,23 +1555,9 @@ static void cpu_prepare_hyp_mode(int cpu)
params->mair_el2 = read_sysreg(mair_el1);
- /*
- * The ID map may be configured to use an extended virtual address
- * range. This is only the case if system RAM is out of range for the
- * currently configured page size and VA_BITS, in which case we will
- * also need the extended virtual range for the HYP ID map, or we won't
- * be able to enable the EL2 MMU.
- *
- * However, at EL2, there is only one TTBR register, and we can't switch
- * between translation tables *and* update TCR_EL2.T0SZ at the same
- * time. Bottom line: we need to use the extended range with *both* our
- * translation tables.
- *
- * So use the same T0SZ value we use for the ID map.
- */
tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1;
tcr &= ~TCR_T0SZ_MASK;
- tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET;
+ tcr |= TCR_T0SZ(hyp_va_bits);
params->tcr_el2 = tcr;
params->pgd_pa = kvm_mmu_get_httbr();
@@ -1844,13 +1851,13 @@ static void teardown_hyp_mode(void)
free_hyp_pgds();
for_each_possible_cpu(cpu) {
free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
- free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order());
+ free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
}
}
static int do_pkvm_init(u32 hyp_va_bits)
{
- void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base);
+ void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base));
int ret;
preempt_disable();
@@ -1870,11 +1877,8 @@ static int do_pkvm_init(u32 hyp_va_bits)
return ret;
}
-static int kvm_hyp_init_protection(u32 hyp_va_bits)
+static void kvm_hyp_init_symbols(void)
{
- void *addr = phys_to_virt(hyp_mem_base);
- int ret;
-
kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1);
@@ -1883,6 +1887,14 @@ static int kvm_hyp_init_protection(u32 hyp_va_bits)
kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
+ kvm_nvhe_sym(__icache_flags) = __icache_flags;
+ kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
+}
+
+static int kvm_hyp_init_protection(u32 hyp_va_bits)
+{
+ void *addr = phys_to_virt(hyp_mem_base);
+ int ret;
ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
if (ret)
@@ -1950,7 +1962,7 @@ static int init_hyp_mode(void)
page_addr = page_address(page);
memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
- kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr;
+ kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr;
}
/*
@@ -2043,7 +2055,7 @@ static int init_hyp_mode(void)
}
for_each_possible_cpu(cpu) {
- char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
+ char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu];
char *percpu_end = percpu_begin + nvhe_percpu_size();
/* Map Hyp percpu pages */
@@ -2054,9 +2066,11 @@ static int init_hyp_mode(void)
}
/* Prepare the CPU initialization parameters */
- cpu_prepare_hyp_mode(cpu);
+ cpu_prepare_hyp_mode(cpu, hyp_va_bits);
}
+ kvm_hyp_init_symbols();
+
if (is_protected_kvm_enabled()) {
init_cpu_logical_map();
@@ -2064,9 +2078,7 @@ static int init_hyp_mode(void)
err = -ENODEV;
goto out_err;
}
- }
- if (is_protected_kvm_enabled()) {
err = kvm_hyp_init_protection(hyp_va_bits);
if (err) {
kvm_err("Failed to init hyp memory protection\n");
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 2ff13a3f8479..5626ddb540ce 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -1059,7 +1059,7 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
maddr = page_address(page);
if (!write) {
- if (test_bit(PG_mte_tagged, &page->flags))
+ if (page_mte_tagged(page))
num_tags = mte_copy_tags_to_user(tags, maddr,
MTE_GRANULES_PER_PAGE);
else
@@ -1068,15 +1068,19 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
clear_user(tags, MTE_GRANULES_PER_PAGE);
kvm_release_pfn_clean(pfn);
} else {
+ /*
+ * Only locking to serialise with a concurrent
+ * set_pte_at() in the VMM but still overriding the
+ * tags, hence ignoring the return value.
+ */
+ try_page_mte_tagging(page);
num_tags = mte_copy_tags_from_user(maddr, tags,
MTE_GRANULES_PER_PAGE);
- /*
- * Set the flag after checking the write
- * completed fully
- */
- if (num_tags == MTE_GRANULES_PER_PAGE)
- set_bit(PG_mte_tagged, &page->flags);
+ /* uaccess failed, don't leave stale tags */
+ if (num_tags != MTE_GRANULES_PER_PAGE)
+ mte_clear_page_tags(page);
+ set_page_mte_tagged(page);
kvm_release_pfn_dirty(pfn);
}
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index b7557b25ed56..791d3de76771 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -13,6 +13,7 @@
#include <hyp/adjust_pc.h>
#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
+#include <asm/kvm_mmu.h>
#if !defined (__KVM_NVHE_HYPERVISOR__) && !defined (__KVM_VHE_HYPERVISOR__)
#error Hypervisor code only!
@@ -115,7 +116,7 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
new |= (old & PSR_C_BIT);
new |= (old & PSR_V_BIT);
- if (kvm_has_mte(vcpu->kvm))
+ if (kvm_has_mte(kern_hyp_va(vcpu->kvm)))
new |= PSR_TCO_BIT;
new |= (old & PSR_DIT_BIT);
diff --git a/arch/arm64/kvm/hyp/hyp-constants.c b/arch/arm64/kvm/hyp/hyp-constants.c
index b3742a6691e8..b257a3b4bfc5 100644
--- a/arch/arm64/kvm/hyp/hyp-constants.c
+++ b/arch/arm64/kvm/hyp/hyp-constants.c
@@ -2,9 +2,12 @@
#include <linux/kbuild.h>
#include <nvhe/memory.h>
+#include <nvhe/pkvm.h>
int main(void)
{
DEFINE(STRUCT_HYP_PAGE_SIZE, sizeof(struct hyp_page));
+ DEFINE(PKVM_HYP_VM_SIZE, sizeof(struct pkvm_hyp_vm));
+ DEFINE(PKVM_HYP_VCPU_SIZE, sizeof(struct pkvm_hyp_vcpu));
return 0;
}
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 6cbbb6c02f66..3330d1b76bdd 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -87,6 +87,17 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2);
write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
+
+ if (cpus_have_final_cap(ARM64_SME)) {
+ sysreg_clear_set_s(SYS_HFGRTR_EL2,
+ HFGxTR_EL2_nSMPRI_EL1_MASK |
+ HFGxTR_EL2_nTPIDR2_EL0_MASK,
+ 0);
+ sysreg_clear_set_s(SYS_HFGWTR_EL2,
+ HFGxTR_EL2_nSMPRI_EL1_MASK |
+ HFGxTR_EL2_nTPIDR2_EL0_MASK,
+ 0);
+ }
}
static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
@@ -96,6 +107,15 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
write_sysreg(0, hstr_el2);
if (kvm_arm_support_pmu_v3())
write_sysreg(0, pmuserenr_el0);
+
+ if (cpus_have_final_cap(ARM64_SME)) {
+ sysreg_clear_set_s(SYS_HFGRTR_EL2, 0,
+ HFGxTR_EL2_nSMPRI_EL1_MASK |
+ HFGxTR_EL2_nTPIDR2_EL0_MASK);
+ sysreg_clear_set_s(SYS_HFGWTR_EL2, 0,
+ HFGxTR_EL2_nSMPRI_EL1_MASK |
+ HFGxTR_EL2_nTPIDR2_EL0_MASK);
+ }
}
static inline void ___activate_traps(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 80e99836eac7..b7bdbe63deed 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -8,8 +8,10 @@
#define __KVM_NVHE_MEM_PROTECT__
#include <linux/kvm_host.h>
#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
#include <asm/kvm_pgtable.h>
#include <asm/virt.h>
+#include <nvhe/pkvm.h>
#include <nvhe/spinlock.h>
/*
@@ -43,30 +45,45 @@ static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot)
return prot & PKVM_PAGE_STATE_PROT_MASK;
}
-struct host_kvm {
+struct host_mmu {
struct kvm_arch arch;
struct kvm_pgtable pgt;
struct kvm_pgtable_mm_ops mm_ops;
hyp_spinlock_t lock;
};
-extern struct host_kvm host_kvm;
+extern struct host_mmu host_mmu;
-extern const u8 pkvm_hyp_id;
+/* This corresponds to page-table locking order */
+enum pkvm_component_id {
+ PKVM_ID_HOST,
+ PKVM_ID_HYP,
+};
+
+extern unsigned long hyp_nr_cpus;
int __pkvm_prot_finalize(void);
int __pkvm_host_share_hyp(u64 pfn);
int __pkvm_host_unshare_hyp(u64 pfn);
+int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
+int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
bool addr_is_memory(phys_addr_t phys);
int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id);
int kvm_host_prepare_stage2(void *pgt_pool_base);
+int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd);
void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
+int hyp_pin_shared_mem(void *from, void *to);
+void hyp_unpin_shared_mem(void *from, void *to);
+void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc);
+int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
+ struct kvm_hyp_memcache *host_mc);
+
static __always_inline void __load_host_stage2(void)
{
if (static_branch_likely(&kvm_protected_mode_initialized))
- __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch);
+ __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch);
else
write_sysreg(0, vttbr_el2);
}
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index 592b7edb3edb..ab205c4d6774 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -38,6 +38,10 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr)
#define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page))
#define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool)
+/*
+ * Refcounting for 'struct hyp_page'.
+ * hyp_pool::lock must be held if atomic access to the refcount is required.
+ */
static inline int hyp_page_count(void *addr)
{
struct hyp_page *p = hyp_virt_to_page(addr);
@@ -45,4 +49,27 @@ static inline int hyp_page_count(void *addr)
return p->refcount;
}
+static inline void hyp_page_ref_inc(struct hyp_page *p)
+{
+ BUG_ON(p->refcount == USHRT_MAX);
+ p->refcount++;
+}
+
+static inline void hyp_page_ref_dec(struct hyp_page *p)
+{
+ BUG_ON(!p->refcount);
+ p->refcount--;
+}
+
+static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
+{
+ hyp_page_ref_dec(p);
+ return (p->refcount == 0);
+}
+
+static inline void hyp_set_page_refcounted(struct hyp_page *p)
+{
+ BUG_ON(p->refcount);
+ p->refcount = 1;
+}
#endif /* __KVM_HYP_MEMORY_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
index 42d8eb9bfe72..d5ec972b5c1e 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -13,9 +13,13 @@
extern struct kvm_pgtable pkvm_pgtable;
extern hyp_spinlock_t pkvm_pgd_lock;
+int hyp_create_pcpu_fixmap(void);
+void *hyp_fixmap_map(phys_addr_t phys);
+void hyp_fixmap_unmap(void);
+
int hyp_create_idmap(u32 hyp_va_bits);
int hyp_map_vectors(void);
-int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back);
+int hyp_back_vmemmap(phys_addr_t back);
int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot);
int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot);
@@ -24,16 +28,4 @@ int __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
unsigned long *haddr);
int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr);
-static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size,
- unsigned long *start, unsigned long *end)
-{
- unsigned long nr_pages = size >> PAGE_SHIFT;
- struct hyp_page *p = hyp_phys_to_page(phys);
-
- *start = (unsigned long)p;
- *end = *start + nr_pages * sizeof(struct hyp_page);
- *start = ALIGN_DOWN(*start, PAGE_SIZE);
- *end = ALIGN(*end, PAGE_SIZE);
-}
-
#endif /* __KVM_HYP_MM_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
new file mode 100644
index 000000000000..82b3d62538a6
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Google LLC
+ * Author: Fuad Tabba <tabba@google.com>
+ */
+
+#ifndef __ARM64_KVM_NVHE_PKVM_H__
+#define __ARM64_KVM_NVHE_PKVM_H__
+
+#include <asm/kvm_pkvm.h>
+
+#include <nvhe/gfp.h>
+#include <nvhe/spinlock.h>
+
+/*
+ * Holds the relevant data for maintaining the vcpu state completely at hyp.
+ */
+struct pkvm_hyp_vcpu {
+ struct kvm_vcpu vcpu;
+
+ /* Backpointer to the host's (untrusted) vCPU instance. */
+ struct kvm_vcpu *host_vcpu;
+};
+
+/*
+ * Holds the relevant data for running a protected vm.
+ */
+struct pkvm_hyp_vm {
+ struct kvm kvm;
+
+ /* Backpointer to the host's (untrusted) KVM instance. */
+ struct kvm *host_kvm;
+
+ /* The guest's stage-2 page-table managed by the hypervisor. */
+ struct kvm_pgtable pgt;
+ struct kvm_pgtable_mm_ops mm_ops;
+ struct hyp_pool pool;
+ hyp_spinlock_t lock;
+
+ /*
+ * The number of vcpus initialized and ready to run.
+ * Modifying this is protected by 'vm_table_lock'.
+ */
+ unsigned int nr_vcpus;
+
+ /* Array of the hyp vCPU structures for this VM. */
+ struct pkvm_hyp_vcpu *vcpus[];
+};
+
+static inline struct pkvm_hyp_vm *
+pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ return container_of(hyp_vcpu->vcpu.kvm, struct pkvm_hyp_vm, kvm);
+}
+
+void pkvm_hyp_vm_table_init(void *tbl);
+
+int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
+ unsigned long pgd_hva);
+int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
+ unsigned long vcpu_hva);
+int __pkvm_teardown_vm(pkvm_handle_t handle);
+
+struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
+ unsigned int vcpu_idx);
+void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu);
+
+#endif /* __ARM64_KVM_NVHE_PKVM_H__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
index 4652fd04bdbe..7c7ea8c55405 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
@@ -28,9 +28,17 @@ typedef union hyp_spinlock {
};
} hyp_spinlock_t;
+#define __HYP_SPIN_LOCK_INITIALIZER \
+ { .__val = 0 }
+
+#define __HYP_SPIN_LOCK_UNLOCKED \
+ ((hyp_spinlock_t) __HYP_SPIN_LOCK_INITIALIZER)
+
+#define DEFINE_HYP_SPINLOCK(x) hyp_spinlock_t x = __HYP_SPIN_LOCK_UNLOCKED
+
#define hyp_spin_lock_init(l) \
do { \
- *(l) = (hyp_spinlock_t){ .__val = 0 }; \
+ *(l) = __HYP_SPIN_LOCK_UNLOCKED; \
} while (0)
static inline void hyp_spin_lock(hyp_spinlock_t *lock)
diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S
index 0c367eb5f4e2..85936c17ae40 100644
--- a/arch/arm64/kvm/hyp/nvhe/cache.S
+++ b/arch/arm64/kvm/hyp/nvhe/cache.S
@@ -12,3 +12,14 @@ SYM_FUNC_START(__pi_dcache_clean_inval_poc)
ret
SYM_FUNC_END(__pi_dcache_clean_inval_poc)
SYM_FUNC_ALIAS(dcache_clean_inval_poc, __pi_dcache_clean_inval_poc)
+
+SYM_FUNC_START(__pi_icache_inval_pou)
+alternative_if ARM64_HAS_CACHE_DIC
+ isb
+ ret
+alternative_else_nop_endif
+
+ invalidate_icache_by_line x0, x1, x2, x3
+ ret
+SYM_FUNC_END(__pi_icache_inval_pou)
+SYM_FUNC_ALIAS(icache_inval_pou, __pi_icache_inval_pou)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 3cea4b6ac23e..728e01d4536b 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -15,17 +15,93 @@
#include <nvhe/mem_protect.h>
#include <nvhe/mm.h>
+#include <nvhe/pkvm.h>
#include <nvhe/trap_handler.h>
DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
+static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+ hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt;
+
+ hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state);
+ hyp_vcpu->vcpu.arch.sve_max_vl = host_vcpu->arch.sve_max_vl;
+
+ hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu;
+
+ hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2;
+ hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2;
+ hyp_vcpu->vcpu.arch.cptr_el2 = host_vcpu->arch.cptr_el2;
+
+ hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags;
+ hyp_vcpu->vcpu.arch.fp_state = host_vcpu->arch.fp_state;
+
+ hyp_vcpu->vcpu.arch.debug_ptr = kern_hyp_va(host_vcpu->arch.debug_ptr);
+ hyp_vcpu->vcpu.arch.host_fpsimd_state = host_vcpu->arch.host_fpsimd_state;
+
+ hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2;
+
+ hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3;
+}
+
+static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+ struct vgic_v3_cpu_if *hyp_cpu_if = &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3;
+ struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3;
+ unsigned int i;
+
+ host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt;
+
+ host_vcpu->arch.hcr_el2 = hyp_vcpu->vcpu.arch.hcr_el2;
+ host_vcpu->arch.cptr_el2 = hyp_vcpu->vcpu.arch.cptr_el2;
+
+ host_vcpu->arch.fault = hyp_vcpu->vcpu.arch.fault;
+
+ host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags;
+ host_vcpu->arch.fp_state = hyp_vcpu->vcpu.arch.fp_state;
+
+ host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr;
+ for (i = 0; i < hyp_cpu_if->used_lrs; ++i)
+ host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i];
+}
+
static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
{
- DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
+ DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1);
+ int ret;
- cpu_reg(host_ctxt, 1) = __kvm_vcpu_run(kern_hyp_va(vcpu));
+ host_vcpu = kern_hyp_va(host_vcpu);
+
+ if (unlikely(is_protected_kvm_enabled())) {
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ struct kvm *host_kvm;
+
+ host_kvm = kern_hyp_va(host_vcpu->kvm);
+ hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle,
+ host_vcpu->vcpu_idx);
+ if (!hyp_vcpu) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ flush_hyp_vcpu(hyp_vcpu);
+
+ ret = __kvm_vcpu_run(&hyp_vcpu->vcpu);
+
+ sync_hyp_vcpu(hyp_vcpu);
+ pkvm_put_hyp_vcpu(hyp_vcpu);
+ } else {
+ /* The host is fully trusted, run its vCPU directly. */
+ ret = __kvm_vcpu_run(host_vcpu);
+ }
+
+out:
+ cpu_reg(host_ctxt, 1) = ret;
}
static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt)
@@ -191,6 +267,33 @@ static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt)
__pkvm_vcpu_init_traps(kern_hyp_va(vcpu));
}
+static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1);
+ DECLARE_REG(unsigned long, vm_hva, host_ctxt, 2);
+ DECLARE_REG(unsigned long, pgd_hva, host_ctxt, 3);
+
+ host_kvm = kern_hyp_va(host_kvm);
+ cpu_reg(host_ctxt, 1) = __pkvm_init_vm(host_kvm, vm_hva, pgd_hva);
+}
+
+static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+ DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 2);
+ DECLARE_REG(unsigned long, vcpu_hva, host_ctxt, 3);
+
+ host_vcpu = kern_hyp_va(host_vcpu);
+ cpu_reg(host_ctxt, 1) = __pkvm_init_vcpu(handle, host_vcpu, vcpu_hva);
+}
+
+static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle);
+}
+
typedef void (*hcall_t)(struct kvm_cpu_context *);
#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -220,6 +323,9 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__vgic_v3_save_aprs),
HANDLE_FUNC(__vgic_v3_restore_aprs),
HANDLE_FUNC(__pkvm_vcpu_init_traps),
+ HANDLE_FUNC(__pkvm_init_vm),
+ HANDLE_FUNC(__pkvm_init_vcpu),
+ HANDLE_FUNC(__pkvm_teardown_vm),
};
static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
index 9f54833af400..04d194583f1e 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
@@ -23,6 +23,8 @@ u64 cpu_logical_map(unsigned int cpu)
return hyp_cpu_logical_map[cpu];
}
+unsigned long __ro_after_init kvm_arm_hyp_percpu_base[NR_CPUS];
+
unsigned long __hyp_per_cpu_offset(unsigned int cpu)
{
unsigned long *cpu_base_array;
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 1e78acf9662e..552653fa18be 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -21,21 +21,33 @@
#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP)
-extern unsigned long hyp_nr_cpus;
-struct host_kvm host_kvm;
+struct host_mmu host_mmu;
static struct hyp_pool host_s2_pool;
-const u8 pkvm_hyp_id = 1;
+static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm);
+#define current_vm (*this_cpu_ptr(&__current_vm))
+
+static void guest_lock_component(struct pkvm_hyp_vm *vm)
+{
+ hyp_spin_lock(&vm->lock);
+ current_vm = vm;
+}
+
+static void guest_unlock_component(struct pkvm_hyp_vm *vm)
+{
+ current_vm = NULL;
+ hyp_spin_unlock(&vm->lock);
+}
static void host_lock_component(void)
{
- hyp_spin_lock(&host_kvm.lock);
+ hyp_spin_lock(&host_mmu.lock);
}
static void host_unlock_component(void)
{
- hyp_spin_unlock(&host_kvm.lock);
+ hyp_spin_unlock(&host_mmu.lock);
}
static void hyp_lock_component(void)
@@ -79,6 +91,11 @@ static void host_s2_put_page(void *addr)
hyp_put_page(&host_s2_pool, addr);
}
+static void host_s2_free_removed_table(void *addr, u32 level)
+{
+ kvm_pgtable_stage2_free_removed(&host_mmu.mm_ops, addr, level);
+}
+
static int prepare_s2_pool(void *pgt_pool_base)
{
unsigned long nr_pages, pfn;
@@ -90,9 +107,10 @@ static int prepare_s2_pool(void *pgt_pool_base)
if (ret)
return ret;
- host_kvm.mm_ops = (struct kvm_pgtable_mm_ops) {
+ host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) {
.zalloc_pages_exact = host_s2_zalloc_pages_exact,
.zalloc_page = host_s2_zalloc_page,
+ .free_removed_table = host_s2_free_removed_table,
.phys_to_virt = hyp_phys_to_virt,
.virt_to_phys = hyp_virt_to_phys,
.page_count = hyp_page_count,
@@ -111,7 +129,7 @@ static void prepare_host_vtcr(void)
parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val);
phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange);
- host_kvm.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
+ host_mmu.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
id_aa64mmfr1_el1_sys_val, phys_shift);
}
@@ -119,45 +137,170 @@ static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot pr
int kvm_host_prepare_stage2(void *pgt_pool_base)
{
- struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
+ struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu;
int ret;
prepare_host_vtcr();
- hyp_spin_lock_init(&host_kvm.lock);
- mmu->arch = &host_kvm.arch;
+ hyp_spin_lock_init(&host_mmu.lock);
+ mmu->arch = &host_mmu.arch;
ret = prepare_s2_pool(pgt_pool_base);
if (ret)
return ret;
- ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, mmu,
- &host_kvm.mm_ops, KVM_HOST_S2_FLAGS,
+ ret = __kvm_pgtable_stage2_init(&host_mmu.pgt, mmu,
+ &host_mmu.mm_ops, KVM_HOST_S2_FLAGS,
host_stage2_force_pte_cb);
if (ret)
return ret;
- mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd);
- mmu->pgt = &host_kvm.pgt;
+ mmu->pgd_phys = __hyp_pa(host_mmu.pgt.pgd);
+ mmu->pgt = &host_mmu.pgt;
atomic64_set(&mmu->vmid.id, 0);
return 0;
}
+static bool guest_stage2_force_pte_cb(u64 addr, u64 end,
+ enum kvm_pgtable_prot prot)
+{
+ return true;
+}
+
+static void *guest_s2_zalloc_pages_exact(size_t size)
+{
+ void *addr = hyp_alloc_pages(&current_vm->pool, get_order(size));
+
+ WARN_ON(size != (PAGE_SIZE << get_order(size)));
+ hyp_split_page(hyp_virt_to_page(addr));
+
+ return addr;
+}
+
+static void guest_s2_free_pages_exact(void *addr, unsigned long size)
+{
+ u8 order = get_order(size);
+ unsigned int i;
+
+ for (i = 0; i < (1 << order); i++)
+ hyp_put_page(&current_vm->pool, addr + (i * PAGE_SIZE));
+}
+
+static void *guest_s2_zalloc_page(void *mc)
+{
+ struct hyp_page *p;
+ void *addr;
+
+ addr = hyp_alloc_pages(&current_vm->pool, 0);
+ if (addr)
+ return addr;
+
+ addr = pop_hyp_memcache(mc, hyp_phys_to_virt);
+ if (!addr)
+ return addr;
+
+ memset(addr, 0, PAGE_SIZE);
+ p = hyp_virt_to_page(addr);
+ memset(p, 0, sizeof(*p));
+ p->refcount = 1;
+
+ return addr;
+}
+
+static void guest_s2_get_page(void *addr)
+{
+ hyp_get_page(&current_vm->pool, addr);
+}
+
+static void guest_s2_put_page(void *addr)
+{
+ hyp_put_page(&current_vm->pool, addr);
+}
+
+static void clean_dcache_guest_page(void *va, size_t size)
+{
+ __clean_dcache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size);
+ hyp_fixmap_unmap();
+}
+
+static void invalidate_icache_guest_page(void *va, size_t size)
+{
+ __invalidate_icache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size);
+ hyp_fixmap_unmap();
+}
+
+int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
+{
+ struct kvm_s2_mmu *mmu = &vm->kvm.arch.mmu;
+ unsigned long nr_pages;
+ int ret;
+
+ nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT;
+ ret = hyp_pool_init(&vm->pool, hyp_virt_to_pfn(pgd), nr_pages, 0);
+ if (ret)
+ return ret;
+
+ hyp_spin_lock_init(&vm->lock);
+ vm->mm_ops = (struct kvm_pgtable_mm_ops) {
+ .zalloc_pages_exact = guest_s2_zalloc_pages_exact,
+ .free_pages_exact = guest_s2_free_pages_exact,
+ .zalloc_page = guest_s2_zalloc_page,
+ .phys_to_virt = hyp_phys_to_virt,
+ .virt_to_phys = hyp_virt_to_phys,
+ .page_count = hyp_page_count,
+ .get_page = guest_s2_get_page,
+ .put_page = guest_s2_put_page,
+ .dcache_clean_inval_poc = clean_dcache_guest_page,
+ .icache_inval_pou = invalidate_icache_guest_page,
+ };
+
+ guest_lock_component(vm);
+ ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0,
+ guest_stage2_force_pte_cb);
+ guest_unlock_component(vm);
+ if (ret)
+ return ret;
+
+ vm->kvm.arch.mmu.pgd_phys = __hyp_pa(vm->pgt.pgd);
+
+ return 0;
+}
+
+void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
+{
+ void *addr;
+
+ /* Dump all pgtable pages in the hyp_pool */
+ guest_lock_component(vm);
+ kvm_pgtable_stage2_destroy(&vm->pgt);
+ vm->kvm.arch.mmu.pgd_phys = 0ULL;
+ guest_unlock_component(vm);
+
+ /* Drain the hyp_pool into the memcache */
+ addr = hyp_alloc_pages(&vm->pool, 0);
+ while (addr) {
+ memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page));
+ push_hyp_memcache(mc, addr, hyp_virt_to_phys);
+ WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1));
+ addr = hyp_alloc_pages(&vm->pool, 0);
+ }
+}
+
int __pkvm_prot_finalize(void)
{
- struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
+ struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu;
struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
if (params->hcr_el2 & HCR_VM)
return -EPERM;
params->vttbr = kvm_get_vttbr(mmu);
- params->vtcr = host_kvm.arch.vtcr;
+ params->vtcr = host_mmu.arch.vtcr;
params->hcr_el2 |= HCR_VM;
kvm_flush_dcache_to_poc(params, sizeof(*params));
write_sysreg(params->hcr_el2, hcr_el2);
- __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch);
+ __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch);
/*
* Make sure to have an ISB before the TLB maintenance below but only
@@ -175,7 +318,7 @@ int __pkvm_prot_finalize(void)
static int host_stage2_unmap_dev_all(void)
{
- struct kvm_pgtable *pgt = &host_kvm.pgt;
+ struct kvm_pgtable *pgt = &host_mmu.pgt;
struct memblock_region *reg;
u64 addr = 0;
int i, ret;
@@ -195,7 +338,7 @@ struct kvm_mem_range {
u64 end;
};
-static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
+static struct memblock_region *find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
{
int cur, left = 0, right = hyp_memblock_nr;
struct memblock_region *reg;
@@ -218,18 +361,28 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
} else {
range->start = reg->base;
range->end = end;
- return true;
+ return reg;
}
}
- return false;
+ return NULL;
}
bool addr_is_memory(phys_addr_t phys)
{
struct kvm_mem_range range;
- return find_mem_range(phys, &range);
+ return !!find_mem_range(phys, &range);
+}
+
+static bool addr_is_allowed_memory(phys_addr_t phys)
+{
+ struct memblock_region *reg;
+ struct kvm_mem_range range;
+
+ reg = find_mem_range(phys, &range);
+
+ return reg && !(reg->flags & MEMBLOCK_NOMAP);
}
static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
@@ -250,8 +403,8 @@ static bool range_is_memory(u64 start, u64 end)
static inline int __host_stage2_idmap(u64 start, u64 end,
enum kvm_pgtable_prot prot)
{
- return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start,
- prot, &host_s2_pool);
+ return kvm_pgtable_stage2_map(&host_mmu.pgt, start, end - start, start,
+ prot, &host_s2_pool, 0);
}
/*
@@ -263,7 +416,7 @@ static inline int __host_stage2_idmap(u64 start, u64 end,
#define host_stage2_try(fn, ...) \
({ \
int __ret; \
- hyp_assert_lock_held(&host_kvm.lock); \
+ hyp_assert_lock_held(&host_mmu.lock); \
__ret = fn(__VA_ARGS__); \
if (__ret == -ENOMEM) { \
__ret = host_stage2_unmap_dev_all(); \
@@ -286,8 +439,8 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
u32 level;
int ret;
- hyp_assert_lock_held(&host_kvm.lock);
- ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level);
+ hyp_assert_lock_held(&host_mmu.lock);
+ ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level);
if (ret)
return ret;
@@ -319,7 +472,7 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
{
- return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt,
+ return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
addr, size, &host_s2_pool, owner_id);
}
@@ -348,7 +501,7 @@ static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot pr
static int host_stage2_idmap(u64 addr)
{
struct kvm_mem_range range;
- bool is_memory = find_mem_range(addr, &range);
+ bool is_memory = !!find_mem_range(addr, &range);
enum kvm_pgtable_prot prot;
int ret;
@@ -380,12 +533,6 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
BUG_ON(ret && ret != -EAGAIN);
}
-/* This corresponds to locking order */
-enum pkvm_component_id {
- PKVM_ID_HOST,
- PKVM_ID_HYP,
-};
-
struct pkvm_mem_transition {
u64 nr_pages;
@@ -399,6 +546,9 @@ struct pkvm_mem_transition {
/* Address in the completer's address space */
u64 completer_addr;
} host;
+ struct {
+ u64 completer_addr;
+ } hyp;
};
} initiator;
@@ -412,23 +562,24 @@ struct pkvm_mem_share {
const enum kvm_pgtable_prot completer_prot;
};
+struct pkvm_mem_donation {
+ const struct pkvm_mem_transition tx;
+};
+
struct check_walk_data {
enum pkvm_page_state desired;
enum pkvm_page_state (*get_page_state)(kvm_pte_t pte);
};
-static int __check_page_state_visitor(u64 addr, u64 end, u32 level,
- kvm_pte_t *ptep,
- enum kvm_pgtable_walk_flags flag,
- void * const arg)
+static int __check_page_state_visitor(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- struct check_walk_data *d = arg;
- kvm_pte_t pte = *ptep;
+ struct check_walk_data *d = ctx->arg;
- if (kvm_pte_valid(pte) && !addr_is_memory(kvm_pte_to_phys(pte)))
+ if (kvm_pte_valid(ctx->old) && !addr_is_allowed_memory(kvm_pte_to_phys(ctx->old)))
return -EINVAL;
- return d->get_page_state(pte) == d->desired ? 0 : -EPERM;
+ return d->get_page_state(ctx->old) == d->desired ? 0 : -EPERM;
}
static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size,
@@ -459,8 +610,8 @@ static int __host_check_page_state_range(u64 addr, u64 size,
.get_page_state = host_get_page_state,
};
- hyp_assert_lock_held(&host_kvm.lock);
- return check_page_state_range(&host_kvm.pgt, addr, size, &d);
+ hyp_assert_lock_held(&host_mmu.lock);
+ return check_page_state_range(&host_mmu.pgt, addr, size, &d);
}
static int __host_set_page_state_range(u64 addr, u64 size,
@@ -511,12 +662,52 @@ static int host_initiate_unshare(u64 *completer_addr,
return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED);
}
+static int host_initiate_donation(u64 *completer_addr,
+ const struct pkvm_mem_transition *tx)
+{
+ u8 owner_id = tx->completer.id;
+ u64 size = tx->nr_pages * PAGE_SIZE;
+
+ *completer_addr = tx->initiator.host.completer_addr;
+ return host_stage2_set_owner_locked(tx->initiator.addr, size, owner_id);
+}
+
+static bool __host_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx)
+{
+ return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) ||
+ tx->initiator.id != PKVM_ID_HYP);
+}
+
+static int __host_ack_transition(u64 addr, const struct pkvm_mem_transition *tx,
+ enum pkvm_page_state state)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+
+ if (__host_ack_skip_pgtable_check(tx))
+ return 0;
+
+ return __host_check_page_state_range(addr, size, state);
+}
+
+static int host_ack_donation(u64 addr, const struct pkvm_mem_transition *tx)
+{
+ return __host_ack_transition(addr, tx, PKVM_NOPAGE);
+}
+
+static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+ u8 host_id = tx->completer.id;
+
+ return host_stage2_set_owner_locked(addr, size, host_id);
+}
+
static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte)
{
if (!kvm_pte_valid(pte))
return PKVM_NOPAGE;
- return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
+ return pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte));
}
static int __hyp_check_page_state_range(u64 addr, u64 size,
@@ -531,6 +722,27 @@ static int __hyp_check_page_state_range(u64 addr, u64 size,
return check_page_state_range(&pkvm_pgtable, addr, size, &d);
}
+static int hyp_request_donation(u64 *completer_addr,
+ const struct pkvm_mem_transition *tx)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+ u64 addr = tx->initiator.addr;
+
+ *completer_addr = tx->initiator.hyp.completer_addr;
+ return __hyp_check_page_state_range(addr, size, PKVM_PAGE_OWNED);
+}
+
+static int hyp_initiate_donation(u64 *completer_addr,
+ const struct pkvm_mem_transition *tx)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+ int ret;
+
+ *completer_addr = tx->initiator.hyp.completer_addr;
+ ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, tx->initiator.addr, size);
+ return (ret != size) ? -EFAULT : 0;
+}
+
static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx)
{
return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) ||
@@ -555,6 +767,9 @@ static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx)
{
u64 size = tx->nr_pages * PAGE_SIZE;
+ if (tx->initiator.id == PKVM_ID_HOST && hyp_page_count((void *)addr))
+ return -EBUSY;
+
if (__hyp_ack_skip_pgtable_check(tx))
return 0;
@@ -562,6 +777,16 @@ static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx)
PKVM_PAGE_SHARED_BORROWED);
}
+static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx)
+{
+ u64 size = tx->nr_pages * PAGE_SIZE;
+
+ if (__hyp_ack_skip_pgtable_check(tx))
+ return 0;
+
+ return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE);
+}
+
static int hyp_complete_share(u64 addr, const struct pkvm_mem_transition *tx,
enum kvm_pgtable_prot perms)
{
@@ -580,6 +805,15 @@ static int hyp_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx)
return (ret != size) ? -EFAULT : 0;
}
+static int hyp_complete_donation(u64 addr,
+ const struct pkvm_mem_transition *tx)
+{
+ void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE);
+ enum kvm_pgtable_prot prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED);
+
+ return pkvm_create_mappings_locked(start, end, prot);
+}
+
static int check_share(struct pkvm_mem_share *share)
{
const struct pkvm_mem_transition *tx = &share->tx;
@@ -732,6 +966,94 @@ static int do_unshare(struct pkvm_mem_share *share)
return WARN_ON(__do_unshare(share));
}
+static int check_donation(struct pkvm_mem_donation *donation)
+{
+ const struct pkvm_mem_transition *tx = &donation->tx;
+ u64 completer_addr;
+ int ret;
+
+ switch (tx->initiator.id) {
+ case PKVM_ID_HOST:
+ ret = host_request_owned_transition(&completer_addr, tx);
+ break;
+ case PKVM_ID_HYP:
+ ret = hyp_request_donation(&completer_addr, tx);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ return ret;
+
+ switch (tx->completer.id) {
+ case PKVM_ID_HOST:
+ ret = host_ack_donation(completer_addr, tx);
+ break;
+ case PKVM_ID_HYP:
+ ret = hyp_ack_donation(completer_addr, tx);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int __do_donate(struct pkvm_mem_donation *donation)
+{
+ const struct pkvm_mem_transition *tx = &donation->tx;
+ u64 completer_addr;
+ int ret;
+
+ switch (tx->initiator.id) {
+ case PKVM_ID_HOST:
+ ret = host_initiate_donation(&completer_addr, tx);
+ break;
+ case PKVM_ID_HYP:
+ ret = hyp_initiate_donation(&completer_addr, tx);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ return ret;
+
+ switch (tx->completer.id) {
+ case PKVM_ID_HOST:
+ ret = host_complete_donation(completer_addr, tx);
+ break;
+ case PKVM_ID_HYP:
+ ret = hyp_complete_donation(completer_addr, tx);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+/*
+ * do_donate():
+ *
+ * The page owner transfers ownership to another component, losing access
+ * as a consequence.
+ *
+ * Initiator: OWNED => NOPAGE
+ * Completer: NOPAGE => OWNED
+ */
+static int do_donate(struct pkvm_mem_donation *donation)
+{
+ int ret;
+
+ ret = check_donation(donation);
+ if (ret)
+ return ret;
+
+ return WARN_ON(__do_donate(donation));
+}
+
int __pkvm_host_share_hyp(u64 pfn)
{
int ret;
@@ -797,3 +1119,112 @@ int __pkvm_host_unshare_hyp(u64 pfn)
return ret;
}
+
+int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages)
+{
+ int ret;
+ u64 host_addr = hyp_pfn_to_phys(pfn);
+ u64 hyp_addr = (u64)__hyp_va(host_addr);
+ struct pkvm_mem_donation donation = {
+ .tx = {
+ .nr_pages = nr_pages,
+ .initiator = {
+ .id = PKVM_ID_HOST,
+ .addr = host_addr,
+ .host = {
+ .completer_addr = hyp_addr,
+ },
+ },
+ .completer = {
+ .id = PKVM_ID_HYP,
+ },
+ },
+ };
+
+ host_lock_component();
+ hyp_lock_component();
+
+ ret = do_donate(&donation);
+
+ hyp_unlock_component();
+ host_unlock_component();
+
+ return ret;
+}
+
+int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
+{
+ int ret;
+ u64 host_addr = hyp_pfn_to_phys(pfn);
+ u64 hyp_addr = (u64)__hyp_va(host_addr);
+ struct pkvm_mem_donation donation = {
+ .tx = {
+ .nr_pages = nr_pages,
+ .initiator = {
+ .id = PKVM_ID_HYP,
+ .addr = hyp_addr,
+ .hyp = {
+ .completer_addr = host_addr,
+ },
+ },
+ .completer = {
+ .id = PKVM_ID_HOST,
+ },
+ },
+ };
+
+ host_lock_component();
+ hyp_lock_component();
+
+ ret = do_donate(&donation);
+
+ hyp_unlock_component();
+ host_unlock_component();
+
+ return ret;
+}
+
+int hyp_pin_shared_mem(void *from, void *to)
+{
+ u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
+ u64 end = PAGE_ALIGN((u64)to);
+ u64 size = end - start;
+ int ret;
+
+ host_lock_component();
+ hyp_lock_component();
+
+ ret = __host_check_page_state_range(__hyp_pa(start), size,
+ PKVM_PAGE_SHARED_OWNED);
+ if (ret)
+ goto unlock;
+
+ ret = __hyp_check_page_state_range(start, size,
+ PKVM_PAGE_SHARED_BORROWED);
+ if (ret)
+ goto unlock;
+
+ for (cur = start; cur < end; cur += PAGE_SIZE)
+ hyp_page_ref_inc(hyp_virt_to_page(cur));
+
+unlock:
+ hyp_unlock_component();
+ host_unlock_component();
+
+ return ret;
+}
+
+void hyp_unpin_shared_mem(void *from, void *to)
+{
+ u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
+ u64 end = PAGE_ALIGN((u64)to);
+
+ host_lock_component();
+ hyp_lock_component();
+
+ for (cur = start; cur < end; cur += PAGE_SIZE)
+ hyp_page_ref_dec(hyp_virt_to_page(cur));
+
+ hyp_unlock_component();
+ host_unlock_component();
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index 96193cb31a39..318298eb3d6b 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -14,6 +14,7 @@
#include <nvhe/early_alloc.h>
#include <nvhe/gfp.h>
#include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
#include <nvhe/mm.h>
#include <nvhe/spinlock.h>
@@ -25,6 +26,12 @@ unsigned int hyp_memblock_nr;
static u64 __io_map_base;
+struct hyp_fixmap_slot {
+ u64 addr;
+ kvm_pte_t *ptep;
+};
+static DEFINE_PER_CPU(struct hyp_fixmap_slot, fixmap_slots);
+
static int __pkvm_create_mappings(unsigned long start, unsigned long size,
unsigned long phys, enum kvm_pgtable_prot prot)
{
@@ -129,13 +136,36 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
return ret;
}
-int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back)
+int hyp_back_vmemmap(phys_addr_t back)
{
- unsigned long start, end;
+ unsigned long i, start, size, end = 0;
+ int ret;
- hyp_vmemmap_range(phys, size, &start, &end);
+ for (i = 0; i < hyp_memblock_nr; i++) {
+ start = hyp_memory[i].base;
+ start = ALIGN_DOWN((u64)hyp_phys_to_page(start), PAGE_SIZE);
+ /*
+ * The begining of the hyp_vmemmap region for the current
+ * memblock may already be backed by the page backing the end
+ * the previous region, so avoid mapping it twice.
+ */
+ start = max(start, end);
+
+ end = hyp_memory[i].base + hyp_memory[i].size;
+ end = PAGE_ALIGN((u64)hyp_phys_to_page(end));
+ if (start >= end)
+ continue;
+
+ size = end - start;
+ ret = __pkvm_create_mappings(start, size, back, PAGE_HYP);
+ if (ret)
+ return ret;
+
+ memset(hyp_phys_to_virt(back), 0, size);
+ back += size;
+ }
- return __pkvm_create_mappings(start, end - start, back, PAGE_HYP);
+ return 0;
}
static void *__hyp_bp_vect_base;
@@ -189,6 +219,102 @@ int hyp_map_vectors(void)
return 0;
}
+void *hyp_fixmap_map(phys_addr_t phys)
+{
+ struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots);
+ kvm_pte_t pte, *ptep = slot->ptep;
+
+ pte = *ptep;
+ pte &= ~kvm_phys_to_pte(KVM_PHYS_INVALID);
+ pte |= kvm_phys_to_pte(phys) | KVM_PTE_VALID;
+ WRITE_ONCE(*ptep, pte);
+ dsb(ishst);
+
+ return (void *)slot->addr;
+}
+
+static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
+{
+ kvm_pte_t *ptep = slot->ptep;
+ u64 addr = slot->addr;
+
+ WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID);
+
+ /*
+ * Irritatingly, the architecture requires that we use inner-shareable
+ * broadcast TLB invalidation here in case another CPU speculates
+ * through our fixmap and decides to create an "amalagamation of the
+ * values held in the TLB" due to the apparent lack of a
+ * break-before-make sequence.
+ *
+ * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03
+ */
+ dsb(ishst);
+ __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1));
+ dsb(ish);
+ isb();
+}
+
+void hyp_fixmap_unmap(void)
+{
+ fixmap_clear_slot(this_cpu_ptr(&fixmap_slots));
+}
+
+static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)ctx->arg);
+
+ if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_MAX_LEVELS - 1)
+ return -EINVAL;
+
+ slot->addr = ctx->addr;
+ slot->ptep = ctx->ptep;
+
+ /*
+ * Clear the PTE, but keep the page-table page refcount elevated to
+ * prevent it from ever being freed. This lets us manipulate the PTEs
+ * by hand safely without ever needing to allocate memory.
+ */
+ fixmap_clear_slot(slot);
+
+ return 0;
+}
+
+static int create_fixmap_slot(u64 addr, u64 cpu)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = __create_fixmap_slot_cb,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ .arg = (void *)cpu,
+ };
+
+ return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker);
+}
+
+int hyp_create_pcpu_fixmap(void)
+{
+ unsigned long addr, i;
+ int ret;
+
+ for (i = 0; i < hyp_nr_cpus; i++) {
+ ret = pkvm_alloc_private_va_range(PAGE_SIZE, &addr);
+ if (ret)
+ return ret;
+
+ ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PAGE_SIZE,
+ __hyp_pa(__hyp_bss_start), PAGE_HYP);
+ if (ret)
+ return ret;
+
+ ret = create_fixmap_slot(addr, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
int hyp_create_idmap(u32 hyp_va_bits)
{
unsigned long start, end;
@@ -213,3 +339,36 @@ int hyp_create_idmap(u32 hyp_va_bits)
return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC);
}
+
+static void *admit_host_page(void *arg)
+{
+ struct kvm_hyp_memcache *host_mc = arg;
+
+ if (!host_mc->nr_pages)
+ return NULL;
+
+ /*
+ * The host still owns the pages in its memcache, so we need to go
+ * through a full host-to-hyp donation cycle to change it. Fortunately,
+ * __pkvm_host_donate_hyp() takes care of races for us, so if it
+ * succeeds we're good to go.
+ */
+ if (__pkvm_host_donate_hyp(hyp_phys_to_pfn(host_mc->head), 1))
+ return NULL;
+
+ return pop_hyp_memcache(host_mc, hyp_phys_to_virt);
+}
+
+/* Refill our local memcache by poping pages from the one provided by the host. */
+int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
+ struct kvm_hyp_memcache *host_mc)
+{
+ struct kvm_hyp_memcache tmp = *host_mc;
+ int ret;
+
+ ret = __topup_hyp_memcache(mc, min_pages, admit_host_page,
+ hyp_virt_to_phys, &tmp);
+ *host_mc = tmp;
+
+ return ret;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index d40f0b30b534..803ba3222e75 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -93,11 +93,16 @@ static inline struct hyp_page *node_to_page(struct list_head *node)
static void __hyp_attach_page(struct hyp_pool *pool,
struct hyp_page *p)
{
+ phys_addr_t phys = hyp_page_to_phys(p);
unsigned short order = p->order;
struct hyp_page *buddy;
memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
+ /* Skip coalescing for 'external' pages being freed into the pool. */
+ if (phys < pool->range_start || phys >= pool->range_end)
+ goto insert;
+
/*
* Only the first struct hyp_page of a high-order page (otherwise known
* as the 'head') should have p->order set. The non-head pages should
@@ -116,6 +121,7 @@ static void __hyp_attach_page(struct hyp_pool *pool,
p = min(p, buddy);
}
+insert:
/* Mark the new head, and insert it */
p->order = order;
page_add_to_list(p, &pool->free_area[order]);
@@ -144,25 +150,6 @@ static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
return p;
}
-static inline void hyp_page_ref_inc(struct hyp_page *p)
-{
- BUG_ON(p->refcount == USHRT_MAX);
- p->refcount++;
-}
-
-static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
-{
- BUG_ON(!p->refcount);
- p->refcount--;
- return (p->refcount == 0);
-}
-
-static inline void hyp_set_page_refcounted(struct hyp_page *p)
-{
- BUG_ON(p->refcount);
- p->refcount = 1;
-}
-
static void __hyp_put_page(struct hyp_pool *pool, struct hyp_page *p)
{
if (hyp_page_ref_dec_and_test(p))
@@ -249,10 +236,8 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
/* Init the vmemmap portion */
p = hyp_phys_to_page(phys);
- for (i = 0; i < nr_pages; i++) {
- p[i].order = 0;
+ for (i = 0; i < nr_pages; i++)
hyp_set_page_refcounted(&p[i]);
- }
/* Attach the unused pages to the buddy tree */
for (i = reserved_pages; i < nr_pages; i++)
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 85d3b7ae720f..a06ece14a6d8 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -7,8 +7,17 @@
#include <linux/kvm_host.h>
#include <linux/mm.h>
#include <nvhe/fixed_config.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/memory.h>
+#include <nvhe/pkvm.h>
#include <nvhe/trap_handler.h>
+/* Used by icache_is_vpipt(). */
+unsigned long __icache_flags;
+
+/* Used by kvm_get_vttbr(). */
+unsigned int kvm_arm_vmid_bits;
+
/*
* Set trap register values based on features in ID_AA64PFR0.
*/
@@ -183,3 +192,430 @@ void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu)
pvm_init_traps_aa64mmfr0(vcpu);
pvm_init_traps_aa64mmfr1(vcpu);
}
+
+/*
+ * Start the VM table handle at the offset defined instead of at 0.
+ * Mainly for sanity checking and debugging.
+ */
+#define HANDLE_OFFSET 0x1000
+
+static unsigned int vm_handle_to_idx(pkvm_handle_t handle)
+{
+ return handle - HANDLE_OFFSET;
+}
+
+static pkvm_handle_t idx_to_vm_handle(unsigned int idx)
+{
+ return idx + HANDLE_OFFSET;
+}
+
+/*
+ * Spinlock for protecting state related to the VM table. Protects writes
+ * to 'vm_table' and 'nr_table_entries' as well as reads and writes to
+ * 'last_hyp_vcpu_lookup'.
+ */
+static DEFINE_HYP_SPINLOCK(vm_table_lock);
+
+/*
+ * The table of VM entries for protected VMs in hyp.
+ * Allocated at hyp initialization and setup.
+ */
+static struct pkvm_hyp_vm **vm_table;
+
+void pkvm_hyp_vm_table_init(void *tbl)
+{
+ WARN_ON(vm_table);
+ vm_table = tbl;
+}
+
+/*
+ * Return the hyp vm structure corresponding to the handle.
+ */
+static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
+{
+ unsigned int idx = vm_handle_to_idx(handle);
+
+ if (unlikely(idx >= KVM_MAX_PVMS))
+ return NULL;
+
+ return vm_table[idx];
+}
+
+struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
+ unsigned int vcpu_idx)
+{
+ struct pkvm_hyp_vcpu *hyp_vcpu = NULL;
+ struct pkvm_hyp_vm *hyp_vm;
+
+ hyp_spin_lock(&vm_table_lock);
+ hyp_vm = get_vm_by_handle(handle);
+ if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx)
+ goto unlock;
+
+ hyp_vcpu = hyp_vm->vcpus[vcpu_idx];
+ hyp_page_ref_inc(hyp_virt_to_page(hyp_vm));
+unlock:
+ hyp_spin_unlock(&vm_table_lock);
+ return hyp_vcpu;
+}
+
+void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+
+ hyp_spin_lock(&vm_table_lock);
+ hyp_page_ref_dec(hyp_virt_to_page(hyp_vm));
+ hyp_spin_unlock(&vm_table_lock);
+}
+
+static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
+{
+ if (host_vcpu)
+ hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1);
+}
+
+static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[],
+ unsigned int nr_vcpus)
+{
+ int i;
+
+ for (i = 0; i < nr_vcpus; i++)
+ unpin_host_vcpu(hyp_vcpus[i]->host_vcpu);
+}
+
+static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
+ unsigned int nr_vcpus)
+{
+ hyp_vm->host_kvm = host_kvm;
+ hyp_vm->kvm.created_vcpus = nr_vcpus;
+ hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr;
+}
+
+static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
+ struct pkvm_hyp_vm *hyp_vm,
+ struct kvm_vcpu *host_vcpu,
+ unsigned int vcpu_idx)
+{
+ int ret = 0;
+
+ if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1))
+ return -EBUSY;
+
+ if (host_vcpu->vcpu_idx != vcpu_idx) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ hyp_vcpu->host_vcpu = host_vcpu;
+
+ hyp_vcpu->vcpu.kvm = &hyp_vm->kvm;
+ hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id);
+ hyp_vcpu->vcpu.vcpu_idx = vcpu_idx;
+
+ hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu;
+ hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags);
+done:
+ if (ret)
+ unpin_host_vcpu(host_vcpu);
+ return ret;
+}
+
+static int find_free_vm_table_entry(struct kvm *host_kvm)
+{
+ int i;
+
+ for (i = 0; i < KVM_MAX_PVMS; ++i) {
+ if (!vm_table[i])
+ return i;
+ }
+
+ return -ENOMEM;
+}
+
+/*
+ * Allocate a VM table entry and insert a pointer to the new vm.
+ *
+ * Return a unique handle to the protected VM on success,
+ * negative error code on failure.
+ */
+static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm,
+ struct pkvm_hyp_vm *hyp_vm)
+{
+ struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu;
+ int idx;
+
+ hyp_assert_lock_held(&vm_table_lock);
+
+ /*
+ * Initializing protected state might have failed, yet a malicious
+ * host could trigger this function. Thus, ensure that 'vm_table'
+ * exists.
+ */
+ if (unlikely(!vm_table))
+ return -EINVAL;
+
+ idx = find_free_vm_table_entry(host_kvm);
+ if (idx < 0)
+ return idx;
+
+ hyp_vm->kvm.arch.pkvm.handle = idx_to_vm_handle(idx);
+
+ /* VMID 0 is reserved for the host */
+ atomic64_set(&mmu->vmid.id, idx + 1);
+
+ mmu->arch = &hyp_vm->kvm.arch;
+ mmu->pgt = &hyp_vm->pgt;
+
+ vm_table[idx] = hyp_vm;
+ return hyp_vm->kvm.arch.pkvm.handle;
+}
+
+/*
+ * Deallocate and remove the VM table entry corresponding to the handle.
+ */
+static void remove_vm_table_entry(pkvm_handle_t handle)
+{
+ hyp_assert_lock_held(&vm_table_lock);
+ vm_table[vm_handle_to_idx(handle)] = NULL;
+}
+
+static size_t pkvm_get_hyp_vm_size(unsigned int nr_vcpus)
+{
+ return size_add(sizeof(struct pkvm_hyp_vm),
+ size_mul(sizeof(struct pkvm_hyp_vcpu *), nr_vcpus));
+}
+
+static void *map_donated_memory_noclear(unsigned long host_va, size_t size)
+{
+ void *va = (void *)kern_hyp_va(host_va);
+
+ if (!PAGE_ALIGNED(va))
+ return NULL;
+
+ if (__pkvm_host_donate_hyp(hyp_virt_to_pfn(va),
+ PAGE_ALIGN(size) >> PAGE_SHIFT))
+ return NULL;
+
+ return va;
+}
+
+static void *map_donated_memory(unsigned long host_va, size_t size)
+{
+ void *va = map_donated_memory_noclear(host_va, size);
+
+ if (va)
+ memset(va, 0, size);
+
+ return va;
+}
+
+static void __unmap_donated_memory(void *va, size_t size)
+{
+ WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va),
+ PAGE_ALIGN(size) >> PAGE_SHIFT));
+}
+
+static void unmap_donated_memory(void *va, size_t size)
+{
+ if (!va)
+ return;
+
+ memset(va, 0, size);
+ __unmap_donated_memory(va, size);
+}
+
+static void unmap_donated_memory_noclear(void *va, size_t size)
+{
+ if (!va)
+ return;
+
+ __unmap_donated_memory(va, size);
+}
+
+/*
+ * Initialize the hypervisor copy of the protected VM state using the
+ * memory donated by the host.
+ *
+ * Unmaps the donated memory from the host at stage 2.
+ *
+ * host_kvm: A pointer to the host's struct kvm.
+ * vm_hva: The host va of the area being donated for the VM state.
+ * Must be page aligned.
+ * pgd_hva: The host va of the area being donated for the stage-2 PGD for
+ * the VM. Must be page aligned. Its size is implied by the VM's
+ * VTCR.
+ *
+ * Return a unique handle to the protected VM on success,
+ * negative error code on failure.
+ */
+int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
+ unsigned long pgd_hva)
+{
+ struct pkvm_hyp_vm *hyp_vm = NULL;
+ size_t vm_size, pgd_size;
+ unsigned int nr_vcpus;
+ void *pgd = NULL;
+ int ret;
+
+ ret = hyp_pin_shared_mem(host_kvm, host_kvm + 1);
+ if (ret)
+ return ret;
+
+ nr_vcpus = READ_ONCE(host_kvm->created_vcpus);
+ if (nr_vcpus < 1) {
+ ret = -EINVAL;
+ goto err_unpin_kvm;
+ }
+
+ vm_size = pkvm_get_hyp_vm_size(nr_vcpus);
+ pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.vtcr);
+
+ ret = -ENOMEM;
+
+ hyp_vm = map_donated_memory(vm_hva, vm_size);
+ if (!hyp_vm)
+ goto err_remove_mappings;
+
+ pgd = map_donated_memory_noclear(pgd_hva, pgd_size);
+ if (!pgd)
+ goto err_remove_mappings;
+
+ init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus);
+
+ hyp_spin_lock(&vm_table_lock);
+ ret = insert_vm_table_entry(host_kvm, hyp_vm);
+ if (ret < 0)
+ goto err_unlock;
+
+ ret = kvm_guest_prepare_stage2(hyp_vm, pgd);
+ if (ret)
+ goto err_remove_vm_table_entry;
+ hyp_spin_unlock(&vm_table_lock);
+
+ return hyp_vm->kvm.arch.pkvm.handle;
+
+err_remove_vm_table_entry:
+ remove_vm_table_entry(hyp_vm->kvm.arch.pkvm.handle);
+err_unlock:
+ hyp_spin_unlock(&vm_table_lock);
+err_remove_mappings:
+ unmap_donated_memory(hyp_vm, vm_size);
+ unmap_donated_memory(pgd, pgd_size);
+err_unpin_kvm:
+ hyp_unpin_shared_mem(host_kvm, host_kvm + 1);
+ return ret;
+}
+
+/*
+ * Initialize the hypervisor copy of the protected vCPU state using the
+ * memory donated by the host.
+ *
+ * handle: The handle for the protected vm.
+ * host_vcpu: A pointer to the corresponding host vcpu.
+ * vcpu_hva: The host va of the area being donated for the vcpu state.
+ * Must be page aligned. The size of the area must be equal to
+ * the page-aligned size of 'struct pkvm_hyp_vcpu'.
+ * Return 0 on success, negative error code on failure.
+ */
+int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
+ unsigned long vcpu_hva)
+{
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ struct pkvm_hyp_vm *hyp_vm;
+ unsigned int idx;
+ int ret;
+
+ hyp_vcpu = map_donated_memory(vcpu_hva, sizeof(*hyp_vcpu));
+ if (!hyp_vcpu)
+ return -ENOMEM;
+
+ hyp_spin_lock(&vm_table_lock);
+
+ hyp_vm = get_vm_by_handle(handle);
+ if (!hyp_vm) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ idx = hyp_vm->nr_vcpus;
+ if (idx >= hyp_vm->kvm.created_vcpus) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx);
+ if (ret)
+ goto unlock;
+
+ hyp_vm->vcpus[idx] = hyp_vcpu;
+ hyp_vm->nr_vcpus++;
+unlock:
+ hyp_spin_unlock(&vm_table_lock);
+
+ if (ret)
+ unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu));
+
+ return ret;
+}
+
+static void
+teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size)
+{
+ size = PAGE_ALIGN(size);
+ memset(addr, 0, size);
+
+ for (void *start = addr; start < addr + size; start += PAGE_SIZE)
+ push_hyp_memcache(mc, start, hyp_virt_to_phys);
+
+ unmap_donated_memory_noclear(addr, size);
+}
+
+int __pkvm_teardown_vm(pkvm_handle_t handle)
+{
+ struct kvm_hyp_memcache *mc;
+ struct pkvm_hyp_vm *hyp_vm;
+ struct kvm *host_kvm;
+ unsigned int idx;
+ size_t vm_size;
+ int err;
+
+ hyp_spin_lock(&vm_table_lock);
+ hyp_vm = get_vm_by_handle(handle);
+ if (!hyp_vm) {
+ err = -ENOENT;
+ goto err_unlock;
+ }
+
+ if (WARN_ON(hyp_page_count(hyp_vm))) {
+ err = -EBUSY;
+ goto err_unlock;
+ }
+
+ host_kvm = hyp_vm->host_kvm;
+
+ /* Ensure the VMID is clean before it can be reallocated */
+ __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu);
+ remove_vm_table_entry(handle);
+ hyp_spin_unlock(&vm_table_lock);
+
+ /* Reclaim guest pages (including page-table pages) */
+ mc = &host_kvm->arch.pkvm.teardown_mc;
+ reclaim_guest_pages(hyp_vm, mc);
+ unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus);
+
+ /* Push the metadata pages to the teardown memcache */
+ for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) {
+ struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx];
+
+ teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu));
+ }
+
+ vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus);
+ teardown_donated_memory(mc, hyp_vm, vm_size);
+ hyp_unpin_shared_mem(host_kvm, host_kvm + 1);
+ return 0;
+
+err_unlock:
+ hyp_spin_unlock(&vm_table_lock);
+ return err;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index e8d4ea2fcfa0..110f04627785 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -16,6 +16,7 @@
#include <nvhe/memory.h>
#include <nvhe/mem_protect.h>
#include <nvhe/mm.h>
+#include <nvhe/pkvm.h>
#include <nvhe/trap_handler.h>
unsigned long hyp_nr_cpus;
@@ -24,6 +25,7 @@ unsigned long hyp_nr_cpus;
(unsigned long)__per_cpu_start)
static void *vmemmap_base;
+static void *vm_table_base;
static void *hyp_pgt_base;
static void *host_s2_pgt_base;
static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
@@ -31,16 +33,20 @@ static struct hyp_pool hpool;
static int divide_memory_pool(void *virt, unsigned long size)
{
- unsigned long vstart, vend, nr_pages;
+ unsigned long nr_pages;
hyp_early_alloc_init(virt, size);
- hyp_vmemmap_range(__hyp_pa(virt), size, &vstart, &vend);
- nr_pages = (vend - vstart) >> PAGE_SHIFT;
+ nr_pages = hyp_vmemmap_pages(sizeof(struct hyp_page));
vmemmap_base = hyp_early_alloc_contig(nr_pages);
if (!vmemmap_base)
return -ENOMEM;
+ nr_pages = hyp_vm_table_pages();
+ vm_table_base = hyp_early_alloc_contig(nr_pages);
+ if (!vm_table_base)
+ return -ENOMEM;
+
nr_pages = hyp_s1_pgtable_pages();
hyp_pgt_base = hyp_early_alloc_contig(nr_pages);
if (!hyp_pgt_base)
@@ -78,7 +84,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
if (ret)
return ret;
- ret = hyp_back_vmemmap(phys, size, hyp_virt_to_phys(vmemmap_base));
+ ret = hyp_back_vmemmap(hyp_virt_to_phys(vmemmap_base));
if (ret)
return ret;
@@ -138,20 +144,17 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
}
/*
- * Map the host's .bss and .rodata sections RO in the hypervisor, but
- * transfer the ownership from the host to the hypervisor itself to
- * make sure it can't be donated or shared with another entity.
+ * Map the host sections RO in the hypervisor, but transfer the
+ * ownership from the host to the hypervisor itself to make sure they
+ * can't be donated or shared with another entity.
*
* The ownership transition requires matching changes in the host
* stage-2. This will be done later (see finalize_host_mappings()) once
* the hyp_vmemmap is addressable.
*/
prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED);
- ret = pkvm_create_mappings(__start_rodata, __end_rodata, prot);
- if (ret)
- return ret;
-
- ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, prot);
+ ret = pkvm_create_mappings(&kvm_vgic_global_state,
+ &kvm_vgic_global_state + 1, prot);
if (ret)
return ret;
@@ -186,33 +189,20 @@ static void hpool_put_page(void *addr)
hyp_put_page(&hpool, addr);
}
-static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
- kvm_pte_t *ptep,
- enum kvm_pgtable_walk_flags flag,
- void * const arg)
+static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- struct kvm_pgtable_mm_ops *mm_ops = arg;
enum kvm_pgtable_prot prot;
enum pkvm_page_state state;
- kvm_pte_t pte = *ptep;
phys_addr_t phys;
- if (!kvm_pte_valid(pte))
+ if (!kvm_pte_valid(ctx->old))
return 0;
- /*
- * Fix-up the refcount for the page-table pages as the early allocator
- * was unable to access the hyp_vmemmap and so the buddy allocator has
- * initialised the refcount to '1'.
- */
- mm_ops->get_page(ptep);
- if (flag != KVM_PGTABLE_WALK_LEAF)
- return 0;
-
- if (level != (KVM_PGTABLE_MAX_LEVELS - 1))
+ if (ctx->level != (KVM_PGTABLE_MAX_LEVELS - 1))
return -EINVAL;
- phys = kvm_pte_to_phys(pte);
+ phys = kvm_pte_to_phys(ctx->old);
if (!addr_is_memory(phys))
return -EINVAL;
@@ -220,10 +210,10 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
* Adjust the host stage-2 mappings to match the ownership attributes
* configured in the hypervisor stage-1.
*/
- state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte));
+ state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old));
switch (state) {
case PKVM_PAGE_OWNED:
- return host_stage2_set_owner_locked(phys, PAGE_SIZE, pkvm_hyp_id);
+ return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP);
case PKVM_PAGE_SHARED_OWNED:
prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED);
break;
@@ -237,12 +227,25 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
return host_stage2_idmap_locked(phys, PAGE_SIZE, prot);
}
-static int finalize_host_mappings(void)
+static int fix_hyp_pgtable_refcnt_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ /*
+ * Fix-up the refcount for the page-table pages as the early allocator
+ * was unable to access the hyp_vmemmap and so the buddy allocator has
+ * initialised the refcount to '1'.
+ */
+ if (kvm_pte_valid(ctx->old))
+ ctx->mm_ops->get_page(ctx->ptep);
+
+ return 0;
+}
+
+static int fix_host_ownership(void)
{
struct kvm_pgtable_walker walker = {
- .cb = finalize_host_mappings_walker,
- .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
- .arg = pkvm_pgtable.mm_ops,
+ .cb = fix_host_ownership_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
};
int i, ret;
@@ -258,6 +261,18 @@ static int finalize_host_mappings(void)
return 0;
}
+static int fix_hyp_pgtable_refcnt(void)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = fix_hyp_pgtable_refcnt_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
+ .arg = pkvm_pgtable.mm_ops,
+ };
+
+ return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits),
+ &walker);
+}
+
void __noreturn __pkvm_init_finalise(void)
{
struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
@@ -287,10 +302,19 @@ void __noreturn __pkvm_init_finalise(void)
};
pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
- ret = finalize_host_mappings();
+ ret = fix_host_ownership();
+ if (ret)
+ goto out;
+
+ ret = fix_hyp_pgtable_refcnt();
+ if (ret)
+ goto out;
+
+ ret = hyp_create_pcpu_fixmap();
if (ret)
goto out;
+ pkvm_hyp_vm_table_init(vm_table_base);
out:
/*
* We tail-called to here from handle___pkvm_init() and will not return,
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 8e9d49a964be..c2cb46ca4fb6 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -55,18 +55,6 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
write_sysreg(val, cptr_el2);
write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
- if (cpus_have_final_cap(ARM64_SME)) {
- val = read_sysreg_s(SYS_HFGRTR_EL2);
- val &= ~(HFGxTR_EL2_nTPIDR2_EL0_MASK |
- HFGxTR_EL2_nSMPRI_EL1_MASK);
- write_sysreg_s(val, SYS_HFGRTR_EL2);
-
- val = read_sysreg_s(SYS_HFGWTR_EL2);
- val &= ~(HFGxTR_EL2_nTPIDR2_EL0_MASK |
- HFGxTR_EL2_nSMPRI_EL1_MASK);
- write_sysreg_s(val, SYS_HFGWTR_EL2);
- }
-
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
@@ -110,20 +98,6 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
- if (cpus_have_final_cap(ARM64_SME)) {
- u64 val;
-
- val = read_sysreg_s(SYS_HFGRTR_EL2);
- val |= HFGxTR_EL2_nTPIDR2_EL0_MASK |
- HFGxTR_EL2_nSMPRI_EL1_MASK;
- write_sysreg_s(val, SYS_HFGRTR_EL2);
-
- val = read_sysreg_s(SYS_HFGWTR_EL2);
- val |= HFGxTR_EL2_nTPIDR2_EL0_MASK |
- HFGxTR_EL2_nSMPRI_EL1_MASK;
- write_sysreg_s(val, SYS_HFGWTR_EL2);
- }
-
cptr = CPTR_EL2_DEFAULT;
if (vcpu_has_sve(vcpu) && (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED))
cptr |= CPTR_EL2_TZ;
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index cdf8e76b0be1..b11cf2c618a6 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -49,35 +49,38 @@
#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2)
#define KVM_MAX_OWNER_ID 1
+/*
+ * Used to indicate a pte for which a 'break-before-make' sequence is in
+ * progress.
+ */
+#define KVM_INVALID_PTE_LOCKED BIT(10)
+
struct kvm_pgtable_walk_data {
- struct kvm_pgtable *pgt;
struct kvm_pgtable_walker *walker;
u64 addr;
u64 end;
};
-#define KVM_PHYS_INVALID (-1ULL)
-
static bool kvm_phys_is_valid(u64 phys)
{
return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX));
}
-static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys)
{
- u64 granule = kvm_granule_size(level);
+ u64 granule = kvm_granule_size(ctx->level);
- if (!kvm_level_supports_block_mapping(level))
+ if (!kvm_level_supports_block_mapping(ctx->level))
return false;
- if (granule > (end - addr))
+ if (granule > (ctx->end - ctx->addr))
return false;
if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule))
return false;
- return IS_ALIGNED(addr, granule);
+ return IS_ALIGNED(ctx->addr, granule);
}
static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
@@ -88,7 +91,7 @@ static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
return (data->addr >> shift) & mask;
}
-static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
+static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
{
u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
u64 mask = BIT(pgt->ia_bits) - 1;
@@ -96,11 +99,6 @@ static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
return (addr & mask) >> shift;
}
-static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data)
-{
- return __kvm_pgd_page_idx(data->pgt, data->addr);
-}
-
static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
{
struct kvm_pgtable pgt = {
@@ -108,7 +106,7 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
.start_level = start_level,
};
- return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
+ return kvm_pgd_page_idx(&pgt, -1ULL) + 1;
}
static bool kvm_pte_table(kvm_pte_t pte, u32 level)
@@ -122,16 +120,6 @@ static bool kvm_pte_table(kvm_pte_t pte, u32 level)
return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
}
-static kvm_pte_t kvm_phys_to_pte(u64 pa)
-{
- kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
-
- if (PAGE_SHIFT == 16)
- pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
-
- return pte;
-}
-
static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
{
return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));
@@ -142,16 +130,13 @@ static void kvm_clear_pte(kvm_pte_t *ptep)
WRITE_ONCE(*ptep, 0);
}
-static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp,
- struct kvm_pgtable_mm_ops *mm_ops)
+static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops)
{
- kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp));
+ kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp));
pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
pte |= KVM_PTE_VALID;
-
- WARN_ON(kvm_pte_valid(old));
- smp_store_release(ptep, pte);
+ return pte;
}
static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
@@ -172,36 +157,47 @@ static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
}
-static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
- u32 level, kvm_pte_t *ptep,
- enum kvm_pgtable_walk_flags flag)
+static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data,
+ const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
struct kvm_pgtable_walker *walker = data->walker;
- return walker->cb(addr, data->end, level, ptep, flag, walker->arg);
+
+ /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */
+ WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held());
+ return walker->cb(ctx, visit);
}
static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
- kvm_pte_t *pgtable, u32 level);
+ struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level);
static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
- kvm_pte_t *ptep, u32 level)
+ struct kvm_pgtable_mm_ops *mm_ops,
+ kvm_pteref_t pteref, u32 level)
{
- int ret = 0;
- u64 addr = data->addr;
- kvm_pte_t *childp, pte = *ptep;
- bool table = kvm_pte_table(pte, level);
enum kvm_pgtable_walk_flags flags = data->walker->flags;
+ kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref);
+ struct kvm_pgtable_visit_ctx ctx = {
+ .ptep = ptep,
+ .old = READ_ONCE(*ptep),
+ .arg = data->walker->arg,
+ .mm_ops = mm_ops,
+ .addr = data->addr,
+ .end = data->end,
+ .level = level,
+ .flags = flags,
+ };
+ int ret = 0;
+ kvm_pteref_t childp;
+ bool table = kvm_pte_table(ctx.old, level);
- if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
- ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
- KVM_PGTABLE_WALK_TABLE_PRE);
- }
+ if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE))
+ ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE);
- if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) {
- ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
- KVM_PGTABLE_WALK_LEAF);
- pte = *ptep;
- table = kvm_pte_table(pte, level);
+ if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) {
+ ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF);
+ ctx.old = READ_ONCE(*ptep);
+ table = kvm_pte_table(ctx.old, level);
}
if (ret)
@@ -213,22 +209,20 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
goto out;
}
- childp = kvm_pte_follow(pte, data->pgt->mm_ops);
- ret = __kvm_pgtable_walk(data, childp, level + 1);
+ childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops);
+ ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1);
if (ret)
goto out;
- if (flags & KVM_PGTABLE_WALK_TABLE_POST) {
- ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
- KVM_PGTABLE_WALK_TABLE_POST);
- }
+ if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST)
+ ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST);
out:
return ret;
}
static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
- kvm_pte_t *pgtable, u32 level)
+ struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level)
{
u32 idx;
int ret = 0;
@@ -237,12 +231,12 @@ static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
return -EINVAL;
for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
- kvm_pte_t *ptep = &pgtable[idx];
+ kvm_pteref_t pteref = &pgtable[idx];
if (data->addr >= data->end)
break;
- ret = __kvm_pgtable_visit(data, ptep, level);
+ ret = __kvm_pgtable_visit(data, mm_ops, pteref, level);
if (ret)
break;
}
@@ -250,11 +244,10 @@ static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
return ret;
}
-static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
+static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data)
{
u32 idx;
int ret = 0;
- struct kvm_pgtable *pgt = data->pgt;
u64 limit = BIT(pgt->ia_bits);
if (data->addr > limit || data->end > limit)
@@ -263,10 +256,10 @@ static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
if (!pgt->pgd)
return -EINVAL;
- for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) {
- kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE];
+ for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) {
+ kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE];
- ret = __kvm_pgtable_walk(data, ptep, pgt->start_level);
+ ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level);
if (ret)
break;
}
@@ -278,13 +271,20 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
struct kvm_pgtable_walker *walker)
{
struct kvm_pgtable_walk_data walk_data = {
- .pgt = pgt,
.addr = ALIGN_DOWN(addr, PAGE_SIZE),
.end = PAGE_ALIGN(walk_data.addr + size),
.walker = walker,
};
+ int r;
- return _kvm_pgtable_walk(&walk_data);
+ r = kvm_pgtable_walk_begin(walker);
+ if (r)
+ return r;
+
+ r = _kvm_pgtable_walk(pgt, &walk_data);
+ kvm_pgtable_walk_end(walker);
+
+ return r;
}
struct leaf_walk_data {
@@ -292,13 +292,13 @@ struct leaf_walk_data {
u32 level;
};
-static int leaf_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
- enum kvm_pgtable_walk_flags flag, void * const arg)
+static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- struct leaf_walk_data *data = arg;
+ struct leaf_walk_data *data = ctx->arg;
- data->pte = *ptep;
- data->level = level;
+ data->pte = ctx->old;
+ data->level = ctx->level;
return 0;
}
@@ -329,7 +329,6 @@ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
struct hyp_map_data {
u64 phys;
kvm_pte_t attr;
- struct kvm_pgtable_mm_ops *mm_ops;
};
static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
@@ -383,47 +382,49 @@ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
return prot;
}
-static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
- kvm_pte_t *ptep, struct hyp_map_data *data)
+static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
+ struct hyp_map_data *data)
{
- kvm_pte_t new, old = *ptep;
- u64 granule = kvm_granule_size(level), phys = data->phys;
+ kvm_pte_t new;
+ u64 granule = kvm_granule_size(ctx->level), phys = data->phys;
- if (!kvm_block_mapping_supported(addr, end, phys, level))
+ if (!kvm_block_mapping_supported(ctx, phys))
return false;
data->phys += granule;
- new = kvm_init_valid_leaf_pte(phys, data->attr, level);
- if (old == new)
+ new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
+ if (ctx->old == new)
return true;
- if (!kvm_pte_valid(old))
- data->mm_ops->get_page(ptep);
- else if (WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW))
+ if (!kvm_pte_valid(ctx->old))
+ ctx->mm_ops->get_page(ctx->ptep);
+ else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW))
return false;
- smp_store_release(ptep, new);
+ smp_store_release(ctx->ptep, new);
return true;
}
-static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
- enum kvm_pgtable_walk_flags flag, void * const arg)
+static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- kvm_pte_t *childp;
- struct hyp_map_data *data = arg;
- struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
+ kvm_pte_t *childp, new;
+ struct hyp_map_data *data = ctx->arg;
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
- if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg))
+ if (hyp_map_walker_try_leaf(ctx, data))
return 0;
- if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
+ if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1))
return -EINVAL;
childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
if (!childp)
return -ENOMEM;
- kvm_set_table_pte(ptep, childp, mm_ops);
- mm_ops->get_page(ptep);
+ new = kvm_init_table_pte(childp, mm_ops);
+ mm_ops->get_page(ctx->ptep);
+ smp_store_release(ctx->ptep, new);
+
return 0;
}
@@ -433,7 +434,6 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
int ret;
struct hyp_map_data map_data = {
.phys = ALIGN_DOWN(phys, PAGE_SIZE),
- .mm_ops = pgt->mm_ops,
};
struct kvm_pgtable_walker walker = {
.cb = hyp_map_walker,
@@ -451,44 +451,39 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
return ret;
}
-struct hyp_unmap_data {
- u64 unmapped;
- struct kvm_pgtable_mm_ops *mm_ops;
-};
-
-static int hyp_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
- enum kvm_pgtable_walk_flags flag, void * const arg)
+static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- kvm_pte_t pte = *ptep, *childp = NULL;
- u64 granule = kvm_granule_size(level);
- struct hyp_unmap_data *data = arg;
- struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
+ kvm_pte_t *childp = NULL;
+ u64 granule = kvm_granule_size(ctx->level);
+ u64 *unmapped = ctx->arg;
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
- if (!kvm_pte_valid(pte))
+ if (!kvm_pte_valid(ctx->old))
return -EINVAL;
- if (kvm_pte_table(pte, level)) {
- childp = kvm_pte_follow(pte, mm_ops);
+ if (kvm_pte_table(ctx->old, ctx->level)) {
+ childp = kvm_pte_follow(ctx->old, mm_ops);
if (mm_ops->page_count(childp) != 1)
return 0;
- kvm_clear_pte(ptep);
+ kvm_clear_pte(ctx->ptep);
dsb(ishst);
- __tlbi_level(vae2is, __TLBI_VADDR(addr, 0), level);
+ __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
} else {
- if (end - addr < granule)
+ if (ctx->end - ctx->addr < granule)
return -EINVAL;
- kvm_clear_pte(ptep);
+ kvm_clear_pte(ctx->ptep);
dsb(ishst);
- __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level);
- data->unmapped += granule;
+ __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
+ *unmapped += granule;
}
dsb(ish);
isb();
- mm_ops->put_page(ptep);
+ mm_ops->put_page(ctx->ptep);
if (childp)
mm_ops->put_page(childp);
@@ -498,12 +493,10 @@ static int hyp_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
- struct hyp_unmap_data unmap_data = {
- .mm_ops = pgt->mm_ops,
- };
+ u64 unmapped = 0;
struct kvm_pgtable_walker walker = {
.cb = hyp_unmap_walker,
- .arg = &unmap_data,
+ .arg = &unmapped,
.flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
};
@@ -511,7 +504,7 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
return 0;
kvm_pgtable_walk(pgt, addr, size, &walker);
- return unmap_data.unmapped;
+ return unmapped;
}
int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
@@ -519,7 +512,7 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
{
u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
- pgt->pgd = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
+ pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL);
if (!pgt->pgd)
return -ENOMEM;
@@ -532,19 +525,18 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
return 0;
}
-static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
- enum kvm_pgtable_walk_flags flag, void * const arg)
+static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- struct kvm_pgtable_mm_ops *mm_ops = arg;
- kvm_pte_t pte = *ptep;
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
- if (!kvm_pte_valid(pte))
+ if (!kvm_pte_valid(ctx->old))
return 0;
- mm_ops->put_page(ptep);
+ mm_ops->put_page(ctx->ptep);
- if (kvm_pte_table(pte, level))
- mm_ops->put_page(kvm_pte_follow(pte, mm_ops));
+ if (kvm_pte_table(ctx->old, ctx->level))
+ mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
return 0;
}
@@ -554,11 +546,10 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
struct kvm_pgtable_walker walker = {
.cb = hyp_free_walker,
.flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
- .arg = pgt->mm_ops,
};
WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
- pgt->mm_ops->put_page(pgt->pgd);
+ pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd));
pgt->pgd = NULL;
}
@@ -573,8 +564,6 @@ struct stage2_map_data {
struct kvm_s2_mmu *mmu;
void *memcache;
- struct kvm_pgtable_mm_ops *mm_ops;
-
/* Force mappings to page granularity */
bool force_pte;
};
@@ -682,19 +671,92 @@ static bool stage2_pte_is_counted(kvm_pte_t pte)
return !!pte;
}
-static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
- u32 level, struct kvm_pgtable_mm_ops *mm_ops)
+static bool stage2_pte_is_locked(kvm_pte_t pte)
+{
+ return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED);
+}
+
+static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
+{
+ if (!kvm_pgtable_walk_shared(ctx)) {
+ WRITE_ONCE(*ctx->ptep, new);
+ return true;
+ }
+
+ return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old;
+}
+
+/**
+ * stage2_try_break_pte() - Invalidates a pte according to the
+ * 'break-before-make' requirements of the
+ * architecture.
+ *
+ * @ctx: context of the visited pte.
+ * @mmu: stage-2 mmu
+ *
+ * Returns: true if the pte was successfully broken.
+ *
+ * If the removed pte was valid, performs the necessary serialization and TLB
+ * invalidation for the old value. For counted ptes, drops the reference count
+ * on the containing table page.
+ */
+static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
+ struct kvm_s2_mmu *mmu)
+{
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+
+ if (stage2_pte_is_locked(ctx->old)) {
+ /*
+ * Should never occur if this walker has exclusive access to the
+ * page tables.
+ */
+ WARN_ON(!kvm_pgtable_walk_shared(ctx));
+ return false;
+ }
+
+ if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
+ return false;
+
+ /*
+ * Perform the appropriate TLB invalidation based on the evicted pte
+ * value (if any).
+ */
+ if (kvm_pte_table(ctx->old, ctx->level))
+ kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
+ else if (kvm_pte_valid(ctx->old))
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
+
+ if (stage2_pte_is_counted(ctx->old))
+ mm_ops->put_page(ctx->ptep);
+
+ return true;
+}
+
+static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
+{
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+
+ WARN_ON(!stage2_pte_is_locked(*ctx->ptep));
+
+ if (stage2_pte_is_counted(new))
+ mm_ops->get_page(ctx->ptep);
+
+ smp_store_release(ctx->ptep, new);
+}
+
+static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
+ struct kvm_pgtable_mm_ops *mm_ops)
{
/*
* Clear the existing PTE, and perform break-before-make with
* TLB maintenance if it was valid.
*/
- if (kvm_pte_valid(*ptep)) {
- kvm_clear_pte(ptep);
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
+ if (kvm_pte_valid(ctx->old)) {
+ kvm_clear_pte(ctx->ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
}
- mm_ops->put_page(ptep);
+ mm_ops->put_page(ctx->ptep);
}
static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
@@ -708,44 +770,42 @@ static bool stage2_pte_executable(kvm_pte_t pte)
return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
}
-static bool stage2_leaf_mapping_allowed(u64 addr, u64 end, u32 level,
+static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx,
struct stage2_map_data *data)
{
- if (data->force_pte && (level < (KVM_PGTABLE_MAX_LEVELS - 1)))
+ if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1)))
return false;
- return kvm_block_mapping_supported(addr, end, data->phys, level);
+ return kvm_block_mapping_supported(ctx, data->phys);
}
-static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
- kvm_pte_t *ptep,
+static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
struct stage2_map_data *data)
{
- kvm_pte_t new, old = *ptep;
- u64 granule = kvm_granule_size(level), phys = data->phys;
+ kvm_pte_t new;
+ u64 granule = kvm_granule_size(ctx->level), phys = data->phys;
struct kvm_pgtable *pgt = data->mmu->pgt;
- struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
- if (!stage2_leaf_mapping_allowed(addr, end, level, data))
+ if (!stage2_leaf_mapping_allowed(ctx, data))
return -E2BIG;
if (kvm_phys_is_valid(phys))
- new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+ new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
else
new = kvm_init_invalid_leaf_owner(data->owner_id);
- if (stage2_pte_is_counted(old)) {
- /*
- * Skip updating the PTE if we are trying to recreate the exact
- * same mapping or only change the access permissions. Instead,
- * the vCPU will exit one more time from guest if still needed
- * and then go through the path of relaxing permissions.
- */
- if (!stage2_pte_needs_update(old, new))
- return -EAGAIN;
+ /*
+ * Skip updating the PTE if we are trying to recreate the exact
+ * same mapping or only change the access permissions. Instead,
+ * the vCPU will exit one more time from guest if still needed
+ * and then go through the path of relaxing permissions.
+ */
+ if (!stage2_pte_needs_update(ctx->old, new))
+ return -EAGAIN;
- stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
- }
+ if (!stage2_try_break_pte(ctx, data->mmu))
+ return -EAGAIN;
/* Perform CMOs before installation of the guest stage-2 PTE */
if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
@@ -755,56 +815,43 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
- smp_store_release(ptep, new);
- if (stage2_pte_is_counted(new))
- mm_ops->get_page(ptep);
+ stage2_make_pte(ctx, new);
+
if (kvm_phys_is_valid(phys))
data->phys += granule;
return 0;
}
-static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
- kvm_pte_t *ptep,
+static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx,
struct stage2_map_data *data)
{
- if (data->anchor)
- return 0;
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+ kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops);
+ int ret;
- if (!stage2_leaf_mapping_allowed(addr, end, level, data))
+ if (!stage2_leaf_mapping_allowed(ctx, data))
return 0;
- data->childp = kvm_pte_follow(*ptep, data->mm_ops);
- kvm_clear_pte(ptep);
+ ret = stage2_map_walker_try_leaf(ctx, data);
+ if (ret)
+ return ret;
- /*
- * Invalidate the whole stage-2, as we may have numerous leaf
- * entries below us which would otherwise need invalidating
- * individually.
- */
- kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu);
- data->anchor = ptep;
+ mm_ops->free_removed_table(childp, ctx->level);
return 0;
}
-static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx,
struct stage2_map_data *data)
{
- struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
- kvm_pte_t *childp, pte = *ptep;
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+ kvm_pte_t *childp, new;
int ret;
- if (data->anchor) {
- if (stage2_pte_is_counted(pte))
- mm_ops->put_page(ptep);
-
- return 0;
- }
-
- ret = stage2_map_walker_try_leaf(addr, end, level, ptep, data);
+ ret = stage2_map_walker_try_leaf(ctx, data);
if (ret != -E2BIG)
return ret;
- if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
+ if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1))
return -EINVAL;
if (!data->memcache)
@@ -814,99 +861,62 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
if (!childp)
return -ENOMEM;
+ if (!stage2_try_break_pte(ctx, data->mmu)) {
+ mm_ops->put_page(childp);
+ return -EAGAIN;
+ }
+
/*
* If we've run into an existing block mapping then replace it with
* a table. Accesses beyond 'end' that fall within the new table
* will be mapped lazily.
*/
- if (stage2_pte_is_counted(pte))
- stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
-
- kvm_set_table_pte(ptep, childp, mm_ops);
- mm_ops->get_page(ptep);
+ new = kvm_init_table_pte(childp, mm_ops);
+ stage2_make_pte(ctx, new);
return 0;
}
-static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
- kvm_pte_t *ptep,
- struct stage2_map_data *data)
-{
- struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
- kvm_pte_t *childp;
- int ret = 0;
-
- if (!data->anchor)
- return 0;
-
- if (data->anchor == ptep) {
- childp = data->childp;
- data->anchor = NULL;
- data->childp = NULL;
- ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
- } else {
- childp = kvm_pte_follow(*ptep, mm_ops);
- }
-
- mm_ops->put_page(childp);
- mm_ops->put_page(ptep);
-
- return ret;
-}
-
/*
- * This is a little fiddly, as we use all three of the walk flags. The idea
- * is that the TABLE_PRE callback runs for table entries on the way down,
- * looking for table entries which we could conceivably replace with a
- * block entry for this mapping. If it finds one, then it sets the 'anchor'
- * field in 'struct stage2_map_data' to point at the table entry, before
- * clearing the entry to zero and descending into the now detached table.
+ * The TABLE_PRE callback runs for table entries on the way down, looking
+ * for table entries which we could conceivably replace with a block entry
+ * for this mapping. If it finds one it replaces the entry and calls
+ * kvm_pgtable_mm_ops::free_removed_table() to tear down the detached table.
*
- * The behaviour of the LEAF callback then depends on whether or not the
- * anchor has been set. If not, then we're not using a block mapping higher
- * up the table and we perform the mapping at the existing leaves instead.
- * If, on the other hand, the anchor _is_ set, then we drop references to
- * all valid leaves so that the pages beneath the anchor can be freed.
- *
- * Finally, the TABLE_POST callback does nothing if the anchor has not
- * been set, but otherwise frees the page-table pages while walking back up
- * the page-table, installing the block entry when it revisits the anchor
- * pointer and clearing the anchor to NULL.
+ * Otherwise, the LEAF callback performs the mapping at the existing leaves
+ * instead.
*/
-static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
- enum kvm_pgtable_walk_flags flag, void * const arg)
+static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- struct stage2_map_data *data = arg;
+ struct stage2_map_data *data = ctx->arg;
- switch (flag) {
+ switch (visit) {
case KVM_PGTABLE_WALK_TABLE_PRE:
- return stage2_map_walk_table_pre(addr, end, level, ptep, data);
+ return stage2_map_walk_table_pre(ctx, data);
case KVM_PGTABLE_WALK_LEAF:
- return stage2_map_walk_leaf(addr, end, level, ptep, data);
- case KVM_PGTABLE_WALK_TABLE_POST:
- return stage2_map_walk_table_post(addr, end, level, ptep, data);
+ return stage2_map_walk_leaf(ctx, data);
+ default:
+ return -EINVAL;
}
-
- return -EINVAL;
}
int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
u64 phys, enum kvm_pgtable_prot prot,
- void *mc)
+ void *mc, enum kvm_pgtable_walk_flags flags)
{
int ret;
struct stage2_map_data map_data = {
.phys = ALIGN_DOWN(phys, PAGE_SIZE),
.mmu = pgt->mmu,
.memcache = mc,
- .mm_ops = pgt->mm_ops,
.force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot),
};
struct kvm_pgtable_walker walker = {
.cb = stage2_map_walker,
- .flags = KVM_PGTABLE_WALK_TABLE_PRE |
- KVM_PGTABLE_WALK_LEAF |
- KVM_PGTABLE_WALK_TABLE_POST,
+ .flags = flags |
+ KVM_PGTABLE_WALK_TABLE_PRE |
+ KVM_PGTABLE_WALK_LEAF,
.arg = &map_data,
};
@@ -930,15 +940,13 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
.phys = KVM_PHYS_INVALID,
.mmu = pgt->mmu,
.memcache = mc,
- .mm_ops = pgt->mm_ops,
.owner_id = owner_id,
.force_pte = true,
};
struct kvm_pgtable_walker walker = {
.cb = stage2_map_walker,
.flags = KVM_PGTABLE_WALK_TABLE_PRE |
- KVM_PGTABLE_WALK_LEAF |
- KVM_PGTABLE_WALK_TABLE_POST,
+ KVM_PGTABLE_WALK_LEAF,
.arg = &map_data,
};
@@ -949,30 +957,29 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
return ret;
}
-static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
- enum kvm_pgtable_walk_flags flag,
- void * const arg)
+static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- struct kvm_pgtable *pgt = arg;
+ struct kvm_pgtable *pgt = ctx->arg;
struct kvm_s2_mmu *mmu = pgt->mmu;
- struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
- kvm_pte_t pte = *ptep, *childp = NULL;
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+ kvm_pte_t *childp = NULL;
bool need_flush = false;
- if (!kvm_pte_valid(pte)) {
- if (stage2_pte_is_counted(pte)) {
- kvm_clear_pte(ptep);
- mm_ops->put_page(ptep);
+ if (!kvm_pte_valid(ctx->old)) {
+ if (stage2_pte_is_counted(ctx->old)) {
+ kvm_clear_pte(ctx->ptep);
+ mm_ops->put_page(ctx->ptep);
}
return 0;
}
- if (kvm_pte_table(pte, level)) {
- childp = kvm_pte_follow(pte, mm_ops);
+ if (kvm_pte_table(ctx->old, ctx->level)) {
+ childp = kvm_pte_follow(ctx->old, mm_ops);
if (mm_ops->page_count(childp) != 1)
return 0;
- } else if (stage2_pte_cacheable(pgt, pte)) {
+ } else if (stage2_pte_cacheable(pgt, ctx->old)) {
need_flush = !stage2_has_fwb(pgt);
}
@@ -981,11 +988,11 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
* block entry and rely on the remaining portions being faulted
* back lazily.
*/
- stage2_put_pte(ptep, mmu, addr, level, mm_ops);
+ stage2_put_pte(ctx, mmu, mm_ops);
if (need_flush && mm_ops->dcache_clean_inval_poc)
- mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops),
- kvm_granule_size(level));
+ mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
+ kvm_granule_size(ctx->level));
if (childp)
mm_ops->put_page(childp);
@@ -1009,21 +1016,19 @@ struct stage2_attr_data {
kvm_pte_t attr_clr;
kvm_pte_t pte;
u32 level;
- struct kvm_pgtable_mm_ops *mm_ops;
};
-static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
- enum kvm_pgtable_walk_flags flag,
- void * const arg)
+static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- kvm_pte_t pte = *ptep;
- struct stage2_attr_data *data = arg;
- struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
+ kvm_pte_t pte = ctx->old;
+ struct stage2_attr_data *data = ctx->arg;
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
- if (!kvm_pte_valid(pte))
+ if (!kvm_pte_valid(ctx->old))
return 0;
- data->level = level;
+ data->level = ctx->level;
data->pte = pte;
pte &= ~data->attr_clr;
pte |= data->attr_set;
@@ -1039,10 +1044,12 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
* stage-2 PTE if we are going to add executable permission.
*/
if (mm_ops->icache_inval_pou &&
- stage2_pte_executable(pte) && !stage2_pte_executable(*ptep))
+ stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old))
mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops),
- kvm_granule_size(level));
- WRITE_ONCE(*ptep, pte);
+ kvm_granule_size(ctx->level));
+
+ if (!stage2_try_set_pte(ctx, pte))
+ return -EAGAIN;
}
return 0;
@@ -1051,19 +1058,18 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
u64 size, kvm_pte_t attr_set,
kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
- u32 *level)
+ u32 *level, enum kvm_pgtable_walk_flags flags)
{
int ret;
kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
struct stage2_attr_data data = {
.attr_set = attr_set & attr_mask,
.attr_clr = attr_clr & attr_mask,
- .mm_ops = pgt->mm_ops,
};
struct kvm_pgtable_walker walker = {
.cb = stage2_attr_walker,
.arg = &data,
- .flags = KVM_PGTABLE_WALK_LEAF,
+ .flags = flags | KVM_PGTABLE_WALK_LEAF,
};
ret = kvm_pgtable_walk(pgt, addr, size, &walker);
@@ -1082,14 +1088,14 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
return stage2_update_leaf_attrs(pgt, addr, size, 0,
KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
- NULL, NULL);
+ NULL, NULL, 0);
}
kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
{
kvm_pte_t pte = 0;
stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
- &pte, NULL);
+ &pte, NULL, 0);
dsb(ishst);
return pte;
}
@@ -1098,7 +1104,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
{
kvm_pte_t pte = 0;
stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
- &pte, NULL);
+ &pte, NULL, 0);
/*
* "But where's the TLBI?!", you scream.
* "Over in the core code", I sigh.
@@ -1111,7 +1117,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
{
kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL);
+ stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0);
return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
}
@@ -1134,26 +1140,25 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
if (prot & KVM_PGTABLE_PROT_X)
clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
- ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level);
+ ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level,
+ KVM_PGTABLE_WALK_SHARED);
if (!ret)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
return ret;
}
-static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
- enum kvm_pgtable_walk_flags flag,
- void * const arg)
+static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- struct kvm_pgtable *pgt = arg;
+ struct kvm_pgtable *pgt = ctx->arg;
struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
- kvm_pte_t pte = *ptep;
- if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte))
+ if (!kvm_pte_valid(ctx->old) || !stage2_pte_cacheable(pgt, ctx->old))
return 0;
if (mm_ops->dcache_clean_inval_poc)
- mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops),
- kvm_granule_size(level));
+ mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
+ kvm_granule_size(ctx->level));
return 0;
}
@@ -1184,7 +1189,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
- pgt->pgd = mm_ops->zalloc_pages_exact(pgd_sz);
+ pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz);
if (!pgt->pgd)
return -ENOMEM;
@@ -1200,20 +1205,27 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
return 0;
}
-static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
- enum kvm_pgtable_walk_flags flag,
- void * const arg)
+size_t kvm_pgtable_stage2_pgd_size(u64 vtcr)
+{
+ u32 ia_bits = VTCR_EL2_IPA(vtcr);
+ u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
+ u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
+
+ return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
+}
+
+static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- struct kvm_pgtable_mm_ops *mm_ops = arg;
- kvm_pte_t pte = *ptep;
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
- if (!stage2_pte_is_counted(pte))
+ if (!stage2_pte_is_counted(ctx->old))
return 0;
- mm_ops->put_page(ptep);
+ mm_ops->put_page(ctx->ptep);
- if (kvm_pte_table(pte, level))
- mm_ops->put_page(kvm_pte_follow(pte, mm_ops));
+ if (kvm_pte_table(ctx->old, ctx->level))
+ mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
return 0;
}
@@ -1225,11 +1237,33 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
.cb = stage2_free_walker,
.flags = KVM_PGTABLE_WALK_LEAF |
KVM_PGTABLE_WALK_TABLE_POST,
- .arg = pgt->mm_ops,
};
WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
- pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
+ pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz);
pgt->pgd = NULL;
}
+
+void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level)
+{
+ kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_free_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF |
+ KVM_PGTABLE_WALK_TABLE_POST,
+ };
+ struct kvm_pgtable_walk_data data = {
+ .walker = &walker,
+
+ /*
+ * At this point the IPA really doesn't matter, as the page
+ * table being traversed has already been removed from the stage
+ * 2. Set an appropriate range to cover the entire page table.
+ */
+ .addr = 0,
+ .end = kvm_granule_size(level),
+ };
+
+ WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1));
+}
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 7acb87eaa092..1a97391fedd2 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -63,10 +63,6 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
__activate_traps_fpsimd32(vcpu);
}
- if (cpus_have_final_cap(ARM64_SME))
- write_sysreg(read_sysreg(sctlr_el2) & ~SCTLR_ELx_ENTP2,
- sctlr_el2);
-
write_sysreg(val, cpacr_el1);
write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1);
@@ -88,10 +84,6 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
*/
asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
- if (cpus_have_final_cap(ARM64_SME))
- write_sysreg(read_sysreg(sctlr_el2) | SCTLR_ELx_ENTP2,
- sctlr_el2);
-
write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1);
if (!arm64_kernel_unmapped_at_el0())
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 60ee3d9f01f8..39d9a334efb5 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -128,6 +128,25 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size)
free_pages_exact(virt, size);
}
+static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;
+
+static void stage2_free_removed_table_rcu_cb(struct rcu_head *head)
+{
+ struct page *page = container_of(head, struct page, rcu_head);
+ void *pgtable = page_to_virt(page);
+ u32 level = page_private(page);
+
+ kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level);
+}
+
+static void stage2_free_removed_table(void *addr, u32 level)
+{
+ struct page *page = virt_to_page(addr);
+
+ set_page_private(page, (unsigned long)level);
+ call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb);
+}
+
static void kvm_host_get_page(void *addr)
{
get_page(virt_to_page(addr));
@@ -640,8 +659,8 @@ static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
static int get_user_mapping_size(struct kvm *kvm, u64 addr)
{
struct kvm_pgtable pgt = {
- .pgd = (kvm_pte_t *)kvm->mm->pgd,
- .ia_bits = VA_BITS,
+ .pgd = (kvm_pteref_t)kvm->mm->pgd,
+ .ia_bits = vabits_actual,
.start_level = (KVM_PGTABLE_MAX_LEVELS -
CONFIG_PGTABLE_LEVELS),
.mm_ops = &kvm_user_mm_ops,
@@ -662,6 +681,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.zalloc_page = stage2_memcache_zalloc_page,
.zalloc_pages_exact = kvm_s2_zalloc_pages_exact,
.free_pages_exact = kvm_s2_free_pages_exact,
+ .free_removed_table = stage2_free_removed_table,
.get_page = kvm_host_get_page,
.put_page = kvm_s2_put_page,
.page_count = kvm_host_page_count,
@@ -675,15 +695,42 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
* kvm_init_stage2_mmu - Initialise a S2 MMU structure
* @kvm: The pointer to the KVM structure
* @mmu: The pointer to the s2 MMU structure
+ * @type: The machine type of the virtual machine
*
* Allocates only the stage-2 HW PGD level table(s).
* Note we don't need locking here as this is only called when the VM is
* created, which can only be done once.
*/
-int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
+int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
{
+ u32 kvm_ipa_limit = get_kvm_ipa_limit();
int cpu, err;
struct kvm_pgtable *pgt;
+ u64 mmfr0, mmfr1;
+ u32 phys_shift;
+
+ if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
+ return -EINVAL;
+
+ phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
+ if (is_protected_kvm_enabled()) {
+ phys_shift = kvm_ipa_limit;
+ } else if (phys_shift) {
+ if (phys_shift > kvm_ipa_limit ||
+ phys_shift < ARM64_MIN_PARANGE_BITS)
+ return -EINVAL;
+ } else {
+ phys_shift = KVM_PHYS_SHIFT;
+ if (phys_shift > kvm_ipa_limit) {
+ pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
+ current->comm);
+ return -EINVAL;
+ }
+ }
+
+ mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+ mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+ kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
if (mmu->pgt != NULL) {
kvm_err("kvm_arch already initialized?\n");
@@ -807,6 +854,32 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
}
}
+static void hyp_mc_free_fn(void *addr, void *unused)
+{
+ free_page((unsigned long)addr);
+}
+
+static void *hyp_mc_alloc_fn(void *unused)
+{
+ return (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+}
+
+void free_hyp_memcache(struct kvm_hyp_memcache *mc)
+{
+ if (is_protected_kvm_enabled())
+ __free_hyp_memcache(mc, hyp_mc_free_fn,
+ kvm_host_va, NULL);
+}
+
+int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
+{
+ if (!is_protected_kvm_enabled())
+ return 0;
+
+ return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
+ kvm_host_pa, NULL);
+}
+
/**
* kvm_phys_addr_ioremap - map a device range to guest IPA
*
@@ -841,7 +914,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
write_lock(&kvm->mmu_lock);
ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
- &cache);
+ &cache, 0);
write_unlock(&kvm->mmu_lock);
if (ret)
break;
@@ -1091,32 +1164,26 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
* - mmap_lock protects between a VM faulting a page in and the VMM performing
* an mprotect() to add VM_MTE
*/
-static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
- unsigned long size)
+static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
+ unsigned long size)
{
unsigned long i, nr_pages = size >> PAGE_SHIFT;
- struct page *page;
+ struct page *page = pfn_to_page(pfn);
if (!kvm_has_mte(kvm))
- return 0;
-
- /*
- * pfn_to_online_page() is used to reject ZONE_DEVICE pages
- * that may not support tags.
- */
- page = pfn_to_online_page(pfn);
-
- if (!page)
- return -EFAULT;
+ return;
for (i = 0; i < nr_pages; i++, page++) {
- if (!test_bit(PG_mte_tagged, &page->flags)) {
+ if (try_page_mte_tagging(page)) {
mte_clear_page_tags(page_address(page));
- set_bit(PG_mte_tagged, &page->flags);
+ set_page_mte_tagged(page);
}
}
+}
- return 0;
+static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & VM_MTE_ALLOWED;
}
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
@@ -1127,7 +1194,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
bool write_fault, writable, force_pte = false;
bool exec_fault;
bool device = false;
- bool shared;
unsigned long mmu_seq;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
@@ -1136,7 +1202,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
gfn_t gfn;
kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot);
- bool use_read_lock = false;
unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
unsigned long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
@@ -1171,14 +1236,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (logging_active) {
force_pte = true;
vma_shift = PAGE_SHIFT;
- use_read_lock = (fault_status == FSC_PERM && write_fault &&
- fault_granule == PAGE_SIZE);
} else {
vma_shift = get_vma_page_shift(vma, hva);
}
- shared = (vma->vm_flags & VM_SHARED);
-
switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED
case PUD_SHIFT:
@@ -1271,15 +1332,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (exec_fault && device)
return -ENOEXEC;
- /*
- * To reduce MMU contentions and enhance concurrency during dirty
- * logging dirty logging, only acquire read lock for permission
- * relaxation.
- */
- if (use_read_lock)
- read_lock(&kvm->mmu_lock);
- else
- write_lock(&kvm->mmu_lock);
+ read_lock(&kvm->mmu_lock);
pgt = vcpu->arch.hw_mmu->pgt;
if (mmu_invalidate_retry(kvm, mmu_seq))
goto out_unlock;
@@ -1298,13 +1351,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
- /* Check the VMM hasn't introduced a new VM_SHARED VMA */
- if (!shared)
- ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
- else
+ /* Check the VMM hasn't introduced a new disallowed VMA */
+ if (kvm_vma_mte_allowed(vma)) {
+ sanitise_mte_tags(kvm, pfn, vma_pagesize);
+ } else {
ret = -EFAULT;
- if (ret)
goto out_unlock;
+ }
}
if (writable)
@@ -1323,15 +1376,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* permissions only if vma_pagesize equals fault_granule. Otherwise,
* kvm_pgtable_stage2_map() should be called to change block size.
*/
- if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
+ if (fault_status == FSC_PERM && vma_pagesize == fault_granule)
ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
- } else {
- WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n");
-
+ else
ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
__pfn_to_phys(pfn), prot,
- memcache);
- }
+ memcache, KVM_PGTABLE_WALK_SHARED);
/* Mark the page dirty only if the fault is handled successfully */
if (writable && !ret) {
@@ -1340,10 +1390,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
out_unlock:
- if (use_read_lock)
- read_unlock(&kvm->mmu_lock);
- else
- write_unlock(&kvm->mmu_lock);
+ read_unlock(&kvm->mmu_lock);
kvm_set_pfn_accessed(pfn);
kvm_release_pfn_clean(pfn);
return ret != -EAGAIN ? ret : 0;
@@ -1526,15 +1573,18 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
kvm_pfn_t pfn = pte_pfn(range->pte);
- int ret;
if (!kvm->arch.mmu.pgt)
return false;
WARN_ON(range->end - range->start != 1);
- ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
- if (ret)
+ /*
+ * If the page isn't tagged, defer to user_mem_abort() for sanitising
+ * the MTE tags. The S2 pte should have been unmapped by
+ * mmu_notifier_invalidate_range_end().
+ */
+ if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn)))
return false;
/*
@@ -1549,7 +1599,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
*/
kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
PAGE_SIZE, __pfn_to_phys(pfn),
- KVM_PGTABLE_PROT_R, NULL);
+ KVM_PGTABLE_PROT_R, NULL, 0);
return false;
}
@@ -1618,6 +1668,8 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
int kvm_mmu_init(u32 *hyp_va_bits)
{
int err;
+ u32 idmap_bits;
+ u32 kernel_bits;
hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
@@ -1631,7 +1683,31 @@ int kvm_mmu_init(u32 *hyp_va_bits)
*/
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
- *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+ /*
+ * The ID map may be configured to use an extended virtual address
+ * range. This is only the case if system RAM is out of range for the
+ * currently configured page size and VA_BITS_MIN, in which case we will
+ * also need the extended virtual range for the HYP ID map, or we won't
+ * be able to enable the EL2 MMU.
+ *
+ * However, in some cases the ID map may be configured for fewer than
+ * the number of VA bits used by the regular kernel stage 1. This
+ * happens when VA_BITS=52 and the kernel image is placed in PA space
+ * below 48 bits.
+ *
+ * At EL2, there is only one TTBR register, and we can't switch between
+ * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom
+ * line: we need to use the extended range with *both* our translation
+ * tables.
+ *
+ * So use the maximum of the idmap VA bits and the regular kernel stage
+ * 1 VA bits to assure that the hypervisor can both ID map its code page
+ * and map any kernel memory.
+ */
+ idmap_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+ kernel_bits = vabits_actual;
+ *hyp_va_bits = max(idmap_bits, kernel_bits);
+
kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
kvm_debug("HYP VA range: %lx:%lx\n",
@@ -1740,12 +1816,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if (!vma)
break;
- /*
- * VM_SHARED mappings are not allowed with MTE to avoid races
- * when updating the PG_mte_tagged page flag, see
- * sanitise_mte_tags for more details.
- */
- if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) {
+ if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) {
ret = -EINVAL;
break;
}
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index ebecb7c045f4..cf56958b1492 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -6,6 +6,7 @@
#include <linux/kvm_host.h>
#include <linux/memblock.h>
+#include <linux/mutex.h>
#include <linux/sort.h>
#include <asm/kvm_pkvm.h>
@@ -53,7 +54,7 @@ static int __init register_memblock_regions(void)
void __init kvm_hyp_reserve(void)
{
- u64 nr_pages, prev, hyp_mem_pages = 0;
+ u64 hyp_mem_pages = 0;
int ret;
if (!is_hyp_mode_available() || is_kernel_in_hyp_mode())
@@ -71,21 +72,8 @@ void __init kvm_hyp_reserve(void)
hyp_mem_pages += hyp_s1_pgtable_pages();
hyp_mem_pages += host_s2_pgtable_pages();
-
- /*
- * The hyp_vmemmap needs to be backed by pages, but these pages
- * themselves need to be present in the vmemmap, so compute the number
- * of pages needed by looking for a fixed point.
- */
- nr_pages = 0;
- do {
- prev = nr_pages;
- nr_pages = hyp_mem_pages + prev;
- nr_pages = DIV_ROUND_UP(nr_pages * STRUCT_HYP_PAGE_SIZE,
- PAGE_SIZE);
- nr_pages += __hyp_pgtable_max_pages(nr_pages);
- } while (nr_pages != prev);
- hyp_mem_pages += nr_pages;
+ hyp_mem_pages += hyp_vm_table_pages();
+ hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
/*
* Try to allocate a PMD-aligned region to reduce TLB pressure once
@@ -107,3 +95,121 @@ void __init kvm_hyp_reserve(void)
kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
hyp_mem_base);
}
+
+/*
+ * Allocates and donates memory for hypervisor VM structs at EL2.
+ *
+ * Allocates space for the VM state, which includes the hyp vm as well as
+ * the hyp vcpus.
+ *
+ * Stores an opaque handler in the kvm struct for future reference.
+ *
+ * Return 0 on success, negative error code on failure.
+ */
+static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
+{
+ size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
+ struct kvm_vcpu *host_vcpu;
+ pkvm_handle_t handle;
+ void *pgd, *hyp_vm;
+ unsigned long idx;
+ int ret;
+
+ if (host_kvm->created_vcpus < 1)
+ return -EINVAL;
+
+ pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr);
+
+ /*
+ * The PGD pages will be reclaimed using a hyp_memcache which implies
+ * page granularity. So, use alloc_pages_exact() to get individual
+ * refcounts.
+ */
+ pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT);
+ if (!pgd)
+ return -ENOMEM;
+
+ /* Allocate memory to donate to hyp for vm and vcpu pointers. */
+ hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE,
+ size_mul(sizeof(void *),
+ host_kvm->created_vcpus)));
+ hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT);
+ if (!hyp_vm) {
+ ret = -ENOMEM;
+ goto free_pgd;
+ }
+
+ /* Donate the VM memory to hyp and let hyp initialize it. */
+ ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd);
+ if (ret < 0)
+ goto free_vm;
+
+ handle = ret;
+
+ host_kvm->arch.pkvm.handle = handle;
+
+ /* Donate memory for the vcpus at hyp and initialize it. */
+ hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
+ kvm_for_each_vcpu(idx, host_vcpu, host_kvm) {
+ void *hyp_vcpu;
+
+ /* Indexing of the vcpus to be sequential starting at 0. */
+ if (WARN_ON(host_vcpu->vcpu_idx != idx)) {
+ ret = -EINVAL;
+ goto destroy_vm;
+ }
+
+ hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
+ if (!hyp_vcpu) {
+ ret = -ENOMEM;
+ goto destroy_vm;
+ }
+
+ ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu,
+ hyp_vcpu);
+ if (ret) {
+ free_pages_exact(hyp_vcpu, hyp_vcpu_sz);
+ goto destroy_vm;
+ }
+ }
+
+ return 0;
+
+destroy_vm:
+ pkvm_destroy_hyp_vm(host_kvm);
+ return ret;
+free_vm:
+ free_pages_exact(hyp_vm, hyp_vm_sz);
+free_pgd:
+ free_pages_exact(pgd, pgd_sz);
+ return ret;
+}
+
+int pkvm_create_hyp_vm(struct kvm *host_kvm)
+{
+ int ret = 0;
+
+ mutex_lock(&host_kvm->lock);
+ if (!host_kvm->arch.pkvm.handle)
+ ret = __pkvm_create_hyp_vm(host_kvm);
+ mutex_unlock(&host_kvm->lock);
+
+ return ret;
+}
+
+void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
+{
+ if (host_kvm->arch.pkvm.handle) {
+ WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
+ host_kvm->arch.pkvm.handle));
+ }
+
+ host_kvm->arch.pkvm.handle = 0;
+ free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
+}
+
+int pkvm_init_host_vm(struct kvm *host_kvm)
+{
+ mutex_init(&host_kvm->lock);
+ return 0;
+}
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 0003c7d37533..24908400e190 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -15,16 +15,25 @@
#include <kvm/arm_pmu.h>
#include <kvm/arm_vgic.h>
+#define PERF_ATTR_CFG1_COUNTER_64BIT BIT(0)
+
DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available);
static LIST_HEAD(arm_pmus);
static DEFINE_MUTEX(arm_pmus_lock);
-static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx);
-static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx);
-static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc);
+static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc);
+static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc);
+
+static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc)
+{
+ return container_of(pmc, struct kvm_vcpu, arch.pmu.pmc[pmc->idx]);
+}
-#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1
+static struct kvm_pmc *kvm_vcpu_idx_to_pmc(struct kvm_vcpu *vcpu, int cnt_idx)
+{
+ return &vcpu->arch.pmu.pmc[cnt_idx];
+}
static u32 kvm_pmu_event_mask(struct kvm *kvm)
{
@@ -47,113 +56,46 @@ static u32 kvm_pmu_event_mask(struct kvm *kvm)
}
/**
- * kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter
- * @vcpu: The vcpu pointer
- * @select_idx: The counter index
+ * kvm_pmc_is_64bit - determine if counter is 64bit
+ * @pmc: counter context
*/
-static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx)
+static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc)
{
- return (select_idx == ARMV8_PMU_CYCLE_IDX &&
- __vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_LC);
+ return (pmc->idx == ARMV8_PMU_CYCLE_IDX ||
+ kvm_pmu_is_3p5(kvm_pmc_to_vcpu(pmc)));
}
-static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc)
+static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc)
{
- struct kvm_pmu *pmu;
- struct kvm_vcpu_arch *vcpu_arch;
+ u64 val = __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), PMCR_EL0);
- pmc -= pmc->idx;
- pmu = container_of(pmc, struct kvm_pmu, pmc[0]);
- vcpu_arch = container_of(pmu, struct kvm_vcpu_arch, pmu);
- return container_of(vcpu_arch, struct kvm_vcpu, arch);
+ return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) ||
+ (pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC));
}
-/**
- * kvm_pmu_pmc_is_chained - determine if the pmc is chained
- * @pmc: The PMU counter pointer
- */
-static bool kvm_pmu_pmc_is_chained(struct kvm_pmc *pmc)
+static bool kvm_pmu_counter_can_chain(struct kvm_pmc *pmc)
{
- struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
-
- return test_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
+ return (!(pmc->idx & 1) && (pmc->idx + 1) < ARMV8_PMU_CYCLE_IDX &&
+ !kvm_pmc_has_64bit_overflow(pmc));
}
-/**
- * kvm_pmu_idx_is_high_counter - determine if select_idx is a high/low counter
- * @select_idx: The counter index
- */
-static bool kvm_pmu_idx_is_high_counter(u64 select_idx)
-{
- return select_idx & 0x1;
-}
-
-/**
- * kvm_pmu_get_canonical_pmc - obtain the canonical pmc
- * @pmc: The PMU counter pointer
- *
- * When a pair of PMCs are chained together we use the low counter (canonical)
- * to hold the underlying perf event.
- */
-static struct kvm_pmc *kvm_pmu_get_canonical_pmc(struct kvm_pmc *pmc)
-{
- if (kvm_pmu_pmc_is_chained(pmc) &&
- kvm_pmu_idx_is_high_counter(pmc->idx))
- return pmc - 1;
-
- return pmc;
-}
-static struct kvm_pmc *kvm_pmu_get_alternate_pmc(struct kvm_pmc *pmc)
+static u32 counter_index_to_reg(u64 idx)
{
- if (kvm_pmu_idx_is_high_counter(pmc->idx))
- return pmc - 1;
- else
- return pmc + 1;
+ return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + idx;
}
-/**
- * kvm_pmu_idx_has_chain_evtype - determine if the event type is chain
- * @vcpu: The vcpu pointer
- * @select_idx: The counter index
- */
-static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx)
+static u32 counter_index_to_evtreg(u64 idx)
{
- u64 eventsel, reg;
-
- select_idx |= 0x1;
-
- if (select_idx == ARMV8_PMU_CYCLE_IDX)
- return false;
-
- reg = PMEVTYPER0_EL0 + select_idx;
- eventsel = __vcpu_sys_reg(vcpu, reg) & kvm_pmu_event_mask(vcpu->kvm);
-
- return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN;
+ return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx;
}
-/**
- * kvm_pmu_get_pair_counter_value - get PMU counter value
- * @vcpu: The vcpu pointer
- * @pmc: The PMU counter pointer
- */
-static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu,
- struct kvm_pmc *pmc)
+static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc)
{
- u64 counter, counter_high, reg, enabled, running;
-
- if (kvm_pmu_pmc_is_chained(pmc)) {
- pmc = kvm_pmu_get_canonical_pmc(pmc);
- reg = PMEVCNTR0_EL0 + pmc->idx;
+ struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
+ u64 counter, reg, enabled, running;
- counter = __vcpu_sys_reg(vcpu, reg);
- counter_high = __vcpu_sys_reg(vcpu, reg + 1);
-
- counter = lower_32_bits(counter) | (counter_high << 32);
- } else {
- reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
- ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx;
- counter = __vcpu_sys_reg(vcpu, reg);
- }
+ reg = counter_index_to_reg(pmc->idx);
+ counter = __vcpu_sys_reg(vcpu, reg);
/*
* The real counter value is equal to the value of counter register plus
@@ -163,6 +105,9 @@ static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu,
counter += perf_event_read_value(pmc->perf_event, &enabled,
&running);
+ if (!kvm_pmc_is_64bit(pmc))
+ counter = lower_32_bits(counter);
+
return counter;
}
@@ -173,22 +118,37 @@ static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu,
*/
u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
{
- u64 counter;
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- struct kvm_pmc *pmc = &pmu->pmc[select_idx];
-
if (!kvm_vcpu_has_pmu(vcpu))
return 0;
- counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
+ return kvm_pmu_get_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx));
+}
- if (kvm_pmu_pmc_is_chained(pmc) &&
- kvm_pmu_idx_is_high_counter(select_idx))
- counter = upper_32_bits(counter);
- else if (select_idx != ARMV8_PMU_CYCLE_IDX)
- counter = lower_32_bits(counter);
+static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force)
+{
+ struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
+ u64 reg;
- return counter;
+ kvm_pmu_release_perf_event(pmc);
+
+ reg = counter_index_to_reg(pmc->idx);
+
+ if (vcpu_mode_is_32bit(vcpu) && pmc->idx != ARMV8_PMU_CYCLE_IDX &&
+ !force) {
+ /*
+ * Even with PMUv3p5, AArch32 cannot write to the top
+ * 32bit of the counters. The only possible course of
+ * action is to use PMCR.P, which will reset them to
+ * 0 (the only use of the 'force' parameter).
+ */
+ val = __vcpu_sys_reg(vcpu, reg) & GENMASK(63, 32);
+ val |= lower_32_bits(val);
+ }
+
+ __vcpu_sys_reg(vcpu, reg) = val;
+
+ /* Recreate the perf event to reflect the updated sample_period */
+ kvm_pmu_create_perf_event(pmc);
}
/**
@@ -199,17 +159,10 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
*/
void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
{
- u64 reg;
-
if (!kvm_vcpu_has_pmu(vcpu))
return;
- reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
- ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
- __vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx);
-
- /* Recreate the perf event to reflect the updated sample_period */
- kvm_pmu_create_perf_event(vcpu, select_idx);
+ kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx), val, false);
}
/**
@@ -218,7 +171,6 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
*/
static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
{
- pmc = kvm_pmu_get_canonical_pmc(pmc);
if (pmc->perf_event) {
perf_event_disable(pmc->perf_event);
perf_event_release_kernel(pmc->perf_event);
@@ -232,29 +184,20 @@ static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
*
* If this counter has been configured to monitor some event, release it here.
*/
-static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc)
+static void kvm_pmu_stop_counter(struct kvm_pmc *pmc)
{
- u64 counter, reg, val;
+ struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
+ u64 reg, val;
- pmc = kvm_pmu_get_canonical_pmc(pmc);
if (!pmc->perf_event)
return;
- counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
+ val = kvm_pmu_get_pmc_value(pmc);
- if (pmc->idx == ARMV8_PMU_CYCLE_IDX) {
- reg = PMCCNTR_EL0;
- val = counter;
- } else {
- reg = PMEVCNTR0_EL0 + pmc->idx;
- val = lower_32_bits(counter);
- }
+ reg = counter_index_to_reg(pmc->idx);
__vcpu_sys_reg(vcpu, reg) = val;
- if (kvm_pmu_pmc_is_chained(pmc))
- __vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter);
-
kvm_pmu_release_perf_event(pmc);
}
@@ -280,13 +223,10 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu)
void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
{
unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
int i;
for_each_set_bit(i, &mask, 32)
- kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]);
-
- bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS);
+ kvm_pmu_stop_counter(kvm_vcpu_idx_to_pmc(vcpu, i));
}
/**
@@ -297,10 +237,9 @@ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu)
{
int i;
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
- kvm_pmu_release_perf_event(&pmu->pmc[i]);
+ kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, i));
irq_work_sync(&vcpu->arch.pmu.overflow_work);
}
@@ -325,9 +264,6 @@ u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
{
int i;
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- struct kvm_pmc *pmc;
-
if (!kvm_vcpu_has_pmu(vcpu))
return;
@@ -335,17 +271,16 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
return;
for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
+ struct kvm_pmc *pmc;
+
if (!(val & BIT(i)))
continue;
- pmc = &pmu->pmc[i];
-
- /* A change in the enable state may affect the chain state */
- kvm_pmu_update_pmc_chained(vcpu, i);
- kvm_pmu_create_perf_event(vcpu, i);
+ pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
- /* At this point, pmc must be the canonical */
- if (pmc->perf_event) {
+ if (!pmc->perf_event) {
+ kvm_pmu_create_perf_event(pmc);
+ } else {
perf_event_enable(pmc->perf_event);
if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE)
kvm_debug("fail to enable perf event\n");
@@ -363,23 +298,18 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
{
int i;
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- struct kvm_pmc *pmc;
if (!kvm_vcpu_has_pmu(vcpu) || !val)
return;
for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
+ struct kvm_pmc *pmc;
+
if (!(val & BIT(i)))
continue;
- pmc = &pmu->pmc[i];
-
- /* A change in the enable state may affect the chain state */
- kvm_pmu_update_pmc_chained(vcpu, i);
- kvm_pmu_create_perf_event(vcpu, i);
+ pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
- /* At this point, pmc must be the canonical */
if (pmc->perf_event)
perf_event_disable(pmc->perf_event);
}
@@ -476,14 +406,69 @@ void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu)
static void kvm_pmu_perf_overflow_notify_vcpu(struct irq_work *work)
{
struct kvm_vcpu *vcpu;
- struct kvm_pmu *pmu;
-
- pmu = container_of(work, struct kvm_pmu, overflow_work);
- vcpu = kvm_pmc_to_vcpu(pmu->pmc);
+ vcpu = container_of(work, struct kvm_vcpu, arch.pmu.overflow_work);
kvm_vcpu_kick(vcpu);
}
+/*
+ * Perform an increment on any of the counters described in @mask,
+ * generating the overflow if required, and propagate it as a chained
+ * event if possible.
+ */
+static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu,
+ unsigned long mask, u32 event)
+{
+ int i;
+
+ if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E))
+ return;
+
+ /* Weed out disabled counters */
+ mask &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
+
+ for_each_set_bit(i, &mask, ARMV8_PMU_CYCLE_IDX) {
+ struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
+ u64 type, reg;
+
+ /* Filter on event type */
+ type = __vcpu_sys_reg(vcpu, counter_index_to_evtreg(i));
+ type &= kvm_pmu_event_mask(vcpu->kvm);
+ if (type != event)
+ continue;
+
+ /* Increment this counter */
+ reg = __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) + 1;
+ if (!kvm_pmc_is_64bit(pmc))
+ reg = lower_32_bits(reg);
+ __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) = reg;
+
+ /* No overflow? move on */
+ if (kvm_pmc_has_64bit_overflow(pmc) ? reg : lower_32_bits(reg))
+ continue;
+
+ /* Mark overflow */
+ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
+
+ if (kvm_pmu_counter_can_chain(pmc))
+ kvm_pmu_counter_increment(vcpu, BIT(i + 1),
+ ARMV8_PMUV3_PERFCTR_CHAIN);
+ }
+}
+
+/* Compute the sample period for a given counter value */
+static u64 compute_period(struct kvm_pmc *pmc, u64 counter)
+{
+ u64 val;
+
+ if (kvm_pmc_is_64bit(pmc) && kvm_pmc_has_64bit_overflow(pmc))
+ val = (-counter) & GENMASK(63, 0);
+ else
+ val = (-counter) & GENMASK(31, 0);
+
+ return val;
+}
+
/**
* When the perf event overflows, set the overflow status and inform the vcpu.
*/
@@ -503,10 +488,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
* Reset the sample period to the architectural limit,
* i.e. the point where the counter overflows.
*/
- period = -(local64_read(&perf_event->count));
-
- if (!kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
- period &= GENMASK(31, 0);
+ period = compute_period(pmc, local64_read(&perf_event->count));
local64_set(&perf_event->hw.period_left, 0);
perf_event->attr.sample_period = period;
@@ -514,6 +496,10 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx);
+ if (kvm_pmu_counter_can_chain(pmc))
+ kvm_pmu_counter_increment(vcpu, BIT(idx + 1),
+ ARMV8_PMUV3_PERFCTR_CHAIN);
+
if (kvm_pmu_overflow_status(vcpu)) {
kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
@@ -533,50 +519,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
*/
void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- int i;
-
- if (!kvm_vcpu_has_pmu(vcpu))
- return;
-
- if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E))
- return;
-
- /* Weed out disabled counters */
- val &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
-
- for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) {
- u64 type, reg;
-
- if (!(val & BIT(i)))
- continue;
-
- /* PMSWINC only applies to ... SW_INC! */
- type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
- type &= kvm_pmu_event_mask(vcpu->kvm);
- if (type != ARMV8_PMUV3_PERFCTR_SW_INCR)
- continue;
-
- /* increment this even SW_INC counter */
- reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1;
- reg = lower_32_bits(reg);
- __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg;
-
- if (reg) /* no overflow on the low part */
- continue;
-
- if (kvm_pmu_pmc_is_chained(&pmu->pmc[i])) {
- /* increment the high counter */
- reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) + 1;
- reg = lower_32_bits(reg);
- __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) = reg;
- if (!reg) /* mark overflow on the high counter */
- __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i + 1);
- } else {
- /* mark overflow on low counter */
- __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
- }
- }
+ kvm_pmu_counter_increment(vcpu, val, ARMV8_PMUV3_PERFCTR_SW_INCR);
}
/**
@@ -591,6 +534,12 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
if (!kvm_vcpu_has_pmu(vcpu))
return;
+ /* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */
+ if (!kvm_pmu_is_3p5(vcpu))
+ val &= ~ARMV8_PMU_PMCR_LP;
+
+ __vcpu_sys_reg(vcpu, PMCR_EL0) = val;
+
if (val & ARMV8_PMU_PMCR_E) {
kvm_pmu_enable_counter_mask(vcpu,
__vcpu_sys_reg(vcpu, PMCNTENSET_EL0));
@@ -606,49 +555,44 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
mask &= ~BIT(ARMV8_PMU_CYCLE_IDX);
for_each_set_bit(i, &mask, 32)
- kvm_pmu_set_counter_value(vcpu, i, 0);
+ kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true);
}
}
-static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx)
+static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc)
{
+ struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) &&
- (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx));
+ (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx));
}
/**
* kvm_pmu_create_perf_event - create a perf event for a counter
- * @vcpu: The vcpu pointer
- * @select_idx: The number of selected counter
+ * @pmc: Counter context
*/
-static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
+static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc)
{
+ struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu;
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- struct kvm_pmc *pmc;
struct perf_event *event;
struct perf_event_attr attr;
- u64 eventsel, counter, reg, data;
+ u64 eventsel, reg, data;
- /*
- * For chained counters the event type and filtering attributes are
- * obtained from the low/even counter. We also use this counter to
- * determine if the event is enabled/disabled.
- */
- pmc = kvm_pmu_get_canonical_pmc(&pmu->pmc[select_idx]);
-
- reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
- ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + pmc->idx;
+ reg = counter_index_to_evtreg(pmc->idx);
data = __vcpu_sys_reg(vcpu, reg);
- kvm_pmu_stop_counter(vcpu, pmc);
+ kvm_pmu_stop_counter(pmc);
if (pmc->idx == ARMV8_PMU_CYCLE_IDX)
eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
else
eventsel = data & kvm_pmu_event_mask(vcpu->kvm);
- /* Software increment event doesn't need to be backed by a perf event */
- if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR)
+ /*
+ * Neither SW increment nor chained events need to be backed
+ * by a perf event.
+ */
+ if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR ||
+ eventsel == ARMV8_PMUV3_PERFCTR_CHAIN)
return;
/*
@@ -663,37 +607,25 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
attr.type = arm_pmu->pmu.type;
attr.size = sizeof(attr);
attr.pinned = 1;
- attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, pmc->idx);
+ attr.disabled = !kvm_pmu_counter_is_enabled(pmc);
attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0;
attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
attr.exclude_hv = 1; /* Don't count EL2 events */
attr.exclude_host = 1; /* Don't count host events */
attr.config = eventsel;
- counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
-
- if (kvm_pmu_pmc_is_chained(pmc)) {
- /**
- * The initial sample period (overflow count) of an event. For
- * chained counters we only support overflow interrupts on the
- * high counter.
- */
- attr.sample_period = (-counter) & GENMASK(63, 0);
- attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED;
+ /*
+ * If counting with a 64bit counter, advertise it to the perf
+ * code, carefully dealing with the initial sample period
+ * which also depends on the overflow.
+ */
+ if (kvm_pmc_is_64bit(pmc))
+ attr.config1 |= PERF_ATTR_CFG1_COUNTER_64BIT;
- event = perf_event_create_kernel_counter(&attr, -1, current,
- kvm_pmu_perf_overflow,
- pmc + 1);
- } else {
- /* The initial sample period (overflow count) of an event. */
- if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
- attr.sample_period = (-counter) & GENMASK(63, 0);
- else
- attr.sample_period = (-counter) & GENMASK(31, 0);
+ attr.sample_period = compute_period(pmc, kvm_pmu_get_pmc_value(pmc));
- event = perf_event_create_kernel_counter(&attr, -1, current,
+ event = perf_event_create_kernel_counter(&attr, -1, current,
kvm_pmu_perf_overflow, pmc);
- }
if (IS_ERR(event)) {
pr_err_once("kvm: pmu event creation failed %ld\n",
@@ -705,41 +637,6 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
}
/**
- * kvm_pmu_update_pmc_chained - update chained bitmap
- * @vcpu: The vcpu pointer
- * @select_idx: The number of selected counter
- *
- * Update the chained bitmap based on the event type written in the
- * typer register and the enable state of the odd register.
- */
-static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- struct kvm_pmc *pmc = &pmu->pmc[select_idx], *canonical_pmc;
- bool new_state, old_state;
-
- old_state = kvm_pmu_pmc_is_chained(pmc);
- new_state = kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx) &&
- kvm_pmu_counter_is_enabled(vcpu, pmc->idx | 0x1);
-
- if (old_state == new_state)
- return;
-
- canonical_pmc = kvm_pmu_get_canonical_pmc(pmc);
- kvm_pmu_stop_counter(vcpu, canonical_pmc);
- if (new_state) {
- /*
- * During promotion from !chained to chained we must ensure
- * the adjacent counter is stopped and its event destroyed
- */
- kvm_pmu_stop_counter(vcpu, kvm_pmu_get_alternate_pmc(pmc));
- set_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
- return;
- }
- clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
-}
-
-/**
* kvm_pmu_set_counter_event_type - set selected counter to monitor some event
* @vcpu: The vcpu pointer
* @data: The data guest writes to PMXEVTYPER_EL0
@@ -752,6 +649,7 @@ static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
u64 select_idx)
{
+ struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, select_idx);
u64 reg, mask;
if (!kvm_vcpu_has_pmu(vcpu))
@@ -761,20 +659,19 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
mask &= ~ARMV8_PMU_EVTYPE_EVENT;
mask |= kvm_pmu_event_mask(vcpu->kvm);
- reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
- ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx;
+ reg = counter_index_to_evtreg(pmc->idx);
__vcpu_sys_reg(vcpu, reg) = data & mask;
- kvm_pmu_update_pmc_chained(vcpu, select_idx);
- kvm_pmu_create_perf_event(vcpu, select_idx);
+ kvm_pmu_create_perf_event(pmc);
}
void kvm_host_pmu_init(struct arm_pmu *pmu)
{
struct arm_pmu_entry *entry;
- if (pmu->pmuver == 0 || pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
+ if (pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_NI ||
+ pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
return;
mutex_lock(&arm_pmus_lock);
@@ -827,7 +724,7 @@ static struct arm_pmu *kvm_pmu_probe_armpmu(void)
if (event->pmu) {
pmu = to_arm_pmu(event->pmu);
- if (pmu->pmuver == 0 ||
+ if (pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_NI ||
pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
pmu = NULL;
}
@@ -849,6 +746,8 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
if (!pmceid1) {
val = read_sysreg(pmceid0_el0);
+ /* always support CHAIN */
+ val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN);
base = 0;
} else {
val = read_sysreg(pmceid1_el0);
@@ -1150,3 +1049,14 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
return -ENXIO;
}
+
+u8 kvm_arm_pmu_get_pmuver_limit(void)
+{
+ u64 tmp;
+
+ tmp = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
+ tmp = cpuid_feature_cap_perfmon_field(tmp,
+ ID_AA64DFR0_EL1_PMUVer_SHIFT,
+ ID_AA64DFR0_EL1_PMUVer_V3P5);
+ return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp);
+}
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 5ae18472205a..e0267f672b8a 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -395,32 +395,3 @@ int kvm_set_ipa_limit(void)
return 0;
}
-
-int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
-{
- u64 mmfr0, mmfr1;
- u32 phys_shift;
-
- if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
- return -EINVAL;
-
- phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
- if (phys_shift) {
- if (phys_shift > kvm_ipa_limit ||
- phys_shift < ARM64_MIN_PARANGE_BITS)
- return -EINVAL;
- } else {
- phys_shift = KVM_PHYS_SHIFT;
- if (phys_shift > kvm_ipa_limit) {
- pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
- current->comm);
- return -EINVAL;
- }
- }
-
- mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
- mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
- kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
-
- return 0;
-}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index f4a7c5abcbca..528d253c571a 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -639,22 +639,18 @@ static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
- u64 pmcr, val;
+ u64 pmcr;
/* No PMU available, PMCR_EL0 may UNDEF... */
if (!kvm_arm_support_pmu_v3())
return;
- pmcr = read_sysreg(pmcr_el0);
- /*
- * Writable bits of PMCR_EL0 (ARMV8_PMU_PMCR_MASK) are reset to UNKNOWN
- * except PMCR.E resetting to zero.
- */
- val = ((pmcr & ~ARMV8_PMU_PMCR_MASK)
- | (ARMV8_PMU_PMCR_MASK & 0xdecafbad)) & (~ARMV8_PMU_PMCR_E);
+ /* Only preserve PMCR_EL0.N, and reset the rest to 0 */
+ pmcr = read_sysreg(pmcr_el0) & ARMV8_PMU_PMCR_N_MASK;
if (!kvm_supports_32bit_el0())
- val |= ARMV8_PMU_PMCR_LC;
- __vcpu_sys_reg(vcpu, r->reg) = val;
+ pmcr |= ARMV8_PMU_PMCR_LC;
+
+ __vcpu_sys_reg(vcpu, r->reg) = pmcr;
}
static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags)
@@ -697,13 +693,15 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
return false;
if (p->is_write) {
- /* Only update writeable bits of PMCR */
+ /*
+ * Only update writeable bits of PMCR (continuing into
+ * kvm_pmu_handle_pmcr() as well)
+ */
val = __vcpu_sys_reg(vcpu, PMCR_EL0);
val &= ~ARMV8_PMU_PMCR_MASK;
val |= p->regval & ARMV8_PMU_PMCR_MASK;
if (!kvm_supports_32bit_el0())
val |= ARMV8_PMU_PMCR_LC;
- __vcpu_sys_reg(vcpu, PMCR_EL0) = val;
kvm_pmu_handle_pmcr(vcpu, val);
kvm_vcpu_pmu_restore_guest(vcpu);
} else {
@@ -1062,6 +1060,40 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
return true;
}
+static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu)
+{
+ if (kvm_vcpu_has_pmu(vcpu))
+ return vcpu->kvm->arch.dfr0_pmuver.imp;
+
+ return vcpu->kvm->arch.dfr0_pmuver.unimp;
+}
+
+static u8 perfmon_to_pmuver(u8 perfmon)
+{
+ switch (perfmon) {
+ case ID_DFR0_PERFMON_8_0:
+ return ID_AA64DFR0_EL1_PMUVer_IMP;
+ case ID_DFR0_PERFMON_IMP_DEF:
+ return ID_AA64DFR0_EL1_PMUVer_IMP_DEF;
+ default:
+ /* Anything ARMv8.1+ and NI have the same value. For now. */
+ return perfmon;
+ }
+}
+
+static u8 pmuver_to_perfmon(u8 pmuver)
+{
+ switch (pmuver) {
+ case ID_AA64DFR0_EL1_PMUVer_IMP:
+ return ID_DFR0_PERFMON_8_0;
+ case ID_AA64DFR0_EL1_PMUVer_IMP_DEF:
+ return ID_DFR0_PERFMON_IMP_DEF;
+ default:
+ /* Anything ARMv8.1+ and NI have the same value. For now. */
+ return pmuver;
+ }
+}
+
/* Read a sanitised cpufeature ID register by sys_reg_desc */
static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r)
{
@@ -1111,18 +1143,17 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r
/* Limit debug to ARMv8.0 */
val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer);
val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6);
- /* Limit guests to PMUv3 for ARMv8.4 */
- val = cpuid_feature_cap_perfmon_field(val,
- ID_AA64DFR0_EL1_PMUVer_SHIFT,
- kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_EL1_PMUVer_V3P4 : 0);
+ /* Set PMUver to the required version */
+ val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
+ val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer),
+ vcpu_pmuver(vcpu));
/* Hide SPE from guests */
val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer);
break;
case SYS_ID_DFR0_EL1:
- /* Limit guests to PMUv3 for ARMv8.4 */
- val = cpuid_feature_cap_perfmon_field(val,
- ID_DFR0_PERFMON_SHIFT,
- kvm_vcpu_has_pmu(vcpu) ? ID_DFR0_PERFMON_8_4 : 0);
+ val &= ~ARM64_FEATURE_MASK(ID_DFR0_PERFMON);
+ val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_PERFMON),
+ pmuver_to_perfmon(vcpu_pmuver(vcpu)));
break;
}
@@ -1222,6 +1253,85 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
return 0;
}
+static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd,
+ u64 val)
+{
+ u8 pmuver, host_pmuver;
+ bool valid_pmu;
+
+ host_pmuver = kvm_arm_pmu_get_pmuver_limit();
+
+ /*
+ * Allow AA64DFR0_EL1.PMUver to be set from userspace as long
+ * as it doesn't promise more than what the HW gives us. We
+ * allow an IMPDEF PMU though, only if no PMU is supported
+ * (KVM backward compatibility handling).
+ */
+ pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), val);
+ if ((pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF && pmuver > host_pmuver))
+ return -EINVAL;
+
+ valid_pmu = (pmuver != 0 && pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF);
+
+ /* Make sure view register and PMU support do match */
+ if (kvm_vcpu_has_pmu(vcpu) != valid_pmu)
+ return -EINVAL;
+
+ /* We can only differ with PMUver, and anything else is an error */
+ val ^= read_id_reg(vcpu, rd);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
+ if (val)
+ return -EINVAL;
+
+ if (valid_pmu)
+ vcpu->kvm->arch.dfr0_pmuver.imp = pmuver;
+ else
+ vcpu->kvm->arch.dfr0_pmuver.unimp = pmuver;
+
+ return 0;
+}
+
+static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd,
+ u64 val)
+{
+ u8 perfmon, host_perfmon;
+ bool valid_pmu;
+
+ host_perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit());
+
+ /*
+ * Allow DFR0_EL1.PerfMon to be set from userspace as long as
+ * it doesn't promise more than what the HW gives us on the
+ * AArch64 side (as everything is emulated with that), and
+ * that this is a PMUv3.
+ */
+ perfmon = FIELD_GET(ARM64_FEATURE_MASK(ID_DFR0_PERFMON), val);
+ if ((perfmon != ID_DFR0_PERFMON_IMP_DEF && perfmon > host_perfmon) ||
+ (perfmon != 0 && perfmon < ID_DFR0_PERFMON_8_0))
+ return -EINVAL;
+
+ valid_pmu = (perfmon != 0 && perfmon != ID_DFR0_PERFMON_IMP_DEF);
+
+ /* Make sure view register and PMU support do match */
+ if (kvm_vcpu_has_pmu(vcpu) != valid_pmu)
+ return -EINVAL;
+
+ /* We can only differ with PerfMon, and anything else is an error */
+ val ^= read_id_reg(vcpu, rd);
+ val &= ~ARM64_FEATURE_MASK(ID_DFR0_PERFMON);
+ if (val)
+ return -EINVAL;
+
+ if (valid_pmu)
+ vcpu->kvm->arch.dfr0_pmuver.imp = perfmon_to_pmuver(perfmon);
+ else
+ vcpu->kvm->arch.dfr0_pmuver.unimp = perfmon_to_pmuver(perfmon);
+
+ return 0;
+}
+
/*
* cpufeature ID register user accessors
*
@@ -1443,7 +1553,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
/* CRm=1 */
AA32_ID_SANITISED(ID_PFR0_EL1),
AA32_ID_SANITISED(ID_PFR1_EL1),
- AA32_ID_SANITISED(ID_DFR0_EL1),
+ { SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg,
+ .get_user = get_id_reg, .set_user = set_id_dfr0_el1,
+ .visibility = aa32_id_visibility, },
ID_HIDDEN(ID_AFR0_EL1),
AA32_ID_SANITISED(ID_MMFR0_EL1),
AA32_ID_SANITISED(ID_MMFR1_EL1),
@@ -1483,7 +1595,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_UNALLOCATED(4,7),
/* CRm=5 */
- ID_SANITISED(ID_AA64DFR0_EL1),
+ { SYS_DESC(SYS_ID_AA64DFR0_EL1), .access = access_id_reg,
+ .get_user = get_id_reg, .set_user = set_id_aa64dfr0_el1, },
ID_SANITISED(ID_AA64DFR1_EL1),
ID_UNALLOCATED(5,2),
ID_UNALLOCATED(5,3),
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 733b53055f97..94a666dd1443 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -2743,6 +2743,7 @@ static int vgic_its_has_attr(struct kvm_device *dev,
static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
{
const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+ struct vgic_dist *dist = &kvm->arch.vgic;
int ret = 0;
if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */
@@ -2762,7 +2763,9 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
vgic_its_reset(kvm, its);
break;
case KVM_DEV_ARM_ITS_SAVE_TABLES:
+ dist->save_its_tables_in_progress = true;
ret = abi->save_tables(its);
+ dist->save_its_tables_in_progress = false;
break;
case KVM_DEV_ARM_ITS_RESTORE_TABLES:
ret = abi->restore_tables(its);
@@ -2775,6 +2778,23 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
return ret;
}
+/*
+ * kvm_arch_allow_write_without_running_vcpu - allow writing guest memory
+ * without the running VCPU when dirty ring is enabled.
+ *
+ * The running VCPU is required to track dirty guest pages when dirty ring
+ * is enabled. Otherwise, the backup bitmap should be used to track the
+ * dirty guest pages. When vgic/its tables are being saved, the backup
+ * bitmap is used to track the dirty guest pages due to the missed running
+ * VCPU in the period.
+ */
+bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+
+ return dist->save_its_tables_in_progress;
+}
+
static int vgic_its_set_attr(struct kvm_device *dev,
struct kvm_device_attr *attr)
{
diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c
index 24913271e898..8dd5a8fe64b4 100644
--- a/arch/arm64/mm/copypage.c
+++ b/arch/arm64/mm/copypage.c
@@ -21,9 +21,12 @@ void copy_highpage(struct page *to, struct page *from)
copy_page(kto, kfrom);
- if (system_supports_mte() && test_bit(PG_mte_tagged, &from->flags)) {
- set_bit(PG_mte_tagged, &to->flags);
+ if (system_supports_mte() && page_mte_tagged(from)) {
+ page_kasan_tag_reset(to);
+ /* It's a new page, shouldn't have been tagged yet */
+ WARN_ON_ONCE(!try_page_mte_tagging(to));
mte_copy_page_tags(kto, kfrom);
+ set_page_mte_tagged(to);
}
}
EXPORT_SYMBOL(copy_highpage);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 5b391490e045..0b1c102b89c9 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -30,6 +30,7 @@
#include <asm/bug.h>
#include <asm/cmpxchg.h>
#include <asm/cpufeature.h>
+#include <asm/efi.h>
#include <asm/exception.h>
#include <asm/daifflags.h>
#include <asm/debug-monitors.h>
@@ -391,6 +392,9 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
msg = "paging request";
}
+ if (efi_runtime_fixup_exception(regs, msg))
+ return;
+
die_kernel_fault(msg, addr, esr, regs);
}
@@ -933,6 +937,8 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
void tag_clear_highpage(struct page *page)
{
+ /* Newly allocated page, shouldn't have been tagged yet */
+ WARN_ON_ONCE(!try_page_mte_tagging(page));
mte_zero_clear_page_tags(page_address(page));
- set_bit(PG_mte_tagged, &page->flags);
+ set_page_mte_tagged(page);
}
diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c
index bed803d8e158..cd508ba80ab1 100644
--- a/arch/arm64/mm/mteswap.c
+++ b/arch/arm64/mm/mteswap.c
@@ -24,7 +24,7 @@ int mte_save_tags(struct page *page)
{
void *tag_storage, *ret;
- if (!test_bit(PG_mte_tagged, &page->flags))
+ if (!page_mte_tagged(page))
return 0;
tag_storage = mte_allocate_tag_storage();
@@ -46,21 +46,17 @@ int mte_save_tags(struct page *page)
return 0;
}
-bool mte_restore_tags(swp_entry_t entry, struct page *page)
+void mte_restore_tags(swp_entry_t entry, struct page *page)
{
void *tags = xa_load(&mte_pages, entry.val);
if (!tags)
- return false;
+ return;
- /*
- * Test PG_mte_tagged again in case it was racing with another
- * set_pte_at().
- */
- if (!test_and_set_bit(PG_mte_tagged, &page->flags))
+ if (try_page_mte_tagging(page)) {
mte_restore_page_tags(page_address(page), tags);
-
- return true;
+ set_page_mte_tagged(page);
+ }
}
void mte_invalidate_tags(int type, pgoff_t offset)
diff --git a/arch/parisc/include/asm/hardware.h b/arch/parisc/include/asm/hardware.h
index 9d3d7737c58b..a005ebc54779 100644
--- a/arch/parisc/include/asm/hardware.h
+++ b/arch/parisc/include/asm/hardware.h
@@ -10,12 +10,12 @@
#define SVERSION_ANY_ID PA_SVERSION_ANY_ID
struct hp_hardware {
- unsigned short hw_type:5; /* HPHW_xxx */
- unsigned short hversion;
- unsigned long sversion:28;
- unsigned short opt;
- const char name[80]; /* The hardware description */
-};
+ unsigned int hw_type:8; /* HPHW_xxx */
+ unsigned int hversion:12;
+ unsigned int sversion:12;
+ unsigned char opt;
+ unsigned char name[59]; /* The hardware description */
+} __packed;
struct parisc_device;
diff --git a/arch/parisc/include/uapi/asm/pdc.h b/arch/parisc/include/uapi/asm/pdc.h
index e794e143ec5f..7a90070136e8 100644
--- a/arch/parisc/include/uapi/asm/pdc.h
+++ b/arch/parisc/include/uapi/asm/pdc.h
@@ -363,20 +363,25 @@
#if !defined(__ASSEMBLY__)
-/* flags of the device_path */
+/* flags for hardware_path */
#define PF_AUTOBOOT 0x80
#define PF_AUTOSEARCH 0x40
#define PF_TIMER 0x0F
-struct device_path { /* page 1-69 */
- unsigned char flags; /* flags see above! */
- unsigned char bc[6]; /* bus converter routing info */
- unsigned char mod;
- unsigned int layers[6];/* device-specific layer-info */
-} __attribute__((aligned(8))) ;
+struct hardware_path {
+ unsigned char flags; /* see bit definitions below */
+ signed char bc[6]; /* Bus Converter routing info to a specific */
+ /* I/O adaptor (< 0 means none, > 63 resvd) */
+ signed char mod; /* fixed field of specified module */
+};
+
+struct pdc_module_path { /* page 1-69 */
+ struct hardware_path path;
+ unsigned int layers[6]; /* device-specific info (ctlr #, unit # ...) */
+} __attribute__((aligned(8)));
struct pz_device {
- struct device_path dp; /* see above */
+ struct pdc_module_path dp; /* see above */
/* struct iomod *hpa; */
unsigned int hpa; /* HPA base address */
/* char *spa; */
@@ -611,21 +616,6 @@ struct pdc_initiator { /* PDC_INITIATOR */
int mode;
};
-struct hardware_path {
- char flags; /* see bit definitions below */
- char bc[6]; /* Bus Converter routing info to a specific */
- /* I/O adaptor (< 0 means none, > 63 resvd) */
- char mod; /* fixed field of specified module */
-};
-
-/*
- * Device path specifications used by PDC.
- */
-struct pdc_module_path {
- struct hardware_path path;
- unsigned int layers[6]; /* device-specific info (ctlr #, unit # ...) */
-};
-
/* Only used on some pre-PA2.0 boxes */
struct pdc_memory_map { /* PDC_MEMORY_MAP */
unsigned long hpa; /* mod's register set address */
diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c
index d126e78e101a..e7ee0c0c91d3 100644
--- a/arch/parisc/kernel/drivers.c
+++ b/arch/parisc/kernel/drivers.c
@@ -882,15 +882,13 @@ void __init walk_central_bus(void)
&root);
}
-static void print_parisc_device(struct parisc_device *dev)
+static __init void print_parisc_device(struct parisc_device *dev)
{
- char hw_path[64];
- static int count;
+ static int count __initdata;
- print_pa_hwpath(dev, hw_path);
- pr_info("%d. %s at %pap [%s] { %d, 0x%x, 0x%.3x, 0x%.5x }",
- ++count, dev->name, &(dev->hpa.start), hw_path, dev->id.hw_type,
- dev->id.hversion_rev, dev->id.hversion, dev->id.sversion);
+ pr_info("%d. %s at %pap { type:%d, hv:%#x, sv:%#x, rev:%#x }",
+ ++count, dev->name, &(dev->hpa.start), dev->id.hw_type,
+ dev->id.hversion, dev->id.sversion, dev->id.hversion_rev);
if (dev->num_addrs) {
int k;
@@ -1079,7 +1077,7 @@ static __init int qemu_print_iodc_data(struct device *lin_dev, void *data)
-static int print_one_device(struct device * dev, void * data)
+static __init int print_one_device(struct device * dev, void * data)
{
struct parisc_device * pdev = to_parisc_device(dev);
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 699df27b0e2f..2ca5418457ed 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -147,6 +147,7 @@ config PPC
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
+ select ARCH_SPLIT_ARG64 if PPC32
select ARCH_STACKWALK
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x
@@ -285,7 +286,7 @@ config PPC
#
config PPC_LONG_DOUBLE_128
- depends on PPC64
+ depends on PPC64 && ALTIVEC
def_bool $(success,test "$(shell,echo __LONG_DOUBLE_128__ | $(CC) -E -P -)" = 1)
config PPC_BARRIER_NOSPEC
diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h
index a1142496cd58..6d51b007b59e 100644
--- a/arch/powerpc/include/asm/syscalls.h
+++ b/arch/powerpc/include/asm/syscalls.h
@@ -104,6 +104,13 @@ long sys_ppc_ftruncate64(unsigned int fd, u32 reg4,
unsigned long len1, unsigned long len2);
long sys_ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2,
size_t len, int advice);
+long sys_ppc_sync_file_range2(int fd, unsigned int flags,
+ unsigned int offset1,
+ unsigned int offset2,
+ unsigned int nbytes1,
+ unsigned int nbytes2);
+long sys_ppc_fallocate(int fd, int mode, u32 offset1, u32 offset2,
+ u32 len1, u32 len2);
#endif
#ifdef CONFIG_COMPAT
long compat_sys_mmap2(unsigned long addr, size_t len,
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index 1ab4a4d95aba..d451a8229223 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -112,7 +112,7 @@ PPC32_SYSCALL_DEFINE6(ppc32_fadvise64,
advice);
}
-COMPAT_SYSCALL_DEFINE6(ppc_sync_file_range2,
+PPC32_SYSCALL_DEFINE6(ppc_sync_file_range2,
int, fd, unsigned int, flags,
unsigned int, offset1, unsigned int, offset2,
unsigned int, nbytes1, unsigned int, nbytes2)
@@ -122,3 +122,14 @@ COMPAT_SYSCALL_DEFINE6(ppc_sync_file_range2,
return ksys_sync_file_range(fd, offset, nbytes, flags);
}
+
+#ifdef CONFIG_PPC32
+SYSCALL_DEFINE6(ppc_fallocate,
+ int, fd, int, mode,
+ u32, offset1, u32, offset2, u32, len1, u32, len2)
+{
+ return ksys_fallocate(fd, mode,
+ merge_64(offset1, offset2),
+ merge_64(len1, len2));
+}
+#endif
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index e9e0df4f9a61..a0be127475b1 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -394,8 +394,11 @@
305 common signalfd sys_signalfd compat_sys_signalfd
306 common timerfd_create sys_timerfd_create
307 common eventfd sys_eventfd
-308 common sync_file_range2 sys_sync_file_range2 compat_sys_ppc_sync_file_range2
-309 nospu fallocate sys_fallocate compat_sys_fallocate
+308 32 sync_file_range2 sys_ppc_sync_file_range2 compat_sys_ppc_sync_file_range2
+308 64 sync_file_range2 sys_sync_file_range2
+308 spu sync_file_range2 sys_sync_file_range2
+309 32 fallocate sys_ppc_fallocate compat_sys_fallocate
+309 64 fallocate sys_fallocate
310 nospu subpage_prot sys_subpage_prot
311 32 timerfd_settime sys_timerfd_settime32
311 64 timerfd_settime sys_timerfd_settime
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 928dcf7a20d9..b8998cf0508a 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -34,6 +34,8 @@
#define VE_GET_PORT_NUM(e) ((e) >> 16)
#define VE_IS_IO_STRING(e) ((e) & BIT(4))
+#define ATTR_SEPT_VE_DISABLE BIT(28)
+
/*
* Wrapper for standard use of __tdx_hypercall with no output aside from
* return code.
@@ -98,10 +100,11 @@ static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
}
-static u64 get_cc_mask(void)
+static void tdx_parse_tdinfo(u64 *cc_mask)
{
struct tdx_module_output out;
unsigned int gpa_width;
+ u64 td_attr;
/*
* TDINFO TDX module call is used to get the TD execution environment
@@ -109,19 +112,27 @@ static u64 get_cc_mask(void)
* information, etc. More details about the ABI can be found in TDX
* Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
* [TDG.VP.INFO].
+ */
+ tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out);
+
+ /*
+ * The highest bit of a guest physical address is the "sharing" bit.
+ * Set it for shared pages and clear it for private pages.
*
* The GPA width that comes out of this call is critical. TDX guests
* can not meaningfully run without it.
*/
- tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out);
-
gpa_width = out.rcx & GENMASK(5, 0);
+ *cc_mask = BIT_ULL(gpa_width - 1);
/*
- * The highest bit of a guest physical address is the "sharing" bit.
- * Set it for shared pages and clear it for private pages.
+ * The kernel can not handle #VE's when accessing normal kernel
+ * memory. Ensure that no #VE will be delivered for accesses to
+ * TD-private memory. Only VMM-shared memory (MMIO) will #VE.
*/
- return BIT_ULL(gpa_width - 1);
+ td_attr = out.rdx;
+ if (!(td_attr & ATTR_SEPT_VE_DISABLE))
+ panic("TD misconfiguration: SEPT_VE_DISABLE attibute must be set.\n");
}
/*
@@ -758,7 +769,7 @@ void __init tdx_early_init(void)
setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
cc_set_vendor(CC_VENDOR_INTEL);
- cc_mask = get_cc_mask();
+ tdx_parse_tdinfo(&cc_mask);
cc_set_mask(cc_mask);
/*
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a646a5f9a235..1b92bf05fd65 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4911,6 +4911,7 @@ static const struct x86_cpu_desc isolation_ucodes[] = {
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 6, 0x00000000),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 7, 0x00000000),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 11, 0x00000000),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L, 3, 0x0000007c),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE, 3, 0x0000007c),
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 9, 0x0000004e),
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 7839507b3844..446d2833efa7 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -982,8 +982,13 @@ struct event_constraint intel_icl_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), /* SLOTS */
INTEL_PLD_CONSTRAINT(0x1cd, 0xff), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf), /* MEM_INST_RETIRED.LOAD */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf), /* MEM_INST_RETIRED.STORE */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), /* MEM_LOAD_*_RETIRED.* */
@@ -1004,8 +1009,13 @@ struct event_constraint intel_spr_pebs_event_constraints[] = {
INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe),
INTEL_PLD_CONSTRAINT(0x1cd, 0xfe),
INTEL_PSD_CONSTRAINT(0x2cd, 0x1),
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf),
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf),
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index fea544e5842a..a829492bca4c 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -619,12 +619,8 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
case RAPL_UNIT_QUIRK_INTEL_HSW:
rapl_hw_unit[PERF_RAPL_RAM] = 16;
break;
- /*
- * SPR shares the same DRAM domain energy unit as HSW, plus it
- * also has a fixed energy unit for Psys domain.
- */
+ /* SPR uses a fixed energy unit for Psys domain. */
case RAPL_UNIT_QUIRK_INTEL_SPR:
- rapl_hw_unit[PERF_RAPL_RAM] = 16;
rapl_hw_unit[PERF_RAPL_PSYS] = 0;
break;
default:
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 5d75fe229342..347707d459c6 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -107,6 +107,11 @@
#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */
+#define INTEL_FAM6_EMERALDRAPIDS_X 0xCF
+
+#define INTEL_FAM6_GRANITERAPIDS_X 0xAD
+#define INTEL_FAM6_GRANITERAPIDS_D 0xAE
+
#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
#define INTEL_FAM6_ALDERLAKE_N 0xBE
@@ -118,7 +123,7 @@
#define INTEL_FAM6_METEORLAKE 0xAC
#define INTEL_FAM6_METEORLAKE_L 0xAA
-/* "Small Core" Processors (Atom) */
+/* "Small Core" Processors (Atom/E-Core) */
#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */
@@ -145,6 +150,10 @@
#define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */
#define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */
+#define INTEL_FAM6_SIERRAFOREST_X 0xAF
+
+#define INTEL_FAM6_GRANDRIDGE 0xB6
+
/* Xeon Phi */
#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7551b6f9c31c..b4dbde7d9eb1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2090,8 +2090,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
#define GET_SMSTATE(type, buf, offset) \
(*(type *)((buf) + (offset) - 0x7e00))
-int kvm_cpu_dirty_log_size(void);
-
int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
#define KVM_CLOCK_VALID_FLAGS \
diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h
index 59358d1bf880..fd2669b1cb2d 100644
--- a/arch/x86/include/asm/syscall_wrapper.h
+++ b/arch/x86/include/asm/syscall_wrapper.h
@@ -6,7 +6,7 @@
#ifndef _ASM_X86_SYSCALL_WRAPPER_H
#define _ASM_X86_SYSCALL_WRAPPER_H
-struct pt_regs;
+#include <asm/ptrace.h>
extern long __x64_sys_ni_syscall(const struct pt_regs *regs);
extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7065462378e2..62bc7a01cecc 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -1133,11 +1133,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->eax = max(entry->eax, 0x80000021);
break;
case 0x80000001:
+ entry->ebx &= ~GENMASK(27, 16);
cpuid_entry_override(entry, CPUID_8000_0001_EDX);
cpuid_entry_override(entry, CPUID_8000_0001_ECX);
break;
case 0x80000006:
- /* L2 cache and TLB: pass through host info. */
+ /* Drop reserved bits, pass host L2 cache and TLB info. */
+ entry->edx &= ~GENMASK(17, 16);
break;
case 0x80000007: /* Advanced power management */
/* invariant TSC is CPUID.80000007H:EDX[8] */
@@ -1167,6 +1169,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
g_phys_as = phys_as;
entry->eax = g_phys_as | (virt_as << 8);
+ entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8));
entry->edx = 0;
cpuid_entry_override(entry, CPUID_8000_0008_EBX);
break;
@@ -1186,6 +1189,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->ecx = entry->edx = 0;
break;
case 0x8000001a:
+ entry->eax &= GENMASK(2, 0);
+ entry->ebx = entry->ecx = entry->edx = 0;
+ break;
case 0x8000001e:
break;
case 0x8000001F:
@@ -1193,7 +1199,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
} else {
cpuid_entry_override(entry, CPUID_8000_001F_EAX);
-
+ /* Clear NumVMPL since KVM does not support VMPL. */
+ entry->ebx &= ~GENMASK(31, 12);
/*
* Enumerate '0' for "PA bits reduction", the adjusted
* MAXPHYADDR is enumerated directly (see 0x80000008).
@@ -1331,7 +1338,7 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
if (sanity_check_entries(entries, cpuid->nent, type))
return -EINVAL;
- array.entries = kvcalloc(sizeof(struct kvm_cpuid_entry2), cpuid->nent, GFP_KERNEL);
+ array.entries = kvcalloc(cpuid->nent, sizeof(struct kvm_cpuid_entry2), GFP_KERNEL);
if (!array.entries)
return -ENOMEM;
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index cfed36aba2f7..c1390357126a 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -158,11 +158,16 @@ out:
static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file)
{
struct kvm *kvm = inode->i_private;
+ int r;
if (!kvm_get_kvm_safe(kvm))
return -ENOENT;
- return single_open(file, kvm_mmu_rmaps_stat_show, kvm);
+ r = single_open(file, kvm_mmu_rmaps_stat_show, kvm);
+ if (r < 0)
+ kvm_put_kvm(kvm);
+
+ return r;
}
static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file)
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 3b27622d4642..4a43261d25a2 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -791,8 +791,7 @@ static int linearize(struct x86_emulate_ctxt *ctxt,
ctxt->mode, linear);
}
-static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst,
- enum x86emul_mode mode)
+static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst)
{
ulong linear;
int rc;
@@ -802,41 +801,71 @@ static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst,
if (ctxt->op_bytes != sizeof(unsigned long))
addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1);
- rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear);
+ rc = __linearize(ctxt, addr, &max_size, 1, false, true, ctxt->mode, &linear);
if (rc == X86EMUL_CONTINUE)
ctxt->_eip = addr.ea;
return rc;
}
+static inline int emulator_recalc_and_set_mode(struct x86_emulate_ctxt *ctxt)
+{
+ u64 efer;
+ struct desc_struct cs;
+ u16 selector;
+ u32 base3;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if (!(ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PE)) {
+ /* Real mode. cpu must not have long mode active */
+ if (efer & EFER_LMA)
+ return X86EMUL_UNHANDLEABLE;
+ ctxt->mode = X86EMUL_MODE_REAL;
+ return X86EMUL_CONTINUE;
+ }
+
+ if (ctxt->eflags & X86_EFLAGS_VM) {
+ /* Protected/VM86 mode. cpu must not have long mode active */
+ if (efer & EFER_LMA)
+ return X86EMUL_UNHANDLEABLE;
+ ctxt->mode = X86EMUL_MODE_VM86;
+ return X86EMUL_CONTINUE;
+ }
+
+ if (!ctxt->ops->get_segment(ctxt, &selector, &cs, &base3, VCPU_SREG_CS))
+ return X86EMUL_UNHANDLEABLE;
+
+ if (efer & EFER_LMA) {
+ if (cs.l) {
+ /* Proper long mode */
+ ctxt->mode = X86EMUL_MODE_PROT64;
+ } else if (cs.d) {
+ /* 32 bit compatibility mode*/
+ ctxt->mode = X86EMUL_MODE_PROT32;
+ } else {
+ ctxt->mode = X86EMUL_MODE_PROT16;
+ }
+ } else {
+ /* Legacy 32 bit / 16 bit mode */
+ ctxt->mode = cs.d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
{
- return assign_eip(ctxt, dst, ctxt->mode);
+ return assign_eip(ctxt, dst);
}
-static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
- const struct desc_struct *cs_desc)
+static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst)
{
- enum x86emul_mode mode = ctxt->mode;
- int rc;
+ int rc = emulator_recalc_and_set_mode(ctxt);
-#ifdef CONFIG_X86_64
- if (ctxt->mode >= X86EMUL_MODE_PROT16) {
- if (cs_desc->l) {
- u64 efer = 0;
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
- if (efer & EFER_LMA)
- mode = X86EMUL_MODE_PROT64;
- } else
- mode = X86EMUL_MODE_PROT32; /* temporary value */
- }
-#endif
- if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
- mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
- rc = assign_eip(ctxt, dst, mode);
- if (rc == X86EMUL_CONTINUE)
- ctxt->mode = mode;
- return rc;
+ return assign_eip(ctxt, dst);
}
static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -2172,7 +2201,7 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
+ rc = assign_eip_far(ctxt, ctxt->src.val);
/* Error handling is not implemented. */
if (rc != X86EMUL_CONTINUE)
return X86EMUL_UNHANDLEABLE;
@@ -2250,7 +2279,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
&new_desc);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = assign_eip_far(ctxt, eip, &new_desc);
+ rc = assign_eip_far(ctxt, eip);
/* Error handling is not implemented. */
if (rc != X86EMUL_CONTINUE)
return X86EMUL_UNHANDLEABLE;
@@ -2432,7 +2461,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED;
ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0);
- for (i = 0; i < NR_EMULATOR_GPRS; i++)
+ for (i = 0; i < 8; i++)
*reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4);
val = GET_SMSTATE(u32, smstate, 0x7fcc);
@@ -2489,7 +2518,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
u16 selector;
int i, r;
- for (i = 0; i < NR_EMULATOR_GPRS; i++)
+ for (i = 0; i < 16; i++)
*reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8);
ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78);
@@ -2633,7 +2662,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
* those side effects need to be explicitly handled for both success
* and shutdown.
*/
- return X86EMUL_CONTINUE;
+ return emulator_recalc_and_set_mode(ctxt);
emulate_shutdown:
ctxt->ops->triple_fault(ctxt);
@@ -2876,6 +2905,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
ctxt->_eip = rdx;
+ ctxt->mode = usermode;
*reg_write(ctxt, VCPU_REGS_RSP) = rcx;
return X86EMUL_CONTINUE;
@@ -3469,7 +3499,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
+ rc = assign_eip_far(ctxt, ctxt->src.val);
if (rc != X86EMUL_CONTINUE)
goto fail;
@@ -3611,11 +3641,25 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
static int em_cr_write(struct x86_emulate_ctxt *ctxt)
{
- if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
+ int cr_num = ctxt->modrm_reg;
+ int r;
+
+ if (ctxt->ops->set_cr(ctxt, cr_num, ctxt->src.val))
return emulate_gp(ctxt, 0);
/* Disable writeback. */
ctxt->dst.type = OP_NONE;
+
+ if (cr_num == 0) {
+ /*
+ * CR0 write might have updated CR0.PE and/or CR0.PG
+ * which can affect the cpu's execution mode.
+ */
+ r = emulator_recalc_and_set_mode(ctxt);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ }
+
return X86EMUL_CONTINUE;
}
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 87c4e46daf37..07254314f3dd 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -24,8 +24,6 @@ extern int __read_mostly pt_mode;
#define PMU_CAP_FW_WRITES (1ULL << 13)
#define PMU_CAP_LBR_FMT 0x3f
-#define DEBUGCTLMSR_LBR_MASK (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI)
-
struct nested_vmx_msrs {
/*
* We only store the "true" versions of the VMX capability MSRs. We
@@ -400,6 +398,7 @@ static inline bool vmx_pebs_supported(void)
static inline u64 vmx_get_perf_capabilities(void)
{
u64 perf_cap = PMU_CAP_FW_WRITES;
+ struct x86_pmu_lbr lbr;
u64 host_perf_cap = 0;
if (!enable_pmu)
@@ -408,7 +407,8 @@ static inline u64 vmx_get_perf_capabilities(void)
if (boot_cpu_has(X86_FEATURE_PDCM))
rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
- perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
+ if (x86_perf_get_lbr(&lbr) >= 0 && lbr.nr)
+ perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
if (vmx_pebs_supported()) {
perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
@@ -419,19 +419,6 @@ static inline u64 vmx_get_perf_capabilities(void)
return perf_cap;
}
-static inline u64 vmx_supported_debugctl(void)
-{
- u64 debugctl = 0;
-
- if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
- debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
-
- if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)
- debugctl |= DEBUGCTLMSR_LBR_MASK;
-
- return debugctl;
-}
-
static inline bool cpu_has_notify_vmexit(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9dba04b6b019..63247c57c72c 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2021,15 +2021,17 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
return (unsigned long)data;
}
-static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
+static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
{
- u64 debugctl = vmx_supported_debugctl();
+ u64 debugctl = 0;
- if (!intel_pmu_lbr_is_enabled(vcpu))
- debugctl &= ~DEBUGCTLMSR_LBR_MASK;
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+ (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
+ debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
- debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
+ if ((vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) &&
+ (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
+ debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
return debugctl;
}
@@ -2103,7 +2105,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
case MSR_IA32_DEBUGCTLMSR: {
- u64 invalid = data & ~vcpu_supported_debugctl(vcpu);
+ u64 invalid;
+
+ invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
if (report_ignored_msrs)
vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
@@ -8263,6 +8267,11 @@ static __init int hardware_setup(void)
if (!cpu_has_virtual_nmis())
enable_vnmi = 0;
+#ifdef CONFIG_X86_SGX_KVM
+ if (!cpu_has_vmx_encls_vmexit())
+ enable_sgx = false;
+#endif
+
/*
* set_apic_access_page_addr() is used to reload apic access
* page upon invalidation. No need to do anything if not
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9cf1ba865562..a228a1b872bb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2315,11 +2315,11 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
/* we verify if the enable bit is set... */
if (system_time & 1) {
- kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
- KVM_HOST_USES_PFN, system_time & ~1ULL,
- sizeof(struct pvclock_vcpu_time_info));
+ kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
+ KVM_HOST_USES_PFN, system_time & ~1ULL,
+ sizeof(struct pvclock_vcpu_time_info));
} else {
- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
+ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
}
return;
@@ -3388,7 +3388,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
+ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
vcpu->arch.time = 0;
}
@@ -10044,7 +10044,20 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
kvm_x86_ops.nested_ops->has_events(vcpu))
*req_immediate_exit = true;
- WARN_ON(kvm_is_exception_pending(vcpu));
+ /*
+ * KVM must never queue a new exception while injecting an event; KVM
+ * is done emulating and should only propagate the to-be-injected event
+ * to the VMCS/VMCB. Queueing a new exception can put the vCPU into an
+ * infinite loop as KVM will bail from VM-Enter to inject the pending
+ * exception and start the cycle all over.
+ *
+ * Exempt triple faults as they have special handling and won't put the
+ * vCPU into an infinite loop. Triple fault can be queued when running
+ * VMX without unrestricted guest, as that requires KVM to emulate Real
+ * Mode events (see kvm_inject_realmode_interrupt()).
+ */
+ WARN_ON_ONCE(vcpu->arch.exception.pending ||
+ vcpu->arch.exception_vmexit.pending);
return 0;
out:
@@ -10391,7 +10404,10 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
kvm->arch.apicv_inhibit_reasons = new;
if (new) {
unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
+ int idx = srcu_read_lock(&kvm->srcu);
+
kvm_zap_gfn_range(kvm, gfn, gfn+1);
+ srcu_read_unlock(&kvm->srcu, idx);
}
} else {
kvm->arch.apicv_inhibit_reasons = new;
@@ -10499,20 +10515,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_immediate_exit = false;
- /* Forbid vmenter if vcpu dirty ring is soft-full */
- if (unlikely(vcpu->kvm->dirty_ring_size &&
- kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) {
- vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
- trace_kvm_dirty_ring_exit(vcpu);
- r = 0;
- goto out;
- }
-
if (kvm_request_pending(vcpu)) {
if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
r = -EIO;
goto out;
}
+
+ if (kvm_dirty_ring_check_request(vcpu)) {
+ r = 0;
+ goto out;
+ }
+
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
r = 0;
@@ -11816,6 +11829,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
+ kvm_gpc_init(&vcpu->arch.pv_time);
+
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 93c628d3e3a9..2dae413bd62a 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -42,13 +42,13 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
int idx = srcu_read_lock(&kvm->srcu);
if (gfn == GPA_INVALID) {
- kvm_gfn_to_pfn_cache_destroy(kvm, gpc);
+ kvm_gpc_deactivate(kvm, gpc);
goto out;
}
do {
- ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN,
- gpa, PAGE_SIZE);
+ ret = kvm_gpc_activate(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa,
+ PAGE_SIZE);
if (ret)
goto out;
@@ -554,15 +554,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
offsetof(struct compat_vcpu_info, time));
if (data->u.gpa == GPA_INVALID) {
- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
+ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
r = 0;
break;
}
- r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
- &vcpu->arch.xen.vcpu_info_cache,
- NULL, KVM_HOST_USES_PFN, data->u.gpa,
- sizeof(struct vcpu_info));
+ r = kvm_gpc_activate(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_info_cache, NULL,
+ KVM_HOST_USES_PFN, data->u.gpa,
+ sizeof(struct vcpu_info));
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -570,16 +570,16 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
if (data->u.gpa == GPA_INVALID) {
- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
- &vcpu->arch.xen.vcpu_time_info_cache);
+ kvm_gpc_deactivate(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_time_info_cache);
r = 0;
break;
}
- r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
- &vcpu->arch.xen.vcpu_time_info_cache,
- NULL, KVM_HOST_USES_PFN, data->u.gpa,
- sizeof(struct pvclock_vcpu_time_info));
+ r = kvm_gpc_activate(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_time_info_cache,
+ NULL, KVM_HOST_USES_PFN, data->u.gpa,
+ sizeof(struct pvclock_vcpu_time_info));
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
break;
@@ -590,16 +590,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
if (data->u.gpa == GPA_INVALID) {
- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
- &vcpu->arch.xen.runstate_cache);
+ kvm_gpc_deactivate(vcpu->kvm,
+ &vcpu->arch.xen.runstate_cache);
r = 0;
break;
}
- r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
- &vcpu->arch.xen.runstate_cache,
- NULL, KVM_HOST_USES_PFN, data->u.gpa,
- sizeof(struct vcpu_runstate_info));
+ r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
+ NULL, KVM_HOST_USES_PFN, data->u.gpa,
+ sizeof(struct vcpu_runstate_info));
break;
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
@@ -1667,18 +1666,18 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm,
case EVTCHNSTAT_ipi:
/* IPI must map back to the same port# */
if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port)
- goto out; /* -EINVAL */
+ goto out_noeventfd; /* -EINVAL */
break;
case EVTCHNSTAT_interdomain:
if (data->u.evtchn.deliver.port.port) {
if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm))
- goto out; /* -EINVAL */
+ goto out_noeventfd; /* -EINVAL */
} else {
eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd);
if (IS_ERR(eventfd)) {
ret = PTR_ERR(eventfd);
- goto out;
+ goto out_noeventfd;
}
}
break;
@@ -1718,6 +1717,7 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm,
out:
if (eventfd)
eventfd_ctx_put(eventfd);
+out_noeventfd:
kfree(evtchnfd);
return ret;
}
@@ -1816,7 +1816,12 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
{
vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
vcpu->arch.xen.poll_evtchn = 0;
+
timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
+
+ kvm_gpc_init(&vcpu->arch.xen.runstate_cache);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache);
}
void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
@@ -1824,18 +1829,17 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
if (kvm_xen_timer_enabled(vcpu))
kvm_xen_stop_timer(vcpu);
- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
- &vcpu->arch.xen.runstate_cache);
- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
- &vcpu->arch.xen.vcpu_info_cache);
- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
- &vcpu->arch.xen.vcpu_time_info_cache);
+ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache);
+ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
+ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache);
+
del_timer_sync(&vcpu->arch.xen.poll_timer);
}
void kvm_xen_init_vm(struct kvm *kvm)
{
idr_init(&kvm->arch.xen.evtchn_ports);
+ kvm_gpc_init(&kvm->arch.xen.shinfo_cache);
}
void kvm_xen_destroy_vm(struct kvm *kvm)
@@ -1843,7 +1847,7 @@ void kvm_xen_destroy_vm(struct kvm *kvm)
struct evtchnfd *evtchnfd;
int i;
- kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
+ kvm_gpc_deactivate(kvm, &kvm->arch.xen.shinfo_cache);
idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
if (!evtchnfd->deliver.port.port)
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index 68aff1382872..246d67dab510 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -302,7 +302,7 @@ static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
static bool pmu_msr_chk_emulated(unsigned int msr, uint64_t *val, bool is_read,
bool *emul)
{
- int type, index;
+ int type, index = 0;
if (is_amd_pmu_msr(msr))
*emul = xen_amd_pmu_emulate(msr, val, is_read);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index cfa99e8f054b..4f4309500559 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -910,17 +910,9 @@ static int register_callback(unsigned type, const void *func)
void xen_enable_sysenter(void)
{
- int ret;
- unsigned sysenter_feature;
-
- sysenter_feature = X86_FEATURE_SYSENTER32;
-
- if (!boot_cpu_has(sysenter_feature))
- return;
-
- ret = register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat);
- if(ret != 0)
- setup_clear_cpu_cap(sysenter_feature);
+ if (cpu_feature_enabled(X86_FEATURE_SYSENTER32) &&
+ register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat))
+ setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
}
void xen_enable_syscall(void)
@@ -934,12 +926,9 @@ void xen_enable_syscall(void)
mechanism for syscalls. */
}
- if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
- ret = register_callback(CALLBACKTYPE_syscall32,
- xen_entry_SYSCALL_compat);
- if (ret != 0)
- setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
- }
+ if (cpu_feature_enabled(X86_FEATURE_SYSCALL32) &&
+ register_callback(CALLBACKTYPE_syscall32, xen_entry_SYSCALL_compat))
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
}
static void __init xen_pvmmu_arch_setup(void)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 75c8296b6feb..6a789cda68a5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1262,6 +1262,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
(!blk_queue_nomerges(rq->q) &&
blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
blk_mq_flush_plug_list(plug, false);
+ last = NULL;
trace_block_plug(rq->q);
}
@@ -4193,9 +4194,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
return 0;
err_hctxs:
- xa_destroy(&q->hctx_table);
- q->nr_hw_queues = 0;
- blk_mq_sysfs_deinit(q);
+ blk_mq_release(q);
err_poll:
blk_stat_free_callback(q->poll_cb);
q->poll_cb = NULL;
diff --git a/block/genhd.c b/block/genhd.c
index fee90eb98b4a..0f9769db2de8 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -527,6 +527,7 @@ out_unregister_bdi:
bdi_unregister(disk->bdi);
out_unregister_queue:
blk_unregister_queue(disk);
+ rq_qos_exit(disk->queue);
out_put_slave_dir:
kobject_put(disk->slave_dir);
out_put_holder_dir:
diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index 3b818ab186be..1f4fc5f8a819 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -327,6 +327,7 @@ static int __init acpi_parse_cfmws(union acpi_subtable_headers *header,
pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n",
node, start, end);
}
+ node_set(node, numa_nodes_parsed);
/* Set the next available fake_pxm value */
(*fake_pxm)++;
diff --git a/drivers/acpi/x86/utils.c b/drivers/acpi/x86/utils.c
index f8a2cbdc0ce2..d7d3f1669d4c 100644
--- a/drivers/acpi/x86/utils.c
+++ b/drivers/acpi/x86/utils.c
@@ -219,6 +219,12 @@ static const struct dmi_system_id force_storage_d3_dmi[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 14 7425 2-in-1"),
}
},
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 16 5625"),
+ }
+ },
{}
};
diff --git a/drivers/ata/pata_legacy.c b/drivers/ata/pata_legacy.c
index 0a8bf09a5c19..03c580625c2c 100644
--- a/drivers/ata/pata_legacy.c
+++ b/drivers/ata/pata_legacy.c
@@ -315,9 +315,10 @@ static void pdc20230_set_piomode(struct ata_port *ap, struct ata_device *adev)
outb(inb(0x1F4) & 0x07, 0x1F4);
rt = inb(0x1F3);
- rt &= 0x07 << (3 * adev->devno);
+ rt &= ~(0x07 << (3 * !adev->devno));
if (pio)
- rt |= (1 + 3 * pio) << (3 * adev->devno);
+ rt |= (1 + 3 * pio) << (3 * !adev->devno);
+ outb(rt, 0x1F3);
udelay(100);
outb(inb(0x1F2) | 0x01, 0x1F2);
diff --git a/drivers/ata/pata_palmld.c b/drivers/ata/pata_palmld.c
index 400e65190904..51caa2a427dd 100644
--- a/drivers/ata/pata_palmld.c
+++ b/drivers/ata/pata_palmld.c
@@ -63,8 +63,8 @@ static int palmld_pata_probe(struct platform_device *pdev)
/* remap drive's physical memory address */
mem = devm_platform_ioremap_resource(pdev, 0);
- if (!mem)
- return -ENOMEM;
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
/* request and activate power and reset GPIOs */
lda->power = devm_gpiod_get(dev, "power", GPIOD_OUT_HIGH);
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index db1b4b202646..a41145d52de9 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -408,6 +408,12 @@ config BLK_DEV_UBLK
definition isn't finalized yet, and might change according to future
requirement, so mark is as experimental now.
+ Say Y if you want to get better performance because task_work_add()
+ can be used in IO path for replacing io_uring cmd, which will become
+ shared between IO tasks and ubq daemon, meantime task_work_add() can
+ can handle batch more effectively, but task_work_add() isn't exported
+ for module, so ublk has to be built to kernel.
+
source "drivers/block/rnbd/Kconfig"
endif # BLK_DEV
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 5afce6ffaadf..f96cb01e9604 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -57,11 +57,14 @@
#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
struct ublk_rq_data {
- struct callback_head work;
+ union {
+ struct callback_head work;
+ struct llist_node node;
+ };
};
struct ublk_uring_cmd_pdu {
- struct request *req;
+ struct ublk_queue *ubq;
};
/*
@@ -119,6 +122,8 @@ struct ublk_queue {
struct task_struct *ubq_daemon;
char *io_cmd_buf;
+ struct llist_head io_cmds;
+
unsigned long io_addr; /* mapped vm address */
unsigned int max_io_sz;
bool force_abort;
@@ -764,8 +769,12 @@ static inline void __ublk_rq_task_work(struct request *req)
static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
{
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+ struct ublk_queue *ubq = pdu->ubq;
+ struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
+ struct ublk_rq_data *data;
- __ublk_rq_task_work(pdu->req);
+ llist_for_each_entry(data, io_cmds, node)
+ __ublk_rq_task_work(blk_mq_rq_from_pdu(data));
}
static void ublk_rq_task_work_fn(struct callback_head *work)
@@ -777,6 +786,54 @@ static void ublk_rq_task_work_fn(struct callback_head *work)
__ublk_rq_task_work(req);
}
+static void ublk_submit_cmd(struct ublk_queue *ubq, const struct request *rq)
+{
+ struct ublk_io *io = &ubq->ios[rq->tag];
+
+ /*
+ * If the check pass, we know that this is a re-issued request aborted
+ * previously in monitor_work because the ubq_daemon(cmd's task) is
+ * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
+ * because this ioucmd's io_uring context may be freed now if no inflight
+ * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
+ *
+ * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
+ * the tag). Then the request is re-started(allocating the tag) and we are here.
+ * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
+ * guarantees that here is a re-issued request aborted previously.
+ */
+ if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) {
+ struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
+ struct ublk_rq_data *data;
+
+ llist_for_each_entry(data, io_cmds, node)
+ __ublk_abort_rq(ubq, blk_mq_rq_from_pdu(data));
+ } else {
+ struct io_uring_cmd *cmd = io->cmd;
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+ pdu->ubq = ubq;
+ io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
+ }
+}
+
+static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq,
+ bool last)
+{
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
+
+ if (ublk_can_use_task_work(ubq)) {
+ enum task_work_notify_mode notify_mode = last ?
+ TWA_SIGNAL_NO_IPI : TWA_NONE;
+
+ if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode))
+ __ublk_abort_rq(ubq, rq);
+ } else {
+ if (llist_add(&data->node, &ubq->io_cmds))
+ ublk_submit_cmd(ubq, rq);
+ }
+}
+
static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
@@ -788,6 +845,7 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
res = ublk_setup_iod(ubq, rq);
if (unlikely(res != BLK_STS_OK))
return BLK_STS_IOERR;
+
/* With recovery feature enabled, force_abort is set in
* ublk_stop_dev() before calling del_gendisk(). We have to
* abort all requeued and new rqs here to let del_gendisk()
@@ -803,41 +861,11 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_mq_start_request(bd->rq);
if (unlikely(ubq_daemon_is_dying(ubq))) {
- fail:
__ublk_abort_rq(ubq, rq);
return BLK_STS_OK;
}
- if (ublk_can_use_task_work(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
- enum task_work_notify_mode notify_mode = bd->last ?
- TWA_SIGNAL_NO_IPI : TWA_NONE;
-
- if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode))
- goto fail;
- } else {
- struct ublk_io *io = &ubq->ios[rq->tag];
- struct io_uring_cmd *cmd = io->cmd;
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
-
- /*
- * If the check pass, we know that this is a re-issued request aborted
- * previously in monitor_work because the ubq_daemon(cmd's task) is
- * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
- * because this ioucmd's io_uring context may be freed now if no inflight
- * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
- *
- * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
- * the tag). Then the request is re-started(allocating the tag) and we are here.
- * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
- * guarantees that here is a re-issued request aborted previously.
- */
- if ((io->flags & UBLK_IO_FLAG_ABORTED))
- goto fail;
-
- pdu->req = rq;
- io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
- }
+ ublk_queue_cmd(ubq, rq, bd->last);
return BLK_STS_OK;
}
@@ -1164,22 +1192,12 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
}
static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
- int tag, struct io_uring_cmd *cmd)
+ int tag)
{
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
- if (ublk_can_use_task_work(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
- /* should not fail since we call it just in ubq->ubq_daemon */
- task_work_add(ubq->ubq_daemon, &data->work, TWA_SIGNAL_NO_IPI);
- } else {
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
-
- pdu->req = req;
- io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
- }
+ ublk_queue_cmd(ubq, req, true);
}
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
@@ -1267,7 +1285,7 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
io->addr = ub_cmd->addr;
io->cmd = cmd;
io->flags |= UBLK_IO_FLAG_ACTIVE;
- ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag, cmd);
+ ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag);
break;
default:
goto out;
@@ -1658,6 +1676,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
*/
ub->dev_info.flags &= UBLK_F_ALL;
+ if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
+ ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
+
/* We are not ready to support zero copy */
ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
diff --git a/drivers/bluetooth/virtio_bt.c b/drivers/bluetooth/virtio_bt.c
index 67c21263f9e0..fd281d439505 100644
--- a/drivers/bluetooth/virtio_bt.c
+++ b/drivers/bluetooth/virtio_bt.c
@@ -219,7 +219,7 @@ static void virtbt_rx_work(struct work_struct *work)
if (!skb)
return;
- skb->len = len;
+ skb_put(skb, len);
virtbt_rx_handle(vbt, skb);
if (virtbt_add_inbuf(vbt) < 0)
diff --git a/drivers/clk/clk-renesas-pcie.c b/drivers/clk/clk-renesas-pcie.c
index 4f5df1fc74b4..e6247141d0c0 100644
--- a/drivers/clk/clk-renesas-pcie.c
+++ b/drivers/clk/clk-renesas-pcie.c
@@ -90,13 +90,66 @@ static const struct regmap_access_table rs9_writeable_table = {
.n_yes_ranges = ARRAY_SIZE(rs9_writeable_ranges),
};
+static int rs9_regmap_i2c_write(void *context,
+ unsigned int reg, unsigned int val)
+{
+ struct i2c_client *i2c = context;
+ const u8 data[3] = { reg, 1, val };
+ const int count = ARRAY_SIZE(data);
+ int ret;
+
+ ret = i2c_master_send(i2c, data, count);
+ if (ret == count)
+ return 0;
+ else if (ret < 0)
+ return ret;
+ else
+ return -EIO;
+}
+
+static int rs9_regmap_i2c_read(void *context,
+ unsigned int reg, unsigned int *val)
+{
+ struct i2c_client *i2c = context;
+ struct i2c_msg xfer[2];
+ u8 txdata = reg;
+ u8 rxdata[2];
+ int ret;
+
+ xfer[0].addr = i2c->addr;
+ xfer[0].flags = 0;
+ xfer[0].len = 1;
+ xfer[0].buf = (void *)&txdata;
+
+ xfer[1].addr = i2c->addr;
+ xfer[1].flags = I2C_M_RD;
+ xfer[1].len = 2;
+ xfer[1].buf = (void *)rxdata;
+
+ ret = i2c_transfer(i2c->adapter, xfer, 2);
+ if (ret < 0)
+ return ret;
+ if (ret != 2)
+ return -EIO;
+
+ /*
+ * Byte 0 is transfer length, which is always 1 due
+ * to BCP register programming to 1 in rs9_probe(),
+ * ignore it and use data from Byte 1.
+ */
+ *val = rxdata[1];
+ return 0;
+}
+
static const struct regmap_config rs9_regmap_config = {
.reg_bits = 8,
.val_bits = 8,
- .cache_type = REGCACHE_FLAT,
- .max_register = 0x8,
+ .cache_type = REGCACHE_NONE,
+ .max_register = RS9_REG_BCP,
.rd_table = &rs9_readable_table,
.wr_table = &rs9_writeable_table,
+ .reg_write = rs9_regmap_i2c_write,
+ .reg_read = rs9_regmap_i2c_read,
};
static int rs9_get_output_config(struct rs9_driver_data *rs9, int idx)
@@ -242,11 +295,17 @@ static int rs9_probe(struct i2c_client *client)
return ret;
}
- rs9->regmap = devm_regmap_init_i2c(client, &rs9_regmap_config);
+ rs9->regmap = devm_regmap_init(&client->dev, NULL,
+ client, &rs9_regmap_config);
if (IS_ERR(rs9->regmap))
return dev_err_probe(&client->dev, PTR_ERR(rs9->regmap),
"Failed to allocate register map\n");
+ /* Always read back 1 Byte via I2C */
+ ret = regmap_write(rs9->regmap, RS9_REG_BCP, 1);
+ if (ret < 0)
+ return ret;
+
/* Register clock */
for (i = 0; i < rs9->chip_info->num_clks; i++) {
snprintf(name, 5, "DIF%d", i);
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index c3c3f8c07258..57b83665e5c3 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -1459,10 +1459,14 @@ static void clk_core_init_rate_req(struct clk_core * const core,
{
struct clk_core *parent;
- if (WARN_ON(!core || !req))
+ if (WARN_ON(!req))
return;
memset(req, 0, sizeof(*req));
+ req->max_rate = ULONG_MAX;
+
+ if (!core)
+ return;
req->rate = rate;
clk_core_get_boundaries(core, &req->min_rate, &req->max_rate);
diff --git a/drivers/clk/mediatek/clk-mt8195-topckgen.c b/drivers/clk/mediatek/clk-mt8195-topckgen.c
index 8cbab5ca2e58..1e016329c1d2 100644
--- a/drivers/clk/mediatek/clk-mt8195-topckgen.c
+++ b/drivers/clk/mediatek/clk-mt8195-topckgen.c
@@ -1270,8 +1270,10 @@ static int clk_mt8195_topck_probe(struct platform_device *pdev)
hw = devm_clk_hw_register_mux(&pdev->dev, "mfg_ck_fast_ref", mfg_fast_parents,
ARRAY_SIZE(mfg_fast_parents), CLK_SET_RATE_PARENT,
(base + 0x250), 8, 1, 0, &mt8195_clk_lock);
- if (IS_ERR(hw))
+ if (IS_ERR(hw)) {
+ r = PTR_ERR(hw);
goto unregister_muxes;
+ }
top_clk_data->hws[CLK_TOP_MFG_CK_FAST_REF] = hw;
r = clk_mt8195_reg_mfg_mux_notifier(&pdev->dev,
diff --git a/drivers/clk/qcom/gcc-sc7280.c b/drivers/clk/qcom/gcc-sc7280.c
index 8afb7575e712..46d41ebce2b0 100644
--- a/drivers/clk/qcom/gcc-sc7280.c
+++ b/drivers/clk/qcom/gcc-sc7280.c
@@ -3467,6 +3467,7 @@ static int gcc_sc7280_probe(struct platform_device *pdev)
regmap_update_bits(regmap, 0x28004, BIT(0), BIT(0));
regmap_update_bits(regmap, 0x28014, BIT(0), BIT(0));
regmap_update_bits(regmap, 0x71004, BIT(0), BIT(0));
+ regmap_update_bits(regmap, 0x7100C, BIT(13), BIT(13));
ret = qcom_cc_register_rcg_dfs(regmap, gcc_dfs_clocks,
ARRAY_SIZE(gcc_dfs_clocks));
diff --git a/drivers/clk/qcom/gpucc-sc7280.c b/drivers/clk/qcom/gpucc-sc7280.c
index 9a832f2bcf49..1490cd45a654 100644
--- a/drivers/clk/qcom/gpucc-sc7280.c
+++ b/drivers/clk/qcom/gpucc-sc7280.c
@@ -463,6 +463,7 @@ static int gpu_cc_sc7280_probe(struct platform_device *pdev)
*/
regmap_update_bits(regmap, 0x1170, BIT(0), BIT(0));
regmap_update_bits(regmap, 0x1098, BIT(0), BIT(0));
+ regmap_update_bits(regmap, 0x1098, BIT(13), BIT(13));
return qcom_cc_really_probe(pdev, &gpu_cc_sc7280_desc, regmap);
}
diff --git a/drivers/clk/renesas/r8a779g0-cpg-mssr.c b/drivers/clk/renesas/r8a779g0-cpg-mssr.c
index 9641122133b5..d5b325e3c539 100644
--- a/drivers/clk/renesas/r8a779g0-cpg-mssr.c
+++ b/drivers/clk/renesas/r8a779g0-cpg-mssr.c
@@ -47,6 +47,7 @@ enum clk_ids {
CLK_S0_VIO,
CLK_S0_VC,
CLK_S0_HSC,
+ CLK_SASYNCPER,
CLK_SV_VIP,
CLK_SV_IR,
CLK_SDSRC,
@@ -84,6 +85,7 @@ static const struct cpg_core_clk r8a779g0_core_clks[] __initconst = {
DEF_FIXED(".s0_vio", CLK_S0_VIO, CLK_PLL1_DIV2, 2, 1),
DEF_FIXED(".s0_vc", CLK_S0_VC, CLK_PLL1_DIV2, 2, 1),
DEF_FIXED(".s0_hsc", CLK_S0_HSC, CLK_PLL1_DIV2, 2, 1),
+ DEF_FIXED(".sasyncper", CLK_SASYNCPER, CLK_PLL5_DIV4, 3, 1),
DEF_FIXED(".sv_vip", CLK_SV_VIP, CLK_PLL1, 5, 1),
DEF_FIXED(".sv_ir", CLK_SV_IR, CLK_PLL1, 5, 1),
DEF_BASE(".sdsrc", CLK_SDSRC, CLK_TYPE_GEN4_SDSRC, CLK_PLL5),
@@ -128,6 +130,9 @@ static const struct cpg_core_clk r8a779g0_core_clks[] __initconst = {
DEF_FIXED("s0d4_hsc", R8A779G0_CLK_S0D4_HSC, CLK_S0_HSC, 4, 1),
DEF_FIXED("cl16m_hsc", R8A779G0_CLK_CL16M_HSC, CLK_S0_HSC, 48, 1),
DEF_FIXED("s0d2_cc", R8A779G0_CLK_S0D2_CC, CLK_S0, 2, 1),
+ DEF_FIXED("sasyncperd1",R8A779G0_CLK_SASYNCPERD1, CLK_SASYNCPER,1, 1),
+ DEF_FIXED("sasyncperd2",R8A779G0_CLK_SASYNCPERD2, CLK_SASYNCPER,2, 1),
+ DEF_FIXED("sasyncperd4",R8A779G0_CLK_SASYNCPERD4, CLK_SASYNCPER,4, 1),
DEF_FIXED("svd1_ir", R8A779G0_CLK_SVD1_IR, CLK_SV_IR, 1, 1),
DEF_FIXED("svd2_ir", R8A779G0_CLK_SVD2_IR, CLK_SV_IR, 2, 1),
DEF_FIXED("svd1_vip", R8A779G0_CLK_SVD1_VIP, CLK_SV_VIP, 1, 1),
@@ -153,10 +158,10 @@ static const struct mssr_mod_clk r8a779g0_mod_clks[] __initconst = {
DEF_MOD("avb0", 211, R8A779G0_CLK_S0D4_HSC),
DEF_MOD("avb1", 212, R8A779G0_CLK_S0D4_HSC),
DEF_MOD("avb2", 213, R8A779G0_CLK_S0D4_HSC),
- DEF_MOD("hscif0", 514, R8A779G0_CLK_S0D3_PER),
- DEF_MOD("hscif1", 515, R8A779G0_CLK_S0D3_PER),
- DEF_MOD("hscif2", 516, R8A779G0_CLK_S0D3_PER),
- DEF_MOD("hscif3", 517, R8A779G0_CLK_S0D3_PER),
+ DEF_MOD("hscif0", 514, R8A779G0_CLK_SASYNCPERD1),
+ DEF_MOD("hscif1", 515, R8A779G0_CLK_SASYNCPERD1),
+ DEF_MOD("hscif2", 516, R8A779G0_CLK_SASYNCPERD1),
+ DEF_MOD("hscif3", 517, R8A779G0_CLK_SASYNCPERD1),
DEF_MOD("i2c0", 518, R8A779G0_CLK_S0D6_PER),
DEF_MOD("i2c1", 519, R8A779G0_CLK_S0D6_PER),
DEF_MOD("i2c2", 520, R8A779G0_CLK_S0D6_PER),
diff --git a/drivers/clk/sifive/Kconfig b/drivers/clk/sifive/Kconfig
index 9132c3c4aa86..b7fde0aadfcb 100644
--- a/drivers/clk/sifive/Kconfig
+++ b/drivers/clk/sifive/Kconfig
@@ -2,7 +2,8 @@
menuconfig CLK_SIFIVE
bool "SiFive SoC driver support"
- depends on RISCV || COMPILE_TEST
+ depends on SOC_SIFIVE || COMPILE_TEST
+ default SOC_SIFIVE
help
SoC drivers for SiFive Linux-capable SoCs.
@@ -10,6 +11,7 @@ if CLK_SIFIVE
config CLK_SIFIVE_PRCI
bool "PRCI driver for SiFive SoCs"
+ default SOC_SIFIVE
select RESET_CONTROLLER
select RESET_SIMPLE
select CLK_ANALOGBITS_WRPLL_CLN28HPC
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 16176b9278b4..0c90f13870a4 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -174,7 +174,7 @@ int cxl_mbox_send_cmd(struct cxl_dev_state *cxlds, u16 opcode, void *in,
};
int rc;
- if (out_size > cxlds->payload_size)
+ if (in_size > cxlds->payload_size || out_size > cxlds->payload_size)
return -E2BIG;
rc = cxlds->mbox_send(cxlds, &mbox_cmd);
diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c
index 1d12a8206444..36aa5070d902 100644
--- a/drivers/cxl/core/pmem.c
+++ b/drivers/cxl/core/pmem.c
@@ -188,6 +188,7 @@ static void cxl_nvdimm_release(struct device *dev)
{
struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev);
+ xa_destroy(&cxl_nvd->pmem_regions);
kfree(cxl_nvd);
}
@@ -230,6 +231,7 @@ static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_memdev *cxlmd)
dev = &cxl_nvd->dev;
cxl_nvd->cxlmd = cxlmd;
+ xa_init(&cxl_nvd->pmem_regions);
device_initialize(dev);
lockdep_set_class(&dev->mutex, &cxl_nvdimm_key);
device_set_pm_not_required(dev);
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index bffde862de0b..e7556864ea80 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -811,6 +811,7 @@ static struct cxl_dport *find_dport(struct cxl_port *port, int id)
static int add_dport(struct cxl_port *port, struct cxl_dport *new)
{
struct cxl_dport *dup;
+ int rc;
device_lock_assert(&port->dev);
dup = find_dport(port, new->port_id);
@@ -821,8 +822,14 @@ static int add_dport(struct cxl_port *port, struct cxl_dport *new)
dev_name(dup->dport));
return -EBUSY;
}
- return xa_insert(&port->dports, (unsigned long)new->dport, new,
- GFP_KERNEL);
+
+ rc = xa_insert(&port->dports, (unsigned long)new->dport, new,
+ GFP_KERNEL);
+ if (rc)
+ return rc;
+
+ port->nr_dports++;
+ return 0;
}
/*
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 401148016978..f9ae5ad284ff 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -174,7 +174,8 @@ static int cxl_region_decode_commit(struct cxl_region *cxlr)
iter = to_cxl_port(iter->dev.parent)) {
cxl_rr = cxl_rr_load(iter, cxlr);
cxld = cxl_rr->decoder;
- rc = cxld->commit(cxld);
+ if (cxld->commit)
+ rc = cxld->commit(cxld);
if (rc)
break;
}
@@ -657,6 +658,9 @@ static struct cxl_region_ref *alloc_region_ref(struct cxl_port *port,
xa_for_each(&port->regions, index, iter) {
struct cxl_region_params *ip = &iter->region->params;
+ if (!ip->res)
+ continue;
+
if (ip->res->start > p->res->start) {
dev_dbg(&cxlr->dev,
"%s: HPA order violation %s:%pr vs %pr\n",
@@ -686,18 +690,27 @@ static struct cxl_region_ref *alloc_region_ref(struct cxl_port *port,
return cxl_rr;
}
-static void free_region_ref(struct cxl_region_ref *cxl_rr)
+static void cxl_rr_free_decoder(struct cxl_region_ref *cxl_rr)
{
- struct cxl_port *port = cxl_rr->port;
struct cxl_region *cxlr = cxl_rr->region;
struct cxl_decoder *cxld = cxl_rr->decoder;
+ if (!cxld)
+ return;
+
dev_WARN_ONCE(&cxlr->dev, cxld->region != cxlr, "region mismatch\n");
if (cxld->region == cxlr) {
cxld->region = NULL;
put_device(&cxlr->dev);
}
+}
+static void free_region_ref(struct cxl_region_ref *cxl_rr)
+{
+ struct cxl_port *port = cxl_rr->port;
+ struct cxl_region *cxlr = cxl_rr->region;
+
+ cxl_rr_free_decoder(cxl_rr);
xa_erase(&port->regions, (unsigned long)cxlr);
xa_destroy(&cxl_rr->endpoints);
kfree(cxl_rr);
@@ -728,6 +741,33 @@ static int cxl_rr_ep_add(struct cxl_region_ref *cxl_rr,
return 0;
}
+static int cxl_rr_alloc_decoder(struct cxl_port *port, struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled,
+ struct cxl_region_ref *cxl_rr)
+{
+ struct cxl_decoder *cxld;
+
+ if (port == cxled_to_port(cxled))
+ cxld = &cxled->cxld;
+ else
+ cxld = cxl_region_find_decoder(port, cxlr);
+ if (!cxld) {
+ dev_dbg(&cxlr->dev, "%s: no decoder available\n",
+ dev_name(&port->dev));
+ return -EBUSY;
+ }
+
+ if (cxld->region) {
+ dev_dbg(&cxlr->dev, "%s: %s already attached to %s\n",
+ dev_name(&port->dev), dev_name(&cxld->dev),
+ dev_name(&cxld->region->dev));
+ return -EBUSY;
+ }
+
+ cxl_rr->decoder = cxld;
+ return 0;
+}
+
/**
* cxl_port_attach_region() - track a region's interest in a port by endpoint
* @port: port to add a new region reference 'struct cxl_region_ref'
@@ -794,12 +834,6 @@ static int cxl_port_attach_region(struct cxl_port *port,
cxl_rr->nr_targets++;
nr_targets_inc = true;
}
-
- /*
- * The decoder for @cxlr was allocated when the region was first
- * attached to @port.
- */
- cxld = cxl_rr->decoder;
} else {
cxl_rr = alloc_region_ref(port, cxlr);
if (IS_ERR(cxl_rr)) {
@@ -810,26 +844,11 @@ static int cxl_port_attach_region(struct cxl_port *port,
}
nr_targets_inc = true;
- if (port == cxled_to_port(cxled))
- cxld = &cxled->cxld;
- else
- cxld = cxl_region_find_decoder(port, cxlr);
- if (!cxld) {
- dev_dbg(&cxlr->dev, "%s: no decoder available\n",
- dev_name(&port->dev));
- goto out_erase;
- }
-
- if (cxld->region) {
- dev_dbg(&cxlr->dev, "%s: %s already attached to %s\n",
- dev_name(&port->dev), dev_name(&cxld->dev),
- dev_name(&cxld->region->dev));
- rc = -EBUSY;
+ rc = cxl_rr_alloc_decoder(port, cxlr, cxled, cxl_rr);
+ if (rc)
goto out_erase;
- }
-
- cxl_rr->decoder = cxld;
}
+ cxld = cxl_rr->decoder;
rc = cxl_rr_ep_add(cxl_rr, cxled);
if (rc) {
@@ -971,7 +990,14 @@ static int cxl_port_setup_targets(struct cxl_port *port,
if (cxl_rr->nr_targets_set) {
int i, distance;
- distance = p->nr_targets / cxl_rr->nr_targets;
+ /*
+ * Passthrough ports impose no distance requirements between
+ * peers
+ */
+ if (port->nr_dports == 1)
+ distance = 0;
+ else
+ distance = p->nr_targets / cxl_rr->nr_targets;
for (i = 0; i < cxl_rr->nr_targets_set; i++)
if (ep->dport == cxlsd->target[i]) {
rc = check_last_peer(cxled, ep, cxl_rr,
@@ -1508,9 +1534,24 @@ static const struct attribute_group *region_groups[] = {
static void cxl_region_release(struct device *dev)
{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
struct cxl_region *cxlr = to_cxl_region(dev);
+ int id = atomic_read(&cxlrd->region_id);
+
+ /*
+ * Try to reuse the recently idled id rather than the cached
+ * next id to prevent the region id space from increasing
+ * unnecessarily.
+ */
+ if (cxlr->id < id)
+ if (atomic_try_cmpxchg(&cxlrd->region_id, &id, cxlr->id)) {
+ memregion_free(id);
+ goto out;
+ }
memregion_free(cxlr->id);
+out:
+ put_device(dev->parent);
kfree(cxlr);
}
@@ -1538,8 +1579,19 @@ static struct cxl_region *to_cxl_region(struct device *dev)
static void unregister_region(void *dev)
{
struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ int i;
device_del(dev);
+
+ /*
+ * Now that region sysfs is shutdown, the parameter block is now
+ * read-only, so no need to hold the region rwsem to access the
+ * region parameters.
+ */
+ for (i = 0; i < p->interleave_ways; i++)
+ detach_target(cxlr, i);
+
cxl_region_iomem_release(cxlr);
put_device(dev);
}
@@ -1561,6 +1613,11 @@ static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int i
device_initialize(dev);
lockdep_set_class(&dev->mutex, &cxl_region_key);
dev->parent = &cxlrd->cxlsd.cxld.dev;
+ /*
+ * Keep root decoder pinned through cxl_region_release to fixup
+ * region id allocations
+ */
+ get_device(dev->parent);
device_set_pm_not_required(dev);
dev->bus = &cxl_bus_type;
dev->type = &cxl_region_type;
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index f680450f0b16..ac75554b5d76 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -423,7 +423,7 @@ struct cxl_nvdimm {
struct device dev;
struct cxl_memdev *cxlmd;
struct cxl_nvdimm_bridge *bridge;
- struct cxl_pmem_region *region;
+ struct xarray pmem_regions;
};
struct cxl_pmem_region_mapping {
@@ -457,6 +457,7 @@ struct cxl_pmem_region {
* @regions: cxl_region_ref instances, regions mapped by this port
* @parent_dport: dport that points to this port in the parent
* @decoder_ida: allocator for decoder ids
+ * @nr_dports: number of entries in @dports
* @hdm_end: track last allocated HDM decoder instance for allocation ordering
* @commit_end: cursor to track highest committed decoder for commit ordering
* @component_reg_phys: component register capability base address (optional)
@@ -475,6 +476,7 @@ struct cxl_port {
struct xarray regions;
struct cxl_dport *parent_dport;
struct ida decoder_ida;
+ int nr_dports;
int hdm_end;
int commit_end;
resource_size_t component_reg_phys;
diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c
index 7dc0a2fa1a6b..4c627d67281a 100644
--- a/drivers/cxl/pmem.c
+++ b/drivers/cxl/pmem.c
@@ -30,17 +30,20 @@ static void unregister_nvdimm(void *nvdimm)
struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
struct cxl_nvdimm_bridge *cxl_nvb = cxl_nvd->bridge;
struct cxl_pmem_region *cxlr_pmem;
+ unsigned long index;
device_lock(&cxl_nvb->dev);
- cxlr_pmem = cxl_nvd->region;
dev_set_drvdata(&cxl_nvd->dev, NULL);
- cxl_nvd->region = NULL;
- device_unlock(&cxl_nvb->dev);
+ xa_for_each(&cxl_nvd->pmem_regions, index, cxlr_pmem) {
+ get_device(&cxlr_pmem->dev);
+ device_unlock(&cxl_nvb->dev);
- if (cxlr_pmem) {
device_release_driver(&cxlr_pmem->dev);
put_device(&cxlr_pmem->dev);
+
+ device_lock(&cxl_nvb->dev);
}
+ device_unlock(&cxl_nvb->dev);
nvdimm_delete(nvdimm);
cxl_nvd->bridge = NULL;
@@ -107,7 +110,7 @@ static int cxl_pmem_get_config_size(struct cxl_dev_state *cxlds,
*cmd = (struct nd_cmd_get_config_size) {
.config_size = cxlds->lsa_size,
- .max_xfer = cxlds->payload_size,
+ .max_xfer = cxlds->payload_size - sizeof(struct cxl_mbox_set_lsa),
};
return 0;
@@ -148,7 +151,7 @@ static int cxl_pmem_set_config_data(struct cxl_dev_state *cxlds,
return -EINVAL;
/* 4-byte status follows the input data in the payload */
- if (struct_size(cmd, in_buf, cmd->in_length) + 4 > buf_len)
+ if (size_add(struct_size(cmd, in_buf, cmd->in_length), 4) > buf_len)
return -EINVAL;
set_lsa =
@@ -366,25 +369,49 @@ static int match_cxl_nvdimm(struct device *dev, void *data)
static void unregister_nvdimm_region(void *nd_region)
{
- struct cxl_nvdimm_bridge *cxl_nvb;
- struct cxl_pmem_region *cxlr_pmem;
+ nvdimm_region_delete(nd_region);
+}
+
+static int cxl_nvdimm_add_region(struct cxl_nvdimm *cxl_nvd,
+ struct cxl_pmem_region *cxlr_pmem)
+{
+ int rc;
+
+ rc = xa_insert(&cxl_nvd->pmem_regions, (unsigned long)cxlr_pmem,
+ cxlr_pmem, GFP_KERNEL);
+ if (rc)
+ return rc;
+
+ get_device(&cxlr_pmem->dev);
+ return 0;
+}
+
+static void cxl_nvdimm_del_region(struct cxl_nvdimm *cxl_nvd,
+ struct cxl_pmem_region *cxlr_pmem)
+{
+ /*
+ * It is possible this is called without a corresponding
+ * cxl_nvdimm_add_region for @cxlr_pmem
+ */
+ cxlr_pmem = xa_erase(&cxl_nvd->pmem_regions, (unsigned long)cxlr_pmem);
+ if (cxlr_pmem)
+ put_device(&cxlr_pmem->dev);
+}
+
+static void release_mappings(void *data)
+{
int i;
+ struct cxl_pmem_region *cxlr_pmem = data;
+ struct cxl_nvdimm_bridge *cxl_nvb = cxlr_pmem->bridge;
- cxlr_pmem = nd_region_provider_data(nd_region);
- cxl_nvb = cxlr_pmem->bridge;
device_lock(&cxl_nvb->dev);
for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
struct cxl_nvdimm *cxl_nvd = m->cxl_nvd;
- if (cxl_nvd->region) {
- put_device(&cxlr_pmem->dev);
- cxl_nvd->region = NULL;
- }
+ cxl_nvdimm_del_region(cxl_nvd, cxlr_pmem);
}
device_unlock(&cxl_nvb->dev);
-
- nvdimm_region_delete(nd_region);
}
static void cxlr_pmem_remove_resource(void *res)
@@ -422,7 +449,7 @@ static int cxl_pmem_region_probe(struct device *dev)
if (!cxl_nvb->nvdimm_bus) {
dev_dbg(dev, "nvdimm bus not found\n");
rc = -ENXIO;
- goto err;
+ goto out_nvb;
}
memset(&mappings, 0, sizeof(mappings));
@@ -431,7 +458,7 @@ static int cxl_pmem_region_probe(struct device *dev)
res = devm_kzalloc(dev, sizeof(*res), GFP_KERNEL);
if (!res) {
rc = -ENOMEM;
- goto err;
+ goto out_nvb;
}
res->name = "Persistent Memory";
@@ -442,11 +469,11 @@ static int cxl_pmem_region_probe(struct device *dev)
rc = insert_resource(&iomem_resource, res);
if (rc)
- goto err;
+ goto out_nvb;
rc = devm_add_action_or_reset(dev, cxlr_pmem_remove_resource, res);
if (rc)
- goto err;
+ goto out_nvb;
ndr_desc.res = res;
ndr_desc.provider_data = cxlr_pmem;
@@ -462,7 +489,7 @@ static int cxl_pmem_region_probe(struct device *dev)
nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL);
if (!nd_set) {
rc = -ENOMEM;
- goto err;
+ goto out_nvb;
}
ndr_desc.memregion = cxlr->id;
@@ -472,9 +499,13 @@ static int cxl_pmem_region_probe(struct device *dev)
info = kmalloc_array(cxlr_pmem->nr_mappings, sizeof(*info), GFP_KERNEL);
if (!info) {
rc = -ENOMEM;
- goto err;
+ goto out_nvb;
}
+ rc = devm_add_action_or_reset(dev, release_mappings, cxlr_pmem);
+ if (rc)
+ goto out_nvd;
+
for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
struct cxl_memdev *cxlmd = m->cxlmd;
@@ -486,7 +517,7 @@ static int cxl_pmem_region_probe(struct device *dev)
dev_dbg(dev, "[%d]: %s: no cxl_nvdimm found\n", i,
dev_name(&cxlmd->dev));
rc = -ENODEV;
- goto err;
+ goto out_nvd;
}
/* safe to drop ref now with bridge lock held */
@@ -498,10 +529,17 @@ static int cxl_pmem_region_probe(struct device *dev)
dev_dbg(dev, "[%d]: %s: no nvdimm found\n", i,
dev_name(&cxlmd->dev));
rc = -ENODEV;
- goto err;
+ goto out_nvd;
}
- cxl_nvd->region = cxlr_pmem;
- get_device(&cxlr_pmem->dev);
+
+ /*
+ * Pin the region per nvdimm device as those may be released
+ * out-of-order with respect to the region, and a single nvdimm
+ * maybe associated with multiple regions
+ */
+ rc = cxl_nvdimm_add_region(cxl_nvd, cxlr_pmem);
+ if (rc)
+ goto out_nvd;
m->cxl_nvd = cxl_nvd;
mappings[i] = (struct nd_mapping_desc) {
.nvdimm = nvdimm,
@@ -527,27 +565,18 @@ static int cxl_pmem_region_probe(struct device *dev)
nvdimm_pmem_region_create(cxl_nvb->nvdimm_bus, &ndr_desc);
if (!cxlr_pmem->nd_region) {
rc = -ENOMEM;
- goto err;
+ goto out_nvd;
}
rc = devm_add_action_or_reset(dev, unregister_nvdimm_region,
cxlr_pmem->nd_region);
-out:
+out_nvd:
kfree(info);
+out_nvb:
device_unlock(&cxl_nvb->dev);
put_device(&cxl_nvb->dev);
return rc;
-
-err:
- dev_dbg(dev, "failed to create nvdimm region\n");
- for (i--; i >= 0; i--) {
- nvdimm = mappings[i].nvdimm;
- cxl_nvd = nvdimm_provider_data(nvdimm);
- put_device(&cxl_nvd->region->dev);
- cxl_nvd->region = NULL;
- }
- goto out;
}
static struct cxl_driver cxl_pmem_region_driver = {
diff --git a/drivers/firmware/arm_scmi/bus.c b/drivers/firmware/arm_scmi/bus.c
index d4e23101448a..35bb70724d44 100644
--- a/drivers/firmware/arm_scmi/bus.c
+++ b/drivers/firmware/arm_scmi/bus.c
@@ -216,9 +216,20 @@ void scmi_device_destroy(struct scmi_device *scmi_dev)
device_unregister(&scmi_dev->dev);
}
+void scmi_device_link_add(struct device *consumer, struct device *supplier)
+{
+ struct device_link *link;
+
+ link = device_link_add(consumer, supplier, DL_FLAG_AUTOREMOVE_CONSUMER);
+
+ WARN_ON(!link);
+}
+
void scmi_set_handle(struct scmi_device *scmi_dev)
{
scmi_dev->handle = scmi_handle_get(&scmi_dev->dev);
+ if (scmi_dev->handle)
+ scmi_device_link_add(&scmi_dev->dev, scmi_dev->handle->dev);
}
int scmi_protocol_register(const struct scmi_protocol *proto)
diff --git a/drivers/firmware/arm_scmi/common.h b/drivers/firmware/arm_scmi/common.h
index 61aba7447c32..a1c0154c31c6 100644
--- a/drivers/firmware/arm_scmi/common.h
+++ b/drivers/firmware/arm_scmi/common.h
@@ -97,6 +97,7 @@ static inline void unpack_scmi_header(u32 msg_hdr, struct scmi_msg_hdr *hdr)
struct scmi_revision_info *
scmi_revision_area_get(const struct scmi_protocol_handle *ph);
int scmi_handle_put(const struct scmi_handle *handle);
+void scmi_device_link_add(struct device *consumer, struct device *supplier);
struct scmi_handle *scmi_handle_get(struct device *dev);
void scmi_set_handle(struct scmi_device *scmi_dev);
void scmi_setup_protocol_implemented(const struct scmi_protocol_handle *ph,
@@ -117,6 +118,7 @@ void scmi_protocol_release(const struct scmi_handle *handle, u8 protocol_id);
*
* @dev: Reference to device in the SCMI hierarchy corresponding to this
* channel
+ * @rx_timeout_ms: The configured RX timeout in milliseconds.
* @handle: Pointer to SCMI entity handle
* @no_completion_irq: Flag to indicate that this channel has no completion
* interrupt mechanism for synchronous commands.
@@ -126,6 +128,7 @@ void scmi_protocol_release(const struct scmi_handle *handle, u8 protocol_id);
*/
struct scmi_chan_info {
struct device *dev;
+ unsigned int rx_timeout_ms;
struct scmi_handle *handle;
bool no_completion_irq;
void *transport_info;
@@ -232,7 +235,7 @@ void scmi_free_channel(struct scmi_chan_info *cinfo, struct idr *idr, int id);
struct scmi_shared_mem;
void shmem_tx_prepare(struct scmi_shared_mem __iomem *shmem,
- struct scmi_xfer *xfer);
+ struct scmi_xfer *xfer, struct scmi_chan_info *cinfo);
u32 shmem_read_header(struct scmi_shared_mem __iomem *shmem);
void shmem_fetch_response(struct scmi_shared_mem __iomem *shmem,
struct scmi_xfer *xfer);
diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index 609ebedee9cb..f818d00bb2c6 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -2013,6 +2013,7 @@ static int scmi_chan_setup(struct scmi_info *info, struct device *dev,
return -ENOMEM;
cinfo->dev = dev;
+ cinfo->rx_timeout_ms = info->desc->max_rx_timeout_ms;
ret = info->desc->ops->chan_setup(cinfo, info->dev, tx);
if (ret)
@@ -2044,8 +2045,12 @@ scmi_txrx_setup(struct scmi_info *info, struct device *dev, int prot_id)
{
int ret = scmi_chan_setup(info, dev, prot_id, true);
- if (!ret) /* Rx is optional, hence no error check */
- scmi_chan_setup(info, dev, prot_id, false);
+ if (!ret) {
+ /* Rx is optional, report only memory errors */
+ ret = scmi_chan_setup(info, dev, prot_id, false);
+ if (ret && ret != -ENOMEM)
+ ret = 0;
+ }
return ret;
}
@@ -2273,10 +2278,16 @@ int scmi_protocol_device_request(const struct scmi_device_id *id_table)
sdev = scmi_get_protocol_device(child, info,
id_table->protocol_id,
id_table->name);
- /* Set handle if not already set: device existed */
- if (sdev && !sdev->handle)
- sdev->handle =
- scmi_handle_get_from_info_unlocked(info);
+ if (sdev) {
+ /* Set handle if not already set: device existed */
+ if (!sdev->handle)
+ sdev->handle =
+ scmi_handle_get_from_info_unlocked(info);
+ /* Relink consumer and suppliers */
+ if (sdev->handle)
+ scmi_device_link_add(&sdev->dev,
+ sdev->handle->dev);
+ }
} else {
dev_err(info->dev,
"Failed. SCMI protocol %d not active.\n",
@@ -2475,20 +2486,17 @@ void scmi_free_channel(struct scmi_chan_info *cinfo, struct idr *idr, int id)
static int scmi_remove(struct platform_device *pdev)
{
- int ret = 0, id;
+ int ret, id;
struct scmi_info *info = platform_get_drvdata(pdev);
struct device_node *child;
mutex_lock(&scmi_list_mutex);
if (info->users)
- ret = -EBUSY;
- else
- list_del(&info->node);
+ dev_warn(&pdev->dev,
+ "Still active SCMI users will be forcibly unbound.\n");
+ list_del(&info->node);
mutex_unlock(&scmi_list_mutex);
- if (ret)
- return ret;
-
scmi_notification_exit(&info->handle);
mutex_lock(&info->protocols_mtx);
@@ -2500,7 +2508,11 @@ static int scmi_remove(struct platform_device *pdev)
idr_destroy(&info->active_protocols);
/* Safe to free channels since no more users */
- return scmi_cleanup_txrx_channels(info);
+ ret = scmi_cleanup_txrx_channels(info);
+ if (ret)
+ dev_warn(&pdev->dev, "Failed to cleanup SCMI channels.\n");
+
+ return 0;
}
static ssize_t protocol_version_show(struct device *dev,
@@ -2571,6 +2583,7 @@ MODULE_DEVICE_TABLE(of, scmi_of_match);
static struct platform_driver scmi_driver = {
.driver = {
.name = "arm-scmi",
+ .suppress_bind_attrs = true,
.of_match_table = scmi_of_match,
.dev_groups = versions_groups,
},
diff --git a/drivers/firmware/arm_scmi/mailbox.c b/drivers/firmware/arm_scmi/mailbox.c
index 08ff4d110beb..1e40cb035044 100644
--- a/drivers/firmware/arm_scmi/mailbox.c
+++ b/drivers/firmware/arm_scmi/mailbox.c
@@ -36,7 +36,7 @@ static void tx_prepare(struct mbox_client *cl, void *m)
{
struct scmi_mailbox *smbox = client_to_scmi_mailbox(cl);
- shmem_tx_prepare(smbox->shmem, m);
+ shmem_tx_prepare(smbox->shmem, m, smbox->cinfo);
}
static void rx_callback(struct mbox_client *cl, void *m)
diff --git a/drivers/firmware/arm_scmi/optee.c b/drivers/firmware/arm_scmi/optee.c
index f42dad997ac9..2a7aeab40e54 100644
--- a/drivers/firmware/arm_scmi/optee.c
+++ b/drivers/firmware/arm_scmi/optee.c
@@ -498,7 +498,7 @@ static int scmi_optee_send_message(struct scmi_chan_info *cinfo,
msg_tx_prepare(channel->req.msg, xfer);
ret = invoke_process_msg_channel(channel, msg_command_size(xfer));
} else {
- shmem_tx_prepare(channel->req.shmem, xfer);
+ shmem_tx_prepare(channel->req.shmem, xfer, cinfo);
ret = invoke_process_smt_channel(channel);
}
diff --git a/drivers/firmware/arm_scmi/shmem.c b/drivers/firmware/arm_scmi/shmem.c
index 0e3eaea5d852..1dfe534b8518 100644
--- a/drivers/firmware/arm_scmi/shmem.c
+++ b/drivers/firmware/arm_scmi/shmem.c
@@ -5,10 +5,13 @@
* Copyright (C) 2019 ARM Ltd.
*/
+#include <linux/ktime.h>
#include <linux/io.h>
#include <linux/processor.h>
#include <linux/types.h>
+#include <asm-generic/bug.h>
+
#include "common.h"
/*
@@ -30,16 +33,36 @@ struct scmi_shared_mem {
};
void shmem_tx_prepare(struct scmi_shared_mem __iomem *shmem,
- struct scmi_xfer *xfer)
+ struct scmi_xfer *xfer, struct scmi_chan_info *cinfo)
{
+ ktime_t stop;
+
/*
* Ideally channel must be free by now unless OS timeout last
* request and platform continued to process the same, wait
* until it releases the shared memory, otherwise we may endup
- * overwriting its response with new message payload or vice-versa
+ * overwriting its response with new message payload or vice-versa.
+ * Giving up anyway after twice the expected channel timeout so as
+ * not to bail-out on intermittent issues where the platform is
+ * occasionally a bit slower to answer.
+ *
+ * Note that after a timeout is detected we bail-out and carry on but
+ * the transport functionality is probably permanently compromised:
+ * this is just to ease debugging and avoid complete hangs on boot
+ * due to a misbehaving SCMI firmware.
*/
- spin_until_cond(ioread32(&shmem->channel_status) &
- SCMI_SHMEM_CHAN_STAT_CHANNEL_FREE);
+ stop = ktime_add_ms(ktime_get(), 2 * cinfo->rx_timeout_ms);
+ spin_until_cond((ioread32(&shmem->channel_status) &
+ SCMI_SHMEM_CHAN_STAT_CHANNEL_FREE) ||
+ ktime_after(ktime_get(), stop));
+ if (!(ioread32(&shmem->channel_status) &
+ SCMI_SHMEM_CHAN_STAT_CHANNEL_FREE)) {
+ WARN_ON_ONCE(1);
+ dev_err(cinfo->dev,
+ "Timeout waiting for a free TX channel !\n");
+ return;
+ }
+
/* Mark channel busy + clear error */
iowrite32(0x0, &shmem->channel_status);
iowrite32(xfer->hdr.poll_completion ? 0 : SCMI_SHMEM_FLAG_INTR_ENABLED,
diff --git a/drivers/firmware/arm_scmi/smc.c b/drivers/firmware/arm_scmi/smc.c
index 745acfdd0b3d..87a7b13cf868 100644
--- a/drivers/firmware/arm_scmi/smc.c
+++ b/drivers/firmware/arm_scmi/smc.c
@@ -188,7 +188,7 @@ static int smc_send_message(struct scmi_chan_info *cinfo,
*/
smc_channel_lock_acquire(scmi_info, xfer);
- shmem_tx_prepare(scmi_info->shmem, xfer);
+ shmem_tx_prepare(scmi_info->shmem, xfer, cinfo);
arm_smccc_1_1_invoke(scmi_info->func_id, 0, 0, 0, 0, 0, 0, 0, &res);
diff --git a/drivers/firmware/arm_scmi/virtio.c b/drivers/firmware/arm_scmi/virtio.c
index 14709dbc96a1..33c9b81a55cd 100644
--- a/drivers/firmware/arm_scmi/virtio.c
+++ b/drivers/firmware/arm_scmi/virtio.c
@@ -148,7 +148,6 @@ static void scmi_vio_channel_cleanup_sync(struct scmi_vio_channel *vioch)
{
unsigned long flags;
DECLARE_COMPLETION_ONSTACK(vioch_shutdown_done);
- void *deferred_wq = NULL;
/*
* Prepare to wait for the last release if not already released
@@ -162,16 +161,11 @@ static void scmi_vio_channel_cleanup_sync(struct scmi_vio_channel *vioch)
vioch->shutdown_done = &vioch_shutdown_done;
virtio_break_device(vioch->vqueue->vdev);
- if (!vioch->is_rx && vioch->deferred_tx_wq) {
- deferred_wq = vioch->deferred_tx_wq;
+ if (!vioch->is_rx && vioch->deferred_tx_wq)
/* Cannot be kicked anymore after this...*/
vioch->deferred_tx_wq = NULL;
- }
spin_unlock_irqrestore(&vioch->lock, flags);
- if (deferred_wq)
- destroy_workqueue(deferred_wq);
-
scmi_vio_channel_release(vioch);
/* Let any possibly concurrent RX path release the channel */
@@ -416,6 +410,11 @@ static bool virtio_chan_available(struct device *dev, int idx)
return vioch && !vioch->cinfo;
}
+static void scmi_destroy_tx_workqueue(void *deferred_tx_wq)
+{
+ destroy_workqueue(deferred_tx_wq);
+}
+
static int virtio_chan_setup(struct scmi_chan_info *cinfo, struct device *dev,
bool tx)
{
@@ -430,6 +429,8 @@ static int virtio_chan_setup(struct scmi_chan_info *cinfo, struct device *dev,
/* Setup a deferred worker for polling. */
if (tx && !vioch->deferred_tx_wq) {
+ int ret;
+
vioch->deferred_tx_wq =
alloc_workqueue(dev_name(&scmi_vdev->dev),
WQ_UNBOUND | WQ_FREEZABLE | WQ_SYSFS,
@@ -437,6 +438,11 @@ static int virtio_chan_setup(struct scmi_chan_info *cinfo, struct device *dev,
if (!vioch->deferred_tx_wq)
return -ENOMEM;
+ ret = devm_add_action_or_reset(dev, scmi_destroy_tx_workqueue,
+ vioch->deferred_tx_wq);
+ if (ret)
+ return ret;
+
INIT_WORK(&vioch->deferred_tx_work,
scmi_vio_deferred_tx_worker);
}
@@ -444,12 +450,12 @@ static int virtio_chan_setup(struct scmi_chan_info *cinfo, struct device *dev,
for (i = 0; i < vioch->max_msg; i++) {
struct scmi_vio_msg *msg;
- msg = devm_kzalloc(cinfo->dev, sizeof(*msg), GFP_KERNEL);
+ msg = devm_kzalloc(dev, sizeof(*msg), GFP_KERNEL);
if (!msg)
return -ENOMEM;
if (tx) {
- msg->request = devm_kzalloc(cinfo->dev,
+ msg->request = devm_kzalloc(dev,
VIRTIO_SCMI_MAX_PDU_SIZE,
GFP_KERNEL);
if (!msg->request)
@@ -458,7 +464,7 @@ static int virtio_chan_setup(struct scmi_chan_info *cinfo, struct device *dev,
refcount_set(&msg->users, 1);
}
- msg->input = devm_kzalloc(cinfo->dev, VIRTIO_SCMI_MAX_PDU_SIZE,
+ msg->input = devm_kzalloc(dev, VIRTIO_SCMI_MAX_PDU_SIZE,
GFP_KERNEL);
if (!msg->input)
return -ENOMEM;
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 3ecdc43a3f2b..a46df5d1d094 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -611,7 +611,7 @@ int __init efi_config_parse_tables(const efi_config_table_t *config_tables,
seed = early_memremap(efi_rng_seed, sizeof(*seed));
if (seed != NULL) {
- size = READ_ONCE(seed->size);
+ size = min(seed->size, EFI_RANDOM_SEED_SIZE);
early_memunmap(seed, sizeof(*seed));
} else {
pr_err("Could not map UEFI random seed!\n");
diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c
index 24aa37535372..33ab56769595 100644
--- a/drivers/firmware/efi/libstub/random.c
+++ b/drivers/firmware/efi/libstub/random.c
@@ -75,7 +75,12 @@ efi_status_t efi_random_get_seed(void)
if (status != EFI_SUCCESS)
return status;
- status = efi_bs_call(allocate_pool, EFI_RUNTIME_SERVICES_DATA,
+ /*
+ * Use EFI_ACPI_RECLAIM_MEMORY here so that it is guaranteed that the
+ * allocation will survive a kexec reboot (although we refresh the seed
+ * beforehand)
+ */
+ status = efi_bs_call(allocate_pool, EFI_ACPI_RECLAIM_MEMORY,
sizeof(*seed) + EFI_RANDOM_SEED_SIZE,
(void **)&seed);
if (status != EFI_SUCCESS)
diff --git a/drivers/firmware/efi/tpm.c b/drivers/firmware/efi/tpm.c
index 8f665678e9e3..e8d69bd548f3 100644
--- a/drivers/firmware/efi/tpm.c
+++ b/drivers/firmware/efi/tpm.c
@@ -97,7 +97,7 @@ int __init efi_tpm_eventlog_init(void)
goto out_calc;
}
- memblock_reserve((unsigned long)final_tbl,
+ memblock_reserve(efi.tpm_final_log,
tbl_size + sizeof(*final_tbl));
efi_tpm_final_log_size = tbl_size;
diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c
index 433b61587139..0ba9f18312f5 100644
--- a/drivers/firmware/efi/vars.c
+++ b/drivers/firmware/efi/vars.c
@@ -21,29 +21,22 @@ static struct efivars *__efivars;
static DEFINE_SEMAPHORE(efivars_lock);
-static efi_status_t check_var_size(u32 attributes, unsigned long size)
-{
- const struct efivar_operations *fops;
-
- fops = __efivars->ops;
-
- if (!fops->query_variable_store)
- return (size <= SZ_64K) ? EFI_SUCCESS : EFI_OUT_OF_RESOURCES;
-
- return fops->query_variable_store(attributes, size, false);
-}
-
-static
-efi_status_t check_var_size_nonblocking(u32 attributes, unsigned long size)
+static efi_status_t check_var_size(bool nonblocking, u32 attributes,
+ unsigned long size)
{
const struct efivar_operations *fops;
+ efi_status_t status;
fops = __efivars->ops;
if (!fops->query_variable_store)
+ status = EFI_UNSUPPORTED;
+ else
+ status = fops->query_variable_store(attributes, size,
+ nonblocking);
+ if (status == EFI_UNSUPPORTED)
return (size <= SZ_64K) ? EFI_SUCCESS : EFI_OUT_OF_RESOURCES;
-
- return fops->query_variable_store(attributes, size, true);
+ return status;
}
/**
@@ -196,26 +189,6 @@ efi_status_t efivar_get_next_variable(unsigned long *name_size,
EXPORT_SYMBOL_NS_GPL(efivar_get_next_variable, EFIVAR);
/*
- * efivar_set_variable_blocking() - local helper function for set_variable
- *
- * Must be called with efivars_lock held.
- */
-static efi_status_t
-efivar_set_variable_blocking(efi_char16_t *name, efi_guid_t *vendor,
- u32 attr, unsigned long data_size, void *data)
-{
- efi_status_t status;
-
- if (data_size > 0) {
- status = check_var_size(attr, data_size +
- ucs2_strsize(name, 1024));
- if (status != EFI_SUCCESS)
- return status;
- }
- return __efivars->ops->set_variable(name, vendor, attr, data_size, data);
-}
-
-/*
* efivar_set_variable_locked() - set a variable identified by name/vendor
*
* Must be called with efivars_lock held. If @nonblocking is set, it will use
@@ -228,23 +201,21 @@ efi_status_t efivar_set_variable_locked(efi_char16_t *name, efi_guid_t *vendor,
efi_set_variable_t *setvar;
efi_status_t status;
- if (!nonblocking)
- return efivar_set_variable_blocking(name, vendor, attr,
- data_size, data);
+ if (data_size > 0) {
+ status = check_var_size(nonblocking, attr,
+ data_size + ucs2_strsize(name, 1024));
+ if (status != EFI_SUCCESS)
+ return status;
+ }
/*
* If no _nonblocking variant exists, the ordinary one
* is assumed to be non-blocking.
*/
- setvar = __efivars->ops->set_variable_nonblocking ?:
- __efivars->ops->set_variable;
+ setvar = __efivars->ops->set_variable_nonblocking;
+ if (!setvar || !nonblocking)
+ setvar = __efivars->ops->set_variable;
- if (data_size > 0) {
- status = check_var_size_nonblocking(attr, data_size +
- ucs2_strsize(name, 1024));
- if (status != EFI_SUCCESS)
- return status;
- }
return setvar(name, vendor, attr, data_size, data);
}
EXPORT_SYMBOL_NS_GPL(efivar_set_variable_locked, EFIVAR);
@@ -264,7 +235,8 @@ efi_status_t efivar_set_variable(efi_char16_t *name, efi_guid_t *vendor,
if (efivar_lock())
return EFI_ABORTED;
- status = efivar_set_variable_blocking(name, vendor, attr, data_size, data);
+ status = efivar_set_variable_locked(name, vendor, attr, data_size,
+ data, false);
efivar_unlock();
return status;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 0561812aa0a4..5d9a34601a1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -706,6 +706,13 @@ err:
void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle)
{
+ /* Temporary workaround to fix issues observed in some
+ * compute applications when GFXOFF is enabled on GFX11.
+ */
+ if (IP_VERSION_MAJ(adev->ip_versions[GC_HWIP][0]) == 11) {
+ pr_debug("GFXOFF is %s\n", idle ? "enabled" : "disabled");
+ amdgpu_gfx_off_ctrl(adev, idle);
+ }
amdgpu_dpm_switch_power_profile(adev,
PP_SMC_POWER_PROFILE_COMPUTE,
!idle);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ddaecb2610c9..64510898eedd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4060,15 +4060,18 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
* at suspend time.
*
*/
-static void amdgpu_device_evict_resources(struct amdgpu_device *adev)
+static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
{
+ int ret;
+
/* No need to evict vram on APUs for suspend to ram or s2idle */
if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
- return;
+ return 0;
- if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM))
+ ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
+ if (ret)
DRM_WARN("evicting device resources failed\n");
-
+ return ret;
}
/*
@@ -4118,7 +4121,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
if (!adev->in_s0ix)
amdgpu_amdkfd_suspend(adev, adev->in_runpm);
- amdgpu_device_evict_resources(adev);
+ r = amdgpu_device_evict_resources(adev);
+ if (r)
+ return r;
amdgpu_fence_driver_hw_fini(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 3c9fecdd6b2f..bf2d50c8c92a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2201,7 +2201,8 @@ amdgpu_pci_remove(struct pci_dev *pdev)
pm_runtime_forbid(dev->dev);
}
- if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)) {
+ if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
+ !amdgpu_sriov_vf(adev)) {
bool need_to_reset_gpu = false;
if (adev->gmc.xgmi.num_physical_nodes > 1) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index bf1ff8f0e712..4e42dcb1950f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -337,12 +337,14 @@ static int amdgpu_firmware_info(struct drm_amdgpu_info_firmware *fw_info,
fw_info->feature = adev->psp.cap_feature_version;
break;
case AMDGPU_INFO_FW_MES_KIQ:
- fw_info->ver = adev->mes.ucode_fw_version[0];
- fw_info->feature = 0;
+ fw_info->ver = adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK;
+ fw_info->feature = (adev->mes.kiq_version & AMDGPU_MES_FEAT_VERSION_MASK)
+ >> AMDGPU_MES_FEAT_VERSION_SHIFT;
break;
case AMDGPU_INFO_FW_MES:
- fw_info->ver = adev->mes.ucode_fw_version[1];
- fw_info->feature = 0;
+ fw_info->ver = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
+ fw_info->feature = (adev->mes.sched_version & AMDGPU_MES_FEAT_VERSION_MASK)
+ >> AMDGPU_MES_FEAT_VERSION_SHIFT;
break;
case AMDGPU_INFO_FW_IMU:
fw_info->ver = adev->gfx.imu_fw_version;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
index f4b5301ea2a0..500a1dc4fe02 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
@@ -500,6 +500,8 @@ static int amdgpu_vkms_sw_init(void *handle)
adev_to_drm(adev)->mode_config.fb_base = adev->gmc.aper_base;
+ adev_to_drm(adev)->mode_config.fb_modifiers_not_supported = true;
+
r = amdgpu_display_modeset_create_props(adev);
if (r)
return r;
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index c7118843db05..0c4c5499bb5c 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -2495,442 +2495,444 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
0xbf9f0000, 0x00000000,
};
static const uint32_t cwsr_trap_gfx11_hex[] = {
- 0xbfa00001, 0xbfa0021e,
+ 0xbfa00001, 0xbfa00221,
0xb0804006, 0xb8f8f802,
0x9178ff78, 0x00020006,
- 0xb8fbf803, 0xbf0d9f6d,
- 0xbfa20006, 0x8b6eff78,
- 0x00002000, 0xbfa10009,
- 0x8b6eff6d, 0x00ff0000,
- 0xbfa2001e, 0x8b6eff7b,
- 0x00000400, 0xbfa20041,
- 0xbf830010, 0xb8fbf803,
- 0xbfa0fffa, 0x8b6eff7b,
- 0x00000900, 0xbfa20015,
- 0x8b6eff7b, 0x000071ff,
- 0xbfa10008, 0x8b6fff7b,
- 0x00007080, 0xbfa10001,
- 0xbeee1287, 0xb8eff801,
- 0x846e8c6e, 0x8b6e6f6e,
- 0xbfa2000a, 0x8b6eff6d,
- 0x00ff0000, 0xbfa20007,
- 0xb8eef801, 0x8b6eff6e,
- 0x00000800, 0xbfa20003,
+ 0xb8fbf803, 0xbf0d9e6d,
+ 0xbfa10001, 0xbfbd0000,
+ 0xbf0d9f6d, 0xbfa20006,
+ 0x8b6eff78, 0x00002000,
+ 0xbfa10009, 0x8b6eff6d,
+ 0x00ff0000, 0xbfa2001e,
0x8b6eff7b, 0x00000400,
- 0xbfa20026, 0xbefa4d82,
- 0xbf89fc07, 0x84fa887a,
- 0xf4005bbd, 0xf8000010,
- 0xbf89fc07, 0x846e976e,
- 0x9177ff77, 0x00800000,
- 0x8c776e77, 0xf4045bbd,
- 0xf8000000, 0xbf89fc07,
- 0xf4045ebd, 0xf8000008,
- 0xbf89fc07, 0x8bee6e6e,
- 0xbfa10001, 0xbe80486e,
- 0x8b6eff6d, 0x01ff0000,
- 0xbfa20005, 0x8c78ff78,
- 0x00002000, 0x80ec886c,
- 0x82ed806d, 0xbfa00005,
- 0x8b6eff6d, 0x01000000,
- 0xbfa20002, 0x806c846c,
- 0x826d806d, 0x8b6dff6d,
- 0x0000ffff, 0x8bfe7e7e,
- 0x8bea6a6a, 0xb978f802,
- 0xbe804a6c, 0x8b6dff6d,
- 0x0000ffff, 0xbefa0080,
- 0xb97a0283, 0xbeee007e,
- 0xbeef007f, 0xbefe0180,
- 0xbefe4d84, 0xbf89fc07,
- 0x8b7aff7f, 0x04000000,
- 0x847a857a, 0x8c6d7a6d,
- 0xbefa007e, 0x8b7bff7f,
- 0x0000ffff, 0xbefe00c1,
- 0xbeff00c1, 0xdca6c000,
- 0x007a0000, 0x7e000280,
- 0xbefe007a, 0xbeff007b,
- 0xb8fb02dc, 0x847b997b,
- 0xb8fa3b05, 0x807a817a,
- 0xbf0d997b, 0xbfa20002,
- 0x847a897a, 0xbfa00001,
- 0x847a8a7a, 0xb8fb1e06,
- 0x847b8a7b, 0x807a7b7a,
+ 0xbfa20041, 0xbf830010,
+ 0xb8fbf803, 0xbfa0fffa,
+ 0x8b6eff7b, 0x00000900,
+ 0xbfa20015, 0x8b6eff7b,
+ 0x000071ff, 0xbfa10008,
+ 0x8b6fff7b, 0x00007080,
+ 0xbfa10001, 0xbeee1287,
+ 0xb8eff801, 0x846e8c6e,
+ 0x8b6e6f6e, 0xbfa2000a,
+ 0x8b6eff6d, 0x00ff0000,
+ 0xbfa20007, 0xb8eef801,
+ 0x8b6eff6e, 0x00000800,
+ 0xbfa20003, 0x8b6eff7b,
+ 0x00000400, 0xbfa20026,
+ 0xbefa4d82, 0xbf89fc07,
+ 0x84fa887a, 0xf4005bbd,
+ 0xf8000010, 0xbf89fc07,
+ 0x846e976e, 0x9177ff77,
+ 0x00800000, 0x8c776e77,
+ 0xf4045bbd, 0xf8000000,
+ 0xbf89fc07, 0xf4045ebd,
+ 0xf8000008, 0xbf89fc07,
+ 0x8bee6e6e, 0xbfa10001,
+ 0xbe80486e, 0x8b6eff6d,
+ 0x01ff0000, 0xbfa20005,
+ 0x8c78ff78, 0x00002000,
+ 0x80ec886c, 0x82ed806d,
+ 0xbfa00005, 0x8b6eff6d,
+ 0x01000000, 0xbfa20002,
+ 0x806c846c, 0x826d806d,
+ 0x8b6dff6d, 0x0000ffff,
+ 0x8bfe7e7e, 0x8bea6a6a,
+ 0xb978f802, 0xbe804a6c,
+ 0x8b6dff6d, 0x0000ffff,
+ 0xbefa0080, 0xb97a0283,
+ 0xbeee007e, 0xbeef007f,
+ 0xbefe0180, 0xbefe4d84,
+ 0xbf89fc07, 0x8b7aff7f,
+ 0x04000000, 0x847a857a,
+ 0x8c6d7a6d, 0xbefa007e,
0x8b7bff7f, 0x0000ffff,
- 0x807aff7a, 0x00000200,
- 0x807a7e7a, 0x827b807b,
- 0xd7610000, 0x00010870,
- 0xd7610000, 0x00010a71,
- 0xd7610000, 0x00010c72,
- 0xd7610000, 0x00010e73,
- 0xd7610000, 0x00011074,
- 0xd7610000, 0x00011275,
- 0xd7610000, 0x00011476,
- 0xd7610000, 0x00011677,
- 0xd7610000, 0x00011a79,
- 0xd7610000, 0x00011c7e,
- 0xd7610000, 0x00011e7f,
- 0xbefe00ff, 0x00003fff,
- 0xbeff0080, 0xdca6c040,
- 0x007a0000, 0xd760007a,
- 0x00011d00, 0xd760007b,
- 0x00011f00, 0xbefe007a,
- 0xbeff007b, 0xbef4007e,
- 0x8b75ff7f, 0x0000ffff,
- 0x8c75ff75, 0x00040000,
- 0xbef60080, 0xbef700ff,
- 0x10807fac, 0xbef1007d,
- 0xbef00080, 0xb8f302dc,
- 0x84739973, 0xbefe00c1,
- 0x857d9973, 0x8b7d817d,
- 0xbf06817d, 0xbfa20002,
- 0xbeff0080, 0xbfa00002,
- 0xbeff00c1, 0xbfa00009,
+ 0xbefe00c1, 0xbeff00c1,
+ 0xdca6c000, 0x007a0000,
+ 0x7e000280, 0xbefe007a,
+ 0xbeff007b, 0xb8fb02dc,
+ 0x847b997b, 0xb8fa3b05,
+ 0x807a817a, 0xbf0d997b,
+ 0xbfa20002, 0x847a897a,
+ 0xbfa00001, 0x847a8a7a,
+ 0xb8fb1e06, 0x847b8a7b,
+ 0x807a7b7a, 0x8b7bff7f,
+ 0x0000ffff, 0x807aff7a,
+ 0x00000200, 0x807a7e7a,
+ 0x827b807b, 0xd7610000,
+ 0x00010870, 0xd7610000,
+ 0x00010a71, 0xd7610000,
+ 0x00010c72, 0xd7610000,
+ 0x00010e73, 0xd7610000,
+ 0x00011074, 0xd7610000,
+ 0x00011275, 0xd7610000,
+ 0x00011476, 0xd7610000,
+ 0x00011677, 0xd7610000,
+ 0x00011a79, 0xd7610000,
+ 0x00011c7e, 0xd7610000,
+ 0x00011e7f, 0xbefe00ff,
+ 0x00003fff, 0xbeff0080,
+ 0xdca6c040, 0x007a0000,
+ 0xd760007a, 0x00011d00,
+ 0xd760007b, 0x00011f00,
+ 0xbefe007a, 0xbeff007b,
+ 0xbef4007e, 0x8b75ff7f,
+ 0x0000ffff, 0x8c75ff75,
+ 0x00040000, 0xbef60080,
+ 0xbef700ff, 0x10807fac,
+ 0xbef1007d, 0xbef00080,
+ 0xb8f302dc, 0x84739973,
+ 0xbefe00c1, 0x857d9973,
+ 0x8b7d817d, 0xbf06817d,
+ 0xbfa20002, 0xbeff0080,
+ 0xbfa00002, 0xbeff00c1,
+ 0xbfa00009, 0xbef600ff,
+ 0x01000000, 0xe0685080,
+ 0x701d0100, 0xe0685100,
+ 0x701d0200, 0xe0685180,
+ 0x701d0300, 0xbfa00008,
0xbef600ff, 0x01000000,
- 0xe0685080, 0x701d0100,
- 0xe0685100, 0x701d0200,
- 0xe0685180, 0x701d0300,
- 0xbfa00008, 0xbef600ff,
- 0x01000000, 0xe0685100,
- 0x701d0100, 0xe0685200,
- 0x701d0200, 0xe0685300,
- 0x701d0300, 0xb8f03b05,
- 0x80708170, 0xbf0d9973,
- 0xbfa20002, 0x84708970,
- 0xbfa00001, 0x84708a70,
- 0xb8fa1e06, 0x847a8a7a,
- 0x80707a70, 0x8070ff70,
- 0x00000200, 0xbef600ff,
- 0x01000000, 0x7e000280,
- 0x7e020280, 0x7e040280,
- 0xbefd0080, 0xd7610002,
- 0x0000fa71, 0x807d817d,
- 0xd7610002, 0x0000fa6c,
- 0x807d817d, 0x917aff6d,
- 0x80000000, 0xd7610002,
- 0x0000fa7a, 0x807d817d,
- 0xd7610002, 0x0000fa6e,
- 0x807d817d, 0xd7610002,
- 0x0000fa6f, 0x807d817d,
- 0xd7610002, 0x0000fa78,
- 0x807d817d, 0xb8faf803,
- 0xd7610002, 0x0000fa7a,
- 0x807d817d, 0xd7610002,
- 0x0000fa7b, 0x807d817d,
- 0xb8f1f801, 0xd7610002,
- 0x0000fa71, 0x807d817d,
- 0xb8f1f814, 0xd7610002,
- 0x0000fa71, 0x807d817d,
- 0xb8f1f815, 0xd7610002,
- 0x0000fa71, 0x807d817d,
- 0xbefe00ff, 0x0000ffff,
- 0xbeff0080, 0xe0685000,
- 0x701d0200, 0xbefe00c1,
+ 0xe0685100, 0x701d0100,
+ 0xe0685200, 0x701d0200,
+ 0xe0685300, 0x701d0300,
0xb8f03b05, 0x80708170,
0xbf0d9973, 0xbfa20002,
0x84708970, 0xbfa00001,
0x84708a70, 0xb8fa1e06,
0x847a8a7a, 0x80707a70,
+ 0x8070ff70, 0x00000200,
0xbef600ff, 0x01000000,
- 0xbef90080, 0xbefd0080,
- 0xbf800000, 0xbe804100,
- 0xbe824102, 0xbe844104,
- 0xbe864106, 0xbe884108,
- 0xbe8a410a, 0xbe8c410c,
- 0xbe8e410e, 0xd7610002,
- 0x0000f200, 0x80798179,
- 0xd7610002, 0x0000f201,
+ 0x7e000280, 0x7e020280,
+ 0x7e040280, 0xbefd0080,
+ 0xd7610002, 0x0000fa71,
+ 0x807d817d, 0xd7610002,
+ 0x0000fa6c, 0x807d817d,
+ 0x917aff6d, 0x80000000,
+ 0xd7610002, 0x0000fa7a,
+ 0x807d817d, 0xd7610002,
+ 0x0000fa6e, 0x807d817d,
+ 0xd7610002, 0x0000fa6f,
+ 0x807d817d, 0xd7610002,
+ 0x0000fa78, 0x807d817d,
+ 0xb8faf803, 0xd7610002,
+ 0x0000fa7a, 0x807d817d,
+ 0xd7610002, 0x0000fa7b,
+ 0x807d817d, 0xb8f1f801,
+ 0xd7610002, 0x0000fa71,
+ 0x807d817d, 0xb8f1f814,
+ 0xd7610002, 0x0000fa71,
+ 0x807d817d, 0xb8f1f815,
+ 0xd7610002, 0x0000fa71,
+ 0x807d817d, 0xbefe00ff,
+ 0x0000ffff, 0xbeff0080,
+ 0xe0685000, 0x701d0200,
+ 0xbefe00c1, 0xb8f03b05,
+ 0x80708170, 0xbf0d9973,
+ 0xbfa20002, 0x84708970,
+ 0xbfa00001, 0x84708a70,
+ 0xb8fa1e06, 0x847a8a7a,
+ 0x80707a70, 0xbef600ff,
+ 0x01000000, 0xbef90080,
+ 0xbefd0080, 0xbf800000,
+ 0xbe804100, 0xbe824102,
+ 0xbe844104, 0xbe864106,
+ 0xbe884108, 0xbe8a410a,
+ 0xbe8c410c, 0xbe8e410e,
+ 0xd7610002, 0x0000f200,
0x80798179, 0xd7610002,
- 0x0000f202, 0x80798179,
- 0xd7610002, 0x0000f203,
+ 0x0000f201, 0x80798179,
+ 0xd7610002, 0x0000f202,
0x80798179, 0xd7610002,
- 0x0000f204, 0x80798179,
- 0xd7610002, 0x0000f205,
+ 0x0000f203, 0x80798179,
+ 0xd7610002, 0x0000f204,
0x80798179, 0xd7610002,
- 0x0000f206, 0x80798179,
- 0xd7610002, 0x0000f207,
+ 0x0000f205, 0x80798179,
+ 0xd7610002, 0x0000f206,
0x80798179, 0xd7610002,
- 0x0000f208, 0x80798179,
- 0xd7610002, 0x0000f209,
+ 0x0000f207, 0x80798179,
+ 0xd7610002, 0x0000f208,
0x80798179, 0xd7610002,
- 0x0000f20a, 0x80798179,
- 0xd7610002, 0x0000f20b,
+ 0x0000f209, 0x80798179,
+ 0xd7610002, 0x0000f20a,
0x80798179, 0xd7610002,
- 0x0000f20c, 0x80798179,
- 0xd7610002, 0x0000f20d,
+ 0x0000f20b, 0x80798179,
+ 0xd7610002, 0x0000f20c,
0x80798179, 0xd7610002,
- 0x0000f20e, 0x80798179,
- 0xd7610002, 0x0000f20f,
- 0x80798179, 0xbf06a079,
- 0xbfa10006, 0xe0685000,
- 0x701d0200, 0x8070ff70,
- 0x00000080, 0xbef90080,
- 0x7e040280, 0x807d907d,
- 0xbf0aff7d, 0x00000060,
- 0xbfa2ffbc, 0xbe804100,
- 0xbe824102, 0xbe844104,
- 0xbe864106, 0xbe884108,
- 0xbe8a410a, 0xd7610002,
- 0x0000f200, 0x80798179,
- 0xd7610002, 0x0000f201,
+ 0x0000f20d, 0x80798179,
+ 0xd7610002, 0x0000f20e,
0x80798179, 0xd7610002,
- 0x0000f202, 0x80798179,
- 0xd7610002, 0x0000f203,
+ 0x0000f20f, 0x80798179,
+ 0xbf06a079, 0xbfa10006,
+ 0xe0685000, 0x701d0200,
+ 0x8070ff70, 0x00000080,
+ 0xbef90080, 0x7e040280,
+ 0x807d907d, 0xbf0aff7d,
+ 0x00000060, 0xbfa2ffbc,
+ 0xbe804100, 0xbe824102,
+ 0xbe844104, 0xbe864106,
+ 0xbe884108, 0xbe8a410a,
+ 0xd7610002, 0x0000f200,
0x80798179, 0xd7610002,
- 0x0000f204, 0x80798179,
- 0xd7610002, 0x0000f205,
+ 0x0000f201, 0x80798179,
+ 0xd7610002, 0x0000f202,
0x80798179, 0xd7610002,
- 0x0000f206, 0x80798179,
- 0xd7610002, 0x0000f207,
+ 0x0000f203, 0x80798179,
+ 0xd7610002, 0x0000f204,
0x80798179, 0xd7610002,
- 0x0000f208, 0x80798179,
- 0xd7610002, 0x0000f209,
+ 0x0000f205, 0x80798179,
+ 0xd7610002, 0x0000f206,
0x80798179, 0xd7610002,
- 0x0000f20a, 0x80798179,
- 0xd7610002, 0x0000f20b,
- 0x80798179, 0xe0685000,
- 0x701d0200, 0xbefe00c1,
- 0x857d9973, 0x8b7d817d,
- 0xbf06817d, 0xbfa20002,
- 0xbeff0080, 0xbfa00001,
- 0xbeff00c1, 0xb8fb4306,
- 0x8b7bc17b, 0xbfa10044,
- 0xbfbd0000, 0x8b7aff6d,
- 0x80000000, 0xbfa10040,
- 0x847b867b, 0x847b827b,
- 0xbef6007b, 0xb8f03b05,
- 0x80708170, 0xbf0d9973,
- 0xbfa20002, 0x84708970,
- 0xbfa00001, 0x84708a70,
- 0xb8fa1e06, 0x847a8a7a,
- 0x80707a70, 0x8070ff70,
- 0x00000200, 0x8070ff70,
- 0x00000080, 0xbef600ff,
- 0x01000000, 0xd71f0000,
- 0x000100c1, 0xd7200000,
- 0x000200c1, 0x16000084,
- 0x857d9973, 0x8b7d817d,
- 0xbf06817d, 0xbefd0080,
- 0xbfa20012, 0xbe8300ff,
- 0x00000080, 0xbf800000,
- 0xbf800000, 0xbf800000,
- 0xd8d80000, 0x01000000,
- 0xbf890000, 0xe0685000,
- 0x701d0100, 0x807d037d,
- 0x80700370, 0xd5250000,
- 0x0001ff00, 0x00000080,
- 0xbf0a7b7d, 0xbfa2fff4,
- 0xbfa00011, 0xbe8300ff,
- 0x00000100, 0xbf800000,
- 0xbf800000, 0xbf800000,
- 0xd8d80000, 0x01000000,
- 0xbf890000, 0xe0685000,
- 0x701d0100, 0x807d037d,
- 0x80700370, 0xd5250000,
- 0x0001ff00, 0x00000100,
- 0xbf0a7b7d, 0xbfa2fff4,
+ 0x0000f207, 0x80798179,
+ 0xd7610002, 0x0000f208,
+ 0x80798179, 0xd7610002,
+ 0x0000f209, 0x80798179,
+ 0xd7610002, 0x0000f20a,
+ 0x80798179, 0xd7610002,
+ 0x0000f20b, 0x80798179,
+ 0xe0685000, 0x701d0200,
0xbefe00c1, 0x857d9973,
0x8b7d817d, 0xbf06817d,
- 0xbfa20004, 0xbef000ff,
- 0x00000200, 0xbeff0080,
- 0xbfa00003, 0xbef000ff,
- 0x00000400, 0xbeff00c1,
- 0xb8fb3b05, 0x807b817b,
- 0x847b827b, 0x857d9973,
+ 0xbfa20002, 0xbeff0080,
+ 0xbfa00001, 0xbeff00c1,
+ 0xb8fb4306, 0x8b7bc17b,
+ 0xbfa10044, 0xbfbd0000,
+ 0x8b7aff6d, 0x80000000,
+ 0xbfa10040, 0x847b867b,
+ 0x847b827b, 0xbef6007b,
+ 0xb8f03b05, 0x80708170,
+ 0xbf0d9973, 0xbfa20002,
+ 0x84708970, 0xbfa00001,
+ 0x84708a70, 0xb8fa1e06,
+ 0x847a8a7a, 0x80707a70,
+ 0x8070ff70, 0x00000200,
+ 0x8070ff70, 0x00000080,
+ 0xbef600ff, 0x01000000,
+ 0xd71f0000, 0x000100c1,
+ 0xd7200000, 0x000200c1,
+ 0x16000084, 0x857d9973,
0x8b7d817d, 0xbf06817d,
- 0xbfa20017, 0xbef600ff,
- 0x01000000, 0xbefd0084,
- 0xbf0a7b7d, 0xbfa10037,
- 0x7e008700, 0x7e028701,
- 0x7e048702, 0x7e068703,
- 0xe0685000, 0x701d0000,
- 0xe0685080, 0x701d0100,
- 0xe0685100, 0x701d0200,
- 0xe0685180, 0x701d0300,
- 0x807d847d, 0x8070ff70,
- 0x00000200, 0xbf0a7b7d,
- 0xbfa2ffef, 0xbfa00025,
+ 0xbefd0080, 0xbfa20012,
+ 0xbe8300ff, 0x00000080,
+ 0xbf800000, 0xbf800000,
+ 0xbf800000, 0xd8d80000,
+ 0x01000000, 0xbf890000,
+ 0xe0685000, 0x701d0100,
+ 0x807d037d, 0x80700370,
+ 0xd5250000, 0x0001ff00,
+ 0x00000080, 0xbf0a7b7d,
+ 0xbfa2fff4, 0xbfa00011,
+ 0xbe8300ff, 0x00000100,
+ 0xbf800000, 0xbf800000,
+ 0xbf800000, 0xd8d80000,
+ 0x01000000, 0xbf890000,
+ 0xe0685000, 0x701d0100,
+ 0x807d037d, 0x80700370,
+ 0xd5250000, 0x0001ff00,
+ 0x00000100, 0xbf0a7b7d,
+ 0xbfa2fff4, 0xbefe00c1,
+ 0x857d9973, 0x8b7d817d,
+ 0xbf06817d, 0xbfa20004,
+ 0xbef000ff, 0x00000200,
+ 0xbeff0080, 0xbfa00003,
+ 0xbef000ff, 0x00000400,
+ 0xbeff00c1, 0xb8fb3b05,
+ 0x807b817b, 0x847b827b,
+ 0x857d9973, 0x8b7d817d,
+ 0xbf06817d, 0xbfa20017,
0xbef600ff, 0x01000000,
0xbefd0084, 0xbf0a7b7d,
- 0xbfa10011, 0x7e008700,
+ 0xbfa10037, 0x7e008700,
0x7e028701, 0x7e048702,
0x7e068703, 0xe0685000,
- 0x701d0000, 0xe0685100,
- 0x701d0100, 0xe0685200,
- 0x701d0200, 0xe0685300,
+ 0x701d0000, 0xe0685080,
+ 0x701d0100, 0xe0685100,
+ 0x701d0200, 0xe0685180,
0x701d0300, 0x807d847d,
- 0x8070ff70, 0x00000400,
+ 0x8070ff70, 0x00000200,
0xbf0a7b7d, 0xbfa2ffef,
- 0xb8fb1e06, 0x8b7bc17b,
- 0xbfa1000c, 0x847b837b,
- 0x807b7d7b, 0xbefe00c1,
- 0xbeff0080, 0x7e008700,
+ 0xbfa00025, 0xbef600ff,
+ 0x01000000, 0xbefd0084,
+ 0xbf0a7b7d, 0xbfa10011,
+ 0x7e008700, 0x7e028701,
+ 0x7e048702, 0x7e068703,
0xe0685000, 0x701d0000,
- 0x807d817d, 0x8070ff70,
- 0x00000080, 0xbf0a7b7d,
- 0xbfa2fff8, 0xbfa00146,
- 0xbef4007e, 0x8b75ff7f,
- 0x0000ffff, 0x8c75ff75,
- 0x00040000, 0xbef60080,
- 0xbef700ff, 0x10807fac,
- 0xb8f202dc, 0x84729972,
- 0x8b6eff7f, 0x04000000,
- 0xbfa1003a, 0xbefe00c1,
- 0x857d9972, 0x8b7d817d,
- 0xbf06817d, 0xbfa20002,
- 0xbeff0080, 0xbfa00001,
- 0xbeff00c1, 0xb8ef4306,
- 0x8b6fc16f, 0xbfa1002f,
- 0x846f866f, 0x846f826f,
- 0xbef6006f, 0xb8f83b05,
- 0x80788178, 0xbf0d9972,
- 0xbfa20002, 0x84788978,
- 0xbfa00001, 0x84788a78,
- 0xb8ee1e06, 0x846e8a6e,
- 0x80786e78, 0x8078ff78,
- 0x00000200, 0x8078ff78,
- 0x00000080, 0xbef600ff,
- 0x01000000, 0x857d9972,
- 0x8b7d817d, 0xbf06817d,
- 0xbefd0080, 0xbfa2000c,
- 0xe0500000, 0x781d0000,
- 0xbf8903f7, 0xdac00000,
- 0x00000000, 0x807dff7d,
- 0x00000080, 0x8078ff78,
- 0x00000080, 0xbf0a6f7d,
- 0xbfa2fff5, 0xbfa0000b,
- 0xe0500000, 0x781d0000,
- 0xbf8903f7, 0xdac00000,
- 0x00000000, 0x807dff7d,
- 0x00000100, 0x8078ff78,
- 0x00000100, 0xbf0a6f7d,
- 0xbfa2fff5, 0xbef80080,
+ 0xe0685100, 0x701d0100,
+ 0xe0685200, 0x701d0200,
+ 0xe0685300, 0x701d0300,
+ 0x807d847d, 0x8070ff70,
+ 0x00000400, 0xbf0a7b7d,
+ 0xbfa2ffef, 0xb8fb1e06,
+ 0x8b7bc17b, 0xbfa1000c,
+ 0x847b837b, 0x807b7d7b,
+ 0xbefe00c1, 0xbeff0080,
+ 0x7e008700, 0xe0685000,
+ 0x701d0000, 0x807d817d,
+ 0x8070ff70, 0x00000080,
+ 0xbf0a7b7d, 0xbfa2fff8,
+ 0xbfa00146, 0xbef4007e,
+ 0x8b75ff7f, 0x0000ffff,
+ 0x8c75ff75, 0x00040000,
+ 0xbef60080, 0xbef700ff,
+ 0x10807fac, 0xb8f202dc,
+ 0x84729972, 0x8b6eff7f,
+ 0x04000000, 0xbfa1003a,
0xbefe00c1, 0x857d9972,
0x8b7d817d, 0xbf06817d,
0xbfa20002, 0xbeff0080,
0xbfa00001, 0xbeff00c1,
- 0xb8ef3b05, 0x806f816f,
- 0x846f826f, 0x857d9972,
- 0x8b7d817d, 0xbf06817d,
- 0xbfa20024, 0xbef600ff,
- 0x01000000, 0xbeee0078,
+ 0xb8ef4306, 0x8b6fc16f,
+ 0xbfa1002f, 0x846f866f,
+ 0x846f826f, 0xbef6006f,
+ 0xb8f83b05, 0x80788178,
+ 0xbf0d9972, 0xbfa20002,
+ 0x84788978, 0xbfa00001,
+ 0x84788a78, 0xb8ee1e06,
+ 0x846e8a6e, 0x80786e78,
0x8078ff78, 0x00000200,
- 0xbefd0084, 0xbf0a6f7d,
- 0xbfa10050, 0xe0505000,
- 0x781d0000, 0xe0505080,
- 0x781d0100, 0xe0505100,
- 0x781d0200, 0xe0505180,
- 0x781d0300, 0xbf8903f7,
- 0x7e008500, 0x7e028501,
- 0x7e048502, 0x7e068503,
- 0x807d847d, 0x8078ff78,
- 0x00000200, 0xbf0a6f7d,
- 0xbfa2ffee, 0xe0505000,
- 0x6e1d0000, 0xe0505080,
- 0x6e1d0100, 0xe0505100,
- 0x6e1d0200, 0xe0505180,
- 0x6e1d0300, 0xbf8903f7,
- 0xbfa00034, 0xbef600ff,
- 0x01000000, 0xbeee0078,
- 0x8078ff78, 0x00000400,
- 0xbefd0084, 0xbf0a6f7d,
- 0xbfa10012, 0xe0505000,
- 0x781d0000, 0xe0505100,
- 0x781d0100, 0xe0505200,
- 0x781d0200, 0xe0505300,
- 0x781d0300, 0xbf8903f7,
- 0x7e008500, 0x7e028501,
- 0x7e048502, 0x7e068503,
- 0x807d847d, 0x8078ff78,
- 0x00000400, 0xbf0a6f7d,
- 0xbfa2ffee, 0xb8ef1e06,
- 0x8b6fc16f, 0xbfa1000e,
- 0x846f836f, 0x806f7d6f,
- 0xbefe00c1, 0xbeff0080,
+ 0x8078ff78, 0x00000080,
+ 0xbef600ff, 0x01000000,
+ 0x857d9972, 0x8b7d817d,
+ 0xbf06817d, 0xbefd0080,
+ 0xbfa2000c, 0xe0500000,
+ 0x781d0000, 0xbf8903f7,
+ 0xdac00000, 0x00000000,
+ 0x807dff7d, 0x00000080,
+ 0x8078ff78, 0x00000080,
+ 0xbf0a6f7d, 0xbfa2fff5,
+ 0xbfa0000b, 0xe0500000,
+ 0x781d0000, 0xbf8903f7,
+ 0xdac00000, 0x00000000,
+ 0x807dff7d, 0x00000100,
+ 0x8078ff78, 0x00000100,
+ 0xbf0a6f7d, 0xbfa2fff5,
+ 0xbef80080, 0xbefe00c1,
+ 0x857d9972, 0x8b7d817d,
+ 0xbf06817d, 0xbfa20002,
+ 0xbeff0080, 0xbfa00001,
+ 0xbeff00c1, 0xb8ef3b05,
+ 0x806f816f, 0x846f826f,
+ 0x857d9972, 0x8b7d817d,
+ 0xbf06817d, 0xbfa20024,
+ 0xbef600ff, 0x01000000,
+ 0xbeee0078, 0x8078ff78,
+ 0x00000200, 0xbefd0084,
+ 0xbf0a6f7d, 0xbfa10050,
0xe0505000, 0x781d0000,
+ 0xe0505080, 0x781d0100,
+ 0xe0505100, 0x781d0200,
+ 0xe0505180, 0x781d0300,
0xbf8903f7, 0x7e008500,
- 0x807d817d, 0x8078ff78,
- 0x00000080, 0xbf0a6f7d,
- 0xbfa2fff7, 0xbeff00c1,
+ 0x7e028501, 0x7e048502,
+ 0x7e068503, 0x807d847d,
+ 0x8078ff78, 0x00000200,
+ 0xbf0a6f7d, 0xbfa2ffee,
0xe0505000, 0x6e1d0000,
- 0xe0505100, 0x6e1d0100,
- 0xe0505200, 0x6e1d0200,
- 0xe0505300, 0x6e1d0300,
- 0xbf8903f7, 0xb8f83b05,
- 0x80788178, 0xbf0d9972,
- 0xbfa20002, 0x84788978,
- 0xbfa00001, 0x84788a78,
- 0xb8ee1e06, 0x846e8a6e,
- 0x80786e78, 0x8078ff78,
- 0x00000200, 0x80f8ff78,
- 0x00000050, 0xbef600ff,
- 0x01000000, 0xbefd00ff,
- 0x0000006c, 0x80f89078,
- 0xf428403a, 0xf0000000,
- 0xbf89fc07, 0x80fd847d,
- 0xbf800000, 0xbe804300,
- 0xbe824302, 0x80f8a078,
- 0xf42c403a, 0xf0000000,
- 0xbf89fc07, 0x80fd887d,
- 0xbf800000, 0xbe804300,
- 0xbe824302, 0xbe844304,
- 0xbe864306, 0x80f8c078,
- 0xf430403a, 0xf0000000,
- 0xbf89fc07, 0x80fd907d,
- 0xbf800000, 0xbe804300,
- 0xbe824302, 0xbe844304,
- 0xbe864306, 0xbe884308,
- 0xbe8a430a, 0xbe8c430c,
- 0xbe8e430e, 0xbf06807d,
- 0xbfa1fff0, 0xb980f801,
- 0x00000000, 0xbfbd0000,
+ 0xe0505080, 0x6e1d0100,
+ 0xe0505100, 0x6e1d0200,
+ 0xe0505180, 0x6e1d0300,
+ 0xbf8903f7, 0xbfa00034,
+ 0xbef600ff, 0x01000000,
+ 0xbeee0078, 0x8078ff78,
+ 0x00000400, 0xbefd0084,
+ 0xbf0a6f7d, 0xbfa10012,
+ 0xe0505000, 0x781d0000,
+ 0xe0505100, 0x781d0100,
+ 0xe0505200, 0x781d0200,
+ 0xe0505300, 0x781d0300,
+ 0xbf8903f7, 0x7e008500,
+ 0x7e028501, 0x7e048502,
+ 0x7e068503, 0x807d847d,
+ 0x8078ff78, 0x00000400,
+ 0xbf0a6f7d, 0xbfa2ffee,
+ 0xb8ef1e06, 0x8b6fc16f,
+ 0xbfa1000e, 0x846f836f,
+ 0x806f7d6f, 0xbefe00c1,
+ 0xbeff0080, 0xe0505000,
+ 0x781d0000, 0xbf8903f7,
+ 0x7e008500, 0x807d817d,
+ 0x8078ff78, 0x00000080,
+ 0xbf0a6f7d, 0xbfa2fff7,
+ 0xbeff00c1, 0xe0505000,
+ 0x6e1d0000, 0xe0505100,
+ 0x6e1d0100, 0xe0505200,
+ 0x6e1d0200, 0xe0505300,
+ 0x6e1d0300, 0xbf8903f7,
0xb8f83b05, 0x80788178,
0xbf0d9972, 0xbfa20002,
0x84788978, 0xbfa00001,
0x84788a78, 0xb8ee1e06,
0x846e8a6e, 0x80786e78,
0x8078ff78, 0x00000200,
+ 0x80f8ff78, 0x00000050,
0xbef600ff, 0x01000000,
- 0xf4205bfa, 0xf0000000,
- 0x80788478, 0xf4205b3a,
+ 0xbefd00ff, 0x0000006c,
+ 0x80f89078, 0xf428403a,
+ 0xf0000000, 0xbf89fc07,
+ 0x80fd847d, 0xbf800000,
+ 0xbe804300, 0xbe824302,
+ 0x80f8a078, 0xf42c403a,
+ 0xf0000000, 0xbf89fc07,
+ 0x80fd887d, 0xbf800000,
+ 0xbe804300, 0xbe824302,
+ 0xbe844304, 0xbe864306,
+ 0x80f8c078, 0xf430403a,
+ 0xf0000000, 0xbf89fc07,
+ 0x80fd907d, 0xbf800000,
+ 0xbe804300, 0xbe824302,
+ 0xbe844304, 0xbe864306,
+ 0xbe884308, 0xbe8a430a,
+ 0xbe8c430c, 0xbe8e430e,
+ 0xbf06807d, 0xbfa1fff0,
+ 0xb980f801, 0x00000000,
+ 0xbfbd0000, 0xb8f83b05,
+ 0x80788178, 0xbf0d9972,
+ 0xbfa20002, 0x84788978,
+ 0xbfa00001, 0x84788a78,
+ 0xb8ee1e06, 0x846e8a6e,
+ 0x80786e78, 0x8078ff78,
+ 0x00000200, 0xbef600ff,
+ 0x01000000, 0xf4205bfa,
0xf0000000, 0x80788478,
- 0xf4205b7a, 0xf0000000,
- 0x80788478, 0xf4205c3a,
+ 0xf4205b3a, 0xf0000000,
+ 0x80788478, 0xf4205b7a,
0xf0000000, 0x80788478,
- 0xf4205c7a, 0xf0000000,
- 0x80788478, 0xf4205eba,
+ 0xf4205c3a, 0xf0000000,
+ 0x80788478, 0xf4205c7a,
0xf0000000, 0x80788478,
- 0xf4205efa, 0xf0000000,
- 0x80788478, 0xf4205e7a,
+ 0xf4205eba, 0xf0000000,
+ 0x80788478, 0xf4205efa,
0xf0000000, 0x80788478,
- 0xf4205cfa, 0xf0000000,
- 0x80788478, 0xf4205bba,
+ 0xf4205e7a, 0xf0000000,
+ 0x80788478, 0xf4205cfa,
0xf0000000, 0x80788478,
- 0xbf89fc07, 0xb96ef814,
0xf4205bba, 0xf0000000,
0x80788478, 0xbf89fc07,
- 0xb96ef815, 0xbefd006f,
- 0xbefe0070, 0xbeff0071,
- 0x8b6f7bff, 0x000003ff,
- 0xb96f4803, 0x8b6f7bff,
- 0xfffff800, 0x856f8b6f,
- 0xb96fa2c3, 0xb973f801,
- 0xb8ee3b05, 0x806e816e,
- 0xbf0d9972, 0xbfa20002,
- 0x846e896e, 0xbfa00001,
- 0x846e8a6e, 0xb8ef1e06,
- 0x846f8a6f, 0x806e6f6e,
- 0x806eff6e, 0x00000200,
- 0x806e746e, 0x826f8075,
- 0x8b6fff6f, 0x0000ffff,
- 0xf4085c37, 0xf8000050,
- 0xf4085d37, 0xf8000060,
- 0xf4005e77, 0xf8000074,
- 0xbf89fc07, 0x8b6dff6d,
- 0x0000ffff, 0x8bfe7e7e,
- 0x8bea6a6a, 0xb8eef802,
- 0xbf0d866e, 0xbfa20002,
- 0xb97af802, 0xbe80486c,
- 0xb97af802, 0xbe804a6c,
- 0xbfb00000, 0xbf9f0000,
+ 0xb96ef814, 0xf4205bba,
+ 0xf0000000, 0x80788478,
+ 0xbf89fc07, 0xb96ef815,
+ 0xbefd006f, 0xbefe0070,
+ 0xbeff0071, 0x8b6f7bff,
+ 0x000003ff, 0xb96f4803,
+ 0x8b6f7bff, 0xfffff800,
+ 0x856f8b6f, 0xb96fa2c3,
+ 0xb973f801, 0xb8ee3b05,
+ 0x806e816e, 0xbf0d9972,
+ 0xbfa20002, 0x846e896e,
+ 0xbfa00001, 0x846e8a6e,
+ 0xb8ef1e06, 0x846f8a6f,
+ 0x806e6f6e, 0x806eff6e,
+ 0x00000200, 0x806e746e,
+ 0x826f8075, 0x8b6fff6f,
+ 0x0000ffff, 0xf4085c37,
+ 0xf8000050, 0xf4085d37,
+ 0xf8000060, 0xf4005e77,
+ 0xf8000074, 0xbf89fc07,
+ 0x8b6dff6d, 0x0000ffff,
+ 0x8bfe7e7e, 0x8bea6a6a,
+ 0xb8eef802, 0xbf0d866e,
+ 0xbfa20002, 0xb97af802,
+ 0xbe80486c, 0xb97af802,
+ 0xbe804a6c, 0xbfb00000,
0xbf9f0000, 0xbf9f0000,
0xbf9f0000, 0xbf9f0000,
+ 0xbf9f0000, 0x00000000,
};
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
index 0f81670f6f9c..8b92c33c2a7c 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
@@ -186,6 +186,12 @@ L_SKIP_RESTORE:
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
#if SW_SA_TRAP
+ // If ttmp1[30] is set then issue s_barrier to unblock dependent waves.
+ s_bitcmp1_b32 s_save_pc_hi, 30
+ s_cbranch_scc0 L_TRAP_NO_BARRIER
+ s_barrier
+
+L_TRAP_NO_BARRIER:
// If ttmp1[31] is set then trap may occur early.
// Spin wait until SAVECTX exception is raised.
s_bitcmp1_b32 s_save_pc_hi, 31
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 2797029bd500..22b077ac9a19 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -973,12 +973,10 @@ out_unlock_prange:
out_unlock_svms:
mutex_unlock(&p->svms.lock);
out_unref_process:
+ pr_debug("CPU fault svms 0x%p address 0x%lx done\n", &p->svms, addr);
kfd_unref_process(p);
out_mmput:
mmput(mm);
-
- pr_debug("CPU fault svms 0x%p address 0x%lx done\n", &p->svms, addr);
-
return r ? VM_FAULT_SIGBUS : 0;
}
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index c053cb79cd06..589bee9acf16 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -1549,6 +1549,9 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
adev->dm.dc->debug.visual_confirm = amdgpu_dc_visual_confirm;
+ /* TODO: Remove after DP2 receiver gets proper support of Cable ID feature */
+ adev->dm.dc->debug.ignore_cable_id = true;
+
r = dm_dmub_hw_init(adev);
if (r) {
DRM_ERROR("DMUB interface failed to initialize: status=%d\n", r);
diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c
index 1c612ccf1944..6f77d8e538ab 100644
--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c
+++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c
@@ -157,6 +157,7 @@ void dcn32_init_clocks(struct clk_mgr *clk_mgr_base)
struct clk_mgr_internal *clk_mgr = TO_CLK_MGR_INTERNAL(clk_mgr_base);
unsigned int num_levels;
struct clk_limit_num_entries *num_entries_per_clk = &clk_mgr_base->bw_params->clk_table.num_entries_per_clk;
+ unsigned int i;
memset(&(clk_mgr_base->clks), 0, sizeof(struct dc_clocks));
clk_mgr_base->clks.p_state_change_support = true;
@@ -205,18 +206,17 @@ void dcn32_init_clocks(struct clk_mgr *clk_mgr_base)
clk_mgr->dpm_present = true;
if (clk_mgr_base->ctx->dc->debug.min_disp_clk_khz) {
- unsigned int i;
-
for (i = 0; i < num_levels; i++)
if (clk_mgr_base->bw_params->clk_table.entries[i].dispclk_mhz
< khz_to_mhz_ceil(clk_mgr_base->ctx->dc->debug.min_disp_clk_khz))
clk_mgr_base->bw_params->clk_table.entries[i].dispclk_mhz
= khz_to_mhz_ceil(clk_mgr_base->ctx->dc->debug.min_disp_clk_khz);
}
+ for (i = 0; i < num_levels; i++)
+ if (clk_mgr_base->bw_params->clk_table.entries[i].dispclk_mhz > 1950)
+ clk_mgr_base->bw_params->clk_table.entries[i].dispclk_mhz = 1950;
if (clk_mgr_base->ctx->dc->debug.min_dpp_clk_khz) {
- unsigned int i;
-
for (i = 0; i < num_levels; i++)
if (clk_mgr_base->bw_params->clk_table.entries[i].dppclk_mhz
< khz_to_mhz_ceil(clk_mgr_base->ctx->dc->debug.min_dpp_clk_khz))
@@ -669,6 +669,9 @@ static void dcn32_get_memclk_states_from_smu(struct clk_mgr *clk_mgr_base)
&clk_mgr_base->bw_params->clk_table.entries[0].memclk_mhz,
&num_entries_per_clk->num_memclk_levels);
+ /* memclk must have at least one level */
+ num_entries_per_clk->num_memclk_levels = num_entries_per_clk->num_memclk_levels ? num_entries_per_clk->num_memclk_levels : 1;
+
dcn32_init_single_clock(clk_mgr, PPCLK_FCLK,
&clk_mgr_base->bw_params->clk_table.entries[0].fclk_mhz,
&num_entries_per_clk->num_fclk_levels);
diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h
index bfc5474c0f4c..737b221ca689 100644
--- a/drivers/gpu/drm/amd/display/dc/dc.h
+++ b/drivers/gpu/drm/amd/display/dc/dc.h
@@ -852,6 +852,7 @@ struct dc_debug_options {
bool enable_double_buffered_dsc_pg_support;
bool enable_dp_dig_pixel_rate_div_policy;
enum lttpr_mode lttpr_mode_override;
+ unsigned int dsc_delay_factor_wa_x1000;
};
struct gpu_info_soc_bounding_box_v1_0;
diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c
index 4996d2810edb..938dba5249d4 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c
@@ -623,6 +623,10 @@ void hubp2_cursor_set_attributes(
hubp->att.size.bits.width = attr->width;
hubp->att.size.bits.height = attr->height;
hubp->att.cur_ctl.bits.mode = attr->color_format;
+
+ hubp->cur_rect.w = attr->width;
+ hubp->cur_rect.h = attr->height;
+
hubp->att.cur_ctl.bits.pitch = hw_pitch;
hubp->att.cur_ctl.bits.line_per_chunk = lpc;
hubp->att.cur_ctl.bits.cur_2x_magnify = attr->attribute_flags.bits.ENABLE_MAGNIFICATION;
diff --git a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c
index d0ad72caead2..9066c511a052 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c
@@ -847,7 +847,7 @@ static const struct resource_caps res_cap_dcn314 = {
.num_ddc = 5,
.num_vmid = 16,
.num_mpc_3dlut = 2,
- .num_dsc = 3,
+ .num_dsc = 4,
};
static const struct dc_plane_cap plane_cap = {
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn20/dcn20_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn20/dcn20_fpu.c
index d680f1c5b69f..45db40c41882 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn20/dcn20_fpu.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn20/dcn20_fpu.c
@@ -1228,6 +1228,7 @@ int dcn20_populate_dml_pipes_from_context(
pipes[pipe_cnt].pipe.src.dcc = false;
pipes[pipe_cnt].pipe.src.dcc_rate = 1;
pipes[pipe_cnt].pipe.dest.synchronized_vblank_all_planes = synchronized_vblank;
+ pipes[pipe_cnt].pipe.dest.synchronize_timings = synchronized_vblank;
pipes[pipe_cnt].pipe.dest.hblank_start = timing->h_total - timing->h_front_porch;
pipes[pipe_cnt].pipe.dest.hblank_end = pipes[pipe_cnt].pipe.dest.hblank_start
- timing->h_addressable
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c
index 819de0f11012..f37c9a6b3b7e 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c
@@ -2359,9 +2359,11 @@ void dcn32_update_bw_bounding_box_fpu(struct dc *dc, struct clk_bw_params *bw_pa
if (dc->ctx->dc_bios->vram_info.dram_channel_width_bytes)
dcn3_2_soc.dram_channel_width_bytes = dc->ctx->dc_bios->vram_info.dram_channel_width_bytes;
-
}
+ /* DML DSC delay factor workaround */
+ dcn3_2_ip.dsc_delay_factor_wa = dc->debug.dsc_delay_factor_wa_x1000 / 1000.0;
+
/* Override dispclk_dppclk_vco_speed_mhz from Clk Mgr */
dcn3_2_soc.dispclk_dppclk_vco_speed_mhz = dc->clk_mgr->dentist_vco_freq_khz / 1000.0;
dc->dml.soc.dispclk_dppclk_vco_speed_mhz = dc->clk_mgr->dentist_vco_freq_khz / 1000.0;
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c
index 5b91660a6496..3d184679f129 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c
@@ -364,10 +364,11 @@ static void DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerforman
for (k = 0; k < mode_lib->vba.NumberOfActiveSurfaces; ++k) {
v->DSCDelay[k] = dml32_DSCDelayRequirement(mode_lib->vba.DSCEnabled[k],
mode_lib->vba.ODMCombineEnabled[k], mode_lib->vba.DSCInputBitPerComponent[k],
- mode_lib->vba.OutputBpp[k], mode_lib->vba.HActive[k], mode_lib->vba.HTotal[k],
+ mode_lib->vba.OutputBppPerState[mode_lib->vba.VoltageLevel][k],
+ mode_lib->vba.HActive[k], mode_lib->vba.HTotal[k],
mode_lib->vba.NumberOfDSCSlices[k], mode_lib->vba.OutputFormat[k],
mode_lib->vba.Output[k], mode_lib->vba.PixelClock[k],
- mode_lib->vba.PixelClockBackEnd[k]);
+ mode_lib->vba.PixelClockBackEnd[k], mode_lib->vba.ip.dsc_delay_factor_wa);
}
for (k = 0; k < mode_lib->vba.NumberOfActiveSurfaces; ++k)
@@ -1627,7 +1628,7 @@ static void mode_support_configuration(struct vba_vars_st *v,
&& !mode_lib->vba.MSOOrODMSplitWithNonDPLink
&& !mode_lib->vba.NotEnoughLanesForMSO
&& mode_lib->vba.LinkCapacitySupport[i] == true && !mode_lib->vba.P2IWith420
- && !mode_lib->vba.DSCOnlyIfNecessaryWithBPP
+ //&& !mode_lib->vba.DSCOnlyIfNecessaryWithBPP
&& !mode_lib->vba.DSC422NativeNotSupported
&& !mode_lib->vba.MPCCombineMethodIncompatible
&& mode_lib->vba.ODMCombine2To1SupportCheckOK[i] == true
@@ -2475,7 +2476,8 @@ void dml32_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_l
mode_lib->vba.OutputBppPerState[i][k], mode_lib->vba.HActive[k],
mode_lib->vba.HTotal[k], mode_lib->vba.NumberOfDSCSlices[k],
mode_lib->vba.OutputFormat[k], mode_lib->vba.Output[k],
- mode_lib->vba.PixelClock[k], mode_lib->vba.PixelClockBackEnd[k]);
+ mode_lib->vba.PixelClock[k], mode_lib->vba.PixelClockBackEnd[k],
+ mode_lib->vba.ip.dsc_delay_factor_wa);
}
for (k = 0; k <= mode_lib->vba.NumberOfActiveSurfaces - 1; k++) {
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.c
index ad66e241f9ae..968924c491c1 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.c
@@ -1726,7 +1726,8 @@ unsigned int dml32_DSCDelayRequirement(bool DSCEnabled,
enum output_format_class OutputFormat,
enum output_encoder_class Output,
double PixelClock,
- double PixelClockBackEnd)
+ double PixelClockBackEnd,
+ double dsc_delay_factor_wa)
{
unsigned int DSCDelayRequirement_val;
@@ -1746,7 +1747,7 @@ unsigned int dml32_DSCDelayRequirement(bool DSCEnabled,
}
DSCDelayRequirement_val = DSCDelayRequirement_val + (HTotal - HActive) *
- dml_ceil(DSCDelayRequirement_val / HActive, 1);
+ dml_ceil((double)DSCDelayRequirement_val / HActive, 1);
DSCDelayRequirement_val = DSCDelayRequirement_val * PixelClock / PixelClockBackEnd;
@@ -1764,7 +1765,7 @@ unsigned int dml32_DSCDelayRequirement(bool DSCEnabled,
dml_print("DML::%s: DSCDelayRequirement_val = %d\n", __func__, DSCDelayRequirement_val);
#endif
- return DSCDelayRequirement_val;
+ return dml_ceil(DSCDelayRequirement_val * dsc_delay_factor_wa, 1);
}
void dml32_CalculateSurfaceSizeInMall(
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.h b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.h
index 55cead0d4237..2c3827546ac7 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.h
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.h
@@ -327,7 +327,8 @@ unsigned int dml32_DSCDelayRequirement(bool DSCEnabled,
enum output_format_class OutputFormat,
enum output_encoder_class Output,
double PixelClock,
- double PixelClockBackEnd);
+ double PixelClockBackEnd,
+ double dsc_delay_factor_wa);
void dml32_CalculateSurfaceSizeInMall(
unsigned int NumberOfActiveSurfaces,
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_rq_dlg_calc_32.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_rq_dlg_calc_32.c
index a1276f6b9581..395ae8761980 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_rq_dlg_calc_32.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_rq_dlg_calc_32.c
@@ -291,8 +291,8 @@ void dml32_rq_dlg_get_dlg_reg(struct display_mode_lib *mode_lib,
dml_print("DML_DLG: %s: vready_after_vcount0 = %d\n", __func__, dlg_regs->vready_after_vcount0);
- dst_x_after_scaler = get_dst_x_after_scaler(mode_lib, e2e_pipe_param, num_pipes, pipe_idx);
- dst_y_after_scaler = get_dst_y_after_scaler(mode_lib, e2e_pipe_param, num_pipes, pipe_idx);
+ dst_x_after_scaler = dml_ceil(get_dst_x_after_scaler(mode_lib, e2e_pipe_param, num_pipes, pipe_idx), 1);
+ dst_y_after_scaler = dml_ceil(get_dst_y_after_scaler(mode_lib, e2e_pipe_param, num_pipes, pipe_idx), 1);
// do some adjustment on the dst_after scaler to account for odm combine mode
dml_print("DML_DLG: %s: input dst_x_after_scaler = %d\n", __func__, dst_x_after_scaler);
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn321/dcn321_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn321/dcn321_fpu.c
index dd90f241e906..ec0486efab14 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn321/dcn321_fpu.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn321/dcn321_fpu.c
@@ -29,6 +29,7 @@
#include "dcn321_fpu.h"
#include "dcn32/dcn32_resource.h"
#include "dcn321/dcn321_resource.h"
+#include "dml/dcn32/display_mode_vba_util_32.h"
#define DCN3_2_DEFAULT_DET_SIZE 256
@@ -119,15 +120,15 @@ struct _vcs_dpi_soc_bounding_box_st dcn3_21_soc = {
},
},
.num_states = 1,
- .sr_exit_time_us = 12.36,
- .sr_enter_plus_exit_time_us = 16.72,
+ .sr_exit_time_us = 19.95,
+ .sr_enter_plus_exit_time_us = 24.36,
.sr_exit_z8_time_us = 285.0,
.sr_enter_plus_exit_z8_time_us = 320,
.writeback_latency_us = 12.0,
.round_trip_ping_latency_dcfclk_cycles = 263,
- .urgent_latency_pixel_data_only_us = 4.0,
- .urgent_latency_pixel_mixed_with_vm_data_us = 4.0,
- .urgent_latency_vm_data_only_us = 4.0,
+ .urgent_latency_pixel_data_only_us = 9.35,
+ .urgent_latency_pixel_mixed_with_vm_data_us = 9.35,
+ .urgent_latency_vm_data_only_us = 9.35,
.fclk_change_latency_us = 20,
.usr_retraining_latency_us = 2,
.smn_latency_us = 2,
@@ -538,9 +539,11 @@ void dcn321_update_bw_bounding_box_fpu(struct dc *dc, struct clk_bw_params *bw_p
if (dc->ctx->dc_bios->vram_info.dram_channel_width_bytes)
dcn3_21_soc.dram_channel_width_bytes = dc->ctx->dc_bios->vram_info.dram_channel_width_bytes;
-
}
+ /* DML DSC delay factor workaround */
+ dcn3_21_ip.dsc_delay_factor_wa = dc->debug.dsc_delay_factor_wa_x1000 / 1000.0;
+
/* Override dispclk_dppclk_vco_speed_mhz from Clk Mgr */
dcn3_21_soc.dispclk_dppclk_vco_speed_mhz = dc->clk_mgr->dentist_vco_freq_khz / 1000.0;
dc->dml.soc.dispclk_dppclk_vco_speed_mhz = dc->clk_mgr->dentist_vco_freq_khz / 1000.0;
diff --git a/drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h b/drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h
index f33a8879b05a..d7be01ac0751 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h
+++ b/drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h
@@ -364,6 +364,9 @@ struct _vcs_dpi_ip_params_st {
unsigned int max_num_dp2p0_outputs;
unsigned int max_num_dp2p0_streams;
unsigned int VBlankNomDefaultUS;
+
+ /* DM workarounds */
+ double dsc_delay_factor_wa; // TODO: Remove after implementing root cause fix
};
struct _vcs_dpi_display_xfc_params_st {
diff --git a/drivers/gpu/drm/amd/display/dc/dml/display_mode_vba.c b/drivers/gpu/drm/amd/display/dc/dml/display_mode_vba.c
index 03924aed8d5c..8e6585dab20e 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/display_mode_vba.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/display_mode_vba.c
@@ -625,7 +625,7 @@ static void fetch_pipe_params(struct display_mode_lib *mode_lib)
mode_lib->vba.skip_dio_check[mode_lib->vba.NumberOfActivePlanes] =
dout->is_virtual;
- if (!dout->dsc_enable)
+ if (dout->dsc_enable)
mode_lib->vba.ForcedOutputLinkBPP[mode_lib->vba.NumberOfActivePlanes] = dout->output_bpp;
else
mode_lib->vba.ForcedOutputLinkBPP[mode_lib->vba.NumberOfActivePlanes] = 0.0;
diff --git a/drivers/gpu/drm/drm_format_helper.c b/drivers/gpu/drm/drm_format_helper.c
index e2f76621453c..3ee59bae9d2f 100644
--- a/drivers/gpu/drm/drm_format_helper.c
+++ b/drivers/gpu/drm/drm_format_helper.c
@@ -807,6 +807,38 @@ static bool is_listed_fourcc(const uint32_t *fourccs, size_t nfourccs, uint32_t
return false;
}
+static const uint32_t conv_from_xrgb8888[] = {
+ DRM_FORMAT_XRGB8888,
+ DRM_FORMAT_ARGB8888,
+ DRM_FORMAT_XRGB2101010,
+ DRM_FORMAT_ARGB2101010,
+ DRM_FORMAT_RGB565,
+ DRM_FORMAT_RGB888,
+};
+
+static const uint32_t conv_from_rgb565_888[] = {
+ DRM_FORMAT_XRGB8888,
+ DRM_FORMAT_ARGB8888,
+};
+
+static bool is_conversion_supported(uint32_t from, uint32_t to)
+{
+ switch (from) {
+ case DRM_FORMAT_XRGB8888:
+ case DRM_FORMAT_ARGB8888:
+ return is_listed_fourcc(conv_from_xrgb8888, ARRAY_SIZE(conv_from_xrgb8888), to);
+ case DRM_FORMAT_RGB565:
+ case DRM_FORMAT_RGB888:
+ return is_listed_fourcc(conv_from_rgb565_888, ARRAY_SIZE(conv_from_rgb565_888), to);
+ case DRM_FORMAT_XRGB2101010:
+ return to == DRM_FORMAT_ARGB2101010;
+ case DRM_FORMAT_ARGB2101010:
+ return to == DRM_FORMAT_XRGB2101010;
+ default:
+ return false;
+ }
+}
+
/**
* drm_fb_build_fourcc_list - Filters a list of supported color formats against
* the device's native formats
@@ -827,7 +859,9 @@ static bool is_listed_fourcc(const uint32_t *fourccs, size_t nfourccs, uint32_t
* be handed over to drm_universal_plane_init() et al. Native formats
* will go before emulated formats. Other heuristics might be applied
* to optimize the order. Formats near the beginning of the list are
- * usually preferred over formats near the end of the list.
+ * usually preferred over formats near the end of the list. Formats
+ * without conversion helpers will be skipped. New drivers should only
+ * pass in XRGB8888 and avoid exposing additional emulated formats.
*
* Returns:
* The number of color-formats 4CC codes returned in @fourccs_out.
@@ -839,7 +873,7 @@ size_t drm_fb_build_fourcc_list(struct drm_device *dev,
{
u32 *fourccs = fourccs_out;
const u32 *fourccs_end = fourccs_out + nfourccs_out;
- bool found_native = false;
+ uint32_t native_format = 0;
size_t i;
/*
@@ -858,27 +892,19 @@ size_t drm_fb_build_fourcc_list(struct drm_device *dev,
drm_dbg_kms(dev, "adding native format %p4cc\n", &fourcc);
- if (!found_native)
- found_native = is_listed_fourcc(driver_fourccs, driver_nfourccs, fourcc);
+ /*
+ * There should only be one native format with the current API.
+ * This API needs to be refactored to correctly support arbitrary
+ * sets of native formats, since it needs to report which native
+ * format to use for each emulated format.
+ */
+ if (!native_format)
+ native_format = fourcc;
*fourccs = fourcc;
++fourccs;
}
/*
- * The plane's atomic_update helper converts the framebuffer's color format
- * to a native format when copying to device memory.
- *
- * If there is not a single format supported by both, device and
- * driver, the native formats are likely not supported by the conversion
- * helpers. Therefore *only* support the native formats and add a
- * conversion helper ASAP.
- */
- if (!found_native) {
- drm_warn(dev, "Format conversion helpers required to add extra formats.\n");
- goto out;
- }
-
- /*
* The extra formats, emulated by the driver, go second.
*/
@@ -890,6 +916,9 @@ size_t drm_fb_build_fourcc_list(struct drm_device *dev,
} else if (fourccs == fourccs_end) {
drm_warn(dev, "Ignoring emulated format %p4cc\n", &fourcc);
continue; /* end of available output buffer */
+ } else if (!is_conversion_supported(fourcc, native_format)) {
+ drm_dbg_kms(dev, "Unsupported emulated format %p4cc\n", &fourcc);
+ continue; /* format is not supported for conversion */
}
drm_dbg_kms(dev, "adding emulated format %p4cc\n", &fourcc);
@@ -898,7 +927,6 @@ size_t drm_fb_build_fourcc_list(struct drm_device *dev,
++fourccs;
}
-out:
return fourccs - fourccs_out;
}
EXPORT_SYMBOL(drm_fb_build_fourcc_list);
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index a26edcdadc21..cea00aaca04b 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -282,6 +282,7 @@ i915-y += \
display/intel_ddi.o \
display/intel_ddi_buf_trans.o \
display/intel_display_trace.o \
+ display/intel_dkl_phy.o \
display/intel_dp.o \
display/intel_dp_aux.o \
display/intel_dp_aux_backlight.o \
diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c
index da8472cdc135..69ecf2a3d6c6 100644
--- a/drivers/gpu/drm/i915/display/intel_ddi.c
+++ b/drivers/gpu/drm/i915/display/intel_ddi.c
@@ -43,6 +43,7 @@
#include "intel_de.h"
#include "intel_display_power.h"
#include "intel_display_types.h"
+#include "intel_dkl_phy.h"
#include "intel_dp.h"
#include "intel_dp_link_training.h"
#include "intel_dp_mst.h"
@@ -1262,33 +1263,30 @@ static void tgl_dkl_phy_set_signal_levels(struct intel_encoder *encoder,
for (ln = 0; ln < 2; ln++) {
int level;
- intel_de_write(dev_priv, HIP_INDEX_REG(tc_port),
- HIP_INDEX_VAL(tc_port, ln));
-
- intel_de_write(dev_priv, DKL_TX_PMD_LANE_SUS(tc_port), 0);
+ intel_dkl_phy_write(dev_priv, DKL_TX_PMD_LANE_SUS(tc_port), ln, 0);
level = intel_ddi_level(encoder, crtc_state, 2*ln+0);
- intel_de_rmw(dev_priv, DKL_TX_DPCNTL0(tc_port),
- DKL_TX_PRESHOOT_COEFF_MASK |
- DKL_TX_DE_EMPAHSIS_COEFF_MASK |
- DKL_TX_VSWING_CONTROL_MASK,
- DKL_TX_PRESHOOT_COEFF(trans->entries[level].dkl.preshoot) |
- DKL_TX_DE_EMPHASIS_COEFF(trans->entries[level].dkl.de_emphasis) |
- DKL_TX_VSWING_CONTROL(trans->entries[level].dkl.vswing));
+ intel_dkl_phy_rmw(dev_priv, DKL_TX_DPCNTL0(tc_port), ln,
+ DKL_TX_PRESHOOT_COEFF_MASK |
+ DKL_TX_DE_EMPAHSIS_COEFF_MASK |
+ DKL_TX_VSWING_CONTROL_MASK,
+ DKL_TX_PRESHOOT_COEFF(trans->entries[level].dkl.preshoot) |
+ DKL_TX_DE_EMPHASIS_COEFF(trans->entries[level].dkl.de_emphasis) |
+ DKL_TX_VSWING_CONTROL(trans->entries[level].dkl.vswing));
level = intel_ddi_level(encoder, crtc_state, 2*ln+1);
- intel_de_rmw(dev_priv, DKL_TX_DPCNTL1(tc_port),
- DKL_TX_PRESHOOT_COEFF_MASK |
- DKL_TX_DE_EMPAHSIS_COEFF_MASK |
- DKL_TX_VSWING_CONTROL_MASK,
- DKL_TX_PRESHOOT_COEFF(trans->entries[level].dkl.preshoot) |
- DKL_TX_DE_EMPHASIS_COEFF(trans->entries[level].dkl.de_emphasis) |
- DKL_TX_VSWING_CONTROL(trans->entries[level].dkl.vswing));
+ intel_dkl_phy_rmw(dev_priv, DKL_TX_DPCNTL1(tc_port), ln,
+ DKL_TX_PRESHOOT_COEFF_MASK |
+ DKL_TX_DE_EMPAHSIS_COEFF_MASK |
+ DKL_TX_VSWING_CONTROL_MASK,
+ DKL_TX_PRESHOOT_COEFF(trans->entries[level].dkl.preshoot) |
+ DKL_TX_DE_EMPHASIS_COEFF(trans->entries[level].dkl.de_emphasis) |
+ DKL_TX_VSWING_CONTROL(trans->entries[level].dkl.vswing));
- intel_de_rmw(dev_priv, DKL_TX_DPCNTL2(tc_port),
- DKL_TX_DP20BITMODE, 0);
+ intel_dkl_phy_rmw(dev_priv, DKL_TX_DPCNTL2(tc_port), ln,
+ DKL_TX_DP20BITMODE, 0);
if (IS_ALDERLAKE_P(dev_priv)) {
u32 val;
@@ -1306,10 +1304,10 @@ static void tgl_dkl_phy_set_signal_levels(struct intel_encoder *encoder,
val |= DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX2(0);
}
- intel_de_rmw(dev_priv, DKL_TX_DPCNTL2(tc_port),
- DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX1_MASK |
- DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX2_MASK,
- val);
+ intel_dkl_phy_rmw(dev_priv, DKL_TX_DPCNTL2(tc_port), ln,
+ DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX1_MASK |
+ DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX2_MASK,
+ val);
}
}
}
@@ -2019,12 +2017,8 @@ icl_program_mg_dp_mode(struct intel_digital_port *dig_port,
return;
if (DISPLAY_VER(dev_priv) >= 12) {
- intel_de_write(dev_priv, HIP_INDEX_REG(tc_port),
- HIP_INDEX_VAL(tc_port, 0x0));
- ln0 = intel_de_read(dev_priv, DKL_DP_MODE(tc_port));
- intel_de_write(dev_priv, HIP_INDEX_REG(tc_port),
- HIP_INDEX_VAL(tc_port, 0x1));
- ln1 = intel_de_read(dev_priv, DKL_DP_MODE(tc_port));
+ ln0 = intel_dkl_phy_read(dev_priv, DKL_DP_MODE(tc_port), 0);
+ ln1 = intel_dkl_phy_read(dev_priv, DKL_DP_MODE(tc_port), 1);
} else {
ln0 = intel_de_read(dev_priv, MG_DP_MODE(0, tc_port));
ln1 = intel_de_read(dev_priv, MG_DP_MODE(1, tc_port));
@@ -2085,12 +2079,8 @@ icl_program_mg_dp_mode(struct intel_digital_port *dig_port,
}
if (DISPLAY_VER(dev_priv) >= 12) {
- intel_de_write(dev_priv, HIP_INDEX_REG(tc_port),
- HIP_INDEX_VAL(tc_port, 0x0));
- intel_de_write(dev_priv, DKL_DP_MODE(tc_port), ln0);
- intel_de_write(dev_priv, HIP_INDEX_REG(tc_port),
- HIP_INDEX_VAL(tc_port, 0x1));
- intel_de_write(dev_priv, DKL_DP_MODE(tc_port), ln1);
+ intel_dkl_phy_write(dev_priv, DKL_DP_MODE(tc_port), 0, ln0);
+ intel_dkl_phy_write(dev_priv, DKL_DP_MODE(tc_port), 1, ln1);
} else {
intel_de_write(dev_priv, MG_DP_MODE(0, tc_port), ln0);
intel_de_write(dev_priv, MG_DP_MODE(1, tc_port), ln1);
@@ -3094,10 +3084,8 @@ static void adlp_tbt_to_dp_alt_switch_wa(struct intel_encoder *encoder)
enum tc_port tc_port = intel_port_to_tc(i915, encoder->port);
int ln;
- for (ln = 0; ln < 2; ln++) {
- intel_de_write(i915, HIP_INDEX_REG(tc_port), HIP_INDEX_VAL(tc_port, ln));
- intel_de_rmw(i915, DKL_PCS_DW5(tc_port), DKL_PCS_DW5_CORE_SOFTRESET, 0);
- }
+ for (ln = 0; ln < 2; ln++)
+ intel_dkl_phy_rmw(i915, DKL_PCS_DW5(tc_port), ln, DKL_PCS_DW5_CORE_SOFTRESET, 0);
}
static void intel_ddi_prepare_link_retrain(struct intel_dp *intel_dp,
diff --git a/drivers/gpu/drm/i915/display/intel_display_core.h b/drivers/gpu/drm/i915/display/intel_display_core.h
index 96cf994b0ad1..9b51148e8ba5 100644
--- a/drivers/gpu/drm/i915/display/intel_display_core.h
+++ b/drivers/gpu/drm/i915/display/intel_display_core.h
@@ -316,6 +316,14 @@ struct intel_display {
} dbuf;
struct {
+ /*
+ * dkl.phy_lock protects against concurrent access of the
+ * Dekel TypeC PHYs.
+ */
+ spinlock_t phy_lock;
+ } dkl;
+
+ struct {
/* VLV/CHV/BXT/GLK DSI MMIO register base address */
u32 mmio_base;
} dsi;
diff --git a/drivers/gpu/drm/i915/display/intel_display_power_well.c b/drivers/gpu/drm/i915/display/intel_display_power_well.c
index df7ee4969ef1..1d18eee56253 100644
--- a/drivers/gpu/drm/i915/display/intel_display_power_well.c
+++ b/drivers/gpu/drm/i915/display/intel_display_power_well.c
@@ -12,6 +12,7 @@
#include "intel_de.h"
#include "intel_display_power_well.h"
#include "intel_display_types.h"
+#include "intel_dkl_phy.h"
#include "intel_dmc.h"
#include "intel_dpio_phy.h"
#include "intel_dpll.h"
@@ -529,11 +530,9 @@ icl_tc_phy_aux_power_well_enable(struct drm_i915_private *dev_priv,
enum tc_port tc_port;
tc_port = TGL_AUX_PW_TO_TC_PORT(i915_power_well_instance(power_well)->hsw.idx);
- intel_de_write(dev_priv, HIP_INDEX_REG(tc_port),
- HIP_INDEX_VAL(tc_port, 0x2));
- if (intel_de_wait_for_set(dev_priv, DKL_CMN_UC_DW_27(tc_port),
- DKL_CMN_UC_DW27_UC_HEALTH, 1))
+ if (wait_for(intel_dkl_phy_read(dev_priv, DKL_CMN_UC_DW_27(tc_port), 2) &
+ DKL_CMN_UC_DW27_UC_HEALTH, 1))
drm_warn(&dev_priv->drm,
"Timeout waiting TC uC health\n");
}
diff --git a/drivers/gpu/drm/i915/display/intel_dkl_phy.c b/drivers/gpu/drm/i915/display/intel_dkl_phy.c
new file mode 100644
index 000000000000..710b030c7ed5
--- /dev/null
+++ b/drivers/gpu/drm/i915/display/intel_dkl_phy.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2022 Intel Corporation
+ */
+
+#include "i915_drv.h"
+#include "i915_reg.h"
+
+#include "intel_de.h"
+#include "intel_display.h"
+#include "intel_dkl_phy.h"
+
+static void
+dkl_phy_set_hip_idx(struct drm_i915_private *i915, i915_reg_t reg, int idx)
+{
+ enum tc_port tc_port = DKL_REG_TC_PORT(reg);
+
+ drm_WARN_ON(&i915->drm, tc_port < TC_PORT_1 || tc_port >= I915_MAX_TC_PORTS);
+
+ intel_de_write(i915,
+ HIP_INDEX_REG(tc_port),
+ HIP_INDEX_VAL(tc_port, idx));
+}
+
+/**
+ * intel_dkl_phy_read - read a Dekel PHY register
+ * @i915: i915 device instance
+ * @reg: Dekel PHY register
+ * @ln: lane instance of @reg
+ *
+ * Read the @reg Dekel PHY register.
+ *
+ * Returns the read value.
+ */
+u32
+intel_dkl_phy_read(struct drm_i915_private *i915, i915_reg_t reg, int ln)
+{
+ u32 val;
+
+ spin_lock(&i915->display.dkl.phy_lock);
+
+ dkl_phy_set_hip_idx(i915, reg, ln);
+ val = intel_de_read(i915, reg);
+
+ spin_unlock(&i915->display.dkl.phy_lock);
+
+ return val;
+}
+
+/**
+ * intel_dkl_phy_write - write a Dekel PHY register
+ * @i915: i915 device instance
+ * @reg: Dekel PHY register
+ * @ln: lane instance of @reg
+ * @val: value to write
+ *
+ * Write @val to the @reg Dekel PHY register.
+ */
+void
+intel_dkl_phy_write(struct drm_i915_private *i915, i915_reg_t reg, int ln, u32 val)
+{
+ spin_lock(&i915->display.dkl.phy_lock);
+
+ dkl_phy_set_hip_idx(i915, reg, ln);
+ intel_de_write(i915, reg, val);
+
+ spin_unlock(&i915->display.dkl.phy_lock);
+}
+
+/**
+ * intel_dkl_phy_rmw - read-modify-write a Dekel PHY register
+ * @i915: i915 device instance
+ * @reg: Dekel PHY register
+ * @ln: lane instance of @reg
+ * @clear: mask to clear
+ * @set: mask to set
+ *
+ * Read the @reg Dekel PHY register, clearing then setting the @clear/@set bits in it, and writing
+ * this value back to the register if the value differs from the read one.
+ */
+void
+intel_dkl_phy_rmw(struct drm_i915_private *i915, i915_reg_t reg, int ln, u32 clear, u32 set)
+{
+ spin_lock(&i915->display.dkl.phy_lock);
+
+ dkl_phy_set_hip_idx(i915, reg, ln);
+ intel_de_rmw(i915, reg, clear, set);
+
+ spin_unlock(&i915->display.dkl.phy_lock);
+}
+
+/**
+ * intel_dkl_phy_posting_read - do a posting read from a Dekel PHY register
+ * @i915: i915 device instance
+ * @reg: Dekel PHY register
+ * @ln: lane instance of @reg
+ *
+ * Read the @reg Dekel PHY register without returning the read value.
+ */
+void
+intel_dkl_phy_posting_read(struct drm_i915_private *i915, i915_reg_t reg, int ln)
+{
+ spin_lock(&i915->display.dkl.phy_lock);
+
+ dkl_phy_set_hip_idx(i915, reg, ln);
+ intel_de_posting_read(i915, reg);
+
+ spin_unlock(&i915->display.dkl.phy_lock);
+}
diff --git a/drivers/gpu/drm/i915/display/intel_dkl_phy.h b/drivers/gpu/drm/i915/display/intel_dkl_phy.h
new file mode 100644
index 000000000000..260ad121a0b1
--- /dev/null
+++ b/drivers/gpu/drm/i915/display/intel_dkl_phy.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2022 Intel Corporation
+ */
+
+#ifndef __INTEL_DKL_PHY_H__
+#define __INTEL_DKL_PHY_H__
+
+#include <linux/types.h>
+
+#include "i915_reg_defs.h"
+
+struct drm_i915_private;
+
+u32
+intel_dkl_phy_read(struct drm_i915_private *i915, i915_reg_t reg, int ln);
+void
+intel_dkl_phy_write(struct drm_i915_private *i915, i915_reg_t reg, int ln, u32 val);
+void
+intel_dkl_phy_rmw(struct drm_i915_private *i915, i915_reg_t reg, int ln, u32 clear, u32 set);
+void
+intel_dkl_phy_posting_read(struct drm_i915_private *i915, i915_reg_t reg, int ln);
+
+#endif /* __INTEL_DKL_PHY_H__ */
diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c
index 47419d162f30..2b5bc95a8b0d 100644
--- a/drivers/gpu/drm/i915/display/intel_dp.c
+++ b/drivers/gpu/drm/i915/display/intel_dp.c
@@ -5276,7 +5276,7 @@ static bool intel_edp_init_connector(struct intel_dp *intel_dp,
encoder->devdata, IS_ERR(edid) ? NULL : edid);
intel_panel_add_edid_fixed_modes(intel_connector,
- intel_connector->panel.vbt.drrs_type != DRRS_TYPE_NONE,
+ intel_connector->panel.vbt.drrs_type != DRRS_TYPE_NONE ||
intel_vrr_is_capable(intel_connector));
/* MSO requires information from the EDID */
diff --git a/drivers/gpu/drm/i915/display/intel_dpll_mgr.c b/drivers/gpu/drm/i915/display/intel_dpll_mgr.c
index e5fb66a5dd02..64dd603dc69a 100644
--- a/drivers/gpu/drm/i915/display/intel_dpll_mgr.c
+++ b/drivers/gpu/drm/i915/display/intel_dpll_mgr.c
@@ -25,6 +25,7 @@
#include "intel_de.h"
#include "intel_display_types.h"
+#include "intel_dkl_phy.h"
#include "intel_dpio_phy.h"
#include "intel_dpll.h"
#include "intel_dpll_mgr.h"
@@ -3508,15 +3509,12 @@ static bool dkl_pll_get_hw_state(struct drm_i915_private *dev_priv,
* All registers read here have the same HIP_INDEX_REG even though
* they are on different building blocks
*/
- intel_de_write(dev_priv, HIP_INDEX_REG(tc_port),
- HIP_INDEX_VAL(tc_port, 0x2));
-
- hw_state->mg_refclkin_ctl = intel_de_read(dev_priv,
- DKL_REFCLKIN_CTL(tc_port));
+ hw_state->mg_refclkin_ctl = intel_dkl_phy_read(dev_priv,
+ DKL_REFCLKIN_CTL(tc_port), 2);
hw_state->mg_refclkin_ctl &= MG_REFCLKIN_CTL_OD_2_MUX_MASK;
hw_state->mg_clktop2_hsclkctl =
- intel_de_read(dev_priv, DKL_CLKTOP2_HSCLKCTL(tc_port));
+ intel_dkl_phy_read(dev_priv, DKL_CLKTOP2_HSCLKCTL(tc_port), 2);
hw_state->mg_clktop2_hsclkctl &=
MG_CLKTOP2_HSCLKCTL_TLINEDRV_CLKSEL_MASK |
MG_CLKTOP2_HSCLKCTL_CORE_INPUTSEL_MASK |
@@ -3524,32 +3522,32 @@ static bool dkl_pll_get_hw_state(struct drm_i915_private *dev_priv,
MG_CLKTOP2_HSCLKCTL_DSDIV_RATIO_MASK;
hw_state->mg_clktop2_coreclkctl1 =
- intel_de_read(dev_priv, DKL_CLKTOP2_CORECLKCTL1(tc_port));
+ intel_dkl_phy_read(dev_priv, DKL_CLKTOP2_CORECLKCTL1(tc_port), 2);
hw_state->mg_clktop2_coreclkctl1 &=
MG_CLKTOP2_CORECLKCTL1_A_DIVRATIO_MASK;
- hw_state->mg_pll_div0 = intel_de_read(dev_priv, DKL_PLL_DIV0(tc_port));
+ hw_state->mg_pll_div0 = intel_dkl_phy_read(dev_priv, DKL_PLL_DIV0(tc_port), 2);
val = DKL_PLL_DIV0_MASK;
if (dev_priv->display.vbt.override_afc_startup)
val |= DKL_PLL_DIV0_AFC_STARTUP_MASK;
hw_state->mg_pll_div0 &= val;
- hw_state->mg_pll_div1 = intel_de_read(dev_priv, DKL_PLL_DIV1(tc_port));
+ hw_state->mg_pll_div1 = intel_dkl_phy_read(dev_priv, DKL_PLL_DIV1(tc_port), 2);
hw_state->mg_pll_div1 &= (DKL_PLL_DIV1_IREF_TRIM_MASK |
DKL_PLL_DIV1_TDC_TARGET_CNT_MASK);
- hw_state->mg_pll_ssc = intel_de_read(dev_priv, DKL_PLL_SSC(tc_port));
+ hw_state->mg_pll_ssc = intel_dkl_phy_read(dev_priv, DKL_PLL_SSC(tc_port), 2);
hw_state->mg_pll_ssc &= (DKL_PLL_SSC_IREF_NDIV_RATIO_MASK |
DKL_PLL_SSC_STEP_LEN_MASK |
DKL_PLL_SSC_STEP_NUM_MASK |
DKL_PLL_SSC_EN);
- hw_state->mg_pll_bias = intel_de_read(dev_priv, DKL_PLL_BIAS(tc_port));
+ hw_state->mg_pll_bias = intel_dkl_phy_read(dev_priv, DKL_PLL_BIAS(tc_port), 2);
hw_state->mg_pll_bias &= (DKL_PLL_BIAS_FRAC_EN_H |
DKL_PLL_BIAS_FBDIV_FRAC_MASK);
hw_state->mg_pll_tdc_coldst_bias =
- intel_de_read(dev_priv, DKL_PLL_TDC_COLDST_BIAS(tc_port));
+ intel_dkl_phy_read(dev_priv, DKL_PLL_TDC_COLDST_BIAS(tc_port), 2);
hw_state->mg_pll_tdc_coldst_bias &= (DKL_PLL_TDC_SSC_STEP_SIZE_MASK |
DKL_PLL_TDC_FEED_FWD_GAIN_MASK);
@@ -3737,61 +3735,58 @@ static void dkl_pll_write(struct drm_i915_private *dev_priv,
* All registers programmed here have the same HIP_INDEX_REG even
* though on different building block
*/
- intel_de_write(dev_priv, HIP_INDEX_REG(tc_port),
- HIP_INDEX_VAL(tc_port, 0x2));
-
/* All the registers are RMW */
- val = intel_de_read(dev_priv, DKL_REFCLKIN_CTL(tc_port));
+ val = intel_dkl_phy_read(dev_priv, DKL_REFCLKIN_CTL(tc_port), 2);
val &= ~MG_REFCLKIN_CTL_OD_2_MUX_MASK;
val |= hw_state->mg_refclkin_ctl;
- intel_de_write(dev_priv, DKL_REFCLKIN_CTL(tc_port), val);
+ intel_dkl_phy_write(dev_priv, DKL_REFCLKIN_CTL(tc_port), 2, val);
- val = intel_de_read(dev_priv, DKL_CLKTOP2_CORECLKCTL1(tc_port));
+ val = intel_dkl_phy_read(dev_priv, DKL_CLKTOP2_CORECLKCTL1(tc_port), 2);
val &= ~MG_CLKTOP2_CORECLKCTL1_A_DIVRATIO_MASK;
val |= hw_state->mg_clktop2_coreclkctl1;
- intel_de_write(dev_priv, DKL_CLKTOP2_CORECLKCTL1(tc_port), val);
+ intel_dkl_phy_write(dev_priv, DKL_CLKTOP2_CORECLKCTL1(tc_port), 2, val);
- val = intel_de_read(dev_priv, DKL_CLKTOP2_HSCLKCTL(tc_port));
+ val = intel_dkl_phy_read(dev_priv, DKL_CLKTOP2_HSCLKCTL(tc_port), 2);
val &= ~(MG_CLKTOP2_HSCLKCTL_TLINEDRV_CLKSEL_MASK |
MG_CLKTOP2_HSCLKCTL_CORE_INPUTSEL_MASK |
MG_CLKTOP2_HSCLKCTL_HSDIV_RATIO_MASK |
MG_CLKTOP2_HSCLKCTL_DSDIV_RATIO_MASK);
val |= hw_state->mg_clktop2_hsclkctl;
- intel_de_write(dev_priv, DKL_CLKTOP2_HSCLKCTL(tc_port), val);
+ intel_dkl_phy_write(dev_priv, DKL_CLKTOP2_HSCLKCTL(tc_port), 2, val);
val = DKL_PLL_DIV0_MASK;
if (dev_priv->display.vbt.override_afc_startup)
val |= DKL_PLL_DIV0_AFC_STARTUP_MASK;
- intel_de_rmw(dev_priv, DKL_PLL_DIV0(tc_port), val,
- hw_state->mg_pll_div0);
+ intel_dkl_phy_rmw(dev_priv, DKL_PLL_DIV0(tc_port), 2, val,
+ hw_state->mg_pll_div0);
- val = intel_de_read(dev_priv, DKL_PLL_DIV1(tc_port));
+ val = intel_dkl_phy_read(dev_priv, DKL_PLL_DIV1(tc_port), 2);
val &= ~(DKL_PLL_DIV1_IREF_TRIM_MASK |
DKL_PLL_DIV1_TDC_TARGET_CNT_MASK);
val |= hw_state->mg_pll_div1;
- intel_de_write(dev_priv, DKL_PLL_DIV1(tc_port), val);
+ intel_dkl_phy_write(dev_priv, DKL_PLL_DIV1(tc_port), 2, val);
- val = intel_de_read(dev_priv, DKL_PLL_SSC(tc_port));
+ val = intel_dkl_phy_read(dev_priv, DKL_PLL_SSC(tc_port), 2);
val &= ~(DKL_PLL_SSC_IREF_NDIV_RATIO_MASK |
DKL_PLL_SSC_STEP_LEN_MASK |
DKL_PLL_SSC_STEP_NUM_MASK |
DKL_PLL_SSC_EN);
val |= hw_state->mg_pll_ssc;
- intel_de_write(dev_priv, DKL_PLL_SSC(tc_port), val);
+ intel_dkl_phy_write(dev_priv, DKL_PLL_SSC(tc_port), 2, val);
- val = intel_de_read(dev_priv, DKL_PLL_BIAS(tc_port));
+ val = intel_dkl_phy_read(dev_priv, DKL_PLL_BIAS(tc_port), 2);
val &= ~(DKL_PLL_BIAS_FRAC_EN_H |
DKL_PLL_BIAS_FBDIV_FRAC_MASK);
val |= hw_state->mg_pll_bias;
- intel_de_write(dev_priv, DKL_PLL_BIAS(tc_port), val);
+ intel_dkl_phy_write(dev_priv, DKL_PLL_BIAS(tc_port), 2, val);
- val = intel_de_read(dev_priv, DKL_PLL_TDC_COLDST_BIAS(tc_port));
+ val = intel_dkl_phy_read(dev_priv, DKL_PLL_TDC_COLDST_BIAS(tc_port), 2);
val &= ~(DKL_PLL_TDC_SSC_STEP_SIZE_MASK |
DKL_PLL_TDC_FEED_FWD_GAIN_MASK);
val |= hw_state->mg_pll_tdc_coldst_bias;
- intel_de_write(dev_priv, DKL_PLL_TDC_COLDST_BIAS(tc_port), val);
+ intel_dkl_phy_write(dev_priv, DKL_PLL_TDC_COLDST_BIAS(tc_port), 2, val);
- intel_de_posting_read(dev_priv, DKL_PLL_TDC_COLDST_BIAS(tc_port));
+ intel_dkl_phy_posting_read(dev_priv, DKL_PLL_TDC_COLDST_BIAS(tc_port), 2);
}
static void icl_pll_power_enable(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/display/intel_lvds.c b/drivers/gpu/drm/i915/display/intel_lvds.c
index 9aa38e8141b5..e5352239b2a2 100644
--- a/drivers/gpu/drm/i915/display/intel_lvds.c
+++ b/drivers/gpu/drm/i915/display/intel_lvds.c
@@ -972,8 +972,7 @@ void intel_lvds_init(struct drm_i915_private *dev_priv)
/* Try EDID first */
intel_panel_add_edid_fixed_modes(intel_connector,
- intel_connector->panel.vbt.drrs_type != DRRS_TYPE_NONE,
- false);
+ intel_connector->panel.vbt.drrs_type != DRRS_TYPE_NONE);
/* Failed to get EDID, what about VBT? */
if (!intel_panel_preferred_fixed_mode(intel_connector))
diff --git a/drivers/gpu/drm/i915/display/intel_panel.c b/drivers/gpu/drm/i915/display/intel_panel.c
index a3a3f9fe4342..41cec9dc4223 100644
--- a/drivers/gpu/drm/i915/display/intel_panel.c
+++ b/drivers/gpu/drm/i915/display/intel_panel.c
@@ -254,10 +254,10 @@ static void intel_panel_destroy_probed_modes(struct intel_connector *connector)
}
void intel_panel_add_edid_fixed_modes(struct intel_connector *connector,
- bool has_drrs, bool has_vrr)
+ bool use_alt_fixed_modes)
{
intel_panel_add_edid_preferred_mode(connector);
- if (intel_panel_preferred_fixed_mode(connector) && (has_drrs || has_vrr))
+ if (intel_panel_preferred_fixed_mode(connector) && use_alt_fixed_modes)
intel_panel_add_edid_alt_fixed_modes(connector);
intel_panel_destroy_probed_modes(connector);
}
diff --git a/drivers/gpu/drm/i915/display/intel_panel.h b/drivers/gpu/drm/i915/display/intel_panel.h
index eff3ffd3d082..5c5b5b7f95b6 100644
--- a/drivers/gpu/drm/i915/display/intel_panel.h
+++ b/drivers/gpu/drm/i915/display/intel_panel.h
@@ -44,7 +44,7 @@ int intel_panel_fitting(struct intel_crtc_state *crtc_state,
int intel_panel_compute_config(struct intel_connector *connector,
struct drm_display_mode *adjusted_mode);
void intel_panel_add_edid_fixed_modes(struct intel_connector *connector,
- bool has_drrs, bool has_vrr);
+ bool use_alt_fixed_modes);
void intel_panel_add_vbt_lfp_fixed_mode(struct intel_connector *connector);
void intel_panel_add_vbt_sdvo_fixed_mode(struct intel_connector *connector);
void intel_panel_add_encoder_fixed_mode(struct intel_connector *connector,
diff --git a/drivers/gpu/drm/i915/display/intel_sdvo.c b/drivers/gpu/drm/i915/display/intel_sdvo.c
index f5b744bef18f..774c1dc31a52 100644
--- a/drivers/gpu/drm/i915/display/intel_sdvo.c
+++ b/drivers/gpu/drm/i915/display/intel_sdvo.c
@@ -2747,13 +2747,10 @@ intel_sdvo_dvi_init(struct intel_sdvo *intel_sdvo, int device)
if (!intel_sdvo_connector)
return false;
- if (device == 0) {
- intel_sdvo->controlled_output |= SDVO_OUTPUT_TMDS0;
+ if (device == 0)
intel_sdvo_connector->output_flag = SDVO_OUTPUT_TMDS0;
- } else if (device == 1) {
- intel_sdvo->controlled_output |= SDVO_OUTPUT_TMDS1;
+ else if (device == 1)
intel_sdvo_connector->output_flag = SDVO_OUTPUT_TMDS1;
- }
intel_connector = &intel_sdvo_connector->base;
connector = &intel_connector->base;
@@ -2808,7 +2805,6 @@ intel_sdvo_tv_init(struct intel_sdvo *intel_sdvo, int type)
encoder->encoder_type = DRM_MODE_ENCODER_TVDAC;
connector->connector_type = DRM_MODE_CONNECTOR_SVIDEO;
- intel_sdvo->controlled_output |= type;
intel_sdvo_connector->output_flag = type;
if (intel_sdvo_connector_init(intel_sdvo_connector, intel_sdvo) < 0) {
@@ -2849,13 +2845,10 @@ intel_sdvo_analog_init(struct intel_sdvo *intel_sdvo, int device)
encoder->encoder_type = DRM_MODE_ENCODER_DAC;
connector->connector_type = DRM_MODE_CONNECTOR_VGA;
- if (device == 0) {
- intel_sdvo->controlled_output |= SDVO_OUTPUT_RGB0;
+ if (device == 0)
intel_sdvo_connector->output_flag = SDVO_OUTPUT_RGB0;
- } else if (device == 1) {
- intel_sdvo->controlled_output |= SDVO_OUTPUT_RGB1;
+ else if (device == 1)
intel_sdvo_connector->output_flag = SDVO_OUTPUT_RGB1;
- }
if (intel_sdvo_connector_init(intel_sdvo_connector, intel_sdvo) < 0) {
kfree(intel_sdvo_connector);
@@ -2885,13 +2878,10 @@ intel_sdvo_lvds_init(struct intel_sdvo *intel_sdvo, int device)
encoder->encoder_type = DRM_MODE_ENCODER_LVDS;
connector->connector_type = DRM_MODE_CONNECTOR_LVDS;
- if (device == 0) {
- intel_sdvo->controlled_output |= SDVO_OUTPUT_LVDS0;
+ if (device == 0)
intel_sdvo_connector->output_flag = SDVO_OUTPUT_LVDS0;
- } else if (device == 1) {
- intel_sdvo->controlled_output |= SDVO_OUTPUT_LVDS1;
+ else if (device == 1)
intel_sdvo_connector->output_flag = SDVO_OUTPUT_LVDS1;
- }
if (intel_sdvo_connector_init(intel_sdvo_connector, intel_sdvo) < 0) {
kfree(intel_sdvo_connector);
@@ -2910,8 +2900,12 @@ intel_sdvo_lvds_init(struct intel_sdvo *intel_sdvo, int device)
intel_panel_add_vbt_sdvo_fixed_mode(intel_connector);
if (!intel_panel_preferred_fixed_mode(intel_connector)) {
+ mutex_lock(&i915->drm.mode_config.mutex);
+
intel_ddc_get_modes(connector, &intel_sdvo->ddc);
- intel_panel_add_edid_fixed_modes(intel_connector, false, false);
+ intel_panel_add_edid_fixed_modes(intel_connector, false);
+
+ mutex_unlock(&i915->drm.mode_config.mutex);
}
intel_panel_init(intel_connector);
@@ -2926,16 +2920,39 @@ err:
return false;
}
+static u16 intel_sdvo_filter_output_flags(u16 flags)
+{
+ flags &= SDVO_OUTPUT_MASK;
+
+ /* SDVO requires XXX1 function may not exist unless it has XXX0 function.*/
+ if (!(flags & SDVO_OUTPUT_TMDS0))
+ flags &= ~SDVO_OUTPUT_TMDS1;
+
+ if (!(flags & SDVO_OUTPUT_RGB0))
+ flags &= ~SDVO_OUTPUT_RGB1;
+
+ if (!(flags & SDVO_OUTPUT_LVDS0))
+ flags &= ~SDVO_OUTPUT_LVDS1;
+
+ return flags;
+}
+
static bool
intel_sdvo_output_setup(struct intel_sdvo *intel_sdvo, u16 flags)
{
- /* SDVO requires XXX1 function may not exist unless it has XXX0 function.*/
+ struct drm_i915_private *i915 = to_i915(intel_sdvo->base.base.dev);
+
+ flags = intel_sdvo_filter_output_flags(flags);
+
+ intel_sdvo->controlled_output = flags;
+
+ intel_sdvo_select_ddc_bus(i915, intel_sdvo);
if (flags & SDVO_OUTPUT_TMDS0)
if (!intel_sdvo_dvi_init(intel_sdvo, 0))
return false;
- if ((flags & SDVO_TMDS_MASK) == SDVO_TMDS_MASK)
+ if (flags & SDVO_OUTPUT_TMDS1)
if (!intel_sdvo_dvi_init(intel_sdvo, 1))
return false;
@@ -2956,7 +2973,7 @@ intel_sdvo_output_setup(struct intel_sdvo *intel_sdvo, u16 flags)
if (!intel_sdvo_analog_init(intel_sdvo, 0))
return false;
- if ((flags & SDVO_RGB_MASK) == SDVO_RGB_MASK)
+ if (flags & SDVO_OUTPUT_RGB1)
if (!intel_sdvo_analog_init(intel_sdvo, 1))
return false;
@@ -2964,14 +2981,13 @@ intel_sdvo_output_setup(struct intel_sdvo *intel_sdvo, u16 flags)
if (!intel_sdvo_lvds_init(intel_sdvo, 0))
return false;
- if ((flags & SDVO_LVDS_MASK) == SDVO_LVDS_MASK)
+ if (flags & SDVO_OUTPUT_LVDS1)
if (!intel_sdvo_lvds_init(intel_sdvo, 1))
return false;
- if ((flags & SDVO_OUTPUT_MASK) == 0) {
+ if (flags == 0) {
unsigned char bytes[2];
- intel_sdvo->controlled_output = 0;
memcpy(bytes, &intel_sdvo->caps.output_flags, 2);
DRM_DEBUG_KMS("%s: Unknown SDVO output type (0x%02x%02x)\n",
SDVO_NAME(intel_sdvo),
@@ -3383,8 +3399,6 @@ bool intel_sdvo_init(struct drm_i915_private *dev_priv,
*/
intel_sdvo->base.cloneable = 0;
- intel_sdvo_select_ddc_bus(dev_priv, intel_sdvo);
-
/* Set the input timing to the screen. Assume always input 0. */
if (!intel_sdvo_set_target_input(intel_sdvo))
goto err_output;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
index c698f95af15f..629acb403a2c 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
@@ -6,7 +6,6 @@
#include <linux/scatterlist.h>
#include <linux/slab.h>
-#include <linux/swiotlb.h>
#include "i915_drv.h"
#include "i915_gem.h"
@@ -38,22 +37,12 @@ static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj)
struct scatterlist *sg;
unsigned int sg_page_sizes;
unsigned int npages;
- int max_order;
+ int max_order = MAX_ORDER;
+ unsigned int max_segment;
gfp_t gfp;
- max_order = MAX_ORDER;
-#ifdef CONFIG_SWIOTLB
- if (is_swiotlb_active(obj->base.dev->dev)) {
- unsigned int max_segment;
-
- max_segment = swiotlb_max_segment();
- if (max_segment) {
- max_segment = max_t(unsigned int, max_segment,
- PAGE_SIZE) >> PAGE_SHIFT;
- max_order = min(max_order, ilog2(max_segment));
- }
- }
-#endif
+ max_segment = i915_sg_segment_size(i915->drm.dev) >> PAGE_SHIFT;
+ max_order = min(max_order, get_order(max_segment));
gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_RECLAIMABLE;
if (IS_I965GM(i915) || IS_I965G(i915)) {
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
index f42ca1179f37..11125c32dd35 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
@@ -194,7 +194,7 @@ static int shmem_get_pages(struct drm_i915_gem_object *obj)
struct intel_memory_region *mem = obj->mm.region;
struct address_space *mapping = obj->base.filp->f_mapping;
const unsigned long page_count = obj->base.size / PAGE_SIZE;
- unsigned int max_segment = i915_sg_segment_size();
+ unsigned int max_segment = i915_sg_segment_size(i915->drm.dev);
struct sg_table *st;
struct sgt_iter sgt_iter;
struct page *page;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index 4f861782c3e8..a4aa9500fa17 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -189,7 +189,7 @@ static int i915_ttm_tt_shmem_populate(struct ttm_device *bdev,
struct drm_i915_private *i915 = container_of(bdev, typeof(*i915), bdev);
struct intel_memory_region *mr = i915->mm.regions[INTEL_MEMORY_SYSTEM];
struct i915_ttm_tt *i915_tt = container_of(ttm, typeof(*i915_tt), ttm);
- const unsigned int max_segment = i915_sg_segment_size();
+ const unsigned int max_segment = i915_sg_segment_size(i915->drm.dev);
const size_t size = (size_t)ttm->num_pages << PAGE_SHIFT;
struct file *filp = i915_tt->filp;
struct sgt_iter sgt_iter;
@@ -538,7 +538,7 @@ static struct i915_refct_sgt *i915_ttm_tt_get_st(struct ttm_tt *ttm)
ret = sg_alloc_table_from_pages_segment(st,
ttm->pages, ttm->num_pages,
0, (unsigned long)ttm->num_pages << PAGE_SHIFT,
- i915_sg_segment_size(), GFP_KERNEL);
+ i915_sg_segment_size(i915_tt->dev), GFP_KERNEL);
if (ret) {
st->sgl = NULL;
return ERR_PTR(ret);
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index d4398948f016..f34e01a7fefb 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -129,7 +129,7 @@ static void i915_gem_object_userptr_drop_ref(struct drm_i915_gem_object *obj)
static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
{
const unsigned long num_pages = obj->base.size >> PAGE_SHIFT;
- unsigned int max_segment = i915_sg_segment_size();
+ unsigned int max_segment = i915_sg_segment_size(obj->base.dev->dev);
struct sg_table *st;
unsigned int sg_page_sizes;
struct page **pvec;
diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c
index c459eb362c47..f2a15d8155f4 100644
--- a/drivers/gpu/drm/i915/i915_driver.c
+++ b/drivers/gpu/drm/i915/i915_driver.c
@@ -353,6 +353,7 @@ static int i915_driver_early_probe(struct drm_i915_private *dev_priv)
mutex_init(&dev_priv->display.wm.wm_mutex);
mutex_init(&dev_priv->display.pps.mutex);
mutex_init(&dev_priv->display.hdcp.comp_mutex);
+ spin_lock_init(&dev_priv->display.dkl.phy_lock);
i915_memcpy_init_early(dev_priv);
intel_runtime_pm_init_early(&dev_priv->runtime_pm);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 0b287a59dc2f..da35bb2db26b 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7420,6 +7420,9 @@ enum skl_power_gate {
#define _DKL_PHY5_BASE 0x16C000
#define _DKL_PHY6_BASE 0x16D000
+#define DKL_REG_TC_PORT(__reg) \
+ (TC_PORT_1 + ((__reg).reg - _DKL_PHY1_BASE) / (_DKL_PHY2_BASE - _DKL_PHY1_BASE))
+
/* DEKEL PHY MMIO Address = Phy base + (internal address & ~index_mask) */
#define _DKL_PCS_DW5 0x14
#define DKL_PCS_DW5(tc_port) _MMIO(_PORT(tc_port, _DKL_PHY1_BASE, \
diff --git a/drivers/gpu/drm/i915/i915_scatterlist.h b/drivers/gpu/drm/i915/i915_scatterlist.h
index 9ddb3e743a3e..b0a1db44f895 100644
--- a/drivers/gpu/drm/i915/i915_scatterlist.h
+++ b/drivers/gpu/drm/i915/i915_scatterlist.h
@@ -9,7 +9,8 @@
#include <linux/pfn.h>
#include <linux/scatterlist.h>
-#include <linux/swiotlb.h>
+#include <linux/dma-mapping.h>
+#include <xen/xen.h>
#include "i915_gem.h"
@@ -127,19 +128,26 @@ static inline unsigned int i915_sg_dma_sizes(struct scatterlist *sg)
return page_sizes;
}
-static inline unsigned int i915_sg_segment_size(void)
+static inline unsigned int i915_sg_segment_size(struct device *dev)
{
- unsigned int size = swiotlb_max_segment();
-
- if (size == 0)
- size = UINT_MAX;
-
- size = rounddown(size, PAGE_SIZE);
- /* swiotlb_max_segment_size can return 1 byte when it means one page. */
- if (size < PAGE_SIZE)
- size = PAGE_SIZE;
-
- return size;
+ size_t max = min_t(size_t, UINT_MAX, dma_max_mapping_size(dev));
+
+ /*
+ * For Xen PV guests pages aren't contiguous in DMA (machine) address
+ * space. The DMA API takes care of that both in dma_alloc_* (by
+ * calling into the hypervisor to make the pages contiguous) and in
+ * dma_map_* (by bounce buffering). But i915 abuses ignores the
+ * coherency aspects of the DMA API and thus can't cope with bounce
+ * buffering actually happening, so add a hack here to force small
+ * allocations and mappings when running in PV mode on Xen.
+ *
+ * Note this will still break if bounce buffering is required for other
+ * reasons, like confidential computing hypervisors or PCIe root ports
+ * with addressing limitations.
+ */
+ if (xen_pv_domain())
+ max = PAGE_SIZE;
+ return round_down(max, PAGE_SIZE);
}
bool i915_sg_trim(struct sg_table *orig_st);
diff --git a/drivers/gpu/drm/imx/Kconfig b/drivers/gpu/drm/imx/Kconfig
index 975de4ff7313..fd5b2471fdf0 100644
--- a/drivers/gpu/drm/imx/Kconfig
+++ b/drivers/gpu/drm/imx/Kconfig
@@ -4,7 +4,6 @@ config DRM_IMX
select DRM_KMS_HELPER
select VIDEOMODE_HELPERS
select DRM_GEM_DMA_HELPER
- select DRM_KMS_HELPER
depends on DRM && (ARCH_MXC || ARCH_MULTIPLATFORM || COMPILE_TEST)
depends on IMX_IPUV3_CORE
help
diff --git a/drivers/gpu/drm/imx/imx-tve.c b/drivers/gpu/drm/imx/imx-tve.c
index 6b34fac3f73a..ab4d1c878fda 100644
--- a/drivers/gpu/drm/imx/imx-tve.c
+++ b/drivers/gpu/drm/imx/imx-tve.c
@@ -218,8 +218,9 @@ static int imx_tve_connector_get_modes(struct drm_connector *connector)
return ret;
}
-static int imx_tve_connector_mode_valid(struct drm_connector *connector,
- struct drm_display_mode *mode)
+static enum drm_mode_status
+imx_tve_connector_mode_valid(struct drm_connector *connector,
+ struct drm_display_mode *mode)
{
struct imx_tve *tve = con_to_tve(connector);
unsigned long rate;
diff --git a/drivers/gpu/drm/rockchip/dw-mipi-dsi-rockchip.c b/drivers/gpu/drm/rockchip/dw-mipi-dsi-rockchip.c
index bf6948125b84..f4df9820b295 100644
--- a/drivers/gpu/drm/rockchip/dw-mipi-dsi-rockchip.c
+++ b/drivers/gpu/drm/rockchip/dw-mipi-dsi-rockchip.c
@@ -752,7 +752,7 @@ static void dw_mipi_dsi_rockchip_config(struct dw_mipi_dsi_rockchip *dsi)
static void dw_mipi_dsi_rockchip_set_lcdsel(struct dw_mipi_dsi_rockchip *dsi,
int mux)
{
- if (dsi->cdata->lcdsel_grf_reg < 0)
+ if (dsi->cdata->lcdsel_grf_reg)
regmap_write(dsi->grf_regmap, dsi->cdata->lcdsel_grf_reg,
mux ? dsi->cdata->lcdsel_lit : dsi->cdata->lcdsel_big);
}
@@ -1051,23 +1051,31 @@ static int dw_mipi_dsi_rockchip_host_attach(void *priv_data,
if (ret) {
DRM_DEV_ERROR(dsi->dev, "Failed to register component: %d\n",
ret);
- return ret;
+ goto out;
}
second = dw_mipi_dsi_rockchip_find_second(dsi);
- if (IS_ERR(second))
- return PTR_ERR(second);
+ if (IS_ERR(second)) {
+ ret = PTR_ERR(second);
+ goto out;
+ }
if (second) {
ret = component_add(second, &dw_mipi_dsi_rockchip_ops);
if (ret) {
DRM_DEV_ERROR(second,
"Failed to register component: %d\n",
ret);
- return ret;
+ goto out;
}
}
return 0;
+
+out:
+ mutex_lock(&dsi->usage_mutex);
+ dsi->usage_mode = DW_DSI_USAGE_IDLE;
+ mutex_unlock(&dsi->usage_mutex);
+ return ret;
}
static int dw_mipi_dsi_rockchip_host_detach(void *priv_data,
@@ -1635,7 +1643,6 @@ static const struct rockchip_dw_dsi_chip_data rk3399_chip_data[] = {
static const struct rockchip_dw_dsi_chip_data rk3568_chip_data[] = {
{
.reg = 0xfe060000,
- .lcdsel_grf_reg = -1,
.lanecfg1_grf_reg = RK3568_GRF_VO_CON2,
.lanecfg1 = HIWORD_UPDATE(0, RK3568_DSI0_SKEWCALHS |
RK3568_DSI0_FORCETXSTOPMODE |
@@ -1645,7 +1652,6 @@ static const struct rockchip_dw_dsi_chip_data rk3568_chip_data[] = {
},
{
.reg = 0xfe070000,
- .lcdsel_grf_reg = -1,
.lanecfg1_grf_reg = RK3568_GRF_VO_CON3,
.lanecfg1 = HIWORD_UPDATE(0, RK3568_DSI1_SKEWCALHS |
RK3568_DSI1_FORCETXSTOPMODE |
@@ -1681,5 +1687,11 @@ struct platform_driver dw_mipi_dsi_rockchip_driver = {
.of_match_table = dw_mipi_dsi_rockchip_dt_ids,
.pm = &dw_mipi_dsi_rockchip_pm_ops,
.name = "dw-mipi-dsi-rockchip",
+ /*
+ * For dual-DSI display, one DSI pokes at the other DSI's
+ * drvdata in dw_mipi_dsi_rockchip_find_second(). This is not
+ * safe for asynchronous probe.
+ */
+ .probe_type = PROBE_FORCE_SYNCHRONOUS,
},
};
diff --git a/drivers/gpu/drm/rockchip/dw_hdmi-rockchip.c b/drivers/gpu/drm/rockchip/dw_hdmi-rockchip.c
index c14f88893868..2f4b8f64cbad 100644
--- a/drivers/gpu/drm/rockchip/dw_hdmi-rockchip.c
+++ b/drivers/gpu/drm/rockchip/dw_hdmi-rockchip.c
@@ -565,7 +565,8 @@ static int dw_hdmi_rockchip_bind(struct device *dev, struct device *master,
ret = rockchip_hdmi_parse_dt(hdmi);
if (ret) {
- DRM_DEV_ERROR(hdmi->dev, "Unable to parse OF data\n");
+ if (ret != -EPROBE_DEFER)
+ DRM_DEV_ERROR(hdmi->dev, "Unable to parse OF data\n");
return ret;
}
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
index 614e97aaac80..da8a69953706 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
@@ -364,9 +364,12 @@ rockchip_gem_create_with_handle(struct drm_file *file_priv,
{
struct rockchip_gem_object *rk_obj;
struct drm_gem_object *obj;
+ bool is_framebuffer;
int ret;
- rk_obj = rockchip_gem_create_object(drm, size, false);
+ is_framebuffer = drm->fb_helper && file_priv == drm->fb_helper->client.file;
+
+ rk_obj = rockchip_gem_create_object(drm, size, is_framebuffer);
if (IS_ERR(rk_obj))
return ERR_CAST(rk_obj);
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c
index aac20be5ac08..105a548d0abe 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c
@@ -877,10 +877,14 @@ static void vop2_crtc_atomic_disable(struct drm_crtc *crtc,
{
struct vop2_video_port *vp = to_vop2_video_port(crtc);
struct vop2 *vop2 = vp->vop2;
+ struct drm_crtc_state *old_crtc_state;
int ret;
vop2_lock(vop2);
+ old_crtc_state = drm_atomic_get_old_crtc_state(state, crtc);
+ drm_atomic_helper_disable_planes_on_crtc(old_crtc_state, false);
+
drm_crtc_vblank_off(crtc);
/*
@@ -996,13 +1000,15 @@ static int vop2_plane_atomic_check(struct drm_plane *plane,
static void vop2_plane_atomic_disable(struct drm_plane *plane,
struct drm_atomic_state *state)
{
- struct drm_plane_state *old_pstate = drm_atomic_get_old_plane_state(state, plane);
+ struct drm_plane_state *old_pstate = NULL;
struct vop2_win *win = to_vop2_win(plane);
struct vop2 *vop2 = win->vop2;
drm_dbg(vop2->drm, "%s disable\n", win->data->name);
- if (!old_pstate->crtc)
+ if (state)
+ old_pstate = drm_atomic_get_old_plane_state(state, plane);
+ if (old_pstate && !old_pstate->crtc)
return;
vop2_win_disable(win);
diff --git a/drivers/hwmon/pmbus/pmbus.h b/drivers/hwmon/pmbus/pmbus.h
index 7daaf0caf4d3..10fb17879f8e 100644
--- a/drivers/hwmon/pmbus/pmbus.h
+++ b/drivers/hwmon/pmbus/pmbus.h
@@ -467,7 +467,6 @@ extern const struct regulator_ops pmbus_regulator_ops;
#define PMBUS_REGULATOR_STEP(_name, _id, _voltages, _step) \
[_id] = { \
.name = (_name # _id), \
- .supply_name = "vin", \
.id = (_id), \
.of_match = of_match_ptr(_name # _id), \
.regulators_node = of_match_ptr("regulators"), \
diff --git a/drivers/hwmon/scmi-hwmon.c b/drivers/hwmon/scmi-hwmon.c
index b1329a58ce40..e192f0c67146 100644
--- a/drivers/hwmon/scmi-hwmon.c
+++ b/drivers/hwmon/scmi-hwmon.c
@@ -20,6 +20,11 @@ struct scmi_sensors {
const struct scmi_sensor_info **info[hwmon_max];
};
+struct scmi_thermal_sensor {
+ const struct scmi_protocol_handle *ph;
+ const struct scmi_sensor_info *info;
+};
+
static inline u64 __pow10(u8 x)
{
u64 r = 1;
@@ -64,16 +69,14 @@ static int scmi_hwmon_scale(const struct scmi_sensor_info *sensor, u64 *value)
return 0;
}
-static int scmi_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
- u32 attr, int channel, long *val)
+static int scmi_hwmon_read_scaled_value(const struct scmi_protocol_handle *ph,
+ const struct scmi_sensor_info *sensor,
+ long *val)
{
int ret;
u64 value;
- const struct scmi_sensor_info *sensor;
- struct scmi_sensors *scmi_sensors = dev_get_drvdata(dev);
- sensor = *(scmi_sensors->info[type] + channel);
- ret = sensor_ops->reading_get(scmi_sensors->ph, sensor->id, &value);
+ ret = sensor_ops->reading_get(ph, sensor->id, &value);
if (ret)
return ret;
@@ -84,6 +87,17 @@ static int scmi_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
return ret;
}
+static int scmi_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
+ u32 attr, int channel, long *val)
+{
+ const struct scmi_sensor_info *sensor;
+ struct scmi_sensors *scmi_sensors = dev_get_drvdata(dev);
+
+ sensor = *(scmi_sensors->info[type] + channel);
+
+ return scmi_hwmon_read_scaled_value(scmi_sensors->ph, sensor, val);
+}
+
static int
scmi_hwmon_read_string(struct device *dev, enum hwmon_sensor_types type,
u32 attr, int channel, const char **str)
@@ -122,6 +136,25 @@ static struct hwmon_chip_info scmi_chip_info = {
.info = NULL,
};
+static int scmi_hwmon_thermal_get_temp(struct thermal_zone_device *tz,
+ int *temp)
+{
+ int ret;
+ long value;
+ struct scmi_thermal_sensor *th_sensor = tz->devdata;
+
+ ret = scmi_hwmon_read_scaled_value(th_sensor->ph, th_sensor->info,
+ &value);
+ if (!ret)
+ *temp = value;
+
+ return ret;
+}
+
+static const struct thermal_zone_device_ops scmi_hwmon_thermal_ops = {
+ .get_temp = scmi_hwmon_thermal_get_temp,
+};
+
static int scmi_hwmon_add_chan_info(struct hwmon_channel_info *scmi_hwmon_chan,
struct device *dev, int num,
enum hwmon_sensor_types type, u32 config)
@@ -149,7 +182,6 @@ static enum hwmon_sensor_types scmi_types[] = {
};
static u32 hwmon_attributes[hwmon_max] = {
- [hwmon_chip] = HWMON_C_REGISTER_TZ,
[hwmon_temp] = HWMON_T_INPUT | HWMON_T_LABEL,
[hwmon_in] = HWMON_I_INPUT | HWMON_I_LABEL,
[hwmon_curr] = HWMON_C_INPUT | HWMON_C_LABEL,
@@ -157,6 +189,43 @@ static u32 hwmon_attributes[hwmon_max] = {
[hwmon_energy] = HWMON_E_INPUT | HWMON_E_LABEL,
};
+static int scmi_thermal_sensor_register(struct device *dev,
+ const struct scmi_protocol_handle *ph,
+ const struct scmi_sensor_info *sensor)
+{
+ struct scmi_thermal_sensor *th_sensor;
+ struct thermal_zone_device *tzd;
+
+ th_sensor = devm_kzalloc(dev, sizeof(*th_sensor), GFP_KERNEL);
+ if (!th_sensor)
+ return -ENOMEM;
+
+ th_sensor->ph = ph;
+ th_sensor->info = sensor;
+
+ /*
+ * Try to register a temperature sensor with the Thermal Framework:
+ * skip sensors not defined as part of any thermal zone (-ENODEV) but
+ * report any other errors related to misconfigured zones/sensors.
+ */
+ tzd = devm_thermal_of_zone_register(dev, th_sensor->info->id, th_sensor,
+ &scmi_hwmon_thermal_ops);
+ if (IS_ERR(tzd)) {
+ devm_kfree(dev, th_sensor);
+
+ if (PTR_ERR(tzd) != -ENODEV)
+ return PTR_ERR(tzd);
+
+ dev_dbg(dev, "Sensor '%s' not attached to any thermal zone.\n",
+ sensor->name);
+ } else {
+ dev_dbg(dev, "Sensor '%s' attached to thermal zone ID:%d\n",
+ sensor->name, tzd->id);
+ }
+
+ return 0;
+}
+
static int scmi_hwmon_probe(struct scmi_device *sdev)
{
int i, idx;
@@ -164,7 +233,7 @@ static int scmi_hwmon_probe(struct scmi_device *sdev)
enum hwmon_sensor_types type;
struct scmi_sensors *scmi_sensors;
const struct scmi_sensor_info *sensor;
- int nr_count[hwmon_max] = {0}, nr_types = 0;
+ int nr_count[hwmon_max] = {0}, nr_types = 0, nr_count_temp = 0;
const struct hwmon_chip_info *chip_info;
struct device *hwdev, *dev = &sdev->dev;
struct hwmon_channel_info *scmi_hwmon_chan;
@@ -208,10 +277,8 @@ static int scmi_hwmon_probe(struct scmi_device *sdev)
}
}
- if (nr_count[hwmon_temp]) {
- nr_count[hwmon_chip]++;
- nr_types++;
- }
+ if (nr_count[hwmon_temp])
+ nr_count_temp = nr_count[hwmon_temp];
scmi_hwmon_chan = devm_kcalloc(dev, nr_types, sizeof(*scmi_hwmon_chan),
GFP_KERNEL);
@@ -262,8 +329,31 @@ static int scmi_hwmon_probe(struct scmi_device *sdev)
hwdev = devm_hwmon_device_register_with_info(dev, "scmi_sensors",
scmi_sensors, chip_info,
NULL);
+ if (IS_ERR(hwdev))
+ return PTR_ERR(hwdev);
- return PTR_ERR_OR_ZERO(hwdev);
+ for (i = 0; i < nr_count_temp; i++) {
+ int ret;
+
+ sensor = *(scmi_sensors->info[hwmon_temp] + i);
+ if (!sensor)
+ continue;
+
+ /*
+ * Warn on any misconfiguration related to thermal zones but
+ * bail out of probing only on memory errors.
+ */
+ ret = scmi_thermal_sensor_register(dev, ph, sensor);
+ if (ret) {
+ if (ret == -ENOMEM)
+ return ret;
+ dev_warn(dev,
+ "Thermal zone misconfigured for %s. err=%d\n",
+ sensor->name, ret);
+ }
+ }
+
+ return 0;
}
static const struct scmi_device_id scmi_id_table[] = {
diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c
index e06509edc5f3..1fda1eaa6d6a 100644
--- a/drivers/i2c/busses/i2c-i801.c
+++ b/drivers/i2c/busses/i2c-i801.c
@@ -1243,6 +1243,7 @@ static const struct {
*/
{ "Latitude 5480", 0x29 },
{ "Vostro V131", 0x1d },
+ { "Vostro 5568", 0x29 },
};
static void register_dell_lis3lv02d_i2c_device(struct i801_priv *priv)
diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
index 39cb1b7bb865..809fbd014cd6 100644
--- a/drivers/i2c/busses/i2c-piix4.c
+++ b/drivers/i2c/busses/i2c-piix4.c
@@ -1080,6 +1080,7 @@ static int piix4_probe(struct pci_dev *dev, const struct pci_device_id *id)
"", &piix4_main_adapters[0]);
if (retval < 0)
return retval;
+ piix4_adapter_count = 1;
}
/* Check for auxiliary SMBus on some AMD chipsets */
diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c
index 954022c04cc4..3869c258a529 100644
--- a/drivers/i2c/busses/i2c-tegra.c
+++ b/drivers/i2c/busses/i2c-tegra.c
@@ -284,6 +284,7 @@ struct tegra_i2c_dev {
struct dma_chan *tx_dma_chan;
struct dma_chan *rx_dma_chan;
unsigned int dma_buf_size;
+ struct device *dma_dev;
dma_addr_t dma_phys;
void *dma_buf;
@@ -420,7 +421,7 @@ static int tegra_i2c_dma_submit(struct tegra_i2c_dev *i2c_dev, size_t len)
static void tegra_i2c_release_dma(struct tegra_i2c_dev *i2c_dev)
{
if (i2c_dev->dma_buf) {
- dma_free_coherent(i2c_dev->dev, i2c_dev->dma_buf_size,
+ dma_free_coherent(i2c_dev->dma_dev, i2c_dev->dma_buf_size,
i2c_dev->dma_buf, i2c_dev->dma_phys);
i2c_dev->dma_buf = NULL;
}
@@ -472,10 +473,13 @@ static int tegra_i2c_init_dma(struct tegra_i2c_dev *i2c_dev)
i2c_dev->tx_dma_chan = chan;
+ WARN_ON(i2c_dev->tx_dma_chan->device != i2c_dev->rx_dma_chan->device);
+ i2c_dev->dma_dev = chan->device->dev;
+
i2c_dev->dma_buf_size = i2c_dev->hw->quirks->max_write_len +
I2C_PACKET_HEADER_SIZE;
- dma_buf = dma_alloc_coherent(i2c_dev->dev, i2c_dev->dma_buf_size,
+ dma_buf = dma_alloc_coherent(i2c_dev->dma_dev, i2c_dev->dma_buf_size,
&dma_phys, GFP_KERNEL | __GFP_NOWARN);
if (!dma_buf) {
dev_err(i2c_dev->dev, "failed to allocate DMA buffer\n");
@@ -1272,7 +1276,7 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev,
if (i2c_dev->dma_mode) {
if (i2c_dev->msg_read) {
- dma_sync_single_for_device(i2c_dev->dev,
+ dma_sync_single_for_device(i2c_dev->dma_dev,
i2c_dev->dma_phys,
xfer_size, DMA_FROM_DEVICE);
@@ -1280,7 +1284,7 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev,
if (err)
return err;
} else {
- dma_sync_single_for_cpu(i2c_dev->dev,
+ dma_sync_single_for_cpu(i2c_dev->dma_dev,
i2c_dev->dma_phys,
xfer_size, DMA_TO_DEVICE);
}
@@ -1293,7 +1297,7 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev,
memcpy(i2c_dev->dma_buf + I2C_PACKET_HEADER_SIZE,
msg->buf, msg->len);
- dma_sync_single_for_device(i2c_dev->dev,
+ dma_sync_single_for_device(i2c_dev->dma_dev,
i2c_dev->dma_phys,
xfer_size, DMA_TO_DEVICE);
@@ -1344,7 +1348,7 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev,
}
if (i2c_dev->msg_read && i2c_dev->msg_err == I2C_ERR_NONE) {
- dma_sync_single_for_cpu(i2c_dev->dev,
+ dma_sync_single_for_cpu(i2c_dev->dma_dev,
i2c_dev->dma_phys,
xfer_size, DMA_FROM_DEVICE);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index cc2222b85c88..26d1772179b8 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1556,7 +1556,7 @@ static bool validate_ipv4_net_dev(struct net_device *net_dev,
return false;
memset(&fl4, 0, sizeof(fl4));
- fl4.flowi4_iif = net_dev->ifindex;
+ fl4.flowi4_oif = net_dev->ifindex;
fl4.daddr = daddr;
fl4.saddr = saddr;
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index ae60c73babcc..b69e2c4e4d2a 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2815,10 +2815,18 @@ static int __init ib_core_init(void)
nldev_init();
rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
- roce_gid_mgmt_init();
+ ret = roce_gid_mgmt_init();
+ if (ret) {
+ pr_warn("Couldn't init RoCE GID management\n");
+ goto err_parent;
+ }
return 0;
+err_parent:
+ rdma_nl_unregister(RDMA_NL_LS);
+ nldev_exit();
+ unregister_pernet_device(&rdma_dev_net_ops);
err_compat:
unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
err_sa:
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index b92358f606d0..12dc97067ed2 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -2537,7 +2537,7 @@ void __init nldev_init(void)
rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table);
}
-void __exit nldev_exit(void)
+void nldev_exit(void)
{
rdma_nl_unregister(RDMA_NL_NLDEV);
}
diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c
index 94b94cca4870..15ee92081118 100644
--- a/drivers/infiniband/hw/efa/efa_main.c
+++ b/drivers/infiniband/hw/efa/efa_main.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
/*
- * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
*/
#include <linux/module.h>
@@ -14,10 +14,12 @@
#define PCI_DEV_ID_EFA0_VF 0xefa0
#define PCI_DEV_ID_EFA1_VF 0xefa1
+#define PCI_DEV_ID_EFA2_VF 0xefa2
static const struct pci_device_id efa_pci_tbl[] = {
{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA0_VF) },
{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA1_VF) },
+ { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA2_VF) },
{ }
};
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
index 3d42bd2b36bd..51ae58c02b15 100644
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -913,8 +913,7 @@ void sc_disable(struct send_context *sc)
spin_unlock(&sc->release_lock);
write_seqlock(&sc->waitlock);
- if (!list_empty(&sc->piowait))
- list_move(&sc->piowait, &wake_list);
+ list_splice_init(&sc->piowait, &wake_list);
write_sequnlock(&sc->waitlock);
while (!list_empty(&wake_list)) {
struct iowait *wait;
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 1ead35fb031b..1435fe2ea176 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -118,7 +118,6 @@ static const u32 hns_roce_op_code[] = {
HR_OPC_MAP(ATOMIC_CMP_AND_SWP, ATOM_CMP_AND_SWAP),
HR_OPC_MAP(ATOMIC_FETCH_AND_ADD, ATOM_FETCH_AND_ADD),
HR_OPC_MAP(SEND_WITH_INV, SEND_WITH_INV),
- HR_OPC_MAP(LOCAL_INV, LOCAL_INV),
HR_OPC_MAP(MASKED_ATOMIC_CMP_AND_SWP, ATOM_MSK_CMP_AND_SWAP),
HR_OPC_MAP(MASKED_ATOMIC_FETCH_AND_ADD, ATOM_MSK_FETCH_AND_ADD),
HR_OPC_MAP(REG_MR, FAST_REG_PMR),
@@ -559,9 +558,6 @@ static int set_rc_opcode(struct hns_roce_dev *hr_dev,
else
ret = -EOPNOTSUPP;
break;
- case IB_WR_LOCAL_INV:
- hr_reg_enable(rc_sq_wqe, RC_SEND_WQE_SO);
- fallthrough;
case IB_WR_SEND_WITH_INV:
rc_sq_wqe->inv_key = cpu_to_le32(wr->ex.invalidate_rkey);
break;
@@ -2805,8 +2801,12 @@ static int free_mr_modify_qp(struct hns_roce_dev *hr_dev)
static int free_mr_init(struct hns_roce_dev *hr_dev)
{
+ struct hns_roce_v2_priv *priv = hr_dev->priv;
+ struct hns_roce_v2_free_mr *free_mr = &priv->free_mr;
int ret;
+ mutex_init(&free_mr->mutex);
+
ret = free_mr_alloc_res(hr_dev);
if (ret)
return ret;
@@ -3222,7 +3222,6 @@ static int hns_roce_v2_write_mtpt(struct hns_roce_dev *hr_dev,
hr_reg_write(mpt_entry, MPT_ST, V2_MPT_ST_VALID);
hr_reg_write(mpt_entry, MPT_PD, mr->pd);
- hr_reg_enable(mpt_entry, MPT_L_INV_EN);
hr_reg_write_bool(mpt_entry, MPT_BIND_EN,
mr->access & IB_ACCESS_MW_BIND);
@@ -3313,7 +3312,6 @@ static int hns_roce_v2_frmr_write_mtpt(struct hns_roce_dev *hr_dev,
hr_reg_enable(mpt_entry, MPT_RA_EN);
hr_reg_enable(mpt_entry, MPT_R_INV_EN);
- hr_reg_enable(mpt_entry, MPT_L_INV_EN);
hr_reg_enable(mpt_entry, MPT_FRE);
hr_reg_clear(mpt_entry, MPT_MR_MW);
@@ -3345,7 +3343,6 @@ static int hns_roce_v2_mw_write_mtpt(void *mb_buf, struct hns_roce_mw *mw)
hr_reg_write(mpt_entry, MPT_PD, mw->pdn);
hr_reg_enable(mpt_entry, MPT_R_INV_EN);
- hr_reg_enable(mpt_entry, MPT_L_INV_EN);
hr_reg_enable(mpt_entry, MPT_LW_EN);
hr_reg_enable(mpt_entry, MPT_MR_MW);
@@ -3794,7 +3791,6 @@ static const u32 wc_send_op_map[] = {
HR_WC_OP_MAP(RDMA_READ, RDMA_READ),
HR_WC_OP_MAP(RDMA_WRITE, RDMA_WRITE),
HR_WC_OP_MAP(RDMA_WRITE_WITH_IMM, RDMA_WRITE),
- HR_WC_OP_MAP(LOCAL_INV, LOCAL_INV),
HR_WC_OP_MAP(ATOM_CMP_AND_SWAP, COMP_SWAP),
HR_WC_OP_MAP(ATOM_FETCH_AND_ADD, FETCH_ADD),
HR_WC_OP_MAP(ATOM_MSK_CMP_AND_SWAP, MASKED_COMP_SWAP),
@@ -3844,9 +3840,6 @@ static void fill_send_wc(struct ib_wc *wc, struct hns_roce_v2_cqe *cqe)
case HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM:
wc->wc_flags |= IB_WC_WITH_IMM;
break;
- case HNS_ROCE_V2_WQE_OP_LOCAL_INV:
- wc->wc_flags |= IB_WC_WITH_INVALIDATE;
- break;
case HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP:
case HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD:
case HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP:
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index b11579027e82..c7bf2d52c1cd 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -179,7 +179,6 @@ enum {
HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP = 0x8,
HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD = 0x9,
HNS_ROCE_V2_WQE_OP_FAST_REG_PMR = 0xa,
- HNS_ROCE_V2_WQE_OP_LOCAL_INV = 0xb,
HNS_ROCE_V2_WQE_OP_BIND_MW = 0xc,
HNS_ROCE_V2_WQE_OP_MASK = 0x1f,
};
@@ -915,7 +914,6 @@ struct hns_roce_v2_rc_send_wqe {
#define RC_SEND_WQE_OWNER RC_SEND_WQE_FIELD_LOC(7, 7)
#define RC_SEND_WQE_CQE RC_SEND_WQE_FIELD_LOC(8, 8)
#define RC_SEND_WQE_FENCE RC_SEND_WQE_FIELD_LOC(9, 9)
-#define RC_SEND_WQE_SO RC_SEND_WQE_FIELD_LOC(10, 10)
#define RC_SEND_WQE_SE RC_SEND_WQE_FIELD_LOC(11, 11)
#define RC_SEND_WQE_INLINE RC_SEND_WQE_FIELD_LOC(12, 12)
#define RC_SEND_WQE_WQE_INDEX RC_SEND_WQE_FIELD_LOC(30, 15)
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 5152f10d2e6d..ba0c3e4c07d8 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -344,6 +344,10 @@ static int qedr_alloc_resources(struct qedr_dev *dev)
if (IS_IWARP(dev)) {
xa_init(&dev->qps);
dev->iwarp_wq = create_singlethread_workqueue("qedr_iwarpq");
+ if (!dev->iwarp_wq) {
+ rc = -ENOMEM;
+ goto err1;
+ }
}
/* Allocate Status blocks for CNQ */
@@ -351,7 +355,7 @@ static int qedr_alloc_resources(struct qedr_dev *dev)
GFP_KERNEL);
if (!dev->sb_array) {
rc = -ENOMEM;
- goto err1;
+ goto err_destroy_wq;
}
dev->cnq_array = kcalloc(dev->num_cnq,
@@ -402,6 +406,9 @@ err3:
kfree(dev->cnq_array);
err2:
kfree(dev->sb_array);
+err_destroy_wq:
+ if (IS_IWARP(dev))
+ destroy_workqueue(dev->iwarp_wq);
err1:
kfree(dev->sgid_tbl);
return rc;
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index ed5a09e86417..693081e813ec 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -806,8 +806,10 @@ static enum resp_states read_reply(struct rxe_qp *qp,
skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload,
res->cur_psn, AETH_ACK_UNLIMITED);
- if (!skb)
+ if (!skb) {
+ rxe_put(mr);
return RESPST_ERR_RNR;
+ }
rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt),
payload, RXE_FROM_MR_OBJ);
diff --git a/drivers/isdn/hardware/mISDN/netjet.c b/drivers/isdn/hardware/mISDN/netjet.c
index a52f275f8263..f8447135a902 100644
--- a/drivers/isdn/hardware/mISDN/netjet.c
+++ b/drivers/isdn/hardware/mISDN/netjet.c
@@ -956,7 +956,7 @@ nj_release(struct tiger_hw *card)
}
if (card->irq > 0)
free_irq(card->irq, card);
- if (card->isac.dch.dev.dev.class)
+ if (device_is_registered(&card->isac.dch.dev.dev))
mISDN_unregister_device(&card->isac.dch.dev);
for (i = 0; i < 2; i++) {
diff --git a/drivers/isdn/mISDN/core.c b/drivers/isdn/mISDN/core.c
index a41b4b264594..7ea0100f218a 100644
--- a/drivers/isdn/mISDN/core.c
+++ b/drivers/isdn/mISDN/core.c
@@ -233,11 +233,12 @@ mISDN_register_device(struct mISDNdevice *dev,
if (debug & DEBUG_CORE)
printk(KERN_DEBUG "mISDN_register %s %d\n",
dev_name(&dev->dev), dev->id);
+ dev->dev.class = &mISDN_class;
+
err = create_stack(dev);
if (err)
goto error1;
- dev->dev.class = &mISDN_class;
dev->dev.platform_data = dev;
dev->dev.parent = parent;
dev_set_drvdata(&dev->dev, dev);
@@ -249,8 +250,8 @@ mISDN_register_device(struct mISDNdevice *dev,
error3:
delete_stack(dev);
- return err;
error1:
+ put_device(&dev->dev);
return err;
}
diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c
index b9107fe40023..5b139f2206b6 100644
--- a/drivers/net/dsa/dsa_loop.c
+++ b/drivers/net/dsa/dsa_loop.c
@@ -376,6 +376,17 @@ static struct mdio_driver dsa_loop_drv = {
#define NUM_FIXED_PHYS (DSA_LOOP_NUM_PORTS - 2)
+static void dsa_loop_phydevs_unregister(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < NUM_FIXED_PHYS; i++)
+ if (!IS_ERR(phydevs[i])) {
+ fixed_phy_unregister(phydevs[i]);
+ phy_device_free(phydevs[i]);
+ }
+}
+
static int __init dsa_loop_init(void)
{
struct fixed_phy_status status = {
@@ -383,23 +394,23 @@ static int __init dsa_loop_init(void)
.speed = SPEED_100,
.duplex = DUPLEX_FULL,
};
- unsigned int i;
+ unsigned int i, ret;
for (i = 0; i < NUM_FIXED_PHYS; i++)
phydevs[i] = fixed_phy_register(PHY_POLL, &status, NULL);
- return mdio_driver_register(&dsa_loop_drv);
+ ret = mdio_driver_register(&dsa_loop_drv);
+ if (ret)
+ dsa_loop_phydevs_unregister();
+
+ return ret;
}
module_init(dsa_loop_init);
static void __exit dsa_loop_exit(void)
{
- unsigned int i;
-
mdio_driver_unregister(&dsa_loop_drv);
- for (i = 0; i < NUM_FIXED_PHYS; i++)
- if (!IS_ERR(phydevs[i]))
- fixed_phy_unregister(phydevs[i]);
+ dsa_loop_phydevs_unregister();
}
module_exit(dsa_loop_exit);
diff --git a/drivers/net/ethernet/adi/adin1110.c b/drivers/net/ethernet/adi/adin1110.c
index 1744d623999d..606c97610808 100644
--- a/drivers/net/ethernet/adi/adin1110.c
+++ b/drivers/net/ethernet/adi/adin1110.c
@@ -1512,16 +1512,15 @@ static struct notifier_block adin1110_switchdev_notifier = {
.notifier_call = adin1110_switchdev_event,
};
-static void adin1110_unregister_notifiers(void *data)
+static void adin1110_unregister_notifiers(void)
{
unregister_switchdev_blocking_notifier(&adin1110_switchdev_blocking_notifier);
unregister_switchdev_notifier(&adin1110_switchdev_notifier);
unregister_netdevice_notifier(&adin1110_netdevice_nb);
}
-static int adin1110_setup_notifiers(struct adin1110_priv *priv)
+static int adin1110_setup_notifiers(void)
{
- struct device *dev = &priv->spidev->dev;
int ret;
ret = register_netdevice_notifier(&adin1110_netdevice_nb);
@@ -1536,13 +1535,14 @@ static int adin1110_setup_notifiers(struct adin1110_priv *priv)
if (ret < 0)
goto err_sdev;
- return devm_add_action_or_reset(dev, adin1110_unregister_notifiers, NULL);
+ return 0;
err_sdev:
unregister_switchdev_notifier(&adin1110_switchdev_notifier);
err_netdev:
unregister_netdevice_notifier(&adin1110_netdevice_nb);
+
return ret;
}
@@ -1613,10 +1613,6 @@ static int adin1110_probe_netdevs(struct adin1110_priv *priv)
if (ret < 0)
return ret;
- ret = adin1110_setup_notifiers(priv);
- if (ret < 0)
- return ret;
-
for (i = 0; i < priv->cfg->ports_nr; i++) {
ret = devm_register_netdev(dev, priv->ports[i]->netdev);
if (ret < 0) {
@@ -1693,7 +1689,31 @@ static struct spi_driver adin1110_driver = {
.probe = adin1110_probe,
.id_table = adin1110_spi_id,
};
-module_spi_driver(adin1110_driver);
+
+static int __init adin1110_driver_init(void)
+{
+ int ret;
+
+ ret = adin1110_setup_notifiers();
+ if (ret < 0)
+ return ret;
+
+ ret = spi_register_driver(&adin1110_driver);
+ if (ret < 0) {
+ adin1110_unregister_notifiers();
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __exit adin1110_exit(void)
+{
+ adin1110_unregister_notifiers();
+ spi_unregister_driver(&adin1110_driver);
+}
+module_init(adin1110_driver_init);
+module_exit(adin1110_exit);
MODULE_DESCRIPTION("ADIN1110 Network driver");
MODULE_AUTHOR("Alexandru Tachici <alexandru.tachici@analog.com>");
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 28ef4d3c1878..f623c12eaf95 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -713,7 +713,7 @@ fec_enet_txq_put_data_tso(struct fec_enet_priv_tx_q *txq, struct sk_buff *skb,
dev_kfree_skb_any(skb);
if (net_ratelimit())
netdev_err(ndev, "Tx DMA memory map failed\n");
- return NETDEV_TX_BUSY;
+ return NETDEV_TX_OK;
}
bdp->cbd_datlen = cpu_to_fec16(size);
@@ -775,7 +775,7 @@ fec_enet_txq_put_hdr_tso(struct fec_enet_priv_tx_q *txq,
dev_kfree_skb_any(skb);
if (net_ratelimit())
netdev_err(ndev, "Tx DMA memory map failed\n");
- return NETDEV_TX_BUSY;
+ return NETDEV_TX_OK;
}
}
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 65dbfbec487a..9282381a438f 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -3007,19 +3007,19 @@ static void __ibmvnic_reset(struct work_struct *work)
rwi = get_next_rwi(adapter);
/*
- * If there is another reset queued, free the previous rwi
- * and process the new reset even if previous reset failed
- * (the previous reset could have failed because of a fail
- * over for instance, so process the fail over).
- *
* If there are no resets queued and the previous reset failed,
* the adapter would be in an undefined state. So retry the
* previous reset as a hard reset.
+ *
+ * Else, free the previous rwi and, if there is another reset
+ * queued, process the new reset even if previous reset failed
+ * (the previous reset could have failed because of a fail
+ * over for instance, so process the fail over).
*/
- if (rwi)
- kfree(tmprwi);
- else if (rc)
+ if (!rwi && rc)
rwi = tmprwi;
+ else
+ kfree(tmprwi);
if (rwi && (rwi->reset_reason == VNIC_RESET_FAILOVER ||
rwi->reset_reason == VNIC_RESET_MOBILITY || rc))
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
index a42035cec611..e6948939ccc2 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
@@ -414,13 +414,15 @@ static struct sk_buff *lan966x_fdma_rx_get_frame(struct lan966x_rx *rx)
/* Get the received frame and unmap it */
db = &rx->dcbs[rx->dcb_index].db[rx->db_index];
page = rx->page[rx->dcb_index][rx->db_index];
+
+ dma_sync_single_for_cpu(lan966x->dev, (dma_addr_t)db->dataptr,
+ FDMA_DCB_STATUS_BLOCKL(db->status),
+ DMA_FROM_DEVICE);
+
skb = build_skb(page_address(page), PAGE_SIZE << rx->page_order);
if (unlikely(!skb))
goto unmap_page;
- dma_unmap_single(lan966x->dev, (dma_addr_t)db->dataptr,
- FDMA_DCB_STATUS_BLOCKL(db->status),
- DMA_FROM_DEVICE);
skb_put(skb, FDMA_DCB_STATUS_BLOCKL(db->status));
lan966x_ifh_get_src_port(skb->data, &src_port);
@@ -429,6 +431,10 @@ static struct sk_buff *lan966x_fdma_rx_get_frame(struct lan966x_rx *rx)
if (WARN_ON(src_port >= lan966x->num_phys_ports))
goto free_skb;
+ dma_unmap_single_attrs(lan966x->dev, (dma_addr_t)db->dataptr,
+ PAGE_SIZE << rx->page_order, DMA_FROM_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC);
+
skb->dev = lan966x->ports[src_port]->dev;
skb_pull(skb, IFH_LEN * sizeof(u32));
@@ -454,9 +460,9 @@ static struct sk_buff *lan966x_fdma_rx_get_frame(struct lan966x_rx *rx)
free_skb:
kfree_skb(skb);
unmap_page:
- dma_unmap_page(lan966x->dev, (dma_addr_t)db->dataptr,
- FDMA_DCB_STATUS_BLOCKL(db->status),
- DMA_FROM_DEVICE);
+ dma_unmap_single_attrs(lan966x->dev, (dma_addr_t)db->dataptr,
+ PAGE_SIZE << rx->page_order, DMA_FROM_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC);
__free_pages(page, rx->page_order);
return NULL;
@@ -668,12 +674,14 @@ static int lan966x_fdma_get_max_mtu(struct lan966x *lan966x)
int i;
for (i = 0; i < lan966x->num_phys_ports; ++i) {
+ struct lan966x_port *port;
int mtu;
- if (!lan966x->ports[i])
+ port = lan966x->ports[i];
+ if (!port)
continue;
- mtu = lan966x->ports[i]->dev->mtu;
+ mtu = lan_rd(lan966x, DEV_MAC_MAXLEN_CFG(port->chip_port));
if (mtu > max_mtu)
max_mtu = mtu;
}
@@ -733,6 +741,8 @@ int lan966x_fdma_change_mtu(struct lan966x *lan966x)
max_mtu = lan966x_fdma_get_max_mtu(lan966x);
max_mtu += IFH_LEN * sizeof(u32);
+ max_mtu += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ max_mtu += VLAN_HLEN * 2;
if (round_up(max_mtu, PAGE_SIZE) / PAGE_SIZE - 1 ==
lan966x->rx.page_order)
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
index be2fd030cccb..20ee5b28f70a 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
@@ -386,7 +386,7 @@ static int lan966x_port_change_mtu(struct net_device *dev, int new_mtu)
int old_mtu = dev->mtu;
int err;
- lan_wr(DEV_MAC_MAXLEN_CFG_MAX_LEN_SET(new_mtu),
+ lan_wr(DEV_MAC_MAXLEN_CFG_MAX_LEN_SET(LAN966X_HW_MTU(new_mtu)),
lan966x, DEV_MAC_MAXLEN_CFG(port->chip_port));
dev->mtu = new_mtu;
@@ -395,7 +395,7 @@ static int lan966x_port_change_mtu(struct net_device *dev, int new_mtu)
err = lan966x_fdma_change_mtu(lan966x);
if (err) {
- lan_wr(DEV_MAC_MAXLEN_CFG_MAX_LEN_SET(old_mtu),
+ lan_wr(DEV_MAC_MAXLEN_CFG_MAX_LEN_SET(LAN966X_HW_MTU(old_mtu)),
lan966x, DEV_MAC_MAXLEN_CFG(port->chip_port));
dev->mtu = old_mtu;
}
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h
index 9656071b8289..4ec33999e4df 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h
@@ -26,6 +26,8 @@
#define LAN966X_BUFFER_MEMORY (160 * 1024)
#define LAN966X_BUFFER_MIN_SZ 60
+#define LAN966X_HW_MTU(mtu) ((mtu) + ETH_HLEN + ETH_FCS_LEN)
+
#define PGID_AGGR 64
#define PGID_SRC 80
#define PGID_ENTRIES 89
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_regs.h b/drivers/net/ethernet/microchip/lan966x/lan966x_regs.h
index 1d90b93dd417..fb5087fef22e 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_regs.h
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_regs.h
@@ -585,6 +585,21 @@ enum lan966x_target {
#define DEV_MAC_MAXLEN_CFG_MAX_LEN_GET(x)\
FIELD_GET(DEV_MAC_MAXLEN_CFG_MAX_LEN, x)
+/* DEV:MAC_CFG_STATUS:MAC_TAGS_CFG */
+#define DEV_MAC_TAGS_CFG(t) __REG(TARGET_DEV, t, 8, 28, 0, 1, 44, 12, 0, 1, 4)
+
+#define DEV_MAC_TAGS_CFG_VLAN_DBL_AWR_ENA BIT(1)
+#define DEV_MAC_TAGS_CFG_VLAN_DBL_AWR_ENA_SET(x)\
+ FIELD_PREP(DEV_MAC_TAGS_CFG_VLAN_DBL_AWR_ENA, x)
+#define DEV_MAC_TAGS_CFG_VLAN_DBL_AWR_ENA_GET(x)\
+ FIELD_GET(DEV_MAC_TAGS_CFG_VLAN_DBL_AWR_ENA, x)
+
+#define DEV_MAC_TAGS_CFG_VLAN_AWR_ENA BIT(0)
+#define DEV_MAC_TAGS_CFG_VLAN_AWR_ENA_SET(x)\
+ FIELD_PREP(DEV_MAC_TAGS_CFG_VLAN_AWR_ENA, x)
+#define DEV_MAC_TAGS_CFG_VLAN_AWR_ENA_GET(x)\
+ FIELD_GET(DEV_MAC_TAGS_CFG_VLAN_AWR_ENA, x)
+
/* DEV:MAC_CFG_STATUS:MAC_IFG_CFG */
#define DEV_MAC_IFG_CFG(t) __REG(TARGET_DEV, t, 8, 28, 0, 1, 44, 20, 0, 1, 4)
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_vlan.c b/drivers/net/ethernet/microchip/lan966x/lan966x_vlan.c
index 8d7260cd7da9..3c44660128da 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_vlan.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_vlan.c
@@ -169,6 +169,12 @@ void lan966x_vlan_port_apply(struct lan966x_port *port)
ANA_VLAN_CFG_VLAN_POP_CNT,
lan966x, ANA_VLAN_CFG(port->chip_port));
+ lan_rmw(DEV_MAC_TAGS_CFG_VLAN_AWR_ENA_SET(port->vlan_aware) |
+ DEV_MAC_TAGS_CFG_VLAN_DBL_AWR_ENA_SET(port->vlan_aware),
+ DEV_MAC_TAGS_CFG_VLAN_AWR_ENA |
+ DEV_MAC_TAGS_CFG_VLAN_DBL_AWR_ENA,
+ lan966x, DEV_MAC_TAGS_CFG(port->chip_port));
+
/* Drop frames with multicast source address */
val = ANA_DROP_CFG_DROP_MC_SMAC_ENA_SET(1);
if (port->vlan_aware && !pvid)
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 054d5ce6029e..0556542d7a6b 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1059,8 +1059,10 @@ static int efx_pci_probe(struct pci_dev *pci_dev,
/* Allocate and initialise a struct net_device */
net_dev = alloc_etherdev_mq(sizeof(probe_data), EFX_MAX_CORE_TX_QUEUES);
- if (!net_dev)
- return -ENOMEM;
+ if (!net_dev) {
+ rc = -ENOMEM;
+ goto fail0;
+ }
probe_ptr = netdev_priv(net_dev);
*probe_ptr = probe_data;
efx->net_dev = net_dev;
@@ -1132,6 +1134,8 @@ static int efx_pci_probe(struct pci_dev *pci_dev,
WARN_ON(rc > 0);
netif_dbg(efx, drv, efx->net_dev, "initialisation failed. rc=%d\n", rc);
free_netdev(net_dev);
+ fail0:
+ kfree(probe_data);
return rc;
}
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c
index 017dbbda0c1c..79fa7870563b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c
@@ -51,7 +51,6 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id
struct stmmac_resources res;
struct device_node *np;
int ret, i, phy_mode;
- bool mdio = false;
np = dev_of_node(&pdev->dev);
@@ -69,12 +68,10 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id
if (!plat)
return -ENOMEM;
+ plat->mdio_node = of_get_child_by_name(np, "mdio");
if (plat->mdio_node) {
- dev_err(&pdev->dev, "Found MDIO subnode\n");
- mdio = true;
- }
+ dev_info(&pdev->dev, "Found MDIO subnode\n");
- if (mdio) {
plat->mdio_bus_data = devm_kzalloc(&pdev->dev,
sizeof(*plat->mdio_bus_data),
GFP_KERNEL);
diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
index 05848ff15fb5..a3967f8de417 100644
--- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c
+++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
@@ -108,7 +108,7 @@
* @next_tx_buf_to_use: next Tx buffer to write to
* @next_rx_buf_to_use: next Rx buffer to read from
* @base_addr: base address of the Emaclite device
- * @reset_lock: lock used for synchronization
+ * @reset_lock: lock to serialize xmit and tx_timeout execution
* @deferred_skb: holds an skb (for transmission at a later time) when the
* Tx buffer is not free
* @phy_dev: pointer to the PHY device
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index f82090bdf7ab..1cd604cd1fa1 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -583,7 +583,7 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner)
}
for (i = 0; i < PHY_MAX_ADDR; i++) {
- if ((bus->phy_mask & (1 << i)) == 0) {
+ if ((bus->phy_mask & BIT(i)) == 0) {
struct phy_device *phydev;
phydev = mdiobus_scan(bus, i);
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 27c6d235cbda..946628050f28 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1459,7 +1459,8 @@ static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
int err;
int i;
- if (it->nr_segs > MAX_SKB_FRAGS + 1)
+ if (it->nr_segs > MAX_SKB_FRAGS + 1 ||
+ len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN))
return ERR_PTR(-EMSGSIZE);
local_bh_disable();
diff --git a/drivers/nfc/fdp/fdp.c b/drivers/nfc/fdp/fdp.c
index c6b3334f24c9..f12f903a9dd1 100644
--- a/drivers/nfc/fdp/fdp.c
+++ b/drivers/nfc/fdp/fdp.c
@@ -249,11 +249,19 @@ static int fdp_nci_close(struct nci_dev *ndev)
static int fdp_nci_send(struct nci_dev *ndev, struct sk_buff *skb)
{
struct fdp_nci_info *info = nci_get_drvdata(ndev);
+ int ret;
if (atomic_dec_and_test(&info->data_pkt_counter))
info->data_pkt_counter_cb(ndev);
- return info->phy_ops->write(info->phy, skb);
+ ret = info->phy_ops->write(info->phy, skb);
+ if (ret < 0) {
+ kfree_skb(skb);
+ return ret;
+ }
+
+ consume_skb(skb);
+ return 0;
}
static int fdp_nci_request_firmware(struct nci_dev *ndev)
diff --git a/drivers/nfc/nfcmrvl/i2c.c b/drivers/nfc/nfcmrvl/i2c.c
index acef0cfd76af..24436c9e54c9 100644
--- a/drivers/nfc/nfcmrvl/i2c.c
+++ b/drivers/nfc/nfcmrvl/i2c.c
@@ -132,10 +132,15 @@ static int nfcmrvl_i2c_nci_send(struct nfcmrvl_private *priv,
ret = -EREMOTEIO;
} else
ret = 0;
+ }
+
+ if (ret) {
kfree_skb(skb);
+ return ret;
}
- return ret;
+ consume_skb(skb);
+ return 0;
}
static void nfcmrvl_i2c_nci_update_config(struct nfcmrvl_private *priv,
diff --git a/drivers/nfc/nxp-nci/core.c b/drivers/nfc/nxp-nci/core.c
index 7c93d484dc1b..580cb6ecffee 100644
--- a/drivers/nfc/nxp-nci/core.c
+++ b/drivers/nfc/nxp-nci/core.c
@@ -80,10 +80,13 @@ static int nxp_nci_send(struct nci_dev *ndev, struct sk_buff *skb)
return -EINVAL;
r = info->phy_ops->write(info->phy_id, skb);
- if (r < 0)
+ if (r < 0) {
kfree_skb(skb);
+ return r;
+ }
- return r;
+ consume_skb(skb);
+ return 0;
}
static int nxp_nci_rf_pll_unlocked_ntf(struct nci_dev *ndev,
diff --git a/drivers/nfc/s3fwrn5/core.c b/drivers/nfc/s3fwrn5/core.c
index 1c412007fabb..0270e05b68df 100644
--- a/drivers/nfc/s3fwrn5/core.c
+++ b/drivers/nfc/s3fwrn5/core.c
@@ -110,11 +110,15 @@ static int s3fwrn5_nci_send(struct nci_dev *ndev, struct sk_buff *skb)
}
ret = s3fwrn5_write(info, skb);
- if (ret < 0)
+ if (ret < 0) {
kfree_skb(skb);
+ mutex_unlock(&info->mutex);
+ return ret;
+ }
+ consume_skb(skb);
mutex_unlock(&info->mutex);
- return ret;
+ return 0;
}
static int s3fwrn5_nci_post_setup(struct nci_dev *ndev)
diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c
index bdef7a8d6ab8..bcc1dae00780 100644
--- a/drivers/parisc/iosapic.c
+++ b/drivers/parisc/iosapic.c
@@ -866,6 +866,7 @@ int iosapic_serial_irq(struct parisc_device *dev)
return vi->txn_irq;
}
+EXPORT_SYMBOL(iosapic_serial_irq);
#endif
diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c
index d9e51036a4fa..d6af5726ddf3 100644
--- a/drivers/parisc/pdc_stable.c
+++ b/drivers/parisc/pdc_stable.c
@@ -14,7 +14,7 @@
* all) PA-RISC machines should have them. Anyway, for safety reasons, the
* following code can deal with just 96 bytes of Stable Storage, and all
* sizes between 96 and 192 bytes (provided they are multiple of struct
- * device_path size, eg: 128, 160 and 192) to provide full information.
+ * pdc_module_path size, eg: 128, 160 and 192) to provide full information.
* One last word: there's one path we can always count on: the primary path.
* Anything above 224 bytes is used for 'osdep2' OS-dependent storage area.
*
@@ -88,7 +88,7 @@ struct pdcspath_entry {
short ready; /* entry record is valid if != 0 */
unsigned long addr; /* entry address in stable storage */
char *name; /* entry name */
- struct device_path devpath; /* device path in parisc representation */
+ struct pdc_module_path devpath; /* device path in parisc representation */
struct device *dev; /* corresponding device */
struct kobject kobj;
};
@@ -138,7 +138,7 @@ struct pdcspath_attribute paths_attr_##_name = { \
static int
pdcspath_fetch(struct pdcspath_entry *entry)
{
- struct device_path *devpath;
+ struct pdc_module_path *devpath;
if (!entry)
return -EINVAL;
@@ -153,7 +153,7 @@ pdcspath_fetch(struct pdcspath_entry *entry)
return -EIO;
/* Find the matching device.
- NOTE: hardware_path overlays with device_path, so the nice cast can
+ NOTE: hardware_path overlays with pdc_module_path, so the nice cast can
be used */
entry->dev = hwpath_to_device((struct hardware_path *)devpath);
@@ -179,7 +179,7 @@ pdcspath_fetch(struct pdcspath_entry *entry)
static void
pdcspath_store(struct pdcspath_entry *entry)
{
- struct device_path *devpath;
+ struct pdc_module_path *devpath;
BUG_ON(!entry);
@@ -221,7 +221,7 @@ static ssize_t
pdcspath_hwpath_read(struct pdcspath_entry *entry, char *buf)
{
char *out = buf;
- struct device_path *devpath;
+ struct pdc_module_path *devpath;
short i;
if (!entry || !buf)
@@ -236,11 +236,11 @@ pdcspath_hwpath_read(struct pdcspath_entry *entry, char *buf)
return -ENODATA;
for (i = 0; i < 6; i++) {
- if (devpath->bc[i] >= 128)
+ if (devpath->path.bc[i] < 0)
continue;
- out += sprintf(out, "%u/", (unsigned char)devpath->bc[i]);
+ out += sprintf(out, "%d/", devpath->path.bc[i]);
}
- out += sprintf(out, "%u\n", (unsigned char)devpath->mod);
+ out += sprintf(out, "%u\n", (unsigned char)devpath->path.mod);
return out - buf;
}
@@ -296,12 +296,12 @@ pdcspath_hwpath_write(struct pdcspath_entry *entry, const char *buf, size_t coun
for (i=5; ((temp = strrchr(in, '/'))) && (temp-in > 0) && (likely(i)); i--) {
hwpath.bc[i] = simple_strtoul(temp+1, NULL, 10);
in[temp-in] = '\0';
- DPRINTK("%s: bc[%d]: %d\n", __func__, i, hwpath.bc[i]);
+ DPRINTK("%s: bc[%d]: %d\n", __func__, i, hwpath.path.bc[i]);
}
/* Store the final field */
hwpath.bc[i] = simple_strtoul(in, NULL, 10);
- DPRINTK("%s: bc[%d]: %d\n", __func__, i, hwpath.bc[i]);
+ DPRINTK("%s: bc[%d]: %d\n", __func__, i, hwpath.path.bc[i]);
/* Now we check that the user isn't trying to lure us */
if (!(dev = hwpath_to_device((struct hardware_path *)&hwpath))) {
@@ -342,7 +342,7 @@ static ssize_t
pdcspath_layer_read(struct pdcspath_entry *entry, char *buf)
{
char *out = buf;
- struct device_path *devpath;
+ struct pdc_module_path *devpath;
short i;
if (!entry || !buf)
@@ -547,7 +547,7 @@ static ssize_t pdcs_auto_read(struct kobject *kobj,
pathentry = &pdcspath_entry_primary;
read_lock(&pathentry->rw_lock);
- out += sprintf(out, "%s\n", (pathentry->devpath.flags & knob) ?
+ out += sprintf(out, "%s\n", (pathentry->devpath.path.flags & knob) ?
"On" : "Off");
read_unlock(&pathentry->rw_lock);
@@ -594,8 +594,8 @@ static ssize_t pdcs_timer_read(struct kobject *kobj,
/* print the timer value in seconds */
read_lock(&pathentry->rw_lock);
- out += sprintf(out, "%u\n", (pathentry->devpath.flags & PF_TIMER) ?
- (1 << (pathentry->devpath.flags & PF_TIMER)) : 0);
+ out += sprintf(out, "%u\n", (pathentry->devpath.path.flags & PF_TIMER) ?
+ (1 << (pathentry->devpath.path.flags & PF_TIMER)) : 0);
read_unlock(&pathentry->rw_lock);
return out - buf;
@@ -764,7 +764,7 @@ static ssize_t pdcs_auto_write(struct kobject *kobj,
/* Be nice to the existing flag record */
read_lock(&pathentry->rw_lock);
- flags = pathentry->devpath.flags;
+ flags = pathentry->devpath.path.flags;
read_unlock(&pathentry->rw_lock);
DPRINTK("%s: flags before: 0x%X\n", __func__, flags);
@@ -785,7 +785,7 @@ static ssize_t pdcs_auto_write(struct kobject *kobj,
write_lock(&pathentry->rw_lock);
/* Change the path entry flags first */
- pathentry->devpath.flags = flags;
+ pathentry->devpath.path.flags = flags;
/* Now, dive in. Write back to the hardware */
pdcspath_store(pathentry);
diff --git a/drivers/soc/imx/imx93-pd.c b/drivers/soc/imx/imx93-pd.c
index 1f3d7039c1de..4d235c8c4924 100644
--- a/drivers/soc/imx/imx93-pd.c
+++ b/drivers/soc/imx/imx93-pd.c
@@ -135,11 +135,24 @@ static int imx93_pd_probe(struct platform_device *pdev)
ret = pm_genpd_init(&domain->genpd, NULL, domain->init_off);
if (ret)
- return ret;
+ goto err_clk_unprepare;
platform_set_drvdata(pdev, domain);
- return of_genpd_add_provider_simple(np, &domain->genpd);
+ ret = of_genpd_add_provider_simple(np, &domain->genpd);
+ if (ret)
+ goto err_genpd_remove;
+
+ return 0;
+
+err_genpd_remove:
+ pm_genpd_remove(&domain->genpd);
+
+err_clk_unprepare:
+ if (!domain->init_off)
+ clk_bulk_disable_unprepare(domain->num_clks, domain->clks);
+
+ return ret;
}
static const struct of_device_id imx93_pd_ids[] = {
diff --git a/drivers/tty/serial/8250/8250_gsc.c b/drivers/tty/serial/8250/8250_parisc.c
index 948d0a1c6ae8..948d0a1c6ae8 100644
--- a/drivers/tty/serial/8250/8250_gsc.c
+++ b/drivers/tty/serial/8250/8250_parisc.c
diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
index d0b49e15fbf5..b0f62345bc84 100644
--- a/drivers/tty/serial/8250/Kconfig
+++ b/drivers/tty/serial/8250/Kconfig
@@ -116,9 +116,9 @@ config SERIAL_8250_CONSOLE
If unsure, say N.
-config SERIAL_8250_GSC
+config SERIAL_8250_PARISC
tristate
- depends on SERIAL_8250 && GSC
+ depends on SERIAL_8250 && PARISC
default SERIAL_8250
config SERIAL_8250_DMA
diff --git a/drivers/tty/serial/8250/Makefile b/drivers/tty/serial/8250/Makefile
index bee908f99ea0..1615bfdde2a0 100644
--- a/drivers/tty/serial/8250/Makefile
+++ b/drivers/tty/serial/8250/Makefile
@@ -12,7 +12,7 @@ obj-$(CONFIG_SERIAL_8250) += 8250.o 8250_base.o
8250_base-$(CONFIG_SERIAL_8250_DMA) += 8250_dma.o
8250_base-$(CONFIG_SERIAL_8250_DWLIB) += 8250_dwlib.o
8250_base-$(CONFIG_SERIAL_8250_FINTEK) += 8250_fintek.o
-obj-$(CONFIG_SERIAL_8250_GSC) += 8250_gsc.o
+obj-$(CONFIG_SERIAL_8250_PARISC) += 8250_parisc.o
obj-$(CONFIG_SERIAL_8250_PCI) += 8250_pci.o
obj-$(CONFIG_SERIAL_8250_EXAR) += 8250_exar.o
obj-$(CONFIG_SERIAL_8250_HP300) += 8250_hp300.o
diff --git a/drivers/watchdog/exar_wdt.c b/drivers/watchdog/exar_wdt.c
index 35058d8b21bc..7c61ff343271 100644
--- a/drivers/watchdog/exar_wdt.c
+++ b/drivers/watchdog/exar_wdt.c
@@ -355,8 +355,10 @@ static int __init exar_wdt_register(struct wdt_priv *priv, const int idx)
&priv->wdt_res, 1,
priv, sizeof(*priv));
if (IS_ERR(n->pdev)) {
+ int err = PTR_ERR(n->pdev);
+
kfree(n);
- return PTR_ERR(n->pdev);
+ return err;
}
list_add_tail(&n->list, &pdev_list);
diff --git a/drivers/watchdog/sp805_wdt.c b/drivers/watchdog/sp805_wdt.c
index 78ba36689eec..2756ed54ca3d 100644
--- a/drivers/watchdog/sp805_wdt.c
+++ b/drivers/watchdog/sp805_wdt.c
@@ -88,7 +88,7 @@ static bool wdt_is_running(struct watchdog_device *wdd)
return (wdtcontrol & ENABLE_MASK) == ENABLE_MASK;
}
-/* This routine finds load value that will reset system in required timout */
+/* This routine finds load value that will reset system in required timeout */
static int wdt_setload(struct watchdog_device *wdd, unsigned int timeout)
{
struct sp805_wdt *wdt = watchdog_get_drvdata(wdd);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 4ec18ceb2f21..18374a6d05bd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -289,8 +289,10 @@ static void prelim_release(struct preftree *preftree)
struct prelim_ref *ref, *next_ref;
rbtree_postorder_for_each_entry_safe(ref, next_ref,
- &preftree->root.rb_root, rbnode)
+ &preftree->root.rb_root, rbnode) {
+ free_inode_elem_list(ref->inode_list);
free_pref(ref);
+ }
preftree->root = RB_ROOT_CACHED;
preftree->count = 0;
@@ -648,6 +650,18 @@ unode_aux_to_inode_list(struct ulist_node *node)
return (struct extent_inode_elem *)(uintptr_t)node->aux;
}
+static void free_leaf_list(struct ulist *ulist)
+{
+ struct ulist_node *node;
+ struct ulist_iterator uiter;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(ulist, &uiter)))
+ free_inode_elem_list(unode_aux_to_inode_list(node));
+
+ ulist_free(ulist);
+}
+
/*
* We maintain three separate rbtrees: one for direct refs, one for
* indirect refs which have a key, and one for indirect refs which do not
@@ -762,7 +776,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
cond_resched();
}
out:
- ulist_free(parents);
+ /*
+ * We may have inode lists attached to refs in the parents ulist, so we
+ * must free them before freeing the ulist and its refs.
+ */
+ free_leaf_list(parents);
return ret;
}
@@ -1368,6 +1386,12 @@ again:
if (ret < 0)
goto out;
ref->inode_list = eie;
+ /*
+ * We transferred the list ownership to the ref,
+ * so set to NULL to avoid a double free in case
+ * an error happens after this.
+ */
+ eie = NULL;
}
ret = ulist_add_merge_ptr(refs, ref->parent,
ref->inode_list,
@@ -1393,6 +1417,14 @@ again:
eie->next = ref->inode_list;
}
eie = NULL;
+ /*
+ * We have transferred the inode list ownership from
+ * this ref to the ref we added to the 'refs' ulist.
+ * So set this ref's inode list to NULL to avoid
+ * use-after-free when our caller uses it or double
+ * frees in case an error happens before we return.
+ */
+ ref->inode_list = NULL;
}
cond_resched();
}
@@ -1409,24 +1441,6 @@ out:
return ret;
}
-static void free_leaf_list(struct ulist *blocks)
-{
- struct ulist_node *node = NULL;
- struct extent_inode_elem *eie;
- struct ulist_iterator uiter;
-
- ULIST_ITER_INIT(&uiter);
- while ((node = ulist_next(blocks, &uiter))) {
- if (!node->aux)
- continue;
- eie = unode_aux_to_inode_list(node);
- free_inode_elem_list(eie);
- node->aux = 0;
- }
-
- ulist_free(blocks);
-}
-
/*
* Finds all leafs with a reference to the specified combination of bytenr and
* offset. key_list_head will point to a list of corresponding keys (caller must
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 727595eee973..f677b49df8ae 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3462,7 +3462,10 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
-ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before);
+ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before);
+struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before);
extern const struct dentry_operations btrfs_dentry_operations;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a2da9313c694..4b28263c3d32 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -166,11 +166,9 @@ static bool btrfs_supported_super_csum(u16 csum_type)
* Return 0 if the superblock checksum type matches the checksum value of that
* algorithm. Pass the raw disk superblock data.
*/
-static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
- char *raw_disk_sb)
+int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *disk_sb)
{
- struct btrfs_super_block *disk_sb =
- (struct btrfs_super_block *)raw_disk_sb;
char result[BTRFS_CSUM_SIZE];
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
@@ -181,7 +179,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
* filled with zeros and is included in the checksum.
*/
- crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
+ crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
if (memcmp(disk_sb->csum, result, fs_info->csum_size))
@@ -3479,7 +3477,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
* We want to check superblock checksum, the type is stored inside.
* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
*/
- if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {
+ if (btrfs_check_super_csum(fs_info, disk_super)) {
btrfs_err(fs_info, "superblock checksum mismatch");
err = -EINVAL;
btrfs_release_disk_super(disk_super);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c67c15d4d20b..9fa923e005a3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -42,6 +42,8 @@ struct extent_buffer *btrfs_find_create_tree_block(
void btrfs_clean_tree_block(struct extent_buffer *buf);
void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
+int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *disk_sb);
int __cold open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1d4c2397d0d6..fab7eb76e53b 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -58,7 +58,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
}
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
+ u64 root_objectid, u64 generation,
int check_generation)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index f32f4113c976..5afb7ca42828 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -19,7 +19,7 @@ struct btrfs_fid {
} __attribute__ ((packed));
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
+ u64 root_objectid, u64 generation,
int check_generation);
struct dentry *btrfs_get_parent(struct dentry *child);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cd2d36580f1a..2801c991814f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3295,21 +3295,22 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
}
/*
- * If this is a leaf and there are tree mod log users, we may
- * have recorded mod log operations that point to this leaf.
- * So we must make sure no one reuses this leaf's extent before
- * mod log operations are applied to a node, otherwise after
- * rewinding a node using the mod log operations we get an
- * inconsistent btree, as the leaf's extent may now be used as
- * a node or leaf for another different btree.
+ * If there are tree mod log users we may have recorded mod log
+ * operations for this node. If we re-allocate this node we
+ * could replay operations on this node that happened when it
+ * existed in a completely different root. For example if it
+ * was part of root A, then was reallocated to root B, and we
+ * are doing a btrfs_old_search_slot(root b), we could replay
+ * operations that happened when the block was part of root A,
+ * giving us an inconsistent view of the btree.
+ *
* We are safe from races here because at this point no other
* node or root points to this extent buffer, so if after this
- * check a new tree mod log user joins, it will not be able to
- * find a node pointing to this leaf and record operations that
- * point to this leaf.
+ * check a new tree mod log user joins we will not have an
+ * existing log of operations on this node that we have to
+ * contend with.
*/
- if (btrfs_header_level(buf) == 0 &&
- test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+ if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
must_pin = true;
if (must_pin || btrfs_is_zoned(fs_info)) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 176b432035ae..d01631d47806 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1598,14 +1598,19 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
write_bytes);
else
btrfs_check_nocow_unlock(BTRFS_I(inode));
+
+ if (nowait && ret == -ENOSPC)
+ ret = -EAGAIN;
break;
}
release_bytes = reserve_bytes;
again:
ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
- if (ret)
+ if (ret) {
+ btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
break;
+ }
/*
* This is going to setup the pages array with the number of
@@ -1765,6 +1770,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
loff_t endbyte;
ssize_t err;
unsigned int ilock_flags = 0;
+ struct iomap_dio *dio;
if (iocb->ki_flags & IOCB_NOWAIT)
ilock_flags |= BTRFS_ILOCK_TRY;
@@ -1825,11 +1831,22 @@ relock:
* So here we disable page faults in the iov_iter and then retry if we
* got -EFAULT, faulting in the pages before the retry.
*/
-again:
from->nofault = true;
- err = btrfs_dio_rw(iocb, from, written);
+ dio = btrfs_dio_write(iocb, from, written);
from->nofault = false;
+ /*
+ * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
+ * iocb, and that needs to lock the inode. So unlock it before calling
+ * iomap_dio_complete() to avoid a deadlock.
+ */
+ btrfs_inode_unlock(inode, ilock_flags);
+
+ if (IS_ERR_OR_NULL(dio))
+ err = PTR_ERR_OR_ZERO(dio);
+ else
+ err = iomap_dio_complete(dio);
+
/* No increment (+=) because iomap returns a cumulative value. */
if (err > 0)
written = err;
@@ -1855,12 +1872,10 @@ again:
} else {
fault_in_iov_iter_readable(from, left);
prev_left = left;
- goto again;
+ goto relock;
}
}
- btrfs_inode_unlock(inode, ilock_flags);
-
/*
* If 'err' is -ENOTBLK or we have not written all data, then it means
* we must fallback to buffered IO.
@@ -4035,7 +4050,7 @@ again:
*/
pagefault_disable();
to->nofault = true;
- ret = btrfs_dio_rw(iocb, to, read);
+ ret = btrfs_dio_read(iocb, to, read);
to->nofault = false;
pagefault_enable();
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b0807c59e321..0e516aefbf51 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7980,7 +7980,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
*/
status = BLK_STS_RESOURCE;
dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS);
- if (!dip)
+ if (!dip->csums)
goto out_err;
status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
@@ -8078,13 +8078,21 @@ static const struct iomap_dio_ops btrfs_dio_ops = {
.bio_set = &btrfs_dio_bioset,
};
-ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
+ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
{
struct btrfs_dio_data data;
return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC,
- &data, done_before);
+ IOMAP_DIO_PARTIAL, &data, done_before);
+}
+
+struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before)
+{
+ struct btrfs_dio_data data;
+
+ return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, &data, done_before);
}
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index f6395e8288d6..82c8e991300e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1632,10 +1632,8 @@ static int full_stripe_write(struct btrfs_raid_bio *rbio)
int ret;
ret = alloc_rbio_parity_pages(rbio);
- if (ret) {
- __free_raid_bio(rbio);
+ if (ret)
return ret;
- }
ret = lock_stripe_add(rbio);
if (ret == 0)
@@ -1823,8 +1821,10 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
*/
if (rbio_is_full(rbio)) {
ret = full_stripe_write(rbio);
- if (ret)
+ if (ret) {
+ __free_raid_bio(rbio);
goto fail;
+ }
return;
}
@@ -1838,8 +1838,10 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
list_add_tail(&rbio->plug_list, &plug->rbio_list);
} else {
ret = __raid56_parity_write(rbio);
- if (ret)
+ if (ret) {
+ __free_raid_bio(rbio);
goto fail;
+ }
}
return;
@@ -2742,8 +2744,10 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
- BUG();
- kfree(rbio);
+ btrfs_warn_rl(fs_info,
+ "can not determine the failed stripe number for full stripe %llu",
+ bioc->raid_map[0]);
+ __free_raid_bio(rbio);
return NULL;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ec6e1752af2c..145c84b44fd0 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6668,17 +6668,19 @@ static int changed_inode(struct send_ctx *sctx,
/*
* First, process the inode as if it was deleted.
*/
- sctx->cur_inode_gen = right_gen;
- sctx->cur_inode_new = false;
- sctx->cur_inode_deleted = true;
- sctx->cur_inode_size = btrfs_inode_size(
- sctx->right_path->nodes[0], right_ii);
- sctx->cur_inode_mode = btrfs_inode_mode(
- sctx->right_path->nodes[0], right_ii);
- ret = process_all_refs(sctx,
- BTRFS_COMPARE_TREE_DELETED);
- if (ret < 0)
- goto out;
+ if (old_nlinks > 0) {
+ sctx->cur_inode_gen = right_gen;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_deleted = true;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->right_path->nodes[0], right_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->right_path->nodes[0], right_ii);
+ ret = process_all_refs(sctx,
+ BTRFS_COMPARE_TREE_DELETED);
+ if (ret < 0)
+ goto out;
+ }
/*
* Now process the inode as if it was new.
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9be4fd2db0f4..5942b9384088 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2555,6 +2555,7 @@ static int check_dev_super(struct btrfs_device *dev)
{
struct btrfs_fs_info *fs_info = dev->fs_info;
struct btrfs_super_block *sb;
+ u16 csum_type;
int ret = 0;
/* This should be called with fs still frozen. */
@@ -2569,6 +2570,21 @@ static int check_dev_super(struct btrfs_device *dev)
if (IS_ERR(sb))
return PTR_ERR(sb);
+ /* Verify the checksum. */
+ csum_type = btrfs_super_csum_type(sb);
+ if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) {
+ btrfs_err(fs_info, "csum type changed, has %u expect %u",
+ csum_type, btrfs_super_csum_type(fs_info->super_copy));
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ if (btrfs_check_super_csum(fs_info, sb)) {
+ btrfs_err(fs_info, "csum for on-disk super block no longer matches");
+ ret = -EUCLEAN;
+ goto out;
+ }
+
/* Btrfs_validate_super() includes fsid check against super->fsid. */
ret = btrfs_validate_super(fs_info, sb, 0);
if (ret < 0)
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index eee1e4459541..63676ea19f29 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -225,20 +225,20 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
*/
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -250,29 +250,31 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return ret;
}
+ /* btrfs_qgroup_account_extent() always frees the ulists passed to it. */
+ old_roots = NULL;
+ new_roots = NULL;
+
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
nodesize, nodesize)) {
test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
- old_roots = NULL;
- new_roots = NULL;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = remove_extent_item(root, nodesize, nodesize);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return -EINVAL;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -322,20 +324,20 @@ static int test_multiple_refs(struct btrfs_root *root,
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -355,20 +357,20 @@ static int test_multiple_refs(struct btrfs_root *root,
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = add_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -394,20 +396,20 @@ static int test_multiple_refs(struct btrfs_root *root,
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = remove_extent_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 94ba46d57920..a8d4bc6a1937 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7142,6 +7142,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
u64 devid;
u64 type;
u8 uuid[BTRFS_UUID_SIZE];
+ int index;
int num_stripes;
int ret;
int i;
@@ -7149,6 +7150,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
type = btrfs_chunk_type(leaf, chunk);
+ index = btrfs_bg_flags_to_raid_index(type);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
#if BITS_PER_LONG == 32
@@ -7202,7 +7204,15 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->io_align = btrfs_chunk_io_align(leaf, chunk);
map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
map->type = type;
- map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ /*
+ * We can't use the sub_stripes value, as for profiles other than
+ * RAID10, they may have 0 as sub_stripes for filesystems created by
+ * older mkfs (<v5.4).
+ * In that case, it can cause divide-by-zero errors later.
+ * Since currently sub_stripes is fixed for each profile, let's
+ * use the trusted value instead.
+ */
+ map->sub_stripes = btrfs_raid_array[index].sub_stripes;
map->verified_stripes = 0;
em->orig_block_len = btrfs_calc_stripe_length(em);
for (i = 0; i < num_stripes; i++) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 599b9d5af349..f8b668dc8bf8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -395,6 +395,7 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
*/
struct btrfs_bio {
unsigned int mirror_num;
+ struct bvec_iter iter;
/* for direct I/O */
u64 file_offset;
@@ -403,7 +404,6 @@ struct btrfs_bio {
struct btrfs_device *device;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
- struct bvec_iter iter;
/* End I/O information supplied to btrfs_bio_alloc */
btrfs_bio_end_io_t end_io;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index d0b9fec111aa..fe220686bba4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1143,8 +1143,32 @@ const struct inode_operations cifs_file_inode_ops = {
.fiemap = cifs_fiemap,
};
+const char *cifs_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *done)
+{
+ char *target_path;
+
+ target_path = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!target_path)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock(&inode->i_lock);
+ if (likely(CIFS_I(inode)->symlink_target)) {
+ strscpy(target_path, CIFS_I(inode)->symlink_target, PATH_MAX);
+ } else {
+ kfree(target_path);
+ target_path = ERR_PTR(-EOPNOTSUPP);
+ }
+ spin_unlock(&inode->i_lock);
+
+ if (!IS_ERR(target_path))
+ set_delayed_call(done, kfree_link, target_path);
+
+ return target_path;
+}
+
const struct inode_operations cifs_symlink_inode_ops = {
- .get_link = simple_get_link,
+ .get_link = cifs_get_link,
.permission = cifs_permission,
.listxattr = cifs_listxattr,
};
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9bde08d44617..4e2ca3c6e5c0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -215,11 +215,6 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
kfree(cifs_i->symlink_target);
cifs_i->symlink_target = fattr->cf_symlink_target;
fattr->cf_symlink_target = NULL;
-
- if (unlikely(!cifs_i->symlink_target))
- inode->i_link = ERR_PTR(-EOPNOTSUPP);
- else
- inode->i_link = cifs_i->symlink_target;
}
spin_unlock(&inode->i_lock);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index da51ffd02928..3e68d8208cf5 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -400,6 +400,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
{
struct smb_hdr *buf = (struct smb_hdr *)buffer;
struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
+ struct TCP_Server_Info *pserver;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct cifsInodeInfo *pCifsInode;
@@ -464,9 +465,12 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
return false;
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(srv) ? srv->primary_server : srv;
+
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &srv->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
if (tcon->tid != buf->Tid)
continue;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index a38720477966..572293c18e16 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -135,6 +135,7 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len,
int
smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
{
+ struct TCP_Server_Info *pserver;
struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
struct smb2_pdu *pdu = (struct smb2_pdu *)shdr;
int hdr_size = sizeof(struct smb2_hdr);
@@ -143,6 +144,9 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
__u32 calc_len; /* calculated length */
__u64 mid;
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
/*
* Add function to do table lookup of StructureSize by command
* ie Validate the wct via smb2_struct_sizes table above
@@ -155,7 +159,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
/* decrypt frame now that it is completely read in */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(iter, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(iter, &pserver->smb_ses_list, smb_ses_list) {
if (iter->Suid == le64_to_cpu(thdr->SessionId)) {
ses = iter;
break;
@@ -608,51 +612,52 @@ smb2_tcon_find_pending_open_lease(struct cifs_tcon *tcon,
}
static bool
-smb2_is_valid_lease_break(char *buffer)
+smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
{
struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer;
- struct TCP_Server_Info *server;
+ struct TCP_Server_Info *pserver;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct cifs_pending_open *open;
cifs_dbg(FYI, "Checking for lease break\n");
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
- list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
- list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
- spin_lock(&tcon->open_file_lock);
- cifs_stats_inc(
- &tcon->stats.cifs_stats.num_oplock_brks);
- if (smb2_tcon_has_lease(tcon, rsp)) {
- spin_unlock(&tcon->open_file_lock);
- spin_unlock(&cifs_tcp_ses_lock);
- return true;
- }
- open = smb2_tcon_find_pending_open_lease(tcon,
- rsp);
- if (open) {
- __u8 lease_key[SMB2_LEASE_KEY_SIZE];
- struct tcon_link *tlink;
-
- tlink = cifs_get_tlink(open->tlink);
- memcpy(lease_key, open->lease_key,
- SMB2_LEASE_KEY_SIZE);
- spin_unlock(&tcon->open_file_lock);
- spin_unlock(&cifs_tcp_ses_lock);
- smb2_queue_pending_open_break(tlink,
- lease_key,
- rsp->NewLeaseState);
- return true;
- }
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+ spin_lock(&tcon->open_file_lock);
+ cifs_stats_inc(
+ &tcon->stats.cifs_stats.num_oplock_brks);
+ if (smb2_tcon_has_lease(tcon, rsp)) {
spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+ return true;
+ }
+ open = smb2_tcon_find_pending_open_lease(tcon,
+ rsp);
+ if (open) {
+ __u8 lease_key[SMB2_LEASE_KEY_SIZE];
+ struct tcon_link *tlink;
+
+ tlink = cifs_get_tlink(open->tlink);
+ memcpy(lease_key, open->lease_key,
+ SMB2_LEASE_KEY_SIZE);
+ spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+ smb2_queue_pending_open_break(tlink,
+ lease_key,
+ rsp->NewLeaseState);
+ return true;
+ }
+ spin_unlock(&tcon->open_file_lock);
- if (cached_dir_lease_break(tcon, rsp->LeaseKey)) {
- spin_unlock(&cifs_tcp_ses_lock);
- return true;
- }
+ if (cached_dir_lease_break(tcon, rsp->LeaseKey)) {
+ spin_unlock(&cifs_tcp_ses_lock);
+ return true;
}
}
}
@@ -671,6 +676,7 @@ bool
smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
{
struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer;
+ struct TCP_Server_Info *pserver;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct cifsInodeInfo *cinode;
@@ -684,16 +690,19 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
if (rsp->StructureSize !=
smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) {
if (le16_to_cpu(rsp->StructureSize) == 44)
- return smb2_is_valid_lease_break(buffer);
+ return smb2_is_valid_lease_break(buffer, server);
else
return false;
}
cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel);
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
spin_lock(&tcon->open_file_lock);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 4f53fa012936..880cd494afea 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -2302,14 +2302,18 @@ static void
smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
{
struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
+ struct TCP_Server_Info *pserver;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
if (shdr->Status != STATUS_NETWORK_NAME_DELETED)
return;
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) {
spin_lock(&tcon->tc_lock);
@@ -4264,21 +4268,23 @@ init_sg(int num_rqst, struct smb_rqst *rqst, u8 *sign)
static int
smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
{
+ struct TCP_Server_Info *pserver;
struct cifs_ses *ses;
u8 *ses_enc_key;
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
- list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
- if (ses->Suid == ses_id) {
- spin_lock(&ses->ses_lock);
- ses_enc_key = enc ? ses->smb3encryptionkey :
- ses->smb3decryptionkey;
- memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE);
- spin_unlock(&ses->ses_lock);
- spin_unlock(&cifs_tcp_ses_lock);
- return 0;
- }
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ if (ses->Suid == ses_id) {
+ spin_lock(&ses->ses_lock);
+ ses_enc_key = enc ? ses->smb3encryptionkey :
+ ses->smb3decryptionkey;
+ memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE);
+ spin_unlock(&ses->ses_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+ return 0;
}
}
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 8e3f26e6f6b9..381babc1212c 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -77,18 +77,19 @@ static
int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key)
{
struct cifs_chan *chan;
+ struct TCP_Server_Info *pserver;
struct cifs_ses *ses = NULL;
- struct TCP_Server_Info *it = NULL;
int i;
int rc = 0;
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(it, &cifs_tcp_ses_list, tcp_ses_list) {
- list_for_each_entry(ses, &it->smb_ses_list, smb_ses_list) {
- if (ses->Suid == ses_id)
- goto found;
- }
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ if (ses->Suid == ses_id)
+ goto found;
}
cifs_server_dbg(VFS, "%s: Could not find session 0x%llx\n",
__func__, ses_id);
@@ -136,9 +137,13 @@ out:
static struct cifs_ses *
smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id)
{
+ struct TCP_Server_Info *pserver;
struct cifs_ses *ses;
- list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
if (ses->Suid != ses_id)
continue;
++ses->ses_count;
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index ef05bfa87798..0f6d0a80467d 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1521,6 +1521,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
struct ext4_iloc iloc;
int inode_len, ino, ret, tag = tl->fc_tag;
struct ext4_extent_header *eh;
+ size_t off_gen = offsetof(struct ext4_inode, i_generation);
memcpy(&fc_inode, val, sizeof(fc_inode));
@@ -1548,8 +1549,8 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
raw_inode = ext4_raw_inode(&iloc);
memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
- memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
- inode_len - offsetof(struct ext4_inode, i_generation));
+ memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
+ inode_len - off_gen);
if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
if (eh->eh_magic != EXT4_EXT_MAGIC) {
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index ded535535b27..95dfea28bf4e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -145,9 +145,8 @@ static int ext4_update_backup_sb(struct super_block *sb,
if (ext4_has_metadata_csum(sb) &&
es->s_checksum != ext4_superblock_csum(sb, es)) {
ext4_msg(sb, KERN_ERR, "Invalid checksum for backup "
- "superblock %llu\n", sb_block);
+ "superblock %llu", sb_block);
unlock_buffer(bh);
- err = -EFSBADCRC;
goto out_bh;
}
func(es, arg);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 0a220ec9862d..a19a9661646e 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -424,7 +424,8 @@ int ext4_ext_migrate(struct inode *inode)
* already is extent-based, error out.
*/
if (!ext4_has_feature_extents(inode->i_sb) ||
- (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+ ext4_has_inline_data(inode))
return -EINVAL;
if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d5daaf41e1fc..c08c0aba1883 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2259,8 +2259,16 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
memset(de, 0, len); /* wipe old data */
de = (struct ext4_dir_entry_2 *) data2;
top = data2 + len;
- while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
+ while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
+ (data2 + (blocksize - csum_size) -
+ (char *) de))) {
+ brelse(bh2);
+ brelse(bh);
+ return -EFSCORRUPTED;
+ }
de = de2;
+ }
de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
(char *) de, blocksize);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 6dfe9ccae0c5..46b87ffeb304 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1158,6 +1158,7 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
while (group < sbi->s_groups_count) {
struct buffer_head *bh;
ext4_fsblk_t backup_block;
+ struct ext4_super_block *es;
/* Out of journal space, and can't get more - abort - so sad */
err = ext4_resize_ensure_credits_batch(handle, 1);
@@ -1186,6 +1187,10 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
memcpy(bh->b_data, data, size);
if (rest)
memset(bh->b_data + size, 0, rest);
+ es = (struct ext4_super_block *) bh->b_data;
+ es->s_block_group_nr = cpu_to_le16(group);
+ if (ext4_has_metadata_csum(sb))
+ es->s_checksum = ext4_superblock_csum(sb, es);
set_buffer_uptodate(bh);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7950904fbf04..7cdd2138c897 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4881,7 +4881,7 @@ out:
flush_work(&sbi->s_error_work);
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
- return err;
+ return -EINVAL;
}
static int ext4_journal_data_mode_check(struct super_block *sb)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1a3afd469e3a..71bfb663aac5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -3001,6 +3001,10 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
goto out;
}
+ err = file_modified(file);
+ if (err)
+ goto out;
+
if (!(mode & FALLOC_FL_KEEP_SIZE))
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index b4e565711045..e8deaacf1832 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -77,8 +77,10 @@ static void fuse_add_dirent_to_cache(struct file *file,
goto unlock;
addr = kmap_local_page(page);
- if (!offset)
+ if (!offset) {
clear_page(addr);
+ SetPageUptodate(page);
+ }
memcpy(addr + offset, dirent, reclen);
kunmap_local(addr);
fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen;
@@ -516,6 +518,12 @@ retry_locked:
page = find_get_page_flags(file->f_mapping, index,
FGP_ACCESSED | FGP_LOCK);
+ /* Page gone missing, then re-added to cache, but not initialized? */
+ if (page && !PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ }
spin_lock(&fi->rdc.lock);
if (!page) {
/*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index da8da5cdbbc1..f50e025ae406 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -280,7 +280,7 @@ EXPORT_SYMBOL_GPL(nfs_put_client);
static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
{
struct nfs_client *clp;
- const struct sockaddr *sap = data->addr;
+ const struct sockaddr *sap = (struct sockaddr *)data->addr;
struct nfs_net *nn = net_generic(data->net, nfs_net_id);
int error;
@@ -666,7 +666,7 @@ static int nfs_init_server(struct nfs_server *server,
struct rpc_timeout timeparms;
struct nfs_client_initdata cl_init = {
.hostname = ctx->nfs_server.hostname,
- .addr = (const struct sockaddr *)&ctx->nfs_server.address,
+ .addr = &ctx->nfs_server._address,
.addrlen = ctx->nfs_server.addrlen,
.nfs_mod = ctx->nfs_mod,
.proto = ctx->nfs_server.protocol,
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5c97cad741a7..ead8a0e06abf 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -228,8 +228,7 @@ again:
*
*/
void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
- fmode_t type,
- const nfs4_stateid *stateid,
+ fmode_t type, const nfs4_stateid *stateid,
unsigned long pagemod_limit)
{
struct nfs_delegation *delegation;
@@ -239,25 +238,24 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
delegation = rcu_dereference(NFS_I(inode)->delegation);
if (delegation != NULL) {
spin_lock(&delegation->lock);
- if (nfs4_is_valid_delegation(delegation, 0)) {
- nfs4_stateid_copy(&delegation->stateid, stateid);
- delegation->type = type;
- delegation->pagemod_limit = pagemod_limit;
- oldcred = delegation->cred;
- delegation->cred = get_cred(cred);
- clear_bit(NFS_DELEGATION_NEED_RECLAIM,
- &delegation->flags);
- spin_unlock(&delegation->lock);
- rcu_read_unlock();
- put_cred(oldcred);
- trace_nfs4_reclaim_delegation(inode, type);
- return;
- }
- /* We appear to have raced with a delegation return. */
+ nfs4_stateid_copy(&delegation->stateid, stateid);
+ delegation->type = type;
+ delegation->pagemod_limit = pagemod_limit;
+ oldcred = delegation->cred;
+ delegation->cred = get_cred(cred);
+ clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+ if (test_and_clear_bit(NFS_DELEGATION_REVOKED,
+ &delegation->flags))
+ atomic_long_inc(&nfs_active_delegations);
spin_unlock(&delegation->lock);
+ rcu_read_unlock();
+ put_cred(oldcred);
+ trace_nfs4_reclaim_delegation(inode, type);
+ } else {
+ rcu_read_unlock();
+ nfs_inode_set_delegation(inode, cred, type, stateid,
+ pagemod_limit);
}
- rcu_read_unlock();
- nfs_inode_set_delegation(inode, cred, type, stateid, pagemod_limit);
}
static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 58036f657126..f594dac436a7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2489,9 +2489,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
spin_unlock(&dentry->d_lock);
goto out;
}
- if (dentry->d_fsdata)
- /* old devname */
- kfree(dentry->d_fsdata);
+ /* old devname */
+ kfree(dentry->d_fsdata);
dentry->d_fsdata = NFS_FSDATA_BLOCKED;
spin_unlock(&dentry->d_lock);
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index e87d500ad95a..6603b5cee029 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -16,8 +16,9 @@
#include "dns_resolve.h"
ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
- struct sockaddr *sa, size_t salen)
+ struct sockaddr_storage *ss, size_t salen)
{
+ struct sockaddr *sa = (struct sockaddr *)ss;
ssize_t ret;
char *ip_addr = NULL;
int ip_len;
@@ -341,7 +342,7 @@ out:
}
ssize_t nfs_dns_resolve_name(struct net *net, char *name,
- size_t namelen, struct sockaddr *sa, size_t salen)
+ size_t namelen, struct sockaddr_storage *ss, size_t salen)
{
struct nfs_dns_ent key = {
.hostname = name,
@@ -354,7 +355,7 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name,
ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item);
if (ret == 0) {
if (salen >= item->addrlen) {
- memcpy(sa, &item->addr, item->addrlen);
+ memcpy(ss, &item->addr, item->addrlen);
ret = item->addrlen;
} else
ret = -EOVERFLOW;
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
index 576ff4b54c82..fe3b172c4de1 100644
--- a/fs/nfs/dns_resolve.h
+++ b/fs/nfs/dns_resolve.h
@@ -32,6 +32,6 @@ extern void nfs_dns_resolver_cache_destroy(struct net *net);
#endif
extern ssize_t nfs_dns_resolve_name(struct net *net, char *name,
- size_t namelen, struct sockaddr *sa, size_t salen);
+ size_t namelen, struct sockaddr_storage *sa, size_t salen);
#endif
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 4da701fd1424..09833ec102fc 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -273,9 +273,9 @@ static const struct constant_table nfs_secflavor_tokens[] = {
* Address family must be initialized, and address must not be
* the ANY address for that family.
*/
-static int nfs_verify_server_address(struct sockaddr *addr)
+static int nfs_verify_server_address(struct sockaddr_storage *addr)
{
- switch (addr->sa_family) {
+ switch (addr->ss_family) {
case AF_INET: {
struct sockaddr_in *sa = (struct sockaddr_in *)addr;
return sa->sin_addr.s_addr != htonl(INADDR_ANY);
@@ -969,7 +969,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
{
struct nfs_fs_context *ctx = nfs_fc2context(fc);
struct nfs_fh *mntfh = ctx->mntfh;
- struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
+ struct sockaddr_storage *sap = &ctx->nfs_server._address;
int extra_flags = NFS_MOUNT_LEGACY_INTERFACE;
int ret;
@@ -1044,7 +1044,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
memcpy(sap, &data->addr, sizeof(data->addr));
ctx->nfs_server.addrlen = sizeof(data->addr);
ctx->nfs_server.port = ntohs(data->addr.sin_port);
- if (sap->sa_family != AF_INET ||
+ if (sap->ss_family != AF_INET ||
!nfs_verify_server_address(sap))
goto out_no_address;
@@ -1200,7 +1200,7 @@ static int nfs4_parse_monolithic(struct fs_context *fc,
struct nfs4_mount_data *data)
{
struct nfs_fs_context *ctx = nfs_fc2context(fc);
- struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
+ struct sockaddr_storage *sap = &ctx->nfs_server._address;
int ret;
char *c;
@@ -1314,7 +1314,7 @@ static int nfs_fs_context_validate(struct fs_context *fc)
{
struct nfs_fs_context *ctx = nfs_fc2context(fc);
struct nfs_subversion *nfs_mod;
- struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
+ struct sockaddr_storage *sap = &ctx->nfs_server._address;
int max_namelen = PAGE_SIZE;
int max_pathlen = NFS_MAXPATHLEN;
int port = 0;
@@ -1540,7 +1540,7 @@ static int nfs_init_fs_context(struct fs_context *fc)
ctx->version = nfss->nfs_client->rpc_ops->version;
ctx->minorversion = nfss->nfs_client->cl_minorversion;
- memcpy(&ctx->nfs_server.address, &nfss->nfs_client->cl_addr,
+ memcpy(&ctx->nfs_server._address, &nfss->nfs_client->cl_addr,
ctx->nfs_server.addrlen);
if (fc->net_ns != net) {
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d914d609b85b..647fc3f547cb 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -69,7 +69,7 @@ static inline fmode_t flags_to_mode(int flags)
struct nfs_client_initdata {
unsigned long init_flags;
const char *hostname; /* Hostname of the server */
- const struct sockaddr *addr; /* Address of the server */
+ const struct sockaddr_storage *addr; /* Address of the server */
const char *nodename; /* Hostname of the client */
const char *ip_addr; /* IP address of the client */
size_t addrlen;
@@ -180,7 +180,7 @@ static inline struct nfs_fs_context *nfs_fc2context(const struct fs_context *fc)
/* mount_clnt.c */
struct nfs_mount_request {
- struct sockaddr *sap;
+ struct sockaddr_storage *sap;
size_t salen;
char *hostname;
char *dirpath;
@@ -223,7 +223,7 @@ extern void nfs4_server_set_init_caps(struct nfs_server *);
extern struct nfs_server *nfs4_create_server(struct fs_context *);
extern struct nfs_server *nfs4_create_referral_server(struct fs_context *);
extern int nfs4_update_server(struct nfs_server *server, const char *hostname,
- struct sockaddr *sap, size_t salen,
+ struct sockaddr_storage *sap, size_t salen,
struct net *net);
extern void nfs_free_server(struct nfs_server *server);
extern struct nfs_server *nfs_clone_server(struct nfs_server *,
@@ -235,7 +235,7 @@ extern int nfs_client_init_status(const struct nfs_client *clp);
extern int nfs_wait_client_init_complete(const struct nfs_client *clp);
extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
- const struct sockaddr *ds_addr,
+ const struct sockaddr_storage *ds_addr,
int ds_addrlen, int ds_proto,
unsigned int ds_timeo,
unsigned int ds_retrans,
@@ -243,7 +243,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
struct inode *);
extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
- const struct sockaddr *ds_addr, int ds_addrlen,
+ const struct sockaddr_storage *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo,
unsigned int ds_retrans);
#ifdef CONFIG_PROC_FS
@@ -894,13 +894,13 @@ static inline bool nfs_error_is_fatal_on_server(int err)
* Select between a default port value and a user-specified port value.
* If a zero value is set, then autobind will be used.
*/
-static inline void nfs_set_port(struct sockaddr *sap, int *port,
+static inline void nfs_set_port(struct sockaddr_storage *sap, int *port,
const unsigned short default_port)
{
if (*port == NFS_UNSPEC_PORT)
*port = default_port;
- rpc_set_port(sap, *port);
+ rpc_set_port((struct sockaddr *)sap, *port);
}
struct nfs_direct_req {
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index c5e3b6b3366a..68e76b626371 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -158,7 +158,7 @@ int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans)
struct rpc_create_args args = {
.net = info->net,
.protocol = info->protocol,
- .address = info->sap,
+ .address = (struct sockaddr *)info->sap,
.addrsize = info->salen,
.timeout = &mnt_timeout,
.servername = info->hostname,
@@ -245,7 +245,7 @@ void nfs_umount(const struct nfs_mount_request *info)
struct rpc_create_args args = {
.net = info->net,
.protocol = IPPROTO_UDP,
- .address = info->sap,
+ .address = (struct sockaddr *)info->sap,
.addrsize = info->salen,
.timeout = &nfs_umnt_timeout,
.servername = info->hostname,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 3295af4110f1..2f336ace7555 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -175,7 +175,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
}
/* for submounts we want the same server; referrals will reassign */
- memcpy(&ctx->nfs_server.address, &client->cl_addr, client->cl_addrlen);
+ memcpy(&ctx->nfs_server._address, &client->cl_addr, client->cl_addrlen);
ctx->nfs_server.addrlen = client->cl_addrlen;
ctx->nfs_server.port = server->port;
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index b49359afac88..669cda757a5c 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -78,7 +78,7 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
* the MDS.
*/
struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
- const struct sockaddr *ds_addr, int ds_addrlen,
+ const struct sockaddr_storage *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans)
{
struct rpc_timeout ds_timeout;
@@ -98,7 +98,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
char buf[INET6_ADDRSTRLEN + 1];
/* fake a hostname because lockd wants it */
- if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+ if (rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf)) <= 0)
return ERR_PTR(-EINVAL);
cl_init.hostname = buf;
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 13424f0d793b..ecb428512fe1 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -1093,6 +1093,9 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
&args.seq_args, &res.seq_res, 0);
trace_nfs4_clone(src_inode, dst_inode, &args, status);
if (status == 0) {
+ /* a zero-length count means clone to EOF in src */
+ if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE)
+ count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset;
nfs42_copy_dest_done(dst_inode, dst_offset, count);
status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 400a71e75238..cfef738d765e 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -281,7 +281,7 @@ struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *,
int nfs4_submount(struct fs_context *, struct nfs_server *);
int nfs4_replace_transport(struct nfs_server *server,
const struct nfs4_fs_locations *locations);
-size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa,
+size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr_storage *ss,
size_t salen, struct net *net, int port);
/* nfs4proc.c */
extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 7a5162afa5c0..d3051b051a56 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -346,6 +346,7 @@ int nfs40_init_client(struct nfs_client *clp)
ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE,
"NFSv4.0 transport Slot table");
if (ret) {
+ nfs4_shutdown_slot_table(tbl);
kfree(tbl);
return ret;
}
@@ -889,7 +890,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
*/
static int nfs4_set_client(struct nfs_server *server,
const char *hostname,
- const struct sockaddr *addr,
+ const struct sockaddr_storage *addr,
const size_t addrlen,
const char *ip_addr,
int proto, const struct rpc_timeout *timeparms,
@@ -924,7 +925,7 @@ static int nfs4_set_client(struct nfs_server *server,
__set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status))
__set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags);
- server->port = rpc_get_port(addr);
+ server->port = rpc_get_port((struct sockaddr *)addr);
/* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init);
@@ -960,7 +961,7 @@ static int nfs4_set_client(struct nfs_server *server,
* the MDS.
*/
struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
- const struct sockaddr *ds_addr, int ds_addrlen,
+ const struct sockaddr_storage *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
u32 minor_version)
{
@@ -980,7 +981,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
};
char buf[INET6_ADDRSTRLEN + 1];
- if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+ if (rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf)) <= 0)
return ERR_PTR(-EINVAL);
cl_init.hostname = buf;
@@ -1148,7 +1149,7 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc)
/* Get a client record */
error = nfs4_set_client(server,
ctx->nfs_server.hostname,
- &ctx->nfs_server.address,
+ &ctx->nfs_server._address,
ctx->nfs_server.addrlen,
ctx->client_address,
ctx->nfs_server.protocol,
@@ -1238,7 +1239,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc)
rpc_set_port(&ctx->nfs_server.address, NFS_RDMA_PORT);
error = nfs4_set_client(server,
ctx->nfs_server.hostname,
- &ctx->nfs_server.address,
+ &ctx->nfs_server._address,
ctx->nfs_server.addrlen,
parent_client->cl_ipaddr,
XPRT_TRANSPORT_RDMA,
@@ -1254,7 +1255,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc)
rpc_set_port(&ctx->nfs_server.address, NFS_PORT);
error = nfs4_set_client(server,
ctx->nfs_server.hostname,
- &ctx->nfs_server.address,
+ &ctx->nfs_server._address,
ctx->nfs_server.addrlen,
parent_client->cl_ipaddr,
XPRT_TRANSPORT_TCP,
@@ -1303,14 +1304,14 @@ error:
* Returns zero on success, or a negative errno value.
*/
int nfs4_update_server(struct nfs_server *server, const char *hostname,
- struct sockaddr *sap, size_t salen, struct net *net)
+ struct sockaddr_storage *sap, size_t salen, struct net *net)
{
struct nfs_client *clp = server->nfs_client;
struct rpc_clnt *clnt = server->client;
struct xprt_create xargs = {
.ident = clp->cl_proto,
.net = net,
- .dstaddr = sap,
+ .dstaddr = (struct sockaddr *)sap,
.addrlen = salen,
.servername = hostname,
};
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index f2dbf904c598..9a98595bb160 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -164,16 +164,17 @@ static int nfs4_validate_fspath(struct dentry *dentry,
return 0;
}
-size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa,
+size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr_storage *ss,
size_t salen, struct net *net, int port)
{
+ struct sockaddr *sa = (struct sockaddr *)ss;
ssize_t ret;
ret = rpc_pton(net, string, len, sa, salen);
if (ret == 0) {
ret = rpc_uaddr2sockaddr(net, string, len, sa, salen);
if (ret == 0) {
- ret = nfs_dns_resolve_name(net, string, len, sa, salen);
+ ret = nfs_dns_resolve_name(net, string, len, ss, salen);
if (ret < 0)
ret = 0;
}
@@ -331,7 +332,7 @@ static int try_location(struct fs_context *fc,
ctx->nfs_server.addrlen =
nfs_parse_server_name(buf->data, buf->len,
- &ctx->nfs_server.address,
+ &ctx->nfs_server._address,
sizeof(ctx->nfs_server._address),
fc->net_ns, 0);
if (ctx->nfs_server.addrlen == 0)
@@ -483,14 +484,13 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server,
char *page, char *page2,
const struct nfs4_fs_location *location)
{
- const size_t addr_bufsize = sizeof(struct sockaddr_storage);
struct net *net = rpc_net_ns(server->client);
- struct sockaddr *sap;
+ struct sockaddr_storage *sap;
unsigned int s;
size_t salen;
int error;
- sap = kmalloc(addr_bufsize, GFP_KERNEL);
+ sap = kmalloc(sizeof(*sap), GFP_KERNEL);
if (sap == NULL)
return -ENOMEM;
@@ -506,10 +506,10 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server,
continue;
salen = nfs_parse_server_name(buf->data, buf->len,
- sap, addr_bufsize, net, 0);
+ sap, sizeof(*sap), net, 0);
if (salen == 0)
continue;
- rpc_set_port(sap, NFS_PORT);
+ rpc_set_port((struct sockaddr *)sap, NFS_PORT);
error = -ENOMEM;
hostname = kmemdup_nul(buf->data, buf->len, GFP_KERNEL);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e2efcd26336c..86ed5c0142c3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3951,7 +3951,7 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location,
for (i = 0; i < location->nservers; i++) {
struct nfs4_string *srv_loc = &location->servers[i];
- struct sockaddr addr;
+ struct sockaddr_storage addr;
size_t addrlen;
struct xprt_create xprt_args = {
.ident = 0,
@@ -3974,7 +3974,7 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location,
clp->cl_net, server->port);
if (!addrlen)
return;
- xprt_args.dstaddr = &addr;
+ xprt_args.dstaddr = (struct sockaddr *)&addr;
xprt_args.addrlen = addrlen;
servername = kmalloc(srv_loc->len + 1, GFP_KERNEL);
if (!servername)
@@ -7138,6 +7138,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
{
struct nfs4_lockdata *data = calldata;
struct nfs4_lock_state *lsp = data->lsp;
+ struct nfs_server *server = NFS_SERVER(d_inode(data->ctx->dentry));
if (!nfs4_sequence_done(task, &data->res.seq_res))
return;
@@ -7145,8 +7146,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
data->rpc_status = task->tk_status;
switch (task->tk_status) {
case 0:
- renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)),
- data->timestamp);
+ renew_lease(server, data->timestamp);
if (data->arg.new_lock && !data->cancelled) {
data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0)
@@ -7167,6 +7167,8 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
if (!nfs4_stateid_match(&data->arg.open_stateid,
&lsp->ls_state->open_stateid))
goto out_restart;
+ else if (nfs4_async_handle_error(task, server, lsp->ls_state, NULL) == -EAGAIN)
+ goto out_restart;
} else if (!nfs4_stateid_match(&data->arg.lock_stateid,
&lsp->ls_stateid))
goto out_restart;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index c3503fb26fa2..a2d2d5d1b088 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1786,6 +1786,7 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
{
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
/* Mark all delegations for reclaim */
nfs_delegation_mark_reclaim(clp);
nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
@@ -2670,6 +2671,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
if (status < 0)
goto out_error;
nfs4_state_end_reclaim_reboot(clp);
+ continue;
}
/* Detect expired delegations... */
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 987c88ddeaf0..5d035dd2d7bf 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -821,7 +821,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
static struct nfs_client *(*get_v3_ds_connect)(
struct nfs_server *mds_srv,
- const struct sockaddr *ds_addr,
+ const struct sockaddr_storage *ds_addr,
int ds_addrlen,
int ds_proto,
unsigned int ds_timeo,
@@ -882,7 +882,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
continue;
}
clp = get_v3_ds_connect(mds_srv,
- (struct sockaddr *)&da->da_addr,
+ &da->da_addr,
da->da_addrlen, da->da_transport,
timeo, retrans);
if (IS_ERR(clp))
@@ -951,7 +951,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
put_cred(xprtdata.cred);
} else {
clp = nfs4_set_ds_client(mds_srv,
- (struct sockaddr *)&da->da_addr,
+ &da->da_addr,
da->da_addrlen,
da->da_transport, timeo,
retrans, minor_version);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ee66ffdb985e..05ae23657527 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -822,8 +822,7 @@ static int nfs_request_mount(struct fs_context *fc,
{
struct nfs_fs_context *ctx = nfs_fc2context(fc);
struct nfs_mount_request request = {
- .sap = (struct sockaddr *)
- &ctx->mount_server.address,
+ .sap = &ctx->mount_server._address,
.dirpath = ctx->nfs_server.export_path,
.protocol = ctx->mount_server.protocol,
.fh = root_fh,
@@ -854,7 +853,7 @@ static int nfs_request_mount(struct fs_context *fc,
* Construct the mount server's address.
*/
if (ctx->mount_server.address.sa_family == AF_UNSPEC) {
- memcpy(request.sap, &ctx->nfs_server.address,
+ memcpy(request.sap, &ctx->nfs_server._address,
ctx->nfs_server.addrlen);
ctx->mount_server.addrlen = ctx->nfs_server.addrlen;
}
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 29a62db155fb..adc4e87a71d2 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -893,9 +893,8 @@ __nfsd_file_cache_purge(struct net *net)
nf = rhashtable_walk_next(&iter);
while (!IS_ERR_OR_NULL(nf)) {
- if (net && nf->nf_net != net)
- continue;
- nfsd_file_unhash_and_dispose(nf, &dispose);
+ if (!net || nf->nf_net == net)
+ nfsd_file_unhash_and_dispose(nf, &dispose);
nf = rhashtable_walk_next(&iter);
}
diff --git a/fs/proc/page.c b/fs/proc/page.c
index f2273b164535..6249c347809a 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -219,8 +219,9 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
-#ifdef CONFIG_64BIT
+#ifdef CONFIG_ARCH_USES_PG_ARCH_X
u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
+ u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3);
#endif
return u;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 517a138faa66..191b22b9a35b 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -133,6 +133,21 @@ xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno)
return true;
}
+static inline bool
+xfs_verify_agbext(
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno,
+ xfs_agblock_t len)
+{
+ if (agbno + len <= agbno)
+ return false;
+
+ if (!xfs_verify_agbno(pag, agbno))
+ return false;
+
+ return xfs_verify_agbno(pag, agbno + len - 1);
+}
+
/*
* Verify that an AG inode number pointer neither points outside the AG
* nor points at static metadata.
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 6261599bb389..de79f5d07f65 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -263,11 +263,7 @@ xfs_alloc_get_rec(
goto out_bad_rec;
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbno(pag, *bno))
- goto out_bad_rec;
- if (*bno > *bno + *len)
- goto out_bad_rec;
- if (!xfs_verify_agbno(pag, *bno + *len - 1))
+ if (!xfs_verify_agbext(pag, *bno, *len))
goto out_bad_rec;
return 0;
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index d9b66306a9a7..cb9e950a911d 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -146,6 +146,8 @@ xfs_dir3_leaf_check_int(
xfs_dir2_leaf_tail_t *ltp;
int stale;
int i;
+ bool isleaf1 = (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+ hdr->magic == XFS_DIR3_LEAF1_MAGIC);
ltp = xfs_dir2_leaf_tail_p(geo, leaf);
@@ -158,8 +160,7 @@ xfs_dir3_leaf_check_int(
return __this_address;
/* Leaves and bests don't overlap in leaf format. */
- if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
- hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
+ if (isleaf1 &&
(char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
return __this_address;
@@ -175,6 +176,10 @@ xfs_dir3_leaf_check_int(
}
if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
stale++;
+ if (isleaf1 && xfs_dir2_dataptr_to_db(geo,
+ be32_to_cpu(hdr->ents[i].address)) >=
+ be32_to_cpu(ltp->bestcount))
+ return __this_address;
}
if (hdr->stale != stale)
return __this_address;
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index b55bdfa9c8a8..371dc07233e0 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1564,20 +1564,6 @@ struct xfs_rmap_rec {
#define RMAPBT_UNUSED_OFFSET_BITLEN 7
#define RMAPBT_OFFSET_BITLEN 54
-#define XFS_RMAP_ATTR_FORK (1 << 0)
-#define XFS_RMAP_BMBT_BLOCK (1 << 1)
-#define XFS_RMAP_UNWRITTEN (1 << 2)
-#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \
- XFS_RMAP_BMBT_BLOCK)
-#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN)
-struct xfs_rmap_irec {
- xfs_agblock_t rm_startblock; /* extent start block */
- xfs_extlen_t rm_blockcount; /* extent length */
- uint64_t rm_owner; /* extent owner */
- uint64_t rm_offset; /* offset within the owner */
- unsigned int rm_flags; /* state flags */
-};
-
/*
* Key structure
*
@@ -1626,7 +1612,7 @@ unsigned int xfs_refc_block(struct xfs_mount *mp);
* on the startblock. This speeds up mount time deletion of stale
* staging extents because they're all at the right side of the tree.
*/
-#define XFS_REFC_COW_START ((xfs_agblock_t)(1U << 31))
+#define XFS_REFC_COWFLAG (1U << 31)
#define REFCNTBT_COWFLAG_BITLEN 1
#define REFCNTBT_AGBLOCK_BITLEN 31
@@ -1640,12 +1626,6 @@ struct xfs_refcount_key {
__be32 rc_startblock; /* starting block number */
};
-struct xfs_refcount_irec {
- xfs_agblock_t rc_startblock; /* starting block number */
- xfs_extlen_t rc_blockcount; /* count of free blocks */
- xfs_nlink_t rc_refcount; /* number of inodes linked here */
-};
-
#define MAXREFCOUNT ((xfs_nlink_t)~0U)
#define MAXREFCEXTLEN ((xfs_extlen_t)~0U)
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index b351b9dc6561..f13e0809dc63 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -613,25 +613,49 @@ typedef struct xfs_efi_log_format {
uint16_t efi_size; /* size of this item */
uint32_t efi_nextents; /* # extents to free */
uint64_t efi_id; /* efi identifier */
- xfs_extent_t efi_extents[1]; /* array of extents to free */
+ xfs_extent_t efi_extents[]; /* array of extents to free */
} xfs_efi_log_format_t;
+static inline size_t
+xfs_efi_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efi_log_format) +
+ nr * sizeof(struct xfs_extent);
+}
+
typedef struct xfs_efi_log_format_32 {
uint16_t efi_type; /* efi log item type */
uint16_t efi_size; /* size of this item */
uint32_t efi_nextents; /* # extents to free */
uint64_t efi_id; /* efi identifier */
- xfs_extent_32_t efi_extents[1]; /* array of extents to free */
+ xfs_extent_32_t efi_extents[]; /* array of extents to free */
} __attribute__((packed)) xfs_efi_log_format_32_t;
+static inline size_t
+xfs_efi_log_format32_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efi_log_format_32) +
+ nr * sizeof(struct xfs_extent_32);
+}
+
typedef struct xfs_efi_log_format_64 {
uint16_t efi_type; /* efi log item type */
uint16_t efi_size; /* size of this item */
uint32_t efi_nextents; /* # extents to free */
uint64_t efi_id; /* efi identifier */
- xfs_extent_64_t efi_extents[1]; /* array of extents to free */
+ xfs_extent_64_t efi_extents[]; /* array of extents to free */
} xfs_efi_log_format_64_t;
+static inline size_t
+xfs_efi_log_format64_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efi_log_format_64) +
+ nr * sizeof(struct xfs_extent_64);
+}
+
/*
* This is the structure used to lay out an efd log item in the
* log. The efd_extents array is a variable size array whose
@@ -642,25 +666,49 @@ typedef struct xfs_efd_log_format {
uint16_t efd_size; /* size of this item */
uint32_t efd_nextents; /* # of extents freed */
uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_t efd_extents[1]; /* array of extents freed */
+ xfs_extent_t efd_extents[]; /* array of extents freed */
} xfs_efd_log_format_t;
+static inline size_t
+xfs_efd_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efd_log_format) +
+ nr * sizeof(struct xfs_extent);
+}
+
typedef struct xfs_efd_log_format_32 {
uint16_t efd_type; /* efd log item type */
uint16_t efd_size; /* size of this item */
uint32_t efd_nextents; /* # of extents freed */
uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_32_t efd_extents[1]; /* array of extents freed */
+ xfs_extent_32_t efd_extents[]; /* array of extents freed */
} __attribute__((packed)) xfs_efd_log_format_32_t;
+static inline size_t
+xfs_efd_log_format32_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efd_log_format_32) +
+ nr * sizeof(struct xfs_extent_32);
+}
+
typedef struct xfs_efd_log_format_64 {
uint16_t efd_type; /* efd log item type */
uint16_t efd_size; /* size of this item */
uint32_t efd_nextents; /* # of extents freed */
uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_64_t efd_extents[1]; /* array of extents freed */
+ xfs_extent_64_t efd_extents[]; /* array of extents freed */
} xfs_efd_log_format_64_t;
+static inline size_t
+xfs_efd_log_format64_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efd_log_format_64) +
+ nr * sizeof(struct xfs_extent_64);
+}
+
/*
* RUI/RUD (reverse mapping) log format definitions
*/
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 64b910caafaa..3f34bafe18dd 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -46,13 +46,16 @@ STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur,
int
xfs_refcount_lookup_le(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ xfs_refcount_encode_startblock(bno, domain),
XFS_LOOKUP_LE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
+ cur->bc_rec.rc.rc_domain = domain;
return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
}
@@ -63,13 +66,16 @@ xfs_refcount_lookup_le(
int
xfs_refcount_lookup_ge(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ xfs_refcount_encode_startblock(bno, domain),
XFS_LOOKUP_GE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
+ cur->bc_rec.rc.rc_domain = domain;
return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
}
@@ -80,13 +86,16 @@ xfs_refcount_lookup_ge(
int
xfs_refcount_lookup_eq(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ xfs_refcount_encode_startblock(bno, domain),
XFS_LOOKUP_LE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
+ cur->bc_rec.rc.rc_domain = domain;
return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
}
@@ -96,7 +105,17 @@ xfs_refcount_btrec_to_irec(
const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec)
{
- irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock);
+ uint32_t start;
+
+ start = be32_to_cpu(rec->refc.rc_startblock);
+ if (start & XFS_REFC_COWFLAG) {
+ start &= ~XFS_REFC_COWFLAG;
+ irec->rc_domain = XFS_REFC_DOMAIN_COW;
+ } else {
+ irec->rc_domain = XFS_REFC_DOMAIN_SHARED;
+ }
+
+ irec->rc_startblock = start;
irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount);
irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount);
}
@@ -114,7 +133,6 @@ xfs_refcount_get_rec(
struct xfs_perag *pag = cur->bc_ag.pag;
union xfs_btree_rec *rec;
int error;
- xfs_agblock_t realstart;
error = xfs_btree_get_rec(cur, &rec, stat);
if (error || !*stat)
@@ -124,22 +142,11 @@ xfs_refcount_get_rec(
if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
goto out_bad_rec;
- /* handle special COW-staging state */
- realstart = irec->rc_startblock;
- if (realstart & XFS_REFC_COW_START) {
- if (irec->rc_refcount != 1)
- goto out_bad_rec;
- realstart &= ~XFS_REFC_COW_START;
- } else if (irec->rc_refcount < 2) {
+ if (!xfs_refcount_check_domain(irec))
goto out_bad_rec;
- }
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbno(pag, realstart))
- goto out_bad_rec;
- if (realstart > realstart + irec->rc_blockcount)
- goto out_bad_rec;
- if (!xfs_verify_agbno(pag, realstart + irec->rc_blockcount - 1))
+ if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount))
goto out_bad_rec;
if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
@@ -169,12 +176,17 @@ xfs_refcount_update(
struct xfs_refcount_irec *irec)
{
union xfs_btree_rec rec;
+ uint32_t start;
int error;
trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
- rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock);
+
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ rec.refc.rc_startblock = cpu_to_be32(start);
rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount);
rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount);
+
error = xfs_btree_update(cur, &rec);
if (error)
trace_xfs_refcount_update_error(cur->bc_mp,
@@ -196,9 +208,12 @@ xfs_refcount_insert(
int error;
trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+
cur->bc_rec.rc.rc_startblock = irec->rc_startblock;
cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
cur->bc_rec.rc.rc_refcount = irec->rc_refcount;
+ cur->bc_rec.rc.rc_domain = irec->rc_domain;
+
error = xfs_btree_insert(cur, i);
if (error)
goto out_error;
@@ -244,7 +259,8 @@ xfs_refcount_delete(
}
if (error)
goto out_error;
- error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec);
+ error = xfs_refcount_lookup_ge(cur, irec.rc_domain, irec.rc_startblock,
+ &found_rec);
out_error:
if (error)
trace_xfs_refcount_delete_error(cur->bc_mp,
@@ -343,6 +359,7 @@ xfs_refc_next(
STATIC int
xfs_refcount_split_extent(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t agbno,
bool *shape_changed)
{
@@ -351,7 +368,7 @@ xfs_refcount_split_extent(
int error;
*shape_changed = false;
- error = xfs_refcount_lookup_le(cur, agbno, &found_rec);
+ error = xfs_refcount_lookup_le(cur, domain, agbno, &found_rec);
if (error)
goto out_error;
if (!found_rec)
@@ -364,6 +381,8 @@ xfs_refcount_split_extent(
error = -EFSCORRUPTED;
goto out_error;
}
+ if (rcext.rc_domain != domain)
+ return 0;
if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno)
return 0;
@@ -415,6 +434,9 @@ xfs_refcount_merge_center_extents(
trace_xfs_refcount_merge_center_extents(cur->bc_mp,
cur->bc_ag.pag->pag_agno, left, center, right);
+ ASSERT(left->rc_domain == center->rc_domain);
+ ASSERT(right->rc_domain == center->rc_domain);
+
/*
* Make sure the center and right extents are not in the btree.
* If the center extent was synthesized, the first delete call
@@ -423,8 +445,8 @@ xfs_refcount_merge_center_extents(
* call removes the center and the second one removes the right
* extent.
*/
- error = xfs_refcount_lookup_ge(cur, center->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_ge(cur, center->rc_domain,
+ center->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -451,8 +473,8 @@ xfs_refcount_merge_center_extents(
}
/* Enlarge the left extent. */
- error = xfs_refcount_lookup_le(cur, left->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_le(cur, left->rc_domain,
+ left->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -491,10 +513,12 @@ xfs_refcount_merge_left_extent(
trace_xfs_refcount_merge_left_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, left, cleft);
+ ASSERT(left->rc_domain == cleft->rc_domain);
+
/* If the extent at agbno (cleft) wasn't synthesized, remove it. */
if (cleft->rc_refcount > 1) {
- error = xfs_refcount_lookup_le(cur, cleft->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_le(cur, cleft->rc_domain,
+ cleft->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -512,8 +536,8 @@ xfs_refcount_merge_left_extent(
}
/* Enlarge the left extent. */
- error = xfs_refcount_lookup_le(cur, left->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_le(cur, left->rc_domain,
+ left->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -552,13 +576,15 @@ xfs_refcount_merge_right_extent(
trace_xfs_refcount_merge_right_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, cright, right);
+ ASSERT(right->rc_domain == cright->rc_domain);
+
/*
* If the extent ending at agbno+aglen (cright) wasn't synthesized,
* remove it.
*/
if (cright->rc_refcount > 1) {
- error = xfs_refcount_lookup_le(cur, cright->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_le(cur, cright->rc_domain,
+ cright->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -576,8 +602,8 @@ xfs_refcount_merge_right_extent(
}
/* Enlarge the right extent. */
- error = xfs_refcount_lookup_le(cur, right->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_le(cur, right->rc_domain,
+ right->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -600,8 +626,6 @@ out_error:
return error;
}
-#define XFS_FIND_RCEXT_SHARED 1
-#define XFS_FIND_RCEXT_COW 2
/*
* Find the left extent and the one after it (cleft). This function assumes
* that we've already split any extent crossing agbno.
@@ -611,16 +635,16 @@ xfs_refcount_find_left_extents(
struct xfs_btree_cur *cur,
struct xfs_refcount_irec *left,
struct xfs_refcount_irec *cleft,
+ enum xfs_refc_domain domain,
xfs_agblock_t agbno,
- xfs_extlen_t aglen,
- int flags)
+ xfs_extlen_t aglen)
{
struct xfs_refcount_irec tmp;
int error;
int found_rec;
left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK;
- error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec);
+ error = xfs_refcount_lookup_le(cur, domain, agbno - 1, &found_rec);
if (error)
goto out_error;
if (!found_rec)
@@ -634,11 +658,9 @@ xfs_refcount_find_left_extents(
goto out_error;
}
- if (xfs_refc_next(&tmp) != agbno)
- return 0;
- if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2)
+ if (tmp.rc_domain != domain)
return 0;
- if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1)
+ if (xfs_refc_next(&tmp) != agbno)
return 0;
/* We have a left extent; retrieve (or invent) the next right one */
*left = tmp;
@@ -655,6 +677,9 @@ xfs_refcount_find_left_extents(
goto out_error;
}
+ if (tmp.rc_domain != domain)
+ goto not_found;
+
/* if tmp starts at the end of our range, just use that */
if (tmp.rc_startblock == agbno)
*cleft = tmp;
@@ -671,8 +696,10 @@ xfs_refcount_find_left_extents(
cleft->rc_blockcount = min(aglen,
tmp.rc_startblock - agbno);
cleft->rc_refcount = 1;
+ cleft->rc_domain = domain;
}
} else {
+not_found:
/*
* No extents, so pretend that there's one covering the whole
* range.
@@ -680,6 +707,7 @@ xfs_refcount_find_left_extents(
cleft->rc_startblock = agbno;
cleft->rc_blockcount = aglen;
cleft->rc_refcount = 1;
+ cleft->rc_domain = domain;
}
trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
left, cleft, agbno);
@@ -700,16 +728,16 @@ xfs_refcount_find_right_extents(
struct xfs_btree_cur *cur,
struct xfs_refcount_irec *right,
struct xfs_refcount_irec *cright,
+ enum xfs_refc_domain domain,
xfs_agblock_t agbno,
- xfs_extlen_t aglen,
- int flags)
+ xfs_extlen_t aglen)
{
struct xfs_refcount_irec tmp;
int error;
int found_rec;
right->rc_startblock = cright->rc_startblock = NULLAGBLOCK;
- error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec);
+ error = xfs_refcount_lookup_ge(cur, domain, agbno + aglen, &found_rec);
if (error)
goto out_error;
if (!found_rec)
@@ -723,11 +751,9 @@ xfs_refcount_find_right_extents(
goto out_error;
}
- if (tmp.rc_startblock != agbno + aglen)
- return 0;
- if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2)
+ if (tmp.rc_domain != domain)
return 0;
- if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1)
+ if (tmp.rc_startblock != agbno + aglen)
return 0;
/* We have a right extent; retrieve (or invent) the next left one */
*right = tmp;
@@ -744,6 +770,9 @@ xfs_refcount_find_right_extents(
goto out_error;
}
+ if (tmp.rc_domain != domain)
+ goto not_found;
+
/* if tmp ends at the end of our range, just use that */
if (xfs_refc_next(&tmp) == agbno + aglen)
*cright = tmp;
@@ -760,8 +789,10 @@ xfs_refcount_find_right_extents(
cright->rc_blockcount = right->rc_startblock -
cright->rc_startblock;
cright->rc_refcount = 1;
+ cright->rc_domain = domain;
}
} else {
+not_found:
/*
* No extents, so pretend that there's one covering the whole
* range.
@@ -769,6 +800,7 @@ xfs_refcount_find_right_extents(
cright->rc_startblock = agbno;
cright->rc_blockcount = aglen;
cright->rc_refcount = 1;
+ cright->rc_domain = domain;
}
trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
cright, right, agbno + aglen);
@@ -794,10 +826,10 @@ xfs_refc_valid(
STATIC int
xfs_refcount_merge_extents(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t *agbno,
xfs_extlen_t *aglen,
enum xfs_refc_adjust_op adjust,
- int flags,
bool *shape_changed)
{
struct xfs_refcount_irec left = {0}, cleft = {0};
@@ -812,12 +844,12 @@ xfs_refcount_merge_extents(
* just below (agbno + aglen) [cright], and just above (agbno + aglen)
* [right].
*/
- error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno,
- *aglen, flags);
+ error = xfs_refcount_find_left_extents(cur, &left, &cleft, domain,
+ *agbno, *aglen);
if (error)
return error;
- error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno,
- *aglen, flags);
+ error = xfs_refcount_find_right_extents(cur, &right, &cright, domain,
+ *agbno, *aglen);
if (error)
return error;
@@ -870,7 +902,7 @@ xfs_refcount_merge_extents(
aglen);
}
- return error;
+ return 0;
}
/*
@@ -933,7 +965,8 @@ xfs_refcount_adjust_extents(
if (*aglen == 0)
return 0;
- error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec);
+ error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_SHARED, *agbno,
+ &found_rec);
if (error)
goto out_error;
@@ -941,10 +974,11 @@ xfs_refcount_adjust_extents(
error = xfs_refcount_get_rec(cur, &ext, &found_rec);
if (error)
goto out_error;
- if (!found_rec) {
+ if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) {
ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
ext.rc_blockcount = 0;
ext.rc_refcount = 0;
+ ext.rc_domain = XFS_REFC_DOMAIN_SHARED;
}
/*
@@ -957,6 +991,8 @@ xfs_refcount_adjust_extents(
tmp.rc_blockcount = min(*aglen,
ext.rc_startblock - *agbno);
tmp.rc_refcount = 1 + adj;
+ tmp.rc_domain = XFS_REFC_DOMAIN_SHARED;
+
trace_xfs_refcount_modify_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, &tmp);
@@ -986,15 +1022,30 @@ xfs_refcount_adjust_extents(
(*agbno) += tmp.rc_blockcount;
(*aglen) -= tmp.rc_blockcount;
- error = xfs_refcount_lookup_ge(cur, *agbno,
+ /* Stop if there's nothing left to modify */
+ if (*aglen == 0 || !xfs_refcount_still_have_space(cur))
+ break;
+
+ /* Move the cursor to the start of ext. */
+ error = xfs_refcount_lookup_ge(cur,
+ XFS_REFC_DOMAIN_SHARED, *agbno,
&found_rec);
if (error)
goto out_error;
}
- /* Stop if there's nothing left to modify */
- if (*aglen == 0 || !xfs_refcount_still_have_space(cur))
- break;
+ /*
+ * A previous step trimmed agbno/aglen such that the end of the
+ * range would not be in the middle of the record. If this is
+ * no longer the case, something is seriously wrong with the
+ * btree. Make sure we never feed the synthesized record into
+ * the processing loop below.
+ */
+ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) ||
+ XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/*
* Adjust the reference count and either update the tree
@@ -1070,13 +1121,15 @@ xfs_refcount_adjust(
/*
* Ensure that no rcextents cross the boundary of the adjustment range.
*/
- error = xfs_refcount_split_extent(cur, agbno, &shape_changed);
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
+ agbno, &shape_changed);
if (error)
goto out_error;
if (shape_changed)
shape_changes++;
- error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed);
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
+ agbno + aglen, &shape_changed);
if (error)
goto out_error;
if (shape_changed)
@@ -1085,8 +1138,8 @@ xfs_refcount_adjust(
/*
* Try to merge with the left or right extents of the range.
*/
- error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj,
- XFS_FIND_RCEXT_SHARED, &shape_changed);
+ error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED,
+ new_agbno, new_aglen, adj, &shape_changed);
if (error)
goto out_error;
if (shape_changed)
@@ -1125,6 +1178,32 @@ xfs_refcount_finish_one_cleanup(
}
/*
+ * Set up a continuation a deferred refcount operation by updating the intent.
+ * Checks to make sure we're not going to run off the end of the AG.
+ */
+static inline int
+xfs_refcount_continue_op(
+ struct xfs_btree_cur *cur,
+ xfs_fsblock_t startblock,
+ xfs_agblock_t new_agbno,
+ xfs_extlen_t new_len,
+ xfs_fsblock_t *new_fsbno)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_perag *pag = cur->bc_ag.pag;
+
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len)))
+ return -EFSCORRUPTED;
+
+ *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
+
+ ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len));
+ ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno));
+
+ return 0;
+}
+
+/*
* Process one of the deferred refcount operations. We pass back the
* btree cursor to maintain our lock on the btree between calls.
* This saves time and eliminates a buffer deadlock between the
@@ -1191,12 +1270,20 @@ xfs_refcount_finish_one(
case XFS_REFCOUNT_INCREASE:
error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
new_len, XFS_REFCOUNT_ADJUST_INCREASE);
- *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
+ if (error)
+ goto out_drop;
+ if (*new_len > 0)
+ error = xfs_refcount_continue_op(rcur, startblock,
+ new_agbno, *new_len, new_fsb);
break;
case XFS_REFCOUNT_DECREASE:
error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
new_len, XFS_REFCOUNT_ADJUST_DECREASE);
- *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
+ if (error)
+ goto out_drop;
+ if (*new_len > 0)
+ error = xfs_refcount_continue_op(rcur, startblock,
+ new_agbno, *new_len, new_fsb);
break;
case XFS_REFCOUNT_ALLOC_COW:
*new_fsb = startblock + blockcount;
@@ -1307,7 +1394,8 @@ xfs_refcount_find_shared(
*flen = 0;
/* Try to find a refcount extent that crosses the start */
- error = xfs_refcount_lookup_le(cur, agbno, &have);
+ error = xfs_refcount_lookup_le(cur, XFS_REFC_DOMAIN_SHARED, agbno,
+ &have);
if (error)
goto out_error;
if (!have) {
@@ -1325,6 +1413,8 @@ xfs_refcount_find_shared(
error = -EFSCORRUPTED;
goto out_error;
}
+ if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED)
+ goto done;
/* If the extent ends before the start, look at the next one */
if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) {
@@ -1340,6 +1430,8 @@ xfs_refcount_find_shared(
error = -EFSCORRUPTED;
goto out_error;
}
+ if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED)
+ goto done;
}
/* If the extent starts after the range we want, bail out */
@@ -1371,7 +1463,8 @@ xfs_refcount_find_shared(
error = -EFSCORRUPTED;
goto out_error;
}
- if (tmp.rc_startblock >= agbno + aglen ||
+ if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED ||
+ tmp.rc_startblock >= agbno + aglen ||
tmp.rc_startblock != *fbno + *flen)
break;
*flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno);
@@ -1455,17 +1548,23 @@ xfs_refcount_adjust_cow_extents(
return 0;
/* Find any overlapping refcount records */
- error = xfs_refcount_lookup_ge(cur, agbno, &found_rec);
+ error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_COW, agbno,
+ &found_rec);
if (error)
goto out_error;
error = xfs_refcount_get_rec(cur, &ext, &found_rec);
if (error)
goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec &&
+ ext.rc_domain != XFS_REFC_DOMAIN_COW)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (!found_rec) {
- ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks +
- XFS_REFC_COW_START;
+ ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
ext.rc_blockcount = 0;
ext.rc_refcount = 0;
+ ext.rc_domain = XFS_REFC_DOMAIN_COW;
}
switch (adj) {
@@ -1480,6 +1579,8 @@ xfs_refcount_adjust_cow_extents(
tmp.rc_startblock = agbno;
tmp.rc_blockcount = aglen;
tmp.rc_refcount = 1;
+ tmp.rc_domain = XFS_REFC_DOMAIN_COW;
+
trace_xfs_refcount_modify_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, &tmp);
@@ -1542,24 +1643,24 @@ xfs_refcount_adjust_cow(
bool shape_changed;
int error;
- agbno += XFS_REFC_COW_START;
-
/*
* Ensure that no rcextents cross the boundary of the adjustment range.
*/
- error = xfs_refcount_split_extent(cur, agbno, &shape_changed);
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW,
+ agbno, &shape_changed);
if (error)
goto out_error;
- error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed);
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW,
+ agbno + aglen, &shape_changed);
if (error)
goto out_error;
/*
* Try to merge with the left or right extents of the range.
*/
- error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj,
- XFS_FIND_RCEXT_COW, &shape_changed);
+ error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_COW, &agbno,
+ &aglen, adj, &shape_changed);
if (error)
goto out_error;
@@ -1666,10 +1767,18 @@ xfs_refcount_recover_extent(
be32_to_cpu(rec->refc.rc_refcount) != 1))
return -EFSCORRUPTED;
- rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0);
+ rr = kmalloc(sizeof(struct xfs_refcount_recovery),
+ GFP_KERNEL | __GFP_NOFAIL);
+ INIT_LIST_HEAD(&rr->rr_list);
xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
- list_add_tail(&rr->rr_list, debris);
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
+ kfree(rr);
+ return -EFSCORRUPTED;
+ }
+
+ list_add_tail(&rr->rr_list, debris);
return 0;
}
@@ -1687,10 +1796,11 @@ xfs_refcount_recover_cow_leftovers(
union xfs_btree_irec low;
union xfs_btree_irec high;
xfs_fsblock_t fsb;
- xfs_agblock_t agbno;
int error;
- if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START)
+ /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */
+ BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG);
+ if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS)
return -EOPNOTSUPP;
INIT_LIST_HEAD(&debris);
@@ -1717,7 +1827,7 @@ xfs_refcount_recover_cow_leftovers(
/* Find all the leftover CoW staging extents. */
memset(&low, 0, sizeof(low));
memset(&high, 0, sizeof(high));
- low.rc.rc_startblock = XFS_REFC_COW_START;
+ low.rc.rc_domain = high.rc.rc_domain = XFS_REFC_DOMAIN_COW;
high.rc.rc_startblock = -1U;
error = xfs_btree_query_range(cur, &low, &high,
xfs_refcount_recover_extent, &debris);
@@ -1738,8 +1848,8 @@ xfs_refcount_recover_cow_leftovers(
&rr->rr_rrec);
/* Free the orphan record */
- agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
- fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno);
+ fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
+ rr->rr_rrec.rc_startblock);
xfs_refcount_free_cow_extent(tp, fsb,
rr->rr_rrec.rc_blockcount);
@@ -1751,7 +1861,7 @@ xfs_refcount_recover_cow_leftovers(
goto out_free;
list_del(&rr->rr_list);
- kmem_free(rr);
+ kfree(rr);
}
return error;
@@ -1761,7 +1871,7 @@ out_free:
/* Free the leftover list */
list_for_each_entry_safe(rr, n, &debris, rr_list) {
list_del(&rr->rr_list);
- kmem_free(rr);
+ kfree(rr);
}
return error;
}
@@ -1770,6 +1880,7 @@ out_free:
int
xfs_refcount_has_record(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t bno,
xfs_extlen_t len,
bool *exists)
@@ -1781,6 +1892,7 @@ xfs_refcount_has_record(
low.rc.rc_startblock = bno;
memset(&high, 0xFF, sizeof(high));
high.rc.rc_startblock = bno + len - 1;
+ low.rc.rc_domain = high.rc.rc_domain = domain;
return xfs_btree_has_record(cur, &low, &high, exists);
}
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index e8b322de7f3d..452f30556f5a 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -14,14 +14,33 @@ struct xfs_bmbt_irec;
struct xfs_refcount_irec;
extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
- xfs_agblock_t bno, int *stat);
+ enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur,
- xfs_agblock_t bno, int *stat);
+ enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur,
- xfs_agblock_t bno, int *stat);
+ enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
+static inline uint32_t
+xfs_refcount_encode_startblock(
+ xfs_agblock_t startblock,
+ enum xfs_refc_domain domain)
+{
+ uint32_t start;
+
+ /*
+ * low level btree operations need to handle the generic btree range
+ * query functions (which set rc_domain == -1U), so we check that the
+ * domain is /not/ shared.
+ */
+ start = startblock & ~XFS_REFC_COWFLAG;
+ if (domain != XFS_REFC_DOMAIN_SHARED)
+ start |= XFS_REFC_COWFLAG;
+
+ return start;
+}
+
enum xfs_refcount_intent_type {
XFS_REFCOUNT_INCREASE = 1,
XFS_REFCOUNT_DECREASE,
@@ -36,6 +55,18 @@ struct xfs_refcount_intent {
xfs_fsblock_t ri_startblock;
};
+/* Check that the refcount is appropriate for the record domain. */
+static inline bool
+xfs_refcount_check_domain(
+ const struct xfs_refcount_irec *irec)
+{
+ if (irec->rc_domain == XFS_REFC_DOMAIN_COW && irec->rc_refcount != 1)
+ return false;
+ if (irec->rc_domain == XFS_REFC_DOMAIN_SHARED && irec->rc_refcount < 2)
+ return false;
+ return true;
+}
+
void xfs_refcount_increase_extent(struct xfs_trans *tp,
struct xfs_bmbt_irec *irec);
void xfs_refcount_decrease_extent(struct xfs_trans *tp,
@@ -79,7 +110,8 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
#define XFS_REFCOUNT_ITEM_OVERHEAD 32
extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
- xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
+ enum xfs_refc_domain domain, xfs_agblock_t bno,
+ xfs_extlen_t len, bool *exists);
union xfs_btree_rec;
extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 316c1ec0c3c2..e1f789866683 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -13,6 +13,7 @@
#include "xfs_btree.h"
#include "xfs_btree_staging.h"
#include "xfs_refcount_btree.h"
+#include "xfs_refcount.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_trace.h"
@@ -160,7 +161,12 @@ xfs_refcountbt_init_rec_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_rec *rec)
{
- rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock);
+ const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ uint32_t start;
+
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ rec->refc.rc_startblock = cpu_to_be32(start);
rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount);
rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount);
}
@@ -182,10 +188,13 @@ xfs_refcountbt_key_diff(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
- struct xfs_refcount_irec *rec = &cur->bc_rec.rc;
const struct xfs_refcount_key *kp = &key->refc;
+ const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ uint32_t start;
- return (int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock;
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
}
STATIC int64_t
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 094dfc897ebc..b56aca1e7c66 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -235,13 +235,8 @@ xfs_rmap_get_rec(
goto out_bad_rec;
} else {
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbno(pag, irec->rm_startblock))
- goto out_bad_rec;
- if (irec->rm_startblock >
- irec->rm_startblock + irec->rm_blockcount)
- goto out_bad_rec;
- if (!xfs_verify_agbno(pag,
- irec->rm_startblock + irec->rm_blockcount - 1))
+ if (!xfs_verify_agbext(pag, irec->rm_startblock,
+ irec->rm_blockcount))
goto out_bad_rec;
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 2c4ad6e4bb14..5b2f27cbdb80 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -422,7 +422,7 @@ xfs_calc_itruncate_reservation_minlogsize(
/*
* In renaming a files we can modify:
- * the four inodes involved: 4 * inode size
+ * the five inodes involved: 5 * inode size
* the two directory btrees: 2 * (max depth + v2) * dir block size
* the two directory bmap btrees: 2 * max depth * block size
* And the bmap_finish transaction can free dir and bmap blocks (two sets
@@ -437,7 +437,7 @@ xfs_calc_rename_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- max((xfs_calc_inode_res(mp, 4) +
+ max((xfs_calc_inode_res(mp, 5) +
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index a6b7d98cf68f..5ebdda7e1078 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -166,6 +166,36 @@ typedef struct xfs_bmbt_irec
xfs_exntst_t br_state; /* extent state */
} xfs_bmbt_irec_t;
+enum xfs_refc_domain {
+ XFS_REFC_DOMAIN_SHARED = 0,
+ XFS_REFC_DOMAIN_COW,
+};
+
+#define XFS_REFC_DOMAIN_STRINGS \
+ { XFS_REFC_DOMAIN_SHARED, "shared" }, \
+ { XFS_REFC_DOMAIN_COW, "cow" }
+
+struct xfs_refcount_irec {
+ xfs_agblock_t rc_startblock; /* starting block number */
+ xfs_extlen_t rc_blockcount; /* count of free blocks */
+ xfs_nlink_t rc_refcount; /* number of inodes linked here */
+ enum xfs_refc_domain rc_domain; /* shared or cow staging extent? */
+};
+
+#define XFS_RMAP_ATTR_FORK (1 << 0)
+#define XFS_RMAP_BMBT_BLOCK (1 << 1)
+#define XFS_RMAP_UNWRITTEN (1 << 2)
+#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \
+ XFS_RMAP_BMBT_BLOCK)
+#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN)
+struct xfs_rmap_irec {
+ xfs_agblock_t rm_startblock; /* extent start block */
+ xfs_extlen_t rm_blockcount; /* extent length */
+ uint64_t rm_owner; /* extent owner */
+ uint64_t rm_offset; /* offset within the owner */
+ unsigned int rm_flags; /* state flags */
+};
+
/* per-AG block reservation types */
enum xfs_ag_resv_type {
XFS_AG_RESV_NONE = 0,
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index ab427b4d7fe0..3b38f4e2a537 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -100,9 +100,7 @@ xchk_allocbt_rec(
bno = be32_to_cpu(rec->alloc.ar_startblock);
len = be32_to_cpu(rec->alloc.ar_blockcount);
- if (bno + len <= bno ||
- !xfs_verify_agbno(pag, bno) ||
- !xfs_verify_agbno(pag, bno + len - 1))
+ if (!xfs_verify_agbext(pag, bno, len))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
xchk_allocbt_xref(bs->sc, bno, len);
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index e1026e07bf94..e312be7cd375 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -108,9 +108,8 @@ xchk_iallocbt_chunk(
xfs_agblock_t bno;
bno = XFS_AGINO_TO_AGBNO(mp, agino);
- if (bno + len <= bno ||
- !xfs_verify_agbno(pag, bno) ||
- !xfs_verify_agbno(pag, bno + len - 1))
+
+ if (!xfs_verify_agbext(pag, bno, len))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len);
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index c68b767dc08f..a26ee0f24ef2 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -269,15 +269,13 @@ done:
STATIC void
xchk_refcountbt_xref_rmap(
struct xfs_scrub *sc,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- xfs_nlink_t refcount)
+ const struct xfs_refcount_irec *irec)
{
struct xchk_refcnt_check refchk = {
- .sc = sc,
- .bno = bno,
- .len = len,
- .refcount = refcount,
+ .sc = sc,
+ .bno = irec->rc_startblock,
+ .len = irec->rc_blockcount,
+ .refcount = irec->rc_refcount,
.seen = 0,
};
struct xfs_rmap_irec low;
@@ -291,9 +289,9 @@ xchk_refcountbt_xref_rmap(
/* Cross-reference with the rmapbt to confirm the refcount. */
memset(&low, 0, sizeof(low));
- low.rm_startblock = bno;
+ low.rm_startblock = irec->rc_startblock;
memset(&high, 0xFF, sizeof(high));
- high.rm_startblock = bno + len - 1;
+ high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1;
INIT_LIST_HEAD(&refchk.fragments);
error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high,
@@ -302,7 +300,7 @@ xchk_refcountbt_xref_rmap(
goto out_free;
xchk_refcountbt_process_rmap_fragments(&refchk);
- if (refcount != refchk.seen)
+ if (irec->rc_refcount != refchk.seen)
xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
out_free:
@@ -315,17 +313,16 @@ out_free:
/* Cross-reference with the other btrees. */
STATIC void
xchk_refcountbt_xref(
- struct xfs_scrub *sc,
- xfs_agblock_t agbno,
- xfs_extlen_t len,
- xfs_nlink_t refcount)
+ struct xfs_scrub *sc,
+ const struct xfs_refcount_irec *irec)
{
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return;
- xchk_xref_is_used_space(sc, agbno, len);
- xchk_xref_is_not_inode_chunk(sc, agbno, len);
- xchk_refcountbt_xref_rmap(sc, agbno, len, refcount);
+ xchk_xref_is_used_space(sc, irec->rc_startblock, irec->rc_blockcount);
+ xchk_xref_is_not_inode_chunk(sc, irec->rc_startblock,
+ irec->rc_blockcount);
+ xchk_refcountbt_xref_rmap(sc, irec);
}
/* Scrub a refcountbt record. */
@@ -334,35 +331,27 @@ xchk_refcountbt_rec(
struct xchk_btree *bs,
const union xfs_btree_rec *rec)
{
+ struct xfs_refcount_irec irec;
xfs_agblock_t *cow_blocks = bs->private;
struct xfs_perag *pag = bs->cur->bc_ag.pag;
- xfs_agblock_t bno;
- xfs_extlen_t len;
- xfs_nlink_t refcount;
- bool has_cowflag;
- bno = be32_to_cpu(rec->refc.rc_startblock);
- len = be32_to_cpu(rec->refc.rc_blockcount);
- refcount = be32_to_cpu(rec->refc.rc_refcount);
+ xfs_refcount_btrec_to_irec(rec, &irec);
- /* Only CoW records can have refcount == 1. */
- has_cowflag = (bno & XFS_REFC_COW_START);
- if ((refcount == 1 && !has_cowflag) || (refcount != 1 && has_cowflag))
+ /* Check the domain and refcount are not incompatible. */
+ if (!xfs_refcount_check_domain(&irec))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- if (has_cowflag)
- (*cow_blocks) += len;
+
+ if (irec.rc_domain == XFS_REFC_DOMAIN_COW)
+ (*cow_blocks) += irec.rc_blockcount;
/* Check the extent. */
- bno &= ~XFS_REFC_COW_START;
- if (bno + len <= bno ||
- !xfs_verify_agbno(pag, bno) ||
- !xfs_verify_agbno(pag, bno + len - 1))
+ if (!xfs_verify_agbext(pag, irec.rc_startblock, irec.rc_blockcount))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- if (refcount == 0)
+ if (irec.rc_refcount == 0)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- xchk_refcountbt_xref(bs->sc, bno, len, refcount);
+ xchk_refcountbt_xref(bs->sc, &irec);
return 0;
}
@@ -426,7 +415,6 @@ xchk_xref_is_cow_staging(
xfs_extlen_t len)
{
struct xfs_refcount_irec rc;
- bool has_cowflag;
int has_refcount;
int error;
@@ -434,8 +422,8 @@ xchk_xref_is_cow_staging(
return;
/* Find the CoW staging extent. */
- error = xfs_refcount_lookup_le(sc->sa.refc_cur,
- agbno + XFS_REFC_COW_START, &has_refcount);
+ error = xfs_refcount_lookup_le(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW,
+ agbno, &has_refcount);
if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
return;
if (!has_refcount) {
@@ -451,9 +439,8 @@ xchk_xref_is_cow_staging(
return;
}
- /* CoW flag must be set, refcount must be 1. */
- has_cowflag = (rc.rc_startblock & XFS_REFC_COW_START);
- if (!has_cowflag || rc.rc_refcount != 1)
+ /* CoW lookup returned a shared extent record? */
+ if (rc.rc_domain != XFS_REFC_DOMAIN_COW)
xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
/* Must be at least as long as what was passed in */
@@ -477,7 +464,8 @@ xchk_xref_is_not_shared(
if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm))
return;
- error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared);
+ error = xfs_refcount_has_record(sc->sa.refc_cur, XFS_REFC_DOMAIN_SHARED,
+ agbno, len, &shared);
if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
return;
if (shared)
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index cf5ce607dc05..2788a6f2edcd 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -245,28 +245,6 @@ xfs_attri_init(
return attrip;
}
-/*
- * Copy an attr format buffer from the given buf, and into the destination attr
- * format structure.
- */
-STATIC int
-xfs_attri_copy_format(
- struct xfs_log_iovec *buf,
- struct xfs_attri_log_format *dst_attr_fmt)
-{
- struct xfs_attri_log_format *src_attr_fmt = buf->i_addr;
- size_t len;
-
- len = sizeof(struct xfs_attri_log_format);
- if (buf->i_len != len) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
- return -EFSCORRUPTED;
- }
-
- memcpy((char *)dst_attr_fmt, (char *)src_attr_fmt, len);
- return 0;
-}
-
static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_attrd_log_item, attrd_item);
@@ -731,24 +709,50 @@ xlog_recover_attri_commit_pass2(
struct xfs_attri_log_nameval *nv;
const void *attr_value = NULL;
const void *attr_name;
- int error;
+ size_t len;
attri_formatp = item->ri_buf[0].i_addr;
attr_name = item->ri_buf[1].i_addr;
/* Validate xfs_attri_log_format before the large memory allocation */
+ len = sizeof(struct xfs_attri_log_format);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
if (!xfs_attri_validate(mp, attri_formatp)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ /* Validate the attr name */
+ if (item->ri_buf[1].i_len !=
+ xlog_calc_iovec_len(attri_formatp->alfi_name_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
return -EFSCORRUPTED;
}
if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[1].i_addr, item->ri_buf[1].i_len);
return -EFSCORRUPTED;
}
- if (attri_formatp->alfi_value_len)
+ /* Validate the attr value, if present */
+ if (attri_formatp->alfi_value_len != 0) {
+ if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr,
+ item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
attr_value = item->ri_buf[2].i_addr;
+ }
/*
* Memory alloc failure will cause replay to abort. We attach the
@@ -760,9 +764,7 @@ xlog_recover_attri_commit_pass2(
attri_formatp->alfi_value_len);
attrip = xfs_attri_init(mp, nv);
- error = xfs_attri_copy_format(&item->ri_buf[0], &attrip->attri_format);
- if (error)
- goto out;
+ memcpy(&attrip->attri_format, attri_formatp, len);
/*
* The ATTRI has two references. One for the ATTRD and one for ATTRI to
@@ -774,10 +776,6 @@ xlog_recover_attri_commit_pass2(
xfs_attri_release(attrip);
xfs_attri_log_nameval_put(nv);
return 0;
-out:
- xfs_attri_item_free(attrip);
- xfs_attri_log_nameval_put(nv);
- return error;
}
/*
@@ -842,7 +840,8 @@ xlog_recover_attrd_commit_pass2(
attrd_formatp = item->ri_buf[0].i_addr;
if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 51f66e982484..41323da523d1 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -608,28 +608,18 @@ static const struct xfs_item_ops xfs_bui_item_ops = {
.iop_relog = xfs_bui_item_relog,
};
-/*
- * Copy an BUI format buffer from the given buf, and into the destination
- * BUI format structure. The BUI/BUD items were designed not to need any
- * special alignment handling.
- */
-static int
+static inline void
xfs_bui_copy_format(
- struct xfs_log_iovec *buf,
- struct xfs_bui_log_format *dst_bui_fmt)
+ struct xfs_bui_log_format *dst,
+ const struct xfs_bui_log_format *src)
{
- struct xfs_bui_log_format *src_bui_fmt;
- uint len;
+ unsigned int i;
- src_bui_fmt = buf->i_addr;
- len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents);
+ memcpy(dst, src, offsetof(struct xfs_bui_log_format, bui_extents));
- if (buf->i_len == len) {
- memcpy(dst_bui_fmt, src_bui_fmt, len);
- return 0;
- }
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
- return -EFSCORRUPTED;
+ for (i = 0; i < src->bui_nextents; i++)
+ memcpy(&dst->bui_extents[i], &src->bui_extents[i],
+ sizeof(struct xfs_map_extent));
}
/*
@@ -646,23 +636,34 @@ xlog_recover_bui_commit_pass2(
struct xlog_recover_item *item,
xfs_lsn_t lsn)
{
- int error;
struct xfs_mount *mp = log->l_mp;
struct xfs_bui_log_item *buip;
struct xfs_bui_log_format *bui_formatp;
+ size_t len;
bui_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len < xfs_bui_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
return -EFSCORRUPTED;
}
- buip = xfs_bui_init(mp);
- error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format);
- if (error) {
- xfs_bui_item_free(buip);
- return error;
+
+ len = xfs_bui_log_format_sizeof(bui_formatp->bui_nextents);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
}
+
+ buip = xfs_bui_init(mp);
+ xfs_bui_copy_format(&buip->bui_format, bui_formatp);
atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
/*
* Insert the intent into the AIL directly and drop one reference so
@@ -696,7 +697,8 @@ xlog_recover_bud_commit_pass2(
bud_formatp = item->ri_buf[0].i_addr;
if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 7db588ed0be5..c6b2aabd6f18 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -234,13 +234,18 @@ int
xfs_errortag_init(
struct xfs_mount *mp)
{
+ int ret;
+
mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
KM_MAYFAIL);
if (!mp->m_errortag)
return -ENOMEM;
- return xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype,
- &mp->m_kobj, "errortag");
+ ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype,
+ &mp->m_kobj, "errortag");
+ if (ret)
+ kmem_free(mp->m_errortag);
+ return ret;
}
void
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 27ccfcd82f04..d5130d1fcfae 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -66,27 +66,16 @@ xfs_efi_release(
xfs_efi_item_free(efip);
}
-/*
- * This returns the number of iovecs needed to log the given efi item.
- * We only need 1 iovec for an efi item. It just logs the efi_log_format
- * structure.
- */
-static inline int
-xfs_efi_item_sizeof(
- struct xfs_efi_log_item *efip)
-{
- return sizeof(struct xfs_efi_log_format) +
- (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
-}
-
STATIC void
xfs_efi_item_size(
struct xfs_log_item *lip,
int *nvecs,
int *nbytes)
{
+ struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+
*nvecs += 1;
- *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip));
+ *nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents);
}
/*
@@ -112,7 +101,7 @@ xfs_efi_item_format(
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT,
&efip->efi_format,
- xfs_efi_item_sizeof(efip));
+ xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents));
}
@@ -155,13 +144,11 @@ xfs_efi_init(
{
struct xfs_efi_log_item *efip;
- uint size;
ASSERT(nextents > 0);
if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
- size = (uint)(sizeof(struct xfs_efi_log_item) +
- ((nextents - 1) * sizeof(xfs_extent_t)));
- efip = kmem_zalloc(size, 0);
+ efip = kzalloc(xfs_efi_log_item_sizeof(nextents),
+ GFP_KERNEL | __GFP_NOFAIL);
} else {
efip = kmem_cache_zalloc(xfs_efi_cache,
GFP_KERNEL | __GFP_NOFAIL);
@@ -188,15 +175,17 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
{
xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
uint i;
- uint len = sizeof(xfs_efi_log_format_t) +
- (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);
- uint len32 = sizeof(xfs_efi_log_format_32_t) +
- (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);
- uint len64 = sizeof(xfs_efi_log_format_64_t) +
- (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);
+ uint len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents);
+ uint len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents);
+ uint len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents);
if (buf->i_len == len) {
- memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
+ memcpy(dst_efi_fmt, src_efi_fmt,
+ offsetof(struct xfs_efi_log_format, efi_extents));
+ for (i = 0; i < src_efi_fmt->efi_nextents; i++)
+ memcpy(&dst_efi_fmt->efi_extents[i],
+ &src_efi_fmt->efi_extents[i],
+ sizeof(struct xfs_extent));
return 0;
} else if (buf->i_len == len32) {
xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
@@ -227,7 +216,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
}
return 0;
}
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->i_addr,
+ buf->i_len);
return -EFSCORRUPTED;
}
@@ -246,27 +236,16 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp)
kmem_cache_free(xfs_efd_cache, efdp);
}
-/*
- * This returns the number of iovecs needed to log the given efd item.
- * We only need 1 iovec for an efd item. It just logs the efd_log_format
- * structure.
- */
-static inline int
-xfs_efd_item_sizeof(
- struct xfs_efd_log_item *efdp)
-{
- return sizeof(xfs_efd_log_format_t) +
- (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
-}
-
STATIC void
xfs_efd_item_size(
struct xfs_log_item *lip,
int *nvecs,
int *nbytes)
{
+ struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+
*nvecs += 1;
- *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip));
+ *nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents);
}
/*
@@ -291,7 +270,7 @@ xfs_efd_item_format(
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT,
&efdp->efd_format,
- xfs_efd_item_sizeof(efdp));
+ xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents));
}
/*
@@ -340,9 +319,8 @@ xfs_trans_get_efd(
ASSERT(nextents > 0);
if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
- efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
- (nextents - 1) * sizeof(struct xfs_extent),
- 0);
+ efdp = kzalloc(xfs_efd_log_item_sizeof(nextents),
+ GFP_KERNEL | __GFP_NOFAIL);
} else {
efdp = kmem_cache_zalloc(xfs_efd_cache,
GFP_KERNEL | __GFP_NOFAIL);
@@ -733,6 +711,12 @@ xlog_recover_efi_commit_pass2(
efi_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
if (error) {
@@ -769,12 +753,24 @@ xlog_recover_efd_commit_pass2(
xfs_lsn_t lsn)
{
struct xfs_efd_log_format *efd_formatp;
+ int buflen = item->ri_buf[0].i_len;
efd_formatp = item->ri_buf[0].i_addr;
- ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
- ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
- (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
- ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
+
+ if (buflen < sizeof(struct xfs_efd_log_format)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ efd_formatp, buflen);
+ return -EFSCORRUPTED;
+ }
+
+ if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof(
+ efd_formatp->efd_nextents) &&
+ item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof(
+ efd_formatp->efd_nextents)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ efd_formatp, buflen);
+ return -EFSCORRUPTED;
+ }
xlog_recover_release_intent(log, XFS_LI_EFI, efd_formatp->efd_efi_id);
return 0;
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 186d0f2137f1..da6a5afa607c 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -52,6 +52,14 @@ struct xfs_efi_log_item {
xfs_efi_log_format_t efi_format;
};
+static inline size_t
+xfs_efi_log_item_sizeof(
+ unsigned int nr)
+{
+ return offsetof(struct xfs_efi_log_item, efi_format) +
+ xfs_efi_log_format_sizeof(nr);
+}
+
/*
* This is the "extent free done" log item. It is used to log
* the fact that some extents earlier mentioned in an efi item
@@ -64,6 +72,14 @@ struct xfs_efd_log_item {
xfs_efd_log_format_t efd_format;
};
+static inline size_t
+xfs_efd_log_item_sizeof(
+ unsigned int nr)
+{
+ return offsetof(struct xfs_efd_log_item, efd_format) +
+ xfs_efd_log_format_sizeof(nr);
+}
+
/*
* Max number of extents in fast allocation path.
*/
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c6c80265c0b2..e462d39c840e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1261,7 +1261,7 @@ xfs_file_llseek(
}
#ifdef CONFIG_FS_DAX
-static int
+static inline vm_fault_t
xfs_dax_fault(
struct vm_fault *vmf,
enum page_entry_size pe_size,
@@ -1274,14 +1274,15 @@ xfs_dax_fault(
&xfs_read_iomap_ops);
}
#else
-static int
+static inline vm_fault_t
xfs_dax_fault(
struct vm_fault *vmf,
enum page_entry_size pe_size,
bool write_fault,
pfn_t *pfn)
{
- return 0;
+ ASSERT(0);
+ return VM_FAULT_SIGBUS;
}
#endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c000b74dd203..aa303be11576 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2818,7 +2818,7 @@ retry:
* Lock all the participating inodes. Depending upon whether
* the target_name exists in the target directory, and
* whether the target directory is the same as the source
- * directory, we can lock from 2 to 4 inodes.
+ * directory, we can lock from 2 to 5 inodes.
*/
xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 17e923b9c5fa..322eb2ee6c55 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2552,6 +2552,8 @@ xlog_recover_process_intents(
for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
lip != NULL;
lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
+ const struct xfs_item_ops *ops;
+
if (!xlog_item_is_intent(lip))
break;
@@ -2567,13 +2569,17 @@ xlog_recover_process_intents(
* deferred ops, you /must/ attach them to the capture list in
* the recover routine or else those subsequent intents will be
* replayed in the wrong order!
+ *
+ * The recovery function can free the log item, so we must not
+ * access lip after it returns.
*/
spin_unlock(&ailp->ail_lock);
- error = lip->li_ops->iop_recover(lip, &capture_list);
+ ops = lip->li_ops;
+ error = ops->iop_recover(lip, &capture_list);
spin_lock(&ailp->ail_lock);
if (error) {
trace_xlog_intent_recovery_failed(log->l_mp, error,
- lip->li_ops->iop_recover);
+ ops->iop_recover);
break;
}
}
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 758702b9495f..9737b5a9f405 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -118,10 +118,10 @@ xfs_check_ondisk_structs(void)
/* log structures */
XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88);
XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24);
- XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28);
- XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32);
- XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 28);
- XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_extent_32, 12);
XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176);
@@ -134,6 +134,21 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format, 40);
XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bui_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bud_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_cui_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_cud_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rui_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16);
+
+ XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16);
+ XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16);
+ XFS_CHECK_OFFSET(struct xfs_rui_log_format, rui_extents, 16);
+ XFS_CHECK_OFFSET(struct xfs_efi_log_format, efi_extents, 16);
+ XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16);
+ XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16);
/*
* The v5 superblock format extended several v4 header structures with
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 7e97bf19793d..858e3e9eb4a8 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -523,7 +523,9 @@ xfs_cui_item_recover(
type = refc_type;
break;
default:
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &cuip->cui_format,
+ sizeof(cuip->cui_format));
error = -EFSCORRUPTED;
goto abort_error;
}
@@ -536,7 +538,8 @@ xfs_cui_item_recover(
&new_fsb, &new_len, &rcur);
if (error == -EFSCORRUPTED)
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- refc, sizeof(*refc));
+ &cuip->cui_format,
+ sizeof(cuip->cui_format));
if (error)
goto abort_error;
@@ -622,28 +625,18 @@ static const struct xfs_item_ops xfs_cui_item_ops = {
.iop_relog = xfs_cui_item_relog,
};
-/*
- * Copy an CUI format buffer from the given buf, and into the destination
- * CUI format structure. The CUI/CUD items were designed not to need any
- * special alignment handling.
- */
-static int
+static inline void
xfs_cui_copy_format(
- struct xfs_log_iovec *buf,
- struct xfs_cui_log_format *dst_cui_fmt)
+ struct xfs_cui_log_format *dst,
+ const struct xfs_cui_log_format *src)
{
- struct xfs_cui_log_format *src_cui_fmt;
- uint len;
+ unsigned int i;
- src_cui_fmt = buf->i_addr;
- len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents);
+ memcpy(dst, src, offsetof(struct xfs_cui_log_format, cui_extents));
- if (buf->i_len == len) {
- memcpy(dst_cui_fmt, src_cui_fmt, len);
- return 0;
- }
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
- return -EFSCORRUPTED;
+ for (i = 0; i < src->cui_nextents; i++)
+ memcpy(&dst->cui_extents[i], &src->cui_extents[i],
+ sizeof(struct xfs_phys_extent));
}
/*
@@ -660,19 +653,28 @@ xlog_recover_cui_commit_pass2(
struct xlog_recover_item *item,
xfs_lsn_t lsn)
{
- int error;
struct xfs_mount *mp = log->l_mp;
struct xfs_cui_log_item *cuip;
struct xfs_cui_log_format *cui_formatp;
+ size_t len;
cui_formatp = item->ri_buf[0].i_addr;
- cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
- error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format);
- if (error) {
- xfs_cui_item_free(cuip);
- return error;
+ if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
}
+
+ len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
+ xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
/*
* Insert the intent into the AIL directly and drop one reference so
@@ -706,7 +708,8 @@ xlog_recover_cud_commit_pass2(
cud_formatp = item->ri_buf[0].i_addr;
if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index fef92e02f3bb..534504ede1a3 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -155,31 +155,6 @@ xfs_rui_init(
return ruip;
}
-/*
- * Copy an RUI format buffer from the given buf, and into the destination
- * RUI format structure. The RUI/RUD items were designed not to need any
- * special alignment handling.
- */
-STATIC int
-xfs_rui_copy_format(
- struct xfs_log_iovec *buf,
- struct xfs_rui_log_format *dst_rui_fmt)
-{
- struct xfs_rui_log_format *src_rui_fmt;
- uint len;
-
- src_rui_fmt = buf->i_addr;
- len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents);
-
- if (buf->i_len != len) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
- return -EFSCORRUPTED;
- }
-
- memcpy(dst_rui_fmt, src_rui_fmt, len);
- return 0;
-}
-
static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_rud_log_item, rud_item);
@@ -582,7 +557,9 @@ xfs_rui_item_recover(
type = XFS_RMAP_FREE;
break;
default:
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &ruip->rui_format,
+ sizeof(ruip->rui_format));
error = -EFSCORRUPTED;
goto abort_error;
}
@@ -652,6 +629,20 @@ static const struct xfs_item_ops xfs_rui_item_ops = {
.iop_relog = xfs_rui_item_relog,
};
+static inline void
+xfs_rui_copy_format(
+ struct xfs_rui_log_format *dst,
+ const struct xfs_rui_log_format *src)
+{
+ unsigned int i;
+
+ memcpy(dst, src, offsetof(struct xfs_rui_log_format, rui_extents));
+
+ for (i = 0; i < src->rui_nextents; i++)
+ memcpy(&dst->rui_extents[i], &src->rui_extents[i],
+ sizeof(struct xfs_map_extent));
+}
+
/*
* This routine is called to create an in-core extent rmap update
* item from the rui format structure which was logged on disk.
@@ -666,19 +657,28 @@ xlog_recover_rui_commit_pass2(
struct xlog_recover_item *item,
xfs_lsn_t lsn)
{
- int error;
struct xfs_mount *mp = log->l_mp;
struct xfs_rui_log_item *ruip;
struct xfs_rui_log_format *rui_formatp;
+ size_t len;
rui_formatp = item->ri_buf[0].i_addr;
- ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
- error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format);
- if (error) {
- xfs_rui_item_free(ruip);
- return error;
+ if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
}
+
+ ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
+ xfs_rui_copy_format(&ruip->rui_format, rui_formatp);
atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
/*
* Insert the intent into the AIL directly and drop one reference so
@@ -711,7 +711,11 @@ xlog_recover_rud_commit_pass2(
struct xfs_rud_log_format *rud_formatp;
rud_formatp = item->ri_buf[0].i_addr;
- ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format));
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ rud_formatp, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
xlog_recover_release_intent(log, XFS_LI_RUI, rud_formatp->rud_rui_id);
return 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f029c6702dda..ee4b429a2f2c 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -2028,18 +2028,14 @@ xfs_init_caches(void)
goto out_destroy_trans_cache;
xfs_efd_cache = kmem_cache_create("xfs_efd_item",
- (sizeof(struct xfs_efd_log_item) +
- (XFS_EFD_MAX_FAST_EXTENTS - 1) *
- sizeof(struct xfs_extent)),
- 0, 0, NULL);
+ xfs_efd_log_item_sizeof(XFS_EFD_MAX_FAST_EXTENTS),
+ 0, 0, NULL);
if (!xfs_efd_cache)
goto out_destroy_buf_item_cache;
xfs_efi_cache = kmem_cache_create("xfs_efi_item",
- (sizeof(struct xfs_efi_log_item) +
- (XFS_EFI_MAX_FAST_EXTENTS - 1) *
- sizeof(struct xfs_extent)),
- 0, 0, NULL);
+ xfs_efi_log_item_sizeof(XFS_EFI_MAX_FAST_EXTENTS),
+ 0, 0, NULL);
if (!xfs_efi_cache)
goto out_destroy_efd_cache;
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index 43585850f154..513095e353a5 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -33,10 +33,15 @@ xfs_sysfs_init(
const char *name)
{
struct kobject *parent;
+ int err;
parent = parent_kobj ? &parent_kobj->kobject : NULL;
init_completion(&kobj->complete);
- return kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name);
+ err = kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name);
+ if (err)
+ kobject_put(&kobj->kobject);
+
+ return err;
}
static inline void
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index cb7c81ba7fa3..372d871bccc5 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -799,6 +799,9 @@ TRACE_DEFINE_ENUM(PE_SIZE_PTE);
TRACE_DEFINE_ENUM(PE_SIZE_PMD);
TRACE_DEFINE_ENUM(PE_SIZE_PUD);
+TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
+TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
+
TRACE_EVENT(xfs_filemap_fault,
TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
bool write_fault),
@@ -2925,6 +2928,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, domain)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
__field(xfs_nlink_t, refcount)
@@ -2932,13 +2936,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
+ __entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
__entry->startblock,
__entry->blockcount,
__entry->refcount)
@@ -2958,6 +2964,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, domain)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
__field(xfs_nlink_t, refcount)
@@ -2966,14 +2973,16 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
+ __entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
__entry->agbno = agbno;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
__entry->startblock,
__entry->blockcount,
__entry->refcount,
@@ -2994,9 +3003,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, i1_domain)
__field(xfs_agblock_t, i1_startblock)
__field(xfs_extlen_t, i1_blockcount)
__field(xfs_nlink_t, i1_refcount)
+ __field(enum xfs_refc_domain, i2_domain)
__field(xfs_agblock_t, i2_startblock)
__field(xfs_extlen_t, i2_blockcount)
__field(xfs_nlink_t, i2_refcount)
@@ -3004,20 +3015,24 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
+ __entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
__entry->i1_blockcount = i1->rc_blockcount;
__entry->i1_refcount = i1->rc_refcount;
+ __entry->i2_domain = i2->rc_domain;
__entry->i2_startblock = i2->rc_startblock;
__entry->i2_blockcount = i2->rc_blockcount;
__entry->i2_refcount = i2->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- "
- "agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i1_startblock,
__entry->i1_blockcount,
__entry->i1_refcount,
+ __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i2_startblock,
__entry->i2_blockcount,
__entry->i2_refcount)
@@ -3038,9 +3053,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, i1_domain)
__field(xfs_agblock_t, i1_startblock)
__field(xfs_extlen_t, i1_blockcount)
__field(xfs_nlink_t, i1_refcount)
+ __field(enum xfs_refc_domain, i2_domain)
__field(xfs_agblock_t, i2_startblock)
__field(xfs_extlen_t, i2_blockcount)
__field(xfs_nlink_t, i2_refcount)
@@ -3049,21 +3066,25 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
+ __entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
__entry->i1_blockcount = i1->rc_blockcount;
__entry->i1_refcount = i1->rc_refcount;
+ __entry->i2_domain = i2->rc_domain;
__entry->i2_startblock = i2->rc_startblock;
__entry->i2_blockcount = i2->rc_blockcount;
__entry->i2_refcount = i2->rc_refcount;
__entry->agbno = agbno;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- "
- "agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i1_startblock,
__entry->i1_blockcount,
__entry->i1_refcount,
+ __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i2_startblock,
__entry->i2_blockcount,
__entry->i2_refcount,
@@ -3086,12 +3107,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, i1_domain)
__field(xfs_agblock_t, i1_startblock)
__field(xfs_extlen_t, i1_blockcount)
__field(xfs_nlink_t, i1_refcount)
+ __field(enum xfs_refc_domain, i2_domain)
__field(xfs_agblock_t, i2_startblock)
__field(xfs_extlen_t, i2_blockcount)
__field(xfs_nlink_t, i2_refcount)
+ __field(enum xfs_refc_domain, i3_domain)
__field(xfs_agblock_t, i3_startblock)
__field(xfs_extlen_t, i3_blockcount)
__field(xfs_nlink_t, i3_refcount)
@@ -3099,27 +3123,33 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
+ __entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
__entry->i1_blockcount = i1->rc_blockcount;
__entry->i1_refcount = i1->rc_refcount;
+ __entry->i2_domain = i2->rc_domain;
__entry->i2_startblock = i2->rc_startblock;
__entry->i2_blockcount = i2->rc_blockcount;
__entry->i2_refcount = i2->rc_refcount;
+ __entry->i3_domain = i3->rc_domain;
__entry->i3_startblock = i3->rc_startblock;
__entry->i3_blockcount = i3->rc_blockcount;
__entry->i3_refcount = i3->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- "
- "agbno 0x%x fsbcount 0x%x refcount %u -- "
- "agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i1_startblock,
__entry->i1_blockcount,
__entry->i1_refcount,
+ __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i2_startblock,
__entry->i2_blockcount,
__entry->i2_refcount,
+ __print_symbolic(__entry->i3_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i3_startblock,
__entry->i3_blockcount,
__entry->i3_refcount)
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 16fbf2a1144c..f51df7d94ef7 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -730,11 +730,10 @@ void
xfs_ail_push_all_sync(
struct xfs_ail *ailp)
{
- struct xfs_log_item *lip;
DEFINE_WAIT(wait);
spin_lock(&ailp->ail_lock);
- while ((lip = xfs_ail_max(ailp)) != NULL) {
+ while (xfs_ail_max(ailp) != NULL) {
prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE);
wake_up_process(ailp->ail_task);
spin_unlock(&ailp->ail_lock);
diff --git a/include/asm-generic/compat.h b/include/asm-generic/compat.h
index aeb257ad3d1a..8392caea398f 100644
--- a/include/asm-generic/compat.h
+++ b/include/asm-generic/compat.h
@@ -15,7 +15,7 @@
#endif
#ifndef compat_arg_u64
-#ifdef CONFIG_CPU_BIG_ENDIAN
+#ifndef CONFIG_CPU_BIG_ENDIAN
#define compat_arg_u64(name) u32 name##_lo, u32 name##_hi
#define compat_arg_u64_dual(name) u32, name##_lo, u32, name##_hi
#else
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index c0b868ce6a8f..628775334d5e 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -11,7 +11,6 @@
#include <asm/perf_event.h>
#define ARMV8_PMU_CYCLE_IDX (ARMV8_PMU_MAX_COUNTERS - 1)
-#define ARMV8_PMU_MAX_COUNTER_PAIRS ((ARMV8_PMU_MAX_COUNTERS + 1) >> 1)
#ifdef CONFIG_HW_PERF_EVENTS
@@ -29,7 +28,6 @@ struct kvm_pmu {
struct irq_work overflow_work;
struct kvm_pmu_events events;
struct kvm_pmc pmc[ARMV8_PMU_MAX_COUNTERS];
- DECLARE_BITMAP(chained, ARMV8_PMU_MAX_COUNTER_PAIRS);
int irq_num;
bool created;
bool irq_level;
@@ -91,6 +89,14 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu);
vcpu->arch.pmu.events = *kvm_get_pmu_events(); \
} while (0)
+/*
+ * Evaluates as true when emulating PMUv3p5, and false otherwise.
+ */
+#define kvm_pmu_is_3p5(vcpu) \
+ (vcpu->kvm->arch.dfr0_pmuver.imp >= ID_AA64DFR0_EL1_PMUVer_V3P5)
+
+u8 kvm_arm_pmu_get_pmuver_limit(void);
+
#else
struct kvm_pmu {
};
@@ -153,9 +159,14 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
}
#define kvm_vcpu_has_pmu(vcpu) ({ false; })
+#define kvm_pmu_is_3p5(vcpu) ({ false; })
static inline void kvm_pmu_update_vcpu_events(struct kvm_vcpu *vcpu) {}
static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {}
static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {}
+static inline u8 kvm_arm_pmu_get_pmuver_limit(void)
+{
+ return 0;
+}
#endif
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 4df9e73a8bb5..9270cd87da3f 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -263,6 +263,7 @@ struct vgic_dist {
struct vgic_io_device dist_iodev;
bool has_its;
+ bool save_its_tables_in_progress;
/*
* Contains the attributes and gpa of the LPI configuration table.
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 80f3c1c7827d..929d559ad41d 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1222,7 +1222,7 @@ efi_status_t efi_random_get_seed(void);
arch_efi_call_virt_teardown(); \
})
-#define EFI_RANDOM_SEED_SIZE 64U
+#define EFI_RANDOM_SEED_SIZE 32U // BLAKE2S_HASH_SIZE
struct linux_efi_random_seed {
u32 size;
diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 18a31b125f9d..1067a8450826 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -454,13 +454,18 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
#define __fortify_memcpy_chk(p, q, size, p_size, q_size, \
p_size_field, q_size_field, op) ({ \
- size_t __fortify_size = (size_t)(size); \
- WARN_ONCE(fortify_memcpy_chk(__fortify_size, p_size, q_size, \
- p_size_field, q_size_field, #op), \
+ const size_t __fortify_size = (size_t)(size); \
+ const size_t __p_size = (p_size); \
+ const size_t __q_size = (q_size); \
+ const size_t __p_size_field = (p_size_field); \
+ const size_t __q_size_field = (q_size_field); \
+ WARN_ONCE(fortify_memcpy_chk(__fortify_size, __p_size, \
+ __q_size, __p_size_field, \
+ __q_size_field, #op), \
#op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \
__fortify_size, \
"field \"" #p "\" at " __FILE__ ":" __stringify(__LINE__), \
- p_size_field); \
+ __p_size_field); \
__underlying_##op(p, q, __fortify_size); \
})
diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h
index eee1877a354e..859f4b0c1b2b 100644
--- a/include/linux/kernel-page-flags.h
+++ b/include/linux/kernel-page-flags.h
@@ -18,5 +18,6 @@
#define KPF_UNCACHED 39
#define KPF_SOFTDIRTY 40
#define KPF_ARCH_2 41
+#define KPF_ARCH_3 42
#endif /* LINUX_KERNEL_PAGE_FLAGS_H */
diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h
index 906f899813dc..4862c98d80d3 100644
--- a/include/linux/kvm_dirty_ring.h
+++ b/include/linux/kvm_dirty_ring.h
@@ -37,6 +37,11 @@ static inline u32 kvm_dirty_ring_get_rsvd_entries(void)
return 0;
}
+static inline bool kvm_use_dirty_bitmap(struct kvm *kvm)
+{
+ return true;
+}
+
static inline int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring,
int index, u32 size)
{
@@ -49,7 +54,7 @@ static inline int kvm_dirty_ring_reset(struct kvm *kvm,
return 0;
}
-static inline void kvm_dirty_ring_push(struct kvm_dirty_ring *ring,
+static inline void kvm_dirty_ring_push(struct kvm_vcpu *vcpu,
u32 slot, u64 offset)
{
}
@@ -64,13 +69,11 @@ static inline void kvm_dirty_ring_free(struct kvm_dirty_ring *ring)
{
}
-static inline bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
-{
- return true;
-}
-
#else /* CONFIG_HAVE_KVM_DIRTY_RING */
+int kvm_cpu_dirty_log_size(void);
+bool kvm_use_dirty_bitmap(struct kvm *kvm);
+bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm);
u32 kvm_dirty_ring_get_rsvd_entries(void);
int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size);
@@ -84,13 +87,14 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring);
* returns =0: successfully pushed
* <0: unable to push, need to wait
*/
-void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset);
+void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset);
+
+bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu);
/* for use in vm_operations_struct */
struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset);
void kvm_dirty_ring_free(struct kvm_dirty_ring *ring);
-bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring);
#endif /* CONFIG_HAVE_KVM_DIRTY_RING */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 00c3448ba7f8..f51eb9419bfc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -153,10 +153,11 @@ static inline bool is_error_page(struct page *page)
* Architecture-independent vcpu->requests bit members
* Bits 3-7 are reserved for more arch-independent bits.
*/
-#define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_VM_DEAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_UNBLOCK 2
-#define KVM_REQUEST_ARCH_BASE 8
+#define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_VM_DEAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_UNBLOCK 2
+#define KVM_REQ_DIRTY_RING_SOFT_FULL 3
+#define KVM_REQUEST_ARCH_BASE 8
/*
* KVM_REQ_OUTSIDE_GUEST_MODE exists is purely as way to force the vCPU to
@@ -778,6 +779,7 @@ struct kvm {
pid_t userspace_pid;
unsigned int max_halt_poll_ns;
u32 dirty_ring_size;
+ bool dirty_ring_with_bitmap;
bool vm_bugged;
bool vm_dead;
@@ -1240,8 +1242,18 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
/**
- * kvm_gfn_to_pfn_cache_init - prepare a cached kernel mapping and HPA for a
- * given guest physical address.
+ * kvm_gpc_init - initialize gfn_to_pfn_cache.
+ *
+ * @gpc: struct gfn_to_pfn_cache object.
+ *
+ * This sets up a gfn_to_pfn_cache by initializing locks. Note, the cache must
+ * be zero-allocated (or zeroed by the caller before init).
+ */
+void kvm_gpc_init(struct gfn_to_pfn_cache *gpc);
+
+/**
+ * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest
+ * physical address.
*
* @kvm: pointer to kvm instance.
* @gpc: struct gfn_to_pfn_cache object.
@@ -1265,9 +1277,9 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
* kvm_gfn_to_pfn_cache_check() to ensure that the cache is valid before
* accessing the target page.
*/
-int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
- struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
- gpa_t gpa, unsigned long len);
+int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+ struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
+ gpa_t gpa, unsigned long len);
/**
* kvm_gfn_to_pfn_cache_check - check validity of a gfn_to_pfn_cache.
@@ -1324,7 +1336,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
/**
- * kvm_gfn_to_pfn_cache_destroy - destroy and unlink a gfn_to_pfn_cache.
+ * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache.
*
* @kvm: pointer to kvm instance.
* @gpc: struct gfn_to_pfn_cache object.
@@ -1332,7 +1344,7 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
* This removes a cache from the @kvm's list to be processed on MMU notifier
* invocation.
*/
-void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
+void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
void kvm_sigset_activate(struct kvm_vcpu *vcpu);
void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 0b0ae5084e60..c50ce2812f17 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -132,8 +132,9 @@ enum pageflags {
PG_young,
PG_idle,
#endif
-#ifdef CONFIG_64BIT
+#ifdef CONFIG_ARCH_USES_PG_ARCH_X
PG_arch_2,
+ PG_arch_3,
#endif
#ifdef CONFIG_KASAN_HW_TAGS
PG_skip_kasan_poison,
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 4418b1981e31..6bfa972f2fbf 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -181,6 +181,8 @@ enum {
NLA_S64,
NLA_BITFIELD32,
NLA_REJECT,
+ NLA_BE16,
+ NLA_BE32,
__NLA_TYPE_MAX,
};
@@ -231,6 +233,7 @@ enum nla_policy_validation {
* NLA_U32, NLA_U64,
* NLA_S8, NLA_S16,
* NLA_S32, NLA_S64,
+ * NLA_BE16, NLA_BE32,
* NLA_MSECS Leaving the length field zero will verify the
* given type fits, using it verifies minimum length
* just like "All other"
@@ -261,6 +264,8 @@ enum nla_policy_validation {
* NLA_U16,
* NLA_U32,
* NLA_U64,
+ * NLA_BE16,
+ * NLA_BE32,
* NLA_S8,
* NLA_S16,
* NLA_S32,
@@ -317,19 +322,10 @@ struct nla_policy {
u8 validation_type;
u16 len;
union {
- const u32 bitfield32_valid;
- const u32 mask;
- const char *reject_message;
- const struct nla_policy *nested_policy;
- struct netlink_range_validation *range;
- struct netlink_range_validation_signed *range_signed;
- struct {
- s16 min, max;
- u8 network_byte_order:1;
- };
- int (*validate)(const struct nlattr *attr,
- struct netlink_ext_ack *extack);
- /* This entry is special, and used for the attribute at index 0
+ /**
+ * @strict_start_type: first attribute to validate strictly
+ *
+ * This entry is special, and used for the attribute at index 0
* only, and specifies special data about the policy, namely it
* specifies the "boundary type" where strict length validation
* starts for any attribute types >= this value, also, strict
@@ -348,6 +344,19 @@ struct nla_policy {
* was added to enforce strict validation from thereon.
*/
u16 strict_start_type;
+
+ /* private: use NLA_POLICY_*() to set */
+ const u32 bitfield32_valid;
+ const u32 mask;
+ const char *reject_message;
+ const struct nla_policy *nested_policy;
+ struct netlink_range_validation *range;
+ struct netlink_range_validation_signed *range_signed;
+ struct {
+ s16 min, max;
+ };
+ int (*validate)(const struct nlattr *attr,
+ struct netlink_ext_ack *extack);
};
};
@@ -369,6 +378,8 @@ struct nla_policy {
(tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || tp == NLA_U64)
#define __NLA_IS_SINT_TYPE(tp) \
(tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64)
+#define __NLA_IS_BEINT_TYPE(tp) \
+ (tp == NLA_BE16 || tp == NLA_BE32)
#define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition))
#define NLA_ENSURE_UINT_TYPE(tp) \
@@ -382,6 +393,7 @@ struct nla_policy {
#define NLA_ENSURE_INT_OR_BINARY_TYPE(tp) \
(__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \
__NLA_IS_SINT_TYPE(tp) || \
+ __NLA_IS_BEINT_TYPE(tp) || \
tp == NLA_MSECS || \
tp == NLA_BINARY) + tp)
#define NLA_ENSURE_NO_VALIDATION_PTR(tp) \
@@ -389,6 +401,8 @@ struct nla_policy {
tp != NLA_REJECT && \
tp != NLA_NESTED && \
tp != NLA_NESTED_ARRAY) + tp)
+#define NLA_ENSURE_BEINT_TYPE(tp) \
+ (__NLA_ENSURE(__NLA_IS_BEINT_TYPE(tp)) + tp)
#define NLA_POLICY_RANGE(tp, _min, _max) { \
.type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \
@@ -419,14 +433,6 @@ struct nla_policy {
.type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \
.validation_type = NLA_VALIDATE_MAX, \
.max = _max, \
- .network_byte_order = 0, \
-}
-
-#define NLA_POLICY_MAX_BE(tp, _max) { \
- .type = NLA_ENSURE_UINT_TYPE(tp), \
- .validation_type = NLA_VALIDATE_MAX, \
- .max = _max, \
- .network_byte_order = 1, \
}
#define NLA_POLICY_MASK(tp, _mask) { \
diff --git a/include/net/sock.h b/include/net/sock.h
index 22f8bab583dd..5db02546941c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1889,6 +1889,13 @@ void sock_kfree_s(struct sock *sk, void *mem, int size);
void sock_kzfree_s(struct sock *sk, void *mem, int size);
void sk_send_sigurg(struct sock *sk);
+static inline void sock_replace_proto(struct sock *sk, struct proto *proto)
+{
+ if (sk->sk_socket)
+ clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
+ WRITE_ONCE(sk->sk_prot, proto);
+}
+
struct sockcm_cookie {
u64 transmit_time;
u32 mark;
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index e87cb2b80ed3..412b5a46374c 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -91,10 +91,10 @@
#define IF_HAVE_PG_IDLE(flag,string)
#endif
-#ifdef CONFIG_64BIT
-#define IF_HAVE_PG_ARCH_2(flag,string) ,{1UL << flag, string}
+#ifdef CONFIG_ARCH_USES_PG_ARCH_X
+#define IF_HAVE_PG_ARCH_X(flag,string) ,{1UL << flag, string}
#else
-#define IF_HAVE_PG_ARCH_2(flag,string)
+#define IF_HAVE_PG_ARCH_X(flag,string)
#endif
#ifdef CONFIG_KASAN_HW_TAGS
@@ -130,7 +130,8 @@ IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \
IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \
IF_HAVE_PG_IDLE(PG_young, "young" ) \
IF_HAVE_PG_IDLE(PG_idle, "idle" ) \
-IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) \
+IF_HAVE_PG_ARCH_X(PG_arch_2, "arch_2" ) \
+IF_HAVE_PG_ARCH_X(PG_arch_3, "arch_3" ) \
IF_HAVE_PG_SKIP_KASAN_POISON(PG_skip_kasan_poison, "skip_kasan_poison")
#define show_page_flags(flags) \
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 0d5d4419139a..c87b5882d7ae 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1178,6 +1178,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_S390_ZPCI_OP 221
#define KVM_CAP_S390_CPU_TOPOLOGY 222
#define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
+#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 224
#ifdef KVM_CAP_IRQ_ROUTING
diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c
index 5ced822df788..c57610f52bb4 100644
--- a/kernel/events/hw_breakpoint_test.c
+++ b/kernel/events/hw_breakpoint_test.c
@@ -295,11 +295,11 @@ static int test_init(struct kunit *test)
{
/* Most test cases want 2 distinct CPUs. */
if (num_online_cpus() < 2)
- return -EINVAL;
+ kunit_skip(test, "not enough cpus");
/* Want the system to not use breakpoints elsewhere. */
if (hw_breakpoint_is_used())
- return -EBUSY;
+ kunit_skip(test, "hw breakpoint already in use");
return 0;
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3220b0a2fb4a..cd9f5a66a690 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2429,8 +2429,11 @@ int enable_kprobe(struct kprobe *kp)
if (!kprobes_all_disarmed && kprobe_disabled(p)) {
p->flags &= ~KPROBE_FLAG_DISABLED;
ret = arm_kprobe(p);
- if (ret)
+ if (ret) {
p->flags |= KPROBE_FLAG_DISABLED;
+ if (p != kp)
+ kp->flags |= KPROBE_FLAG_DISABLED;
+ }
}
out:
mutex_unlock(&kprobe_mutex);
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index aac63ca9c3d1..e8143e368074 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -141,6 +141,8 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
return -E2BIG;
fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler);
+ if (!fp->rethook)
+ return -ENOMEM;
for (i = 0; i < size; i++) {
struct fprobe_rethook_node *node;
@@ -301,7 +303,8 @@ int unregister_fprobe(struct fprobe *fp)
{
int ret;
- if (!fp || fp->ops.func != fprobe_handler)
+ if (!fp || (fp->ops.saved_func != fprobe_handler &&
+ fp->ops.saved_func != fprobe_kprobe_handler))
return -EINVAL;
/*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fbf2543111c0..7dc023641bf1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3028,18 +3028,8 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
command |= FTRACE_UPDATE_TRACE_FUNC;
}
- if (!command || !ftrace_enabled) {
- /*
- * If these are dynamic or per_cpu ops, they still
- * need their data freed. Since, function tracing is
- * not currently active, we can just free them
- * without synchronizing all CPUs.
- */
- if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
- goto free_ops;
-
- return 0;
- }
+ if (!command || !ftrace_enabled)
+ goto out;
/*
* If the ops uses a trampoline, then it needs to be
@@ -3076,6 +3066,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
removed_ops = NULL;
ops->flags &= ~FTRACE_OPS_FL_REMOVING;
+out:
/*
* Dynamic ops may be freed, we must make sure that all
* callers are done before leaving this function.
@@ -3103,7 +3094,6 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (IS_ENABLED(CONFIG_PREEMPTION))
synchronize_rcu_tasks();
- free_ops:
ftrace_trampoline_free(ops);
}
diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c
index 80e04a1e1977..d81f7c51025c 100644
--- a/kernel/trace/kprobe_event_gen_test.c
+++ b/kernel/trace/kprobe_event_gen_test.c
@@ -100,20 +100,20 @@ static int __init test_gen_kprobe_cmd(void)
KPROBE_GEN_TEST_FUNC,
KPROBE_GEN_TEST_ARG0, KPROBE_GEN_TEST_ARG1);
if (ret)
- goto free;
+ goto out;
/* Use kprobe_event_add_fields to add the rest of the fields */
ret = kprobe_event_add_fields(&cmd, KPROBE_GEN_TEST_ARG2, KPROBE_GEN_TEST_ARG3);
if (ret)
- goto free;
+ goto out;
/*
* This actually creates the event.
*/
ret = kprobe_event_gen_cmd_end(&cmd);
if (ret)
- goto free;
+ goto out;
/*
* Now get the gen_kprobe_test event file. We need to prevent
@@ -136,13 +136,11 @@ static int __init test_gen_kprobe_cmd(void)
goto delete;
}
out:
+ kfree(buf);
return ret;
delete:
/* We got an error after creating the event, delete it */
ret = kprobe_event_delete("gen_kprobe_test");
- free:
- kfree(buf);
-
goto out;
}
@@ -170,14 +168,14 @@ static int __init test_gen_kretprobe_cmd(void)
KPROBE_GEN_TEST_FUNC,
"$retval");
if (ret)
- goto free;
+ goto out;
/*
* This actually creates the event.
*/
ret = kretprobe_event_gen_cmd_end(&cmd);
if (ret)
- goto free;
+ goto out;
/*
* Now get the gen_kretprobe_test event file. We need to
@@ -201,13 +199,11 @@ static int __init test_gen_kretprobe_cmd(void)
goto delete;
}
out:
+ kfree(buf);
return ret;
delete:
/* We got an error after creating the event, delete it */
ret = kprobe_event_delete("gen_kretprobe_test");
- free:
- kfree(buf);
-
goto out;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 199759c73519..9712083832f4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -937,6 +937,9 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
struct ring_buffer_per_cpu *cpu_buffer;
struct rb_irq_work *rbwork;
+ if (!buffer)
+ return;
+
if (cpu == RING_BUFFER_ALL_CPUS) {
/* Wake up individual ones too. One level recursion */
@@ -945,7 +948,15 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
rbwork = &buffer->irq_work;
} else {
+ if (WARN_ON_ONCE(!buffer->buffers))
+ return;
+ if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ return;
+
cpu_buffer = buffer->buffers[cpu];
+ /* The CPU buffer may not have been initialized yet */
+ if (!cpu_buffer)
+ return;
rbwork = &cpu_buffer->irq_work;
}
diff --git a/lib/nlattr.c b/lib/nlattr.c
index 40f22b177d69..b67a53e29b8f 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -124,10 +124,12 @@ void nla_get_range_unsigned(const struct nla_policy *pt,
range->max = U8_MAX;
break;
case NLA_U16:
+ case NLA_BE16:
case NLA_BINARY:
range->max = U16_MAX;
break;
case NLA_U32:
+ case NLA_BE32:
range->max = U32_MAX;
break;
case NLA_U64:
@@ -159,31 +161,6 @@ void nla_get_range_unsigned(const struct nla_policy *pt,
}
}
-static u64 nla_get_attr_bo(const struct nla_policy *pt,
- const struct nlattr *nla)
-{
- switch (pt->type) {
- case NLA_U16:
- if (pt->network_byte_order)
- return ntohs(nla_get_be16(nla));
-
- return nla_get_u16(nla);
- case NLA_U32:
- if (pt->network_byte_order)
- return ntohl(nla_get_be32(nla));
-
- return nla_get_u32(nla);
- case NLA_U64:
- if (pt->network_byte_order)
- return be64_to_cpu(nla_get_be64(nla));
-
- return nla_get_u64(nla);
- }
-
- WARN_ON_ONCE(1);
- return 0;
-}
-
static int nla_validate_range_unsigned(const struct nla_policy *pt,
const struct nlattr *nla,
struct netlink_ext_ack *extack,
@@ -197,9 +174,13 @@ static int nla_validate_range_unsigned(const struct nla_policy *pt,
value = nla_get_u8(nla);
break;
case NLA_U16:
+ value = nla_get_u16(nla);
+ break;
case NLA_U32:
+ value = nla_get_u32(nla);
+ break;
case NLA_U64:
- value = nla_get_attr_bo(pt, nla);
+ value = nla_get_u64(nla);
break;
case NLA_MSECS:
value = nla_get_u64(nla);
@@ -207,6 +188,12 @@ static int nla_validate_range_unsigned(const struct nla_policy *pt,
case NLA_BINARY:
value = nla_len(nla);
break;
+ case NLA_BE16:
+ value = ntohs(nla_get_be16(nla));
+ break;
+ case NLA_BE32:
+ value = ntohl(nla_get_be32(nla));
+ break;
default:
return -EINVAL;
}
@@ -334,6 +321,8 @@ static int nla_validate_int_range(const struct nla_policy *pt,
case NLA_U64:
case NLA_MSECS:
case NLA_BINARY:
+ case NLA_BE16:
+ case NLA_BE32:
return nla_validate_range_unsigned(pt, nla, extack, validate);
case NLA_S8:
case NLA_S16:
diff --git a/mm/Kconfig b/mm/Kconfig
index 57e1d8c5b505..807bd7192f51 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1005,6 +1005,14 @@ config ARCH_USES_HIGH_VMA_FLAGS
config ARCH_HAS_PKEYS
bool
+config ARCH_USES_PG_ARCH_X
+ bool
+ help
+ Enable the definition of PG_arch_x page flags with x > 1. Only
+ suitable for 64-bit architectures with CONFIG_FLATMEM or
+ CONFIG_SPARSEMEM_VMEMMAP enabled, otherwise there may not be
+ enough room for additional bits in page->flags.
+
config VM_EVENT_COUNTERS
default y
bool "Enable VM event counters for /proc/vmstat" if EXPERT
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 561a42567477..dfe72ea23c5f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2444,8 +2444,9 @@ static void __split_huge_page_tail(struct page *head, int tail,
(1L << PG_workingset) |
(1L << PG_locked) |
(1L << PG_unevictable) |
-#ifdef CONFIG_64BIT
+#ifdef CONFIG_ARCH_USES_PG_ARCH_X
(1L << PG_arch_2) |
+ (1L << PG_arch_3) |
#endif
(1L << PG_dirty) |
LRU_GEN_MASK | LRU_REFS_MASK));
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 7a59c4487050..a6c12863a253 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1067,10 +1067,21 @@ int hci_conn_del(struct hci_conn *conn)
hdev->acl_cnt += conn->sent;
} else {
struct hci_conn *acl = conn->link;
+
if (acl) {
acl->link = NULL;
hci_conn_drop(acl);
}
+
+ /* Unacked ISO frames */
+ if (conn->type == ISO_LINK) {
+ if (hdev->iso_pkts)
+ hdev->iso_cnt += conn->sent;
+ else if (hdev->le_pkts)
+ hdev->le_cnt += conn->sent;
+ else
+ hdev->acl_cnt += conn->sent;
+ }
}
if (conn->amp_mgr)
@@ -1761,6 +1772,7 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
if (!cis)
return ERR_PTR(-ENOMEM);
cis->cleanup = cis_cleanup;
+ cis->dst_type = dst_type;
}
if (cis->state == BT_CONNECTED)
@@ -2140,12 +2152,6 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
struct hci_conn *le;
struct hci_conn *cis;
- /* Convert from ISO socket address type to HCI address type */
- if (dst_type == BDADDR_LE_PUBLIC)
- dst_type = ADDR_LE_DEV_PUBLIC;
- else
- dst_type = ADDR_LE_DEV_RANDOM;
-
if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
le = hci_connect_le(hdev, dst, dst_type, false,
BT_SECURITY_LOW,
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 613039ba5dbf..f825857db6d0 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -235,6 +235,14 @@ static int iso_chan_add(struct iso_conn *conn, struct sock *sk,
return err;
}
+static inline u8 le_addr_type(u8 bdaddr_type)
+{
+ if (bdaddr_type == BDADDR_LE_PUBLIC)
+ return ADDR_LE_DEV_PUBLIC;
+ else
+ return ADDR_LE_DEV_RANDOM;
+}
+
static int iso_connect_bis(struct sock *sk)
{
struct iso_conn *conn;
@@ -328,14 +336,16 @@ static int iso_connect_cis(struct sock *sk)
/* Just bind if DEFER_SETUP has been set */
if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst,
- iso_pi(sk)->dst_type, &iso_pi(sk)->qos);
+ le_addr_type(iso_pi(sk)->dst_type),
+ &iso_pi(sk)->qos);
if (IS_ERR(hcon)) {
err = PTR_ERR(hcon);
goto done;
}
} else {
hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst,
- iso_pi(sk)->dst_type, &iso_pi(sk)->qos);
+ le_addr_type(iso_pi(sk)->dst_type),
+ &iso_pi(sk)->qos);
if (IS_ERR(hcon)) {
err = PTR_ERR(hcon);
goto done;
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 1f34b82ca0ec..9c24947aa41e 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -1990,7 +1990,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm,
if (link_type == LE_LINK && c->src_type == BDADDR_BREDR)
continue;
- if (c->psm == psm) {
+ if (c->chan_type != L2CAP_CHAN_FIXED && c->psm == psm) {
int src_match, dst_match;
int src_any, dst_any;
@@ -3764,7 +3764,8 @@ done:
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
sizeof(rfc), (unsigned long) &rfc, endptr - ptr);
- if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) {
+ if (remote_efs &&
+ test_bit(FLAG_EFS_ENABLE, &chan->flags)) {
chan->remote_id = efs.id;
chan->remote_stype = efs.stype;
chan->remote_msdu = le16_to_cpu(efs.msdu);
@@ -5813,6 +5814,19 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
BT_DBG("psm 0x%2.2x scid 0x%4.4x mtu %u mps %u", __le16_to_cpu(psm),
scid, mtu, mps);
+ /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 3, Part A
+ * page 1059:
+ *
+ * Valid range: 0x0001-0x00ff
+ *
+ * Table 4.15: L2CAP_LE_CREDIT_BASED_CONNECTION_REQ SPSM ranges
+ */
+ if (!psm || __le16_to_cpu(psm) > L2CAP_PSM_LE_DYN_END) {
+ result = L2CAP_CR_LE_BAD_PSM;
+ chan = NULL;
+ goto response;
+ }
+
/* Check if we have socket listening on psm */
pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src,
&conn->hcon->dst, LE_LINK);
@@ -6001,6 +6015,18 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn,
psm = req->psm;
+ /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 3, Part A
+ * page 1059:
+ *
+ * Valid range: 0x0001-0x00ff
+ *
+ * Table 4.15: L2CAP_LE_CREDIT_BASED_CONNECTION_REQ SPSM ranges
+ */
+ if (!psm || __le16_to_cpu(psm) > L2CAP_PSM_LE_DYN_END) {
+ result = L2CAP_CR_LE_BAD_PSM;
+ goto response;
+ }
+
BT_DBG("psm 0x%2.2x mtu %u mps %u", __le16_to_cpu(psm), mtu, mps);
memset(&pdu, 0, sizeof(pdu));
@@ -6885,6 +6911,7 @@ static int l2cap_rx_state_recv(struct l2cap_chan *chan,
struct l2cap_ctrl *control,
struct sk_buff *skb, u8 event)
{
+ struct l2cap_ctrl local_control;
int err = 0;
bool skb_in_use = false;
@@ -6909,15 +6936,32 @@ static int l2cap_rx_state_recv(struct l2cap_chan *chan,
chan->buffer_seq = chan->expected_tx_seq;
skb_in_use = true;
+ /* l2cap_reassemble_sdu may free skb, hence invalidate
+ * control, so make a copy in advance to use it after
+ * l2cap_reassemble_sdu returns and to avoid the race
+ * condition, for example:
+ *
+ * The current thread calls:
+ * l2cap_reassemble_sdu
+ * chan->ops->recv == l2cap_sock_recv_cb
+ * __sock_queue_rcv_skb
+ * Another thread calls:
+ * bt_sock_recvmsg
+ * skb_recv_datagram
+ * skb_free_datagram
+ * Then the current thread tries to access control, but
+ * it was freed by skb_free_datagram.
+ */
+ local_control = *control;
err = l2cap_reassemble_sdu(chan, skb, control);
if (err)
break;
- if (control->final) {
+ if (local_control.final) {
if (!test_and_clear_bit(CONN_REJ_ACT,
&chan->conn_state)) {
- control->final = 0;
- l2cap_retransmit_all(chan, control);
+ local_control.final = 0;
+ l2cap_retransmit_all(chan, &local_control);
l2cap_ertm_send(chan);
}
}
@@ -7297,11 +7341,27 @@ static int l2cap_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
struct sk_buff *skb)
{
+ /* l2cap_reassemble_sdu may free skb, hence invalidate control, so store
+ * the txseq field in advance to use it after l2cap_reassemble_sdu
+ * returns and to avoid the race condition, for example:
+ *
+ * The current thread calls:
+ * l2cap_reassemble_sdu
+ * chan->ops->recv == l2cap_sock_recv_cb
+ * __sock_queue_rcv_skb
+ * Another thread calls:
+ * bt_sock_recvmsg
+ * skb_recv_datagram
+ * skb_free_datagram
+ * Then the current thread tries to access control, but it was freed by
+ * skb_free_datagram.
+ */
+ u16 txseq = control->txseq;
+
BT_DBG("chan %p, control %p, skb %p, state %d", chan, control, skb,
chan->rx_state);
- if (l2cap_classify_txseq(chan, control->txseq) ==
- L2CAP_TXSEQ_EXPECTED) {
+ if (l2cap_classify_txseq(chan, txseq) == L2CAP_TXSEQ_EXPECTED) {
l2cap_pass_to_tx(chan, control);
BT_DBG("buffer_seq %u->%u", chan->buffer_seq,
@@ -7324,8 +7384,8 @@ static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
}
}
- chan->last_acked_seq = control->txseq;
- chan->expected_tx_seq = __next_seq(chan, control->txseq);
+ chan->last_acked_seq = txseq;
+ chan->expected_tx_seq = __next_seq(chan, txseq);
return 0;
}
@@ -7581,6 +7641,7 @@ static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid,
return;
}
+ l2cap_chan_hold(chan);
l2cap_chan_lock(chan);
} else {
BT_DBG("unknown cid 0x%4.4x", cid);
@@ -8426,9 +8487,8 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
* expected length.
*/
if (skb->len < L2CAP_LEN_SIZE) {
- if (l2cap_recv_frag(conn, skb, conn->mtu) < 0)
- goto drop;
- return;
+ l2cap_recv_frag(conn, skb, conn->mtu);
+ break;
}
len = get_unaligned_le16(skb->data) + L2CAP_HDR_SIZE;
@@ -8472,7 +8532,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
/* Header still could not be read just continue */
if (conn->rx_skb->len < L2CAP_LEN_SIZE)
- return;
+ break;
}
if (skb->len > conn->rx_len) {
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 5aeb3646e74c..d087fd4c784a 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1332,7 +1332,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
if (data[IFLA_BR_FDB_FLUSH]) {
struct net_bridge_fdb_flush_desc desc = {
- .flags_mask = BR_FDB_STATIC
+ .flags_mask = BIT(BR_FDB_STATIC)
};
br_fdb_flush(br, &desc);
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 612e367fff20..ea733542244c 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -345,7 +345,7 @@ static int set_flush(struct net_bridge *br, unsigned long val,
struct netlink_ext_ack *extack)
{
struct net_bridge_fdb_flush_desc desc = {
- .flags_mask = BR_FDB_STATIC
+ .flags_mask = BIT(BR_FDB_STATIC)
};
br_fdb_flush(br, &desc);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 3c4786b99907..a77a85e357e0 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -409,7 +409,7 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
write_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev, skip_perm);
pneigh_ifdown_and_unlock(tbl, dev);
- pneigh_queue_purge(&tbl->proxy_queue, dev_net(dev));
+ pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL);
if (skb_queue_empty_lockless(&tbl->proxy_queue))
del_timer_sync(&tbl->proxy_timer);
return 0;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index af0e2c0394ac..e504a18fc125 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -1409,9 +1409,9 @@ static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp,
static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master,
const char *user_protocol)
{
+ const struct dsa_device_ops *tag_ops = NULL;
struct dsa_switch *ds = dp->ds;
struct dsa_switch_tree *dst = ds->dst;
- const struct dsa_device_ops *tag_ops;
enum dsa_tag_protocol default_proto;
/* Find out which protocol the switch would prefer. */
@@ -1434,10 +1434,17 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master,
}
tag_ops = dsa_find_tagger_by_name(user_protocol);
- } else {
- tag_ops = dsa_tag_driver_get(default_proto);
+ if (IS_ERR(tag_ops)) {
+ dev_warn(ds->dev,
+ "Failed to find a tagging driver for protocol %s, using default\n",
+ user_protocol);
+ tag_ops = NULL;
+ }
}
+ if (!tag_ops)
+ tag_ops = dsa_tag_driver_get(default_proto);
+
if (IS_ERR(tag_ops)) {
if (PTR_ERR(tag_ops) == -ENOPROTOOPT)
return -EPROBE_DEFER;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 3dd02396517d..4728087c42a5 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -754,6 +754,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags,
(TCPF_ESTABLISHED | TCPF_SYN_RECV |
TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+ if (test_bit(SOCK_SUPPORT_ZC, &sock->flags))
+ set_bit(SOCK_SUPPORT_ZC, &newsock->flags);
sock_graft(sk2, newsock);
newsock->state = SS_CONNECTED;
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index a1626afe87a1..c501c329b1db 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -607,7 +607,7 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
} else {
sk->sk_write_space = psock->saved_write_space;
/* Pairs with lockless read in sk_clone_lock() */
- WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+ sock_replace_proto(sk, psock->sk_proto);
}
return 0;
}
@@ -620,7 +620,7 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
}
/* Pairs with lockless read in sk_clone_lock() */
- WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]);
+ sock_replace_proto(sk, &tcp_bpf_prots[family][config]);
return 0;
}
EXPORT_SYMBOL_GPL(tcp_bpf_update_proto);
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 7c27aa629af1..9ae50b1bd844 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -136,6 +136,9 @@ static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops)
if (icsk->icsk_ulp_ops)
goto out_err;
+ if (sk->sk_socket)
+ clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
+
err = ulp_ops->init(sk);
if (err)
goto out_err;
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index ff15918b7bdc..e5dc91d0e079 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -141,14 +141,14 @@ int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
if (restore) {
sk->sk_write_space = psock->saved_write_space;
- WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+ sock_replace_proto(sk, psock->sk_proto);
return 0;
}
if (sk->sk_family == AF_INET6)
udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
- WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]);
+ sock_replace_proto(sk, &udp_bpf_prots[family]);
return 0;
}
EXPORT_SYMBOL_GPL(udp_bpf_update_proto);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 69252eb462b2..2f355f0ec32a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6555,10 +6555,16 @@ static void __net_exit ip6_route_net_exit(struct net *net)
static int __net_init ip6_route_net_init_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
- proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
- sizeof(struct ipv6_route_iter));
- proc_create_net_single("rt6_stats", 0444, net->proc_net,
- rt6_stats_seq_show, NULL);
+ if (!proc_create_net("ipv6_route", 0, net->proc_net,
+ &ipv6_route_seq_ops,
+ sizeof(struct ipv6_route_iter)))
+ return -ENOMEM;
+
+ if (!proc_create_net_single("rt6_stats", 0444, net->proc_net,
+ rt6_stats_seq_show, NULL)) {
+ remove_proc_entry("ipv6_route", net->proc_net);
+ return -ENOMEM;
+ }
#endif
return 0;
}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 129ec5a9b0eb..bc65e5b7195b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -66,6 +66,7 @@ int udpv6_init_sock(struct sock *sk)
{
skb_queue_head_init(&udp_sk(sk)->reader_queue);
sk->sk_destruct = udpv6_destruct_sock;
+ set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
return 0;
}
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 6e391308431d..3adc291d9ce1 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -42,31 +42,8 @@
#define AHASH_MAX_SIZE (6 * AHASH_INIT_SIZE)
/* Max muber of elements in the array block when tuned */
#define AHASH_MAX_TUNED 64
-
#define AHASH_MAX(h) ((h)->bucketsize)
-/* Max number of elements can be tuned */
-#ifdef IP_SET_HASH_WITH_MULTI
-static u8
-tune_bucketsize(u8 curr, u32 multi)
-{
- u32 n;
-
- if (multi < curr)
- return curr;
-
- n = curr + AHASH_INIT_SIZE;
- /* Currently, at listing one hash bucket must fit into a message.
- * Therefore we have a hard limit here.
- */
- return n > curr && n <= AHASH_MAX_TUNED ? n : curr;
-}
-#define TUNE_BUCKETSIZE(h, multi) \
- ((h)->bucketsize = tune_bucketsize((h)->bucketsize, multi))
-#else
-#define TUNE_BUCKETSIZE(h, multi)
-#endif
-
/* A hash bucket */
struct hbucket {
struct rcu_head rcu; /* for call_rcu */
@@ -936,7 +913,12 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
goto set_full;
/* Create a new slot */
if (n->pos >= n->size) {
- TUNE_BUCKETSIZE(h, multi);
+#ifdef IP_SET_HASH_WITH_MULTI
+ if (h->bucketsize >= AHASH_MAX_TUNED)
+ goto set_full;
+ else if (h->bucketsize < multi)
+ h->bucketsize += AHASH_INIT_SIZE;
+#endif
if (n->size >= AHASH_MAX(h)) {
/* Trigger rehashing */
mtype_data_next(&h->next, d);
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index f9b16f2b2219..fdacbc3c15be 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -599,13 +599,19 @@ static const struct seq_operations ip_vs_app_seq_ops = {
int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs)
{
INIT_LIST_HEAD(&ipvs->app_list);
- proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_seq_ops,
- sizeof(struct seq_net_private));
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs_app", 0, ipvs->net->proc_net,
+ &ip_vs_app_seq_ops,
+ sizeof(struct seq_net_private)))
+ return -ENOMEM;
+#endif
return 0;
}
void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs)
{
unregister_ip_vs_app(ipvs, NULL /* all */);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_app", ipvs->net->proc_net);
+#endif
}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 8c04bb57dd6f..13534e02346c 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1265,8 +1265,8 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
* The drop rate array needs tuning for real environments.
* Called from timer bh only => no locking
*/
- static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
- static char todrop_counter[9] = {0};
+ static const signed char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+ static signed char todrop_counter[9] = {0};
int i;
/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
@@ -1447,20 +1447,36 @@ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
{
atomic_set(&ipvs->conn_count, 0);
- proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
- &ip_vs_conn_seq_ops, sizeof(struct ip_vs_iter_state));
- proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
- &ip_vs_conn_sync_seq_ops,
- sizeof(struct ip_vs_iter_state));
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
+ &ip_vs_conn_seq_ops,
+ sizeof(struct ip_vs_iter_state)))
+ goto err_conn;
+
+ if (!proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
+ &ip_vs_conn_sync_seq_ops,
+ sizeof(struct ip_vs_iter_state)))
+ goto err_conn_sync;
+#endif
+
return 0;
+
+#ifdef CONFIG_PROC_FS
+err_conn_sync:
+ remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
+err_conn:
+ return -ENOMEM;
+#endif
}
void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
{
/* flush all the connection entries first */
ip_vs_conn_flush(ipvs);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net);
+#endif
}
int __init ip_vs_conn_init(void)
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 18319a6e6806..e29e4ccb5c5a 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -1152,7 +1152,16 @@ static int __init nf_nat_init(void)
WARN_ON(nf_nat_hook != NULL);
RCU_INIT_POINTER(nf_nat_hook, &nat_hook);
- return register_nf_nat_bpf();
+ ret = register_nf_nat_bpf();
+ if (ret < 0) {
+ RCU_INIT_POINTER(nf_nat_hook, NULL);
+ nf_ct_helper_expectfn_unregister(&follow_master_nat);
+ synchronize_net();
+ unregister_pernet_subsys(&nat_net_ops);
+ kvfree(nf_nat_bysource);
+ }
+
+ return ret;
}
static void __exit nf_nat_cleanup(void)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 58d9cbc9ccdc..76bd4d03dbda 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -8465,9 +8465,6 @@ static void nft_commit_release(struct nft_trans *trans)
nf_tables_chain_destroy(&trans->ctx);
break;
case NFT_MSG_DELRULE:
- if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
- nft_flow_rule_destroy(nft_trans_flow_rule(trans));
-
nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
break;
case NFT_MSG_DELSET:
@@ -8973,6 +8970,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nft_rule_expr_deactivate(&trans->ctx,
nft_trans_rule(trans),
NFT_TRANS_COMMIT);
+
+ if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
break;
case NFT_MSG_NEWSET:
nft_clear(net, nft_trans_set(trans));
@@ -10030,6 +10030,8 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event,
nft_net = nft_pernet(net);
deleted = 0;
mutex_lock(&nft_net->commit_mutex);
+ if (!list_empty(&nf_tables_destroy_list))
+ rcu_barrier();
again:
list_for_each_entry(table, &nft_net->tables, list) {
if (nft_table_has_owner(table) &&
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 088244f9d838..4edd899aeb9b 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -173,10 +173,10 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
[NFTA_PAYLOAD_SREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_DREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_BASE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX_BE(NLA_U32, 255),
- [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX_BE(NLA_U32, 255),
+ [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
+ [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX_BE(NLA_U32, 255),
+ [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_PAYLOAD_CSUM_FLAGS] = { .type = NLA_U32 },
};
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 155263e73512..8b84869eb2ac 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2544,6 +2544,7 @@ struct genl_family dp_vport_genl_family __ro_after_init = {
.parallel_ops = true,
.small_ops = dp_vport_genl_ops,
.n_small_ops = ARRAY_SIZE(dp_vport_genl_ops),
+ .resv_start_op = OVS_VPORT_CMD_SET + 1,
.mcgrps = &ovs_dp_vport_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c
index 8b96a56d3a49..0f77ae8ef944 100644
--- a/net/rose/rose_link.c
+++ b/net/rose/rose_link.c
@@ -236,6 +236,9 @@ void rose_transmit_clear_request(struct rose_neigh *neigh, unsigned int lci, uns
unsigned char *dptr;
int len;
+ if (!neigh->dev)
+ return;
+
len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 3;
if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL)
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index a5a401f93c1a..98129324e157 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -72,6 +72,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
+ unsigned int len;
int ret;
q->vars.qavg = red_calc_qavg(&q->parms,
@@ -126,9 +127,10 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
break;
}
+ len = qdisc_pkt_len(skb);
ret = qdisc_enqueue(skb, child, to_free);
if (likely(ret == NET_XMIT_SUCCESS)) {
- qdisc_qstats_backlog_inc(sch, skb);
+ sch->qstats.backlog += len;
sch->q.qlen++;
} else if (net_xmit_drop_count(ret)) {
q->stats.pdrop++;
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 3ccbf3c201cd..e12d4fa5aece 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -3380,14 +3380,14 @@ static int __init smc_init(void)
rc = register_pernet_subsys(&smc_net_stat_ops);
if (rc)
- return rc;
+ goto out_pernet_subsys;
smc_ism_init();
smc_clc_init();
rc = smc_nl_init();
if (rc)
- goto out_pernet_subsys;
+ goto out_pernet_subsys_stat;
rc = smc_pnet_init();
if (rc)
@@ -3480,6 +3480,8 @@ out_pnet:
smc_pnet_exit();
out_nl:
smc_nl_exit();
+out_pernet_subsys_stat:
+ unregister_pernet_subsys(&smc_net_stat_ops);
out_pernet_subsys:
unregister_pernet_subsys(&smc_net_ops);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index a31a27816cc0..7bb247c51e2f 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1989,7 +1989,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
goto unwrap_failed;
mic.len = len;
mic.data = kmalloc(len, GFP_KERNEL);
- if (!mic.data)
+ if (ZERO_OR_NULL_PTR(mic.data))
goto unwrap_failed;
if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len))
goto unwrap_failed;
diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c
index c65c90ad626a..c1f559892ae8 100644
--- a/net/sunrpc/sysfs.c
+++ b/net/sunrpc/sysfs.c
@@ -518,13 +518,16 @@ void rpc_sysfs_client_setup(struct rpc_clnt *clnt,
struct net *net)
{
struct rpc_sysfs_client *rpc_client;
+ struct rpc_sysfs_xprt_switch *xswitch =
+ (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs;
+
+ if (!xswitch)
+ return;
rpc_client = rpc_sysfs_client_alloc(rpc_sunrpc_client_kobj,
net, clnt->cl_clid);
if (rpc_client) {
char name[] = "switch";
- struct rpc_sysfs_xprt_switch *xswitch =
- (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs;
int ret;
clnt->cl_sysfs = rpc_client;
@@ -558,6 +561,8 @@ void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch,
rpc_xprt_switch->xprt_switch = xprt_switch;
rpc_xprt_switch->xprt = xprt;
kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_ADD);
+ } else {
+ xprt_switch->xps_sysfs = NULL;
}
}
@@ -569,6 +574,9 @@ void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch,
struct rpc_sysfs_xprt_switch *switch_obj =
(struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs;
+ if (!switch_obj)
+ return;
+
rpc_xprt = rpc_sysfs_xprt_alloc(&switch_obj->kobject, xprt, gfp_flags);
if (rpc_xprt) {
xprt->xprt_sysfs = rpc_xprt;
diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
index 7cf14c6b1725..e9bf15513961 100644
--- a/net/unix/unix_bpf.c
+++ b/net/unix/unix_bpf.c
@@ -145,12 +145,12 @@ int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool re
if (restore) {
sk->sk_write_space = psock->saved_write_space;
- WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+ sock_replace_proto(sk, psock->sk_proto);
return 0;
}
unix_dgram_bpf_check_needs_rebuild(psock->sk_proto);
- WRITE_ONCE(sk->sk_prot, &unix_dgram_bpf_prot);
+ sock_replace_proto(sk, &unix_dgram_bpf_prot);
return 0;
}
@@ -158,12 +158,12 @@ int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool r
{
if (restore) {
sk->sk_write_space = psock->saved_write_space;
- WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+ sock_replace_proto(sk, psock->sk_proto);
return 0;
}
unix_stream_bpf_check_needs_rebuild(psock->sk_proto);
- WRITE_ONCE(sk->sk_prot, &unix_stream_bpf_prot);
+ sock_replace_proto(sk, &unix_stream_bpf_prot);
return 0;
}
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index ee418701cdee..884eca7f6743 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1905,8 +1905,11 @@ static int vsock_connectible_wait_data(struct sock *sk,
err = 0;
transport = vsk->transport;
- while ((data = vsock_connectible_has_data(vsk)) == 0) {
+ while (1) {
prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE);
+ data = vsock_connectible_has_data(vsk);
+ if (data != 0)
+ break;
if (sk->sk_err != 0 ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
@@ -2092,8 +2095,6 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
const struct vsock_transport *transport;
int err;
- DEFINE_WAIT(wait);
-
sk = sock->sk;
vsk = vsock_sk(sk);
err = 0;
diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost
index 8489a3402eb8..e41dee64d429 100644
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -122,7 +122,7 @@ quiet_cmd_modpost = MODPOST $@
sed 's/ko$$/o/' $(or $(modorder-if-needed), /dev/null) | $(MODPOST) $(modpost-args) -T - $(vmlinux.o-if-present)
targets += $(output-symdump)
-$(output-symdump): $(modorder-if-needed) $(vmlinux.o-if-present) $(moudle.symvers-if-present) $(MODPOST) FORCE
+$(output-symdump): $(modorder-if-needed) $(vmlinux.o-if-present) $(module.symvers-if-present) $(MODPOST) FORCE
$(call if_changed,modpost)
__modpost: $(output-symdump)
diff --git a/scripts/kconfig/menu.c b/scripts/kconfig/menu.c
index 62b6313f51c8..109325f31bef 100644
--- a/scripts/kconfig/menu.c
+++ b/scripts/kconfig/menu.c
@@ -722,8 +722,8 @@ static void get_prompt_str(struct gstr *r, struct property *prop,
if (!expr_eq(prop->menu->dep, prop->visible.expr))
get_dep_str(r, prop->visible.expr, " Visible if: ");
- menu = prop->menu->parent;
- for (i = 0; menu && i < 8; menu = menu->parent) {
+ menu = prop->menu;
+ for (i = 0; menu != &rootmenu && i < 8; menu = menu->parent) {
bool accessible = menu_is_visible(menu);
submenu[i++] = menu;
@@ -733,16 +733,7 @@ static void get_prompt_str(struct gstr *r, struct property *prop,
if (head && location) {
jump = xmalloc(sizeof(struct jump_key));
- if (menu_is_visible(prop->menu)) {
- /*
- * There is not enough room to put the hint at the
- * beginning of the "Prompt" line. Put the hint on the
- * last "Location" line even when it would belong on
- * the former.
- */
- jump->target = prop->menu;
- } else
- jump->target = location;
+ jump->target = location;
if (list_empty(head))
jump->index = 0;
@@ -758,13 +749,7 @@ static void get_prompt_str(struct gstr *r, struct property *prop,
menu = submenu[i];
if (jump && menu == location)
jump->offset = strlen(r->s);
-
- if (menu == &rootmenu)
- /* The real rootmenu prompt is ugly */
- str_printf(r, "%*cMain menu", j, ' ');
- else
- str_printf(r, "%*c-> %s", j, ' ', menu_get_prompt(menu));
-
+ str_printf(r, "%*c-> %s", j, ' ', menu_get_prompt(menu));
if (menu->sym) {
str_printf(r, " (%s [=%s])", menu->sym->name ?
menu->sym->name : "<choice>",
diff --git a/security/commoncap.c b/security/commoncap.c
index 5fc8986c3c77..bc751fa5adad 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -401,8 +401,10 @@ int cap_inode_getsecurity(struct user_namespace *mnt_userns,
&tmpbuf, size, GFP_NOFS);
dput(dentry);
- if (ret < 0 || !tmpbuf)
- return ret;
+ if (ret < 0 || !tmpbuf) {
+ size = ret;
+ goto out_free;
+ }
fs_ns = inode->i_sb->s_user_ns;
cap = (struct vfs_cap_data *) tmpbuf;
diff --git a/tools/include/linux/bitfield.h b/tools/include/linux/bitfield.h
new file mode 100644
index 000000000000..6093fa6db260
--- /dev/null
+++ b/tools/include/linux/bitfield.h
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2014 Felix Fietkau <nbd@nbd.name>
+ * Copyright (C) 2004 - 2009 Ivo van Doorn <IvDoorn@gmail.com>
+ */
+
+#ifndef _LINUX_BITFIELD_H
+#define _LINUX_BITFIELD_H
+
+#include <linux/build_bug.h>
+#include <asm/byteorder.h>
+
+/*
+ * Bitfield access macros
+ *
+ * FIELD_{GET,PREP} macros take as first parameter shifted mask
+ * from which they extract the base mask and shift amount.
+ * Mask must be a compilation time constant.
+ *
+ * Example:
+ *
+ * #define REG_FIELD_A GENMASK(6, 0)
+ * #define REG_FIELD_B BIT(7)
+ * #define REG_FIELD_C GENMASK(15, 8)
+ * #define REG_FIELD_D GENMASK(31, 16)
+ *
+ * Get:
+ * a = FIELD_GET(REG_FIELD_A, reg);
+ * b = FIELD_GET(REG_FIELD_B, reg);
+ *
+ * Set:
+ * reg = FIELD_PREP(REG_FIELD_A, 1) |
+ * FIELD_PREP(REG_FIELD_B, 0) |
+ * FIELD_PREP(REG_FIELD_C, c) |
+ * FIELD_PREP(REG_FIELD_D, 0x40);
+ *
+ * Modify:
+ * reg &= ~REG_FIELD_C;
+ * reg |= FIELD_PREP(REG_FIELD_C, c);
+ */
+
+#define __bf_shf(x) (__builtin_ffsll(x) - 1)
+
+#define __scalar_type_to_unsigned_cases(type) \
+ unsigned type: (unsigned type)0, \
+ signed type: (unsigned type)0
+
+#define __unsigned_scalar_typeof(x) typeof( \
+ _Generic((x), \
+ char: (unsigned char)0, \
+ __scalar_type_to_unsigned_cases(char), \
+ __scalar_type_to_unsigned_cases(short), \
+ __scalar_type_to_unsigned_cases(int), \
+ __scalar_type_to_unsigned_cases(long), \
+ __scalar_type_to_unsigned_cases(long long), \
+ default: (x)))
+
+#define __bf_cast_unsigned(type, x) ((__unsigned_scalar_typeof(type))(x))
+
+#define __BF_FIELD_CHECK(_mask, _reg, _val, _pfx) \
+ ({ \
+ BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask), \
+ _pfx "mask is not constant"); \
+ BUILD_BUG_ON_MSG((_mask) == 0, _pfx "mask is zero"); \
+ BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \
+ ~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \
+ _pfx "value too large for the field"); \
+ BUILD_BUG_ON_MSG(__bf_cast_unsigned(_mask, _mask) > \
+ __bf_cast_unsigned(_reg, ~0ull), \
+ _pfx "type of reg too small for mask"); \
+ __BUILD_BUG_ON_NOT_POWER_OF_2((_mask) + \
+ (1ULL << __bf_shf(_mask))); \
+ })
+
+/**
+ * FIELD_MAX() - produce the maximum value representable by a field
+ * @_mask: shifted mask defining the field's length and position
+ *
+ * FIELD_MAX() returns the maximum value that can be held in the field
+ * specified by @_mask.
+ */
+#define FIELD_MAX(_mask) \
+ ({ \
+ __BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_MAX: "); \
+ (typeof(_mask))((_mask) >> __bf_shf(_mask)); \
+ })
+
+/**
+ * FIELD_FIT() - check if value fits in the field
+ * @_mask: shifted mask defining the field's length and position
+ * @_val: value to test against the field
+ *
+ * Return: true if @_val can fit inside @_mask, false if @_val is too big.
+ */
+#define FIELD_FIT(_mask, _val) \
+ ({ \
+ __BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_FIT: "); \
+ !((((typeof(_mask))_val) << __bf_shf(_mask)) & ~(_mask)); \
+ })
+
+/**
+ * FIELD_PREP() - prepare a bitfield element
+ * @_mask: shifted mask defining the field's length and position
+ * @_val: value to put in the field
+ *
+ * FIELD_PREP() masks and shifts up the value. The result should
+ * be combined with other fields of the bitfield using logical OR.
+ */
+#define FIELD_PREP(_mask, _val) \
+ ({ \
+ __BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_PREP: "); \
+ ((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask); \
+ })
+
+/**
+ * FIELD_GET() - extract a bitfield element
+ * @_mask: shifted mask defining the field's length and position
+ * @_reg: value of entire bitfield
+ *
+ * FIELD_GET() extracts the field specified by @_mask from the
+ * bitfield passed in as @_reg by masking and shifting it down.
+ */
+#define FIELD_GET(_mask, _reg) \
+ ({ \
+ __BF_FIELD_CHECK(_mask, _reg, 0U, "FIELD_GET: "); \
+ (typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \
+ })
+
+extern void __compiletime_error("value doesn't fit into mask")
+__field_overflow(void);
+extern void __compiletime_error("bad bitfield mask")
+__bad_mask(void);
+static __always_inline u64 field_multiplier(u64 field)
+{
+ if ((field | (field - 1)) & ((field | (field - 1)) + 1))
+ __bad_mask();
+ return field & -field;
+}
+static __always_inline u64 field_mask(u64 field)
+{
+ return field / field_multiplier(field);
+}
+#define field_max(field) ((typeof(field))field_mask(field))
+#define ____MAKE_OP(type,base,to,from) \
+static __always_inline __##type type##_encode_bits(base v, base field) \
+{ \
+ if (__builtin_constant_p(v) && (v & ~field_mask(field))) \
+ __field_overflow(); \
+ return to((v & field_mask(field)) * field_multiplier(field)); \
+} \
+static __always_inline __##type type##_replace_bits(__##type old, \
+ base val, base field) \
+{ \
+ return (old & ~to(field)) | type##_encode_bits(val, field); \
+} \
+static __always_inline void type##p_replace_bits(__##type *p, \
+ base val, base field) \
+{ \
+ *p = (*p & ~to(field)) | type##_encode_bits(val, field); \
+} \
+static __always_inline base type##_get_bits(__##type v, base field) \
+{ \
+ return (from(v) & field)/field_multiplier(field); \
+}
+#define __MAKE_OP(size) \
+ ____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu) \
+ ____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu) \
+ ____MAKE_OP(u##size,u##size,,)
+____MAKE_OP(u8,u8,,)
+__MAKE_OP(16)
+__MAKE_OP(32)
+__MAKE_OP(64)
+#undef __MAKE_OP
+#undef ____MAKE_OP
+
+#endif
diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h
index bef35bee9c44..ad97c0d522b8 100644
--- a/tools/include/nolibc/string.h
+++ b/tools/include/nolibc/string.h
@@ -19,9 +19,9 @@ static __attribute__((unused))
int memcmp(const void *s1, const void *s2, size_t n)
{
size_t ofs = 0;
- char c1 = 0;
+ int c1 = 0;
- while (ofs < n && !(c1 = ((char *)s1)[ofs] - ((char *)s2)[ofs])) {
+ while (ofs < n && !(c1 = ((unsigned char *)s1)[ofs] - ((unsigned char *)s2)[ofs])) {
ofs++;
}
return c1;
@@ -125,14 +125,18 @@ char *strcpy(char *dst, const char *src)
}
/* this function is only used with arguments that are not constants or when
- * it's not known because optimizations are disabled.
+ * it's not known because optimizations are disabled. Note that gcc 12
+ * recognizes an strlen() pattern and replaces it with a jump to strlen(),
+ * thus itself, hence the asm() statement below that's meant to disable this
+ * confusing practice.
*/
static __attribute__((unused))
-size_t nolibc_strlen(const char *str)
+size_t strlen(const char *str)
{
size_t len;
- for (len = 0; str[len]; len++);
+ for (len = 0; str[len]; len++)
+ asm("");
return len;
}
@@ -140,13 +144,12 @@ size_t nolibc_strlen(const char *str)
* the two branches, then will rely on an external definition of strlen().
*/
#if defined(__OPTIMIZE__)
+#define nolibc_strlen(x) strlen(x)
#define strlen(str) ({ \
__builtin_constant_p((str)) ? \
__builtin_strlen((str)) : \
nolibc_strlen((str)); \
})
-#else
-#define strlen(str) nolibc_strlen((str))
#endif
static __attribute__((unused))
diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index a072b2d3e726..7edce12fd2ce 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -12,30 +12,62 @@
#include "mock.h"
#define NR_CXL_HOST_BRIDGES 2
+#define NR_CXL_SINGLE_HOST 1
#define NR_CXL_ROOT_PORTS 2
#define NR_CXL_SWITCH_PORTS 2
#define NR_CXL_PORT_DECODERS 8
static struct platform_device *cxl_acpi;
static struct platform_device *cxl_host_bridge[NR_CXL_HOST_BRIDGES];
-static struct platform_device
- *cxl_root_port[NR_CXL_HOST_BRIDGES * NR_CXL_ROOT_PORTS];
-static struct platform_device
- *cxl_switch_uport[NR_CXL_HOST_BRIDGES * NR_CXL_ROOT_PORTS];
-static struct platform_device
- *cxl_switch_dport[NR_CXL_HOST_BRIDGES * NR_CXL_ROOT_PORTS *
- NR_CXL_SWITCH_PORTS];
-struct platform_device
- *cxl_mem[NR_CXL_HOST_BRIDGES * NR_CXL_ROOT_PORTS * NR_CXL_SWITCH_PORTS];
+#define NR_MULTI_ROOT (NR_CXL_HOST_BRIDGES * NR_CXL_ROOT_PORTS)
+static struct platform_device *cxl_root_port[NR_MULTI_ROOT];
+static struct platform_device *cxl_switch_uport[NR_MULTI_ROOT];
+#define NR_MEM_MULTI \
+ (NR_CXL_HOST_BRIDGES * NR_CXL_ROOT_PORTS * NR_CXL_SWITCH_PORTS)
+static struct platform_device *cxl_switch_dport[NR_MEM_MULTI];
+
+static struct platform_device *cxl_hb_single[NR_CXL_SINGLE_HOST];
+static struct platform_device *cxl_root_single[NR_CXL_SINGLE_HOST];
+static struct platform_device *cxl_swu_single[NR_CXL_SINGLE_HOST];
+#define NR_MEM_SINGLE (NR_CXL_SINGLE_HOST * NR_CXL_SWITCH_PORTS)
+static struct platform_device *cxl_swd_single[NR_MEM_SINGLE];
+
+struct platform_device *cxl_mem[NR_MEM_MULTI];
+struct platform_device *cxl_mem_single[NR_MEM_SINGLE];
+
+
+static inline bool is_multi_bridge(struct device *dev)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(cxl_host_bridge); i++)
+ if (&cxl_host_bridge[i]->dev == dev)
+ return true;
+ return false;
+}
+
+static inline bool is_single_bridge(struct device *dev)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(cxl_hb_single); i++)
+ if (&cxl_hb_single[i]->dev == dev)
+ return true;
+ return false;
+}
static struct acpi_device acpi0017_mock;
-static struct acpi_device host_bridge[NR_CXL_HOST_BRIDGES] = {
+static struct acpi_device host_bridge[NR_CXL_HOST_BRIDGES + NR_CXL_SINGLE_HOST] = {
[0] = {
.handle = &host_bridge[0],
},
[1] = {
.handle = &host_bridge[1],
},
+ [2] = {
+ .handle = &host_bridge[2],
+ },
+
};
static bool is_mock_dev(struct device *dev)
@@ -45,6 +77,9 @@ static bool is_mock_dev(struct device *dev)
for (i = 0; i < ARRAY_SIZE(cxl_mem); i++)
if (dev == &cxl_mem[i]->dev)
return true;
+ for (i = 0; i < ARRAY_SIZE(cxl_mem_single); i++)
+ if (dev == &cxl_mem_single[i]->dev)
+ return true;
if (dev == &cxl_acpi->dev)
return true;
return false;
@@ -66,7 +101,7 @@ static bool is_mock_adev(struct acpi_device *adev)
static struct {
struct acpi_table_cedt cedt;
- struct acpi_cedt_chbs chbs[NR_CXL_HOST_BRIDGES];
+ struct acpi_cedt_chbs chbs[NR_CXL_HOST_BRIDGES + NR_CXL_SINGLE_HOST];
struct {
struct acpi_cedt_cfmws cfmws;
u32 target[1];
@@ -83,6 +118,10 @@ static struct {
struct acpi_cedt_cfmws cfmws;
u32 target[2];
} cfmws3;
+ struct {
+ struct acpi_cedt_cfmws cfmws;
+ u32 target[1];
+ } cfmws4;
} __packed mock_cedt = {
.cedt = {
.header = {
@@ -107,6 +146,14 @@ static struct {
.uid = 1,
.cxl_version = ACPI_CEDT_CHBS_VERSION_CXL20,
},
+ .chbs[2] = {
+ .header = {
+ .type = ACPI_CEDT_TYPE_CHBS,
+ .length = sizeof(mock_cedt.chbs[0]),
+ },
+ .uid = 2,
+ .cxl_version = ACPI_CEDT_CHBS_VERSION_CXL20,
+ },
.cfmws0 = {
.cfmws = {
.header = {
@@ -167,13 +214,29 @@ static struct {
},
.target = { 0, 1, },
},
+ .cfmws4 = {
+ .cfmws = {
+ .header = {
+ .type = ACPI_CEDT_TYPE_CFMWS,
+ .length = sizeof(mock_cedt.cfmws4),
+ },
+ .interleave_ways = 0,
+ .granularity = 4,
+ .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 |
+ ACPI_CEDT_CFMWS_RESTRICT_PMEM,
+ .qtg_id = 4,
+ .window_size = SZ_256M * 4UL,
+ },
+ .target = { 2 },
+ },
};
-struct acpi_cedt_cfmws *mock_cfmws[4] = {
+struct acpi_cedt_cfmws *mock_cfmws[] = {
[0] = &mock_cedt.cfmws0.cfmws,
[1] = &mock_cedt.cfmws1.cfmws,
[2] = &mock_cedt.cfmws2.cfmws,
[3] = &mock_cedt.cfmws3.cfmws,
+ [4] = &mock_cedt.cfmws4.cfmws,
};
struct cxl_mock_res {
@@ -304,6 +367,9 @@ static bool is_mock_bridge(struct device *dev)
for (i = 0; i < ARRAY_SIZE(cxl_host_bridge); i++)
if (dev == &cxl_host_bridge[i]->dev)
return true;
+ for (i = 0; i < ARRAY_SIZE(cxl_hb_single); i++)
+ if (dev == &cxl_hb_single[i]->dev)
+ return true;
return false;
}
@@ -326,6 +392,18 @@ static bool is_mock_port(struct device *dev)
if (dev == &cxl_switch_dport[i]->dev)
return true;
+ for (i = 0; i < ARRAY_SIZE(cxl_root_single); i++)
+ if (dev == &cxl_root_single[i]->dev)
+ return true;
+
+ for (i = 0; i < ARRAY_SIZE(cxl_swu_single); i++)
+ if (dev == &cxl_swu_single[i]->dev)
+ return true;
+
+ for (i = 0; i < ARRAY_SIZE(cxl_swd_single); i++)
+ if (dev == &cxl_swd_single[i]->dev)
+ return true;
+
if (is_cxl_memdev(dev))
return is_mock_dev(dev->parent);
@@ -561,11 +639,31 @@ static int mock_cxl_port_enumerate_dports(struct cxl_port *port)
int i, array_size;
if (port->depth == 1) {
- array_size = ARRAY_SIZE(cxl_root_port);
- array = cxl_root_port;
+ if (is_multi_bridge(port->uport)) {
+ array_size = ARRAY_SIZE(cxl_root_port);
+ array = cxl_root_port;
+ } else if (is_single_bridge(port->uport)) {
+ array_size = ARRAY_SIZE(cxl_root_single);
+ array = cxl_root_single;
+ } else {
+ dev_dbg(&port->dev, "%s: unknown bridge type\n",
+ dev_name(port->uport));
+ return -ENXIO;
+ }
} else if (port->depth == 2) {
- array_size = ARRAY_SIZE(cxl_switch_dport);
- array = cxl_switch_dport;
+ struct cxl_port *parent = to_cxl_port(port->dev.parent);
+
+ if (is_multi_bridge(parent->uport)) {
+ array_size = ARRAY_SIZE(cxl_switch_dport);
+ array = cxl_switch_dport;
+ } else if (is_single_bridge(parent->uport)) {
+ array_size = ARRAY_SIZE(cxl_swd_single);
+ array = cxl_swd_single;
+ } else {
+ dev_dbg(&port->dev, "%s: unknown bridge type\n",
+ dev_name(port->uport));
+ return -ENXIO;
+ }
} else {
dev_WARN_ONCE(&port->dev, 1, "unexpected depth %d\n",
port->depth);
@@ -576,8 +674,12 @@ static int mock_cxl_port_enumerate_dports(struct cxl_port *port)
struct platform_device *pdev = array[i];
struct cxl_dport *dport;
- if (pdev->dev.parent != port->uport)
+ if (pdev->dev.parent != port->uport) {
+ dev_dbg(&port->dev, "%s: mismatch parent %s\n",
+ dev_name(port->uport),
+ dev_name(pdev->dev.parent));
continue;
+ }
dport = devm_cxl_add_dport(port, &pdev->dev, pdev->id,
CXL_RESOURCE_NONE);
@@ -627,6 +729,157 @@ static void mock_companion(struct acpi_device *adev, struct device *dev)
#define SZ_512G (SZ_64G * 8)
#endif
+static __init int cxl_single_init(void)
+{
+ int i, rc;
+
+ for (i = 0; i < ARRAY_SIZE(cxl_hb_single); i++) {
+ struct acpi_device *adev =
+ &host_bridge[NR_CXL_HOST_BRIDGES + i];
+ struct platform_device *pdev;
+
+ pdev = platform_device_alloc("cxl_host_bridge",
+ NR_CXL_HOST_BRIDGES + i);
+ if (!pdev)
+ goto err_bridge;
+
+ mock_companion(adev, &pdev->dev);
+ rc = platform_device_add(pdev);
+ if (rc) {
+ platform_device_put(pdev);
+ goto err_bridge;
+ }
+
+ cxl_hb_single[i] = pdev;
+ rc = sysfs_create_link(&pdev->dev.kobj, &pdev->dev.kobj,
+ "physical_node");
+ if (rc)
+ goto err_bridge;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(cxl_root_single); i++) {
+ struct platform_device *bridge =
+ cxl_hb_single[i % ARRAY_SIZE(cxl_hb_single)];
+ struct platform_device *pdev;
+
+ pdev = platform_device_alloc("cxl_root_port",
+ NR_MULTI_ROOT + i);
+ if (!pdev)
+ goto err_port;
+ pdev->dev.parent = &bridge->dev;
+
+ rc = platform_device_add(pdev);
+ if (rc) {
+ platform_device_put(pdev);
+ goto err_port;
+ }
+ cxl_root_single[i] = pdev;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(cxl_swu_single); i++) {
+ struct platform_device *root_port = cxl_root_single[i];
+ struct platform_device *pdev;
+
+ pdev = platform_device_alloc("cxl_switch_uport",
+ NR_MULTI_ROOT + i);
+ if (!pdev)
+ goto err_uport;
+ pdev->dev.parent = &root_port->dev;
+
+ rc = platform_device_add(pdev);
+ if (rc) {
+ platform_device_put(pdev);
+ goto err_uport;
+ }
+ cxl_swu_single[i] = pdev;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(cxl_swd_single); i++) {
+ struct platform_device *uport =
+ cxl_swu_single[i % ARRAY_SIZE(cxl_swu_single)];
+ struct platform_device *pdev;
+
+ pdev = platform_device_alloc("cxl_switch_dport",
+ i + NR_MEM_MULTI);
+ if (!pdev)
+ goto err_dport;
+ pdev->dev.parent = &uport->dev;
+
+ rc = platform_device_add(pdev);
+ if (rc) {
+ platform_device_put(pdev);
+ goto err_dport;
+ }
+ cxl_swd_single[i] = pdev;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(cxl_mem_single); i++) {
+ struct platform_device *dport = cxl_swd_single[i];
+ struct platform_device *pdev;
+
+ pdev = platform_device_alloc("cxl_mem", NR_MEM_MULTI + i);
+ if (!pdev)
+ goto err_mem;
+ pdev->dev.parent = &dport->dev;
+ set_dev_node(&pdev->dev, i % 2);
+
+ rc = platform_device_add(pdev);
+ if (rc) {
+ platform_device_put(pdev);
+ goto err_mem;
+ }
+ cxl_mem_single[i] = pdev;
+ }
+
+ return 0;
+
+err_mem:
+ for (i = ARRAY_SIZE(cxl_mem_single) - 1; i >= 0; i--)
+ platform_device_unregister(cxl_mem_single[i]);
+err_dport:
+ for (i = ARRAY_SIZE(cxl_swd_single) - 1; i >= 0; i--)
+ platform_device_unregister(cxl_swd_single[i]);
+err_uport:
+ for (i = ARRAY_SIZE(cxl_swu_single) - 1; i >= 0; i--)
+ platform_device_unregister(cxl_swu_single[i]);
+err_port:
+ for (i = ARRAY_SIZE(cxl_root_single) - 1; i >= 0; i--)
+ platform_device_unregister(cxl_root_single[i]);
+err_bridge:
+ for (i = ARRAY_SIZE(cxl_hb_single) - 1; i >= 0; i--) {
+ struct platform_device *pdev = cxl_hb_single[i];
+
+ if (!pdev)
+ continue;
+ sysfs_remove_link(&pdev->dev.kobj, "physical_node");
+ platform_device_unregister(cxl_hb_single[i]);
+ }
+
+ return rc;
+}
+
+static void cxl_single_exit(void)
+{
+ int i;
+
+ for (i = ARRAY_SIZE(cxl_mem_single) - 1; i >= 0; i--)
+ platform_device_unregister(cxl_mem_single[i]);
+ for (i = ARRAY_SIZE(cxl_swd_single) - 1; i >= 0; i--)
+ platform_device_unregister(cxl_swd_single[i]);
+ for (i = ARRAY_SIZE(cxl_swu_single) - 1; i >= 0; i--)
+ platform_device_unregister(cxl_swu_single[i]);
+ for (i = ARRAY_SIZE(cxl_root_single) - 1; i >= 0; i--)
+ platform_device_unregister(cxl_root_single[i]);
+ for (i = ARRAY_SIZE(cxl_hb_single) - 1; i >= 0; i--) {
+ struct platform_device *pdev = cxl_hb_single[i];
+
+ if (!pdev)
+ continue;
+ sysfs_remove_link(&pdev->dev.kobj, "physical_node");
+ platform_device_unregister(cxl_hb_single[i]);
+ }
+}
+
static __init int cxl_test_init(void)
{
int rc, i;
@@ -695,7 +948,7 @@ static __init int cxl_test_init(void)
pdev = platform_device_alloc("cxl_switch_uport", i);
if (!pdev)
- goto err_port;
+ goto err_uport;
pdev->dev.parent = &root_port->dev;
rc = platform_device_add(pdev);
@@ -713,7 +966,7 @@ static __init int cxl_test_init(void)
pdev = platform_device_alloc("cxl_switch_dport", i);
if (!pdev)
- goto err_port;
+ goto err_dport;
pdev->dev.parent = &uport->dev;
rc = platform_device_add(pdev);
@@ -724,7 +977,6 @@ static __init int cxl_test_init(void)
cxl_switch_dport[i] = pdev;
}
- BUILD_BUG_ON(ARRAY_SIZE(cxl_mem) != ARRAY_SIZE(cxl_switch_dport));
for (i = 0; i < ARRAY_SIZE(cxl_mem); i++) {
struct platform_device *dport = cxl_switch_dport[i];
struct platform_device *pdev;
@@ -743,9 +995,13 @@ static __init int cxl_test_init(void)
cxl_mem[i] = pdev;
}
+ rc = cxl_single_init();
+ if (rc)
+ goto err_mem;
+
cxl_acpi = platform_device_alloc("cxl_acpi", 0);
if (!cxl_acpi)
- goto err_mem;
+ goto err_single;
mock_companion(&acpi0017_mock, &cxl_acpi->dev);
acpi0017_mock.dev.bus = &platform_bus_type;
@@ -758,6 +1014,8 @@ static __init int cxl_test_init(void)
err_add:
platform_device_put(cxl_acpi);
+err_single:
+ cxl_single_exit();
err_mem:
for (i = ARRAY_SIZE(cxl_mem) - 1; i >= 0; i--)
platform_device_unregister(cxl_mem[i]);
@@ -793,6 +1051,7 @@ static __exit void cxl_test_exit(void)
int i;
platform_device_unregister(cxl_acpi);
+ cxl_single_exit();
for (i = ARRAY_SIZE(cxl_mem) - 1; i >= 0; i--)
platform_device_unregister(cxl_mem[i]);
for (i = ARRAY_SIZE(cxl_switch_dport) - 1; i >= 0; i--)
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 2f0d705db9db..4a30d684e208 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -4,6 +4,7 @@
/aarch64/debug-exceptions
/aarch64/get-reg-list
/aarch64/hypercalls
+/aarch64/page_fault_test
/aarch64/psci_test
/aarch64/vcpu_width_config
/aarch64/vgic_init
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 0172eb6cb6ee..1d85b8e218a0 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -47,6 +47,7 @@ LIBKVM += lib/perf_test_util.c
LIBKVM += lib/rbtree.c
LIBKVM += lib/sparsebit.c
LIBKVM += lib/test_util.c
+LIBKVM += lib/userfaultfd_util.c
LIBKVM_STRING += lib/string_override.c
@@ -152,10 +153,12 @@ TEST_GEN_PROGS_aarch64 += aarch64/arch_timer
TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
TEST_GEN_PROGS_aarch64 += aarch64/hypercalls
+TEST_GEN_PROGS_aarch64 += aarch64/page_fault_test
TEST_GEN_PROGS_aarch64 += aarch64/psci_test
TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config
TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq
+TEST_GEN_PROGS_aarch64 += access_tracking_perf_test
TEST_GEN_PROGS_aarch64 += demand_paging_test
TEST_GEN_PROGS_aarch64 += dirty_log_test
TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
diff --git a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
index 6f9c1f19c7f6..b6a5e8861b35 100644
--- a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
+++ b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
@@ -13,6 +13,7 @@
#include "kvm_util.h"
#include "processor.h"
#include "test_util.h"
+#include <linux/bitfield.h>
#define BAD_ID_REG_VAL 0x1badc0deul
@@ -145,7 +146,7 @@ static bool vcpu_aarch64_only(struct kvm_vcpu *vcpu)
vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val);
- el0 = (val & ARM64_FEATURE_MASK(ID_AA64PFR0_EL0)) >> ID_AA64PFR0_EL0_SHIFT;
+ el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL0), val);
return el0 == ID_AA64PFR0_ELx_64BIT_ONLY;
}
diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
index 947bd201435c..b30add3e7726 100644
--- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
+++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
@@ -2,6 +2,7 @@
#include <test_util.h>
#include <kvm_util.h>
#include <processor.h>
+#include <linux/bitfield.h>
#define MDSCR_KDE (1 << 13)
#define MDSCR_MDE (1 << 15)
@@ -11,17 +12,24 @@
#define DBGBCR_EXEC (0x0 << 3)
#define DBGBCR_EL1 (0x1 << 1)
#define DBGBCR_E (0x1 << 0)
+#define DBGBCR_LBN_SHIFT 16
+#define DBGBCR_BT_SHIFT 20
+#define DBGBCR_BT_ADDR_LINK_CTX (0x1 << DBGBCR_BT_SHIFT)
+#define DBGBCR_BT_CTX_LINK (0x3 << DBGBCR_BT_SHIFT)
#define DBGWCR_LEN8 (0xff << 5)
#define DBGWCR_RD (0x1 << 3)
#define DBGWCR_WR (0x2 << 3)
#define DBGWCR_EL1 (0x1 << 1)
#define DBGWCR_E (0x1 << 0)
+#define DBGWCR_LBN_SHIFT 16
+#define DBGWCR_WT_SHIFT 20
+#define DBGWCR_WT_LINK (0x1 << DBGWCR_WT_SHIFT)
#define SPSR_D (1 << 9)
#define SPSR_SS (1 << 21)
-extern unsigned char sw_bp, sw_bp2, hw_bp, hw_bp2, bp_svc, bp_brk, hw_wp, ss_start;
+extern unsigned char sw_bp, sw_bp2, hw_bp, hw_bp2, bp_svc, bp_brk, hw_wp, ss_start, hw_bp_ctx;
extern unsigned char iter_ss_begin, iter_ss_end;
static volatile uint64_t sw_bp_addr, hw_bp_addr;
static volatile uint64_t wp_addr, wp_data_addr;
@@ -29,8 +37,74 @@ static volatile uint64_t svc_addr;
static volatile uint64_t ss_addr[4], ss_idx;
#define PC(v) ((uint64_t)&(v))
+#define GEN_DEBUG_WRITE_REG(reg_name) \
+static void write_##reg_name(int num, uint64_t val) \
+{ \
+ switch (num) { \
+ case 0: \
+ write_sysreg(val, reg_name##0_el1); \
+ break; \
+ case 1: \
+ write_sysreg(val, reg_name##1_el1); \
+ break; \
+ case 2: \
+ write_sysreg(val, reg_name##2_el1); \
+ break; \
+ case 3: \
+ write_sysreg(val, reg_name##3_el1); \
+ break; \
+ case 4: \
+ write_sysreg(val, reg_name##4_el1); \
+ break; \
+ case 5: \
+ write_sysreg(val, reg_name##5_el1); \
+ break; \
+ case 6: \
+ write_sysreg(val, reg_name##6_el1); \
+ break; \
+ case 7: \
+ write_sysreg(val, reg_name##7_el1); \
+ break; \
+ case 8: \
+ write_sysreg(val, reg_name##8_el1); \
+ break; \
+ case 9: \
+ write_sysreg(val, reg_name##9_el1); \
+ break; \
+ case 10: \
+ write_sysreg(val, reg_name##10_el1); \
+ break; \
+ case 11: \
+ write_sysreg(val, reg_name##11_el1); \
+ break; \
+ case 12: \
+ write_sysreg(val, reg_name##12_el1); \
+ break; \
+ case 13: \
+ write_sysreg(val, reg_name##13_el1); \
+ break; \
+ case 14: \
+ write_sysreg(val, reg_name##14_el1); \
+ break; \
+ case 15: \
+ write_sysreg(val, reg_name##15_el1); \
+ break; \
+ default: \
+ GUEST_ASSERT(0); \
+ } \
+}
+
+/* Define write_dbgbcr()/write_dbgbvr()/write_dbgwcr()/write_dbgwvr() */
+GEN_DEBUG_WRITE_REG(dbgbcr)
+GEN_DEBUG_WRITE_REG(dbgbvr)
+GEN_DEBUG_WRITE_REG(dbgwcr)
+GEN_DEBUG_WRITE_REG(dbgwvr)
+
static void reset_debug_state(void)
{
+ uint8_t brps, wrps, i;
+ uint64_t dfr0;
+
asm volatile("msr daifset, #8");
write_sysreg(0, osdlr_el1);
@@ -38,11 +112,21 @@ static void reset_debug_state(void)
isb();
write_sysreg(0, mdscr_el1);
- /* This test only uses the first bp and wp slot. */
- write_sysreg(0, dbgbvr0_el1);
- write_sysreg(0, dbgbcr0_el1);
- write_sysreg(0, dbgwcr0_el1);
- write_sysreg(0, dbgwvr0_el1);
+ write_sysreg(0, contextidr_el1);
+
+ /* Reset all bcr/bvr/wcr/wvr registers */
+ dfr0 = read_sysreg(id_aa64dfr0_el1);
+ brps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_BRPS), dfr0);
+ for (i = 0; i <= brps; i++) {
+ write_dbgbcr(i, 0);
+ write_dbgbvr(i, 0);
+ }
+ wrps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_WRPS), dfr0);
+ for (i = 0; i <= wrps; i++) {
+ write_dbgwcr(i, 0);
+ write_dbgwvr(i, 0);
+ }
+
isb();
}
@@ -54,16 +138,10 @@ static void enable_os_lock(void)
GUEST_ASSERT(read_sysreg(oslsr_el1) & 2);
}
-static void install_wp(uint64_t addr)
+static void enable_monitor_debug_exceptions(void)
{
- uint32_t wcr;
uint32_t mdscr;
- wcr = DBGWCR_LEN8 | DBGWCR_RD | DBGWCR_WR | DBGWCR_EL1 | DBGWCR_E;
- write_sysreg(wcr, dbgwcr0_el1);
- write_sysreg(addr, dbgwvr0_el1);
- isb();
-
asm volatile("msr daifclr, #8");
mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_MDE;
@@ -71,21 +149,76 @@ static void install_wp(uint64_t addr)
isb();
}
-static void install_hw_bp(uint64_t addr)
+static void install_wp(uint8_t wpn, uint64_t addr)
+{
+ uint32_t wcr;
+
+ wcr = DBGWCR_LEN8 | DBGWCR_RD | DBGWCR_WR | DBGWCR_EL1 | DBGWCR_E;
+ write_dbgwcr(wpn, wcr);
+ write_dbgwvr(wpn, addr);
+
+ isb();
+
+ enable_monitor_debug_exceptions();
+}
+
+static void install_hw_bp(uint8_t bpn, uint64_t addr)
{
uint32_t bcr;
- uint32_t mdscr;
bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E;
- write_sysreg(bcr, dbgbcr0_el1);
- write_sysreg(addr, dbgbvr0_el1);
+ write_dbgbcr(bpn, bcr);
+ write_dbgbvr(bpn, addr);
isb();
- asm volatile("msr daifclr, #8");
+ enable_monitor_debug_exceptions();
+}
- mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_MDE;
- write_sysreg(mdscr, mdscr_el1);
+static void install_wp_ctx(uint8_t addr_wp, uint8_t ctx_bp, uint64_t addr,
+ uint64_t ctx)
+{
+ uint32_t wcr;
+ uint64_t ctx_bcr;
+
+ /* Setup a context-aware breakpoint for Linked Context ID Match */
+ ctx_bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E |
+ DBGBCR_BT_CTX_LINK;
+ write_dbgbcr(ctx_bp, ctx_bcr);
+ write_dbgbvr(ctx_bp, ctx);
+
+ /* Setup a linked watchpoint (linked to the context-aware breakpoint) */
+ wcr = DBGWCR_LEN8 | DBGWCR_RD | DBGWCR_WR | DBGWCR_EL1 | DBGWCR_E |
+ DBGWCR_WT_LINK | ((uint32_t)ctx_bp << DBGWCR_LBN_SHIFT);
+ write_dbgwcr(addr_wp, wcr);
+ write_dbgwvr(addr_wp, addr);
isb();
+
+ enable_monitor_debug_exceptions();
+}
+
+void install_hw_bp_ctx(uint8_t addr_bp, uint8_t ctx_bp, uint64_t addr,
+ uint64_t ctx)
+{
+ uint32_t addr_bcr, ctx_bcr;
+
+ /* Setup a context-aware breakpoint for Linked Context ID Match */
+ ctx_bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E |
+ DBGBCR_BT_CTX_LINK;
+ write_dbgbcr(ctx_bp, ctx_bcr);
+ write_dbgbvr(ctx_bp, ctx);
+
+ /*
+ * Setup a normal breakpoint for Linked Address Match, and link it
+ * to the context-aware breakpoint.
+ */
+ addr_bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E |
+ DBGBCR_BT_ADDR_LINK_CTX |
+ ((uint32_t)ctx_bp << DBGBCR_LBN_SHIFT);
+ write_dbgbcr(addr_bp, addr_bcr);
+ write_dbgbvr(addr_bp, addr);
+ isb();
+
+ enable_monitor_debug_exceptions();
}
static void install_ss(void)
@@ -101,52 +234,42 @@ static void install_ss(void)
static volatile char write_data;
-static void guest_code(void)
+static void guest_code(uint8_t bpn, uint8_t wpn, uint8_t ctx_bpn)
{
- GUEST_SYNC(0);
+ uint64_t ctx = 0xabcdef; /* a random context number */
/* Software-breakpoint */
reset_debug_state();
asm volatile("sw_bp: brk #0");
GUEST_ASSERT_EQ(sw_bp_addr, PC(sw_bp));
- GUEST_SYNC(1);
-
/* Hardware-breakpoint */
reset_debug_state();
- install_hw_bp(PC(hw_bp));
+ install_hw_bp(bpn, PC(hw_bp));
asm volatile("hw_bp: nop");
GUEST_ASSERT_EQ(hw_bp_addr, PC(hw_bp));
- GUEST_SYNC(2);
-
/* Hardware-breakpoint + svc */
reset_debug_state();
- install_hw_bp(PC(bp_svc));
+ install_hw_bp(bpn, PC(bp_svc));
asm volatile("bp_svc: svc #0");
GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_svc));
GUEST_ASSERT_EQ(svc_addr, PC(bp_svc) + 4);
- GUEST_SYNC(3);
-
/* Hardware-breakpoint + software-breakpoint */
reset_debug_state();
- install_hw_bp(PC(bp_brk));
+ install_hw_bp(bpn, PC(bp_brk));
asm volatile("bp_brk: brk #0");
GUEST_ASSERT_EQ(sw_bp_addr, PC(bp_brk));
GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_brk));
- GUEST_SYNC(4);
-
/* Watchpoint */
reset_debug_state();
- install_wp(PC(write_data));
+ install_wp(wpn, PC(write_data));
write_data = 'x';
GUEST_ASSERT_EQ(write_data, 'x');
GUEST_ASSERT_EQ(wp_data_addr, PC(write_data));
- GUEST_SYNC(5);
-
/* Single-step */
reset_debug_state();
install_ss();
@@ -160,8 +283,6 @@ static void guest_code(void)
GUEST_ASSERT_EQ(ss_addr[1], PC(ss_start) + 4);
GUEST_ASSERT_EQ(ss_addr[2], PC(ss_start) + 8);
- GUEST_SYNC(6);
-
/* OS Lock does not block software-breakpoint */
reset_debug_state();
enable_os_lock();
@@ -169,30 +290,24 @@ static void guest_code(void)
asm volatile("sw_bp2: brk #0");
GUEST_ASSERT_EQ(sw_bp_addr, PC(sw_bp2));
- GUEST_SYNC(7);
-
/* OS Lock blocking hardware-breakpoint */
reset_debug_state();
enable_os_lock();
- install_hw_bp(PC(hw_bp2));
+ install_hw_bp(bpn, PC(hw_bp2));
hw_bp_addr = 0;
asm volatile("hw_bp2: nop");
GUEST_ASSERT_EQ(hw_bp_addr, 0);
- GUEST_SYNC(8);
-
/* OS Lock blocking watchpoint */
reset_debug_state();
enable_os_lock();
write_data = '\0';
wp_data_addr = 0;
- install_wp(PC(write_data));
+ install_wp(wpn, PC(write_data));
write_data = 'x';
GUEST_ASSERT_EQ(write_data, 'x');
GUEST_ASSERT_EQ(wp_data_addr, 0);
- GUEST_SYNC(9);
-
/* OS Lock blocking single-step */
reset_debug_state();
enable_os_lock();
@@ -205,6 +320,27 @@ static void guest_code(void)
: : : "x0");
GUEST_ASSERT_EQ(ss_addr[0], 0);
+ /* Linked hardware-breakpoint */
+ hw_bp_addr = 0;
+ reset_debug_state();
+ install_hw_bp_ctx(bpn, ctx_bpn, PC(hw_bp_ctx), ctx);
+ /* Set context id */
+ write_sysreg(ctx, contextidr_el1);
+ isb();
+ asm volatile("hw_bp_ctx: nop");
+ write_sysreg(0, contextidr_el1);
+ GUEST_ASSERT_EQ(hw_bp_addr, PC(hw_bp_ctx));
+
+ /* Linked watchpoint */
+ reset_debug_state();
+ install_wp_ctx(wpn, ctx_bpn, PC(write_data), ctx);
+ /* Set context id */
+ write_sysreg(ctx, contextidr_el1);
+ isb();
+ write_data = 'x';
+ GUEST_ASSERT_EQ(write_data, 'x');
+ GUEST_ASSERT_EQ(wp_data_addr, PC(write_data));
+
GUEST_DONE();
}
@@ -279,20 +415,16 @@ static void guest_code_ss(int test_cnt)
GUEST_DONE();
}
-static int debug_version(struct kvm_vcpu *vcpu)
+static int debug_version(uint64_t id_aa64dfr0)
{
- uint64_t id_aa64dfr0;
-
- vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1), &id_aa64dfr0);
- return id_aa64dfr0 & 0xf;
+ return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), id_aa64dfr0);
}
-static void test_guest_debug_exceptions(void)
+static void test_guest_debug_exceptions(uint8_t bpn, uint8_t wpn, uint8_t ctx_bpn)
{
struct kvm_vcpu *vcpu;
struct kvm_vm *vm;
struct ucall uc;
- int stage;
vm = vm_create_with_one_vcpu(&vcpu, guest_code);
ucall_init(vm, NULL);
@@ -311,23 +443,19 @@ static void test_guest_debug_exceptions(void)
vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
ESR_EC_SVC64, guest_svc_handler);
- for (stage = 0; stage < 11; stage++) {
- vcpu_run(vcpu);
-
- switch (get_ucall(vcpu, &uc)) {
- case UCALL_SYNC:
- TEST_ASSERT(uc.args[1] == stage,
- "Stage %d: Unexpected sync ucall, got %lx",
- stage, (ulong)uc.args[1]);
- break;
- case UCALL_ABORT:
- REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx");
- break;
- case UCALL_DONE:
- goto done;
- default:
- TEST_FAIL("Unknown ucall %lu", uc.cmd);
- }
+ /* Specify bpn/wpn/ctx_bpn to be tested */
+ vcpu_args_set(vcpu, 3, bpn, wpn, ctx_bpn);
+ pr_debug("Use bpn#%d, wpn#%d and ctx_bpn#%d\n", bpn, wpn, ctx_bpn);
+
+ vcpu_run(vcpu);
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx");
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
}
done:
@@ -400,6 +528,43 @@ void test_single_step_from_userspace(int test_cnt)
kvm_vm_free(vm);
}
+/*
+ * Run debug testing using the various breakpoint#, watchpoint# and
+ * context-aware breakpoint# with the given ID_AA64DFR0_EL1 configuration.
+ */
+void test_guest_debug_exceptions_all(uint64_t aa64dfr0)
+{
+ uint8_t brp_num, wrp_num, ctx_brp_num, normal_brp_num, ctx_brp_base;
+ int b, w, c;
+
+ /* Number of breakpoints */
+ brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_BRPS), aa64dfr0) + 1;
+ __TEST_REQUIRE(brp_num >= 2, "At least two breakpoints are required");
+
+ /* Number of watchpoints */
+ wrp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_WRPS), aa64dfr0) + 1;
+
+ /* Number of context aware breakpoints */
+ ctx_brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_CTX_CMPS), aa64dfr0) + 1;
+
+ pr_debug("%s brp_num:%d, wrp_num:%d, ctx_brp_num:%d\n", __func__,
+ brp_num, wrp_num, ctx_brp_num);
+
+ /* Number of normal (non-context aware) breakpoints */
+ normal_brp_num = brp_num - ctx_brp_num;
+
+ /* Lowest context aware breakpoint number */
+ ctx_brp_base = normal_brp_num;
+
+ /* Run tests with all supported breakpoints/watchpoints */
+ for (c = ctx_brp_base; c < ctx_brp_base + ctx_brp_num; c++) {
+ for (b = 0; b < normal_brp_num; b++) {
+ for (w = 0; w < wrp_num; w++)
+ test_guest_debug_exceptions(b, w, c);
+ }
+ }
+}
+
static void help(char *name)
{
puts("");
@@ -414,9 +579,11 @@ int main(int argc, char *argv[])
struct kvm_vm *vm;
int opt;
int ss_iteration = 10000;
+ uint64_t aa64dfr0;
vm = vm_create_with_one_vcpu(&vcpu, guest_code);
- __TEST_REQUIRE(debug_version(vcpu) >= 6,
+ vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1), &aa64dfr0);
+ __TEST_REQUIRE(debug_version(aa64dfr0) >= 6,
"Armv8 debug architecture not supported.");
kvm_vm_free(vm);
@@ -432,7 +599,7 @@ int main(int argc, char *argv[])
}
}
- test_guest_debug_exceptions();
+ test_guest_debug_exceptions_all(aa64dfr0);
test_single_step_from_userspace(ss_iteration);
return 0;
diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c
new file mode 100644
index 000000000000..05bb6a6369c2
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c
@@ -0,0 +1,1112 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * page_fault_test.c - Test stage 2 faults.
+ *
+ * This test tries different combinations of guest accesses (e.g., write,
+ * S1PTW), backing source type (e.g., anon) and types of faults (e.g., read on
+ * hugetlbfs with a hole). It checks that the expected handling method is
+ * called (e.g., uffd faults with the right address and write/read flag).
+ */
+
+#define _GNU_SOURCE
+#include <linux/bitmap.h>
+#include <fcntl.h>
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+#include <asm/sysreg.h>
+#include <linux/bitfield.h>
+#include "guest_modes.h"
+#include "userfaultfd_util.h"
+
+/* Guest virtual addresses that point to the test page and its PTE. */
+#define TEST_GVA 0xc0000000
+#define TEST_EXEC_GVA (TEST_GVA + 0x8)
+#define TEST_PTE_GVA 0xb0000000
+#define TEST_DATA 0x0123456789ABCDEF
+
+static uint64_t *guest_test_memory = (uint64_t *)TEST_GVA;
+
+#define CMD_NONE (0)
+#define CMD_SKIP_TEST (1ULL << 1)
+#define CMD_HOLE_PT (1ULL << 2)
+#define CMD_HOLE_DATA (1ULL << 3)
+#define CMD_CHECK_WRITE_IN_DIRTY_LOG (1ULL << 4)
+#define CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG (1ULL << 5)
+#define CMD_CHECK_NO_WRITE_IN_DIRTY_LOG (1ULL << 6)
+#define CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG (1ULL << 7)
+#define CMD_SET_PTE_AF (1ULL << 8)
+
+#define PREPARE_FN_NR 10
+#define CHECK_FN_NR 10
+
+static struct event_cnt {
+ int mmio_exits;
+ int fail_vcpu_runs;
+ int uffd_faults;
+ /* uffd_faults is incremented from multiple threads. */
+ pthread_mutex_t uffd_faults_mutex;
+} events;
+
+struct test_desc {
+ const char *name;
+ uint64_t mem_mark_cmd;
+ /* Skip the test if any prepare function returns false */
+ bool (*guest_prepare[PREPARE_FN_NR])(void);
+ void (*guest_test)(void);
+ void (*guest_test_check[CHECK_FN_NR])(void);
+ uffd_handler_t uffd_pt_handler;
+ uffd_handler_t uffd_data_handler;
+ void (*dabt_handler)(struct ex_regs *regs);
+ void (*iabt_handler)(struct ex_regs *regs);
+ void (*mmio_handler)(struct kvm_vm *vm, struct kvm_run *run);
+ void (*fail_vcpu_run_handler)(int ret);
+ uint32_t pt_memslot_flags;
+ uint32_t data_memslot_flags;
+ bool skip;
+ struct event_cnt expected_events;
+};
+
+struct test_params {
+ enum vm_mem_backing_src_type src_type;
+ struct test_desc *test_desc;
+};
+
+static inline void flush_tlb_page(uint64_t vaddr)
+{
+ uint64_t page = vaddr >> 12;
+
+ dsb(ishst);
+ asm volatile("tlbi vaae1is, %0" :: "r" (page));
+ dsb(ish);
+ isb();
+}
+
+static void guest_write64(void)
+{
+ uint64_t val;
+
+ WRITE_ONCE(*guest_test_memory, TEST_DATA);
+ val = READ_ONCE(*guest_test_memory);
+ GUEST_ASSERT_EQ(val, TEST_DATA);
+}
+
+/* Check the system for atomic instructions. */
+static bool guest_check_lse(void)
+{
+ uint64_t isar0 = read_sysreg(id_aa64isar0_el1);
+ uint64_t atomic;
+
+ atomic = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_ATOMICS), isar0);
+ return atomic >= 2;
+}
+
+static bool guest_check_dc_zva(void)
+{
+ uint64_t dczid = read_sysreg(dczid_el0);
+ uint64_t dzp = FIELD_GET(ARM64_FEATURE_MASK(DCZID_DZP), dczid);
+
+ return dzp == 0;
+}
+
+/* Compare and swap instruction. */
+static void guest_cas(void)
+{
+ uint64_t val;
+
+ GUEST_ASSERT(guest_check_lse());
+ asm volatile(".arch_extension lse\n"
+ "casal %0, %1, [%2]\n"
+ :: "r" (0), "r" (TEST_DATA), "r" (guest_test_memory));
+ val = READ_ONCE(*guest_test_memory);
+ GUEST_ASSERT_EQ(val, TEST_DATA);
+}
+
+static void guest_read64(void)
+{
+ uint64_t val;
+
+ val = READ_ONCE(*guest_test_memory);
+ GUEST_ASSERT_EQ(val, 0);
+}
+
+/* Address translation instruction */
+static void guest_at(void)
+{
+ uint64_t par;
+
+ asm volatile("at s1e1r, %0" :: "r" (guest_test_memory));
+ par = read_sysreg(par_el1);
+ isb();
+
+ /* Bit 1 indicates whether the AT was successful */
+ GUEST_ASSERT_EQ(par & 1, 0);
+}
+
+/*
+ * The size of the block written by "dc zva" is guaranteed to be between (2 <<
+ * 0) and (2 << 9), which is safe in our case as we need the write to happen
+ * for at least a word, and not more than a page.
+ */
+static void guest_dc_zva(void)
+{
+ uint16_t val;
+
+ asm volatile("dc zva, %0" :: "r" (guest_test_memory));
+ dsb(ish);
+ val = READ_ONCE(*guest_test_memory);
+ GUEST_ASSERT_EQ(val, 0);
+}
+
+/*
+ * Pre-indexing loads and stores don't have a valid syndrome (ESR_EL2.ISV==0).
+ * And that's special because KVM must take special care with those: they
+ * should still count as accesses for dirty logging or user-faulting, but
+ * should be handled differently on mmio.
+ */
+static void guest_ld_preidx(void)
+{
+ uint64_t val;
+ uint64_t addr = TEST_GVA - 8;
+
+ /*
+ * This ends up accessing "TEST_GVA + 8 - 8", where "TEST_GVA - 8" is
+ * in a gap between memslots not backing by anything.
+ */
+ asm volatile("ldr %0, [%1, #8]!"
+ : "=r" (val), "+r" (addr));
+ GUEST_ASSERT_EQ(val, 0);
+ GUEST_ASSERT_EQ(addr, TEST_GVA);
+}
+
+static void guest_st_preidx(void)
+{
+ uint64_t val = TEST_DATA;
+ uint64_t addr = TEST_GVA - 8;
+
+ asm volatile("str %0, [%1, #8]!"
+ : "+r" (val), "+r" (addr));
+
+ GUEST_ASSERT_EQ(addr, TEST_GVA);
+ val = READ_ONCE(*guest_test_memory);
+}
+
+static bool guest_set_ha(void)
+{
+ uint64_t mmfr1 = read_sysreg(id_aa64mmfr1_el1);
+ uint64_t hadbs, tcr;
+
+ /* Skip if HA is not supported. */
+ hadbs = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_HADBS), mmfr1);
+ if (hadbs == 0)
+ return false;
+
+ tcr = read_sysreg(tcr_el1) | TCR_EL1_HA;
+ write_sysreg(tcr, tcr_el1);
+ isb();
+
+ return true;
+}
+
+static bool guest_clear_pte_af(void)
+{
+ *((uint64_t *)TEST_PTE_GVA) &= ~PTE_AF;
+ flush_tlb_page(TEST_GVA);
+
+ return true;
+}
+
+static void guest_check_pte_af(void)
+{
+ dsb(ish);
+ GUEST_ASSERT_EQ(*((uint64_t *)TEST_PTE_GVA) & PTE_AF, PTE_AF);
+}
+
+static void guest_check_write_in_dirty_log(void)
+{
+ GUEST_SYNC(CMD_CHECK_WRITE_IN_DIRTY_LOG);
+}
+
+static void guest_check_no_write_in_dirty_log(void)
+{
+ GUEST_SYNC(CMD_CHECK_NO_WRITE_IN_DIRTY_LOG);
+}
+
+static void guest_check_s1ptw_wr_in_dirty_log(void)
+{
+ GUEST_SYNC(CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG);
+}
+
+static void guest_exec(void)
+{
+ int (*code)(void) = (int (*)(void))TEST_EXEC_GVA;
+ int ret;
+
+ ret = code();
+ GUEST_ASSERT_EQ(ret, 0x77);
+}
+
+static bool guest_prepare(struct test_desc *test)
+{
+ bool (*prepare_fn)(void);
+ int i;
+
+ for (i = 0; i < PREPARE_FN_NR; i++) {
+ prepare_fn = test->guest_prepare[i];
+ if (prepare_fn && !prepare_fn())
+ return false;
+ }
+
+ return true;
+}
+
+static void guest_test_check(struct test_desc *test)
+{
+ void (*check_fn)(void);
+ int i;
+
+ for (i = 0; i < CHECK_FN_NR; i++) {
+ check_fn = test->guest_test_check[i];
+ if (check_fn)
+ check_fn();
+ }
+}
+
+static void guest_code(struct test_desc *test)
+{
+ if (!guest_prepare(test))
+ GUEST_SYNC(CMD_SKIP_TEST);
+
+ GUEST_SYNC(test->mem_mark_cmd);
+
+ if (test->guest_test)
+ test->guest_test();
+
+ guest_test_check(test);
+ GUEST_DONE();
+}
+
+static void no_dabt_handler(struct ex_regs *regs)
+{
+ GUEST_ASSERT_1(false, read_sysreg(far_el1));
+}
+
+static void no_iabt_handler(struct ex_regs *regs)
+{
+ GUEST_ASSERT_1(false, regs->pc);
+}
+
+static struct uffd_args {
+ char *copy;
+ void *hva;
+ uint64_t paging_size;
+} pt_args, data_args;
+
+/* Returns true to continue the test, and false if it should be skipped. */
+static int uffd_generic_handler(int uffd_mode, int uffd, struct uffd_msg *msg,
+ struct uffd_args *args, bool expect_write)
+{
+ uint64_t addr = msg->arg.pagefault.address;
+ uint64_t flags = msg->arg.pagefault.flags;
+ struct uffdio_copy copy;
+ int ret;
+
+ TEST_ASSERT(uffd_mode == UFFDIO_REGISTER_MODE_MISSING,
+ "The only expected UFFD mode is MISSING");
+ ASSERT_EQ(!!(flags & UFFD_PAGEFAULT_FLAG_WRITE), expect_write);
+ ASSERT_EQ(addr, (uint64_t)args->hva);
+
+ pr_debug("uffd fault: addr=%p write=%d\n",
+ (void *)addr, !!(flags & UFFD_PAGEFAULT_FLAG_WRITE));
+
+ copy.src = (uint64_t)args->copy;
+ copy.dst = addr;
+ copy.len = args->paging_size;
+ copy.mode = 0;
+
+ ret = ioctl(uffd, UFFDIO_COPY, &copy);
+ if (ret == -1) {
+ pr_info("Failed UFFDIO_COPY in 0x%lx with errno: %d\n",
+ addr, errno);
+ return ret;
+ }
+
+ pthread_mutex_lock(&events.uffd_faults_mutex);
+ events.uffd_faults += 1;
+ pthread_mutex_unlock(&events.uffd_faults_mutex);
+ return 0;
+}
+
+static int uffd_pt_write_handler(int mode, int uffd, struct uffd_msg *msg)
+{
+ return uffd_generic_handler(mode, uffd, msg, &pt_args, true);
+}
+
+static int uffd_data_write_handler(int mode, int uffd, struct uffd_msg *msg)
+{
+ return uffd_generic_handler(mode, uffd, msg, &data_args, true);
+}
+
+static int uffd_data_read_handler(int mode, int uffd, struct uffd_msg *msg)
+{
+ return uffd_generic_handler(mode, uffd, msg, &data_args, false);
+}
+
+static void setup_uffd_args(struct userspace_mem_region *region,
+ struct uffd_args *args)
+{
+ args->hva = (void *)region->region.userspace_addr;
+ args->paging_size = region->region.memory_size;
+
+ args->copy = malloc(args->paging_size);
+ TEST_ASSERT(args->copy, "Failed to allocate data copy.");
+ memcpy(args->copy, args->hva, args->paging_size);
+}
+
+static void setup_uffd(struct kvm_vm *vm, struct test_params *p,
+ struct uffd_desc **pt_uffd, struct uffd_desc **data_uffd)
+{
+ struct test_desc *test = p->test_desc;
+ int uffd_mode = UFFDIO_REGISTER_MODE_MISSING;
+
+ setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_PT), &pt_args);
+ setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_TEST_DATA), &data_args);
+
+ *pt_uffd = NULL;
+ if (test->uffd_pt_handler)
+ *pt_uffd = uffd_setup_demand_paging(uffd_mode, 0,
+ pt_args.hva,
+ pt_args.paging_size,
+ test->uffd_pt_handler);
+
+ *data_uffd = NULL;
+ if (test->uffd_data_handler)
+ *data_uffd = uffd_setup_demand_paging(uffd_mode, 0,
+ data_args.hva,
+ data_args.paging_size,
+ test->uffd_data_handler);
+}
+
+static void free_uffd(struct test_desc *test, struct uffd_desc *pt_uffd,
+ struct uffd_desc *data_uffd)
+{
+ if (test->uffd_pt_handler)
+ uffd_stop_demand_paging(pt_uffd);
+ if (test->uffd_data_handler)
+ uffd_stop_demand_paging(data_uffd);
+
+ free(pt_args.copy);
+ free(data_args.copy);
+}
+
+static int uffd_no_handler(int mode, int uffd, struct uffd_msg *msg)
+{
+ TEST_FAIL("There was no UFFD fault expected.");
+ return -1;
+}
+
+/* Returns false if the test should be skipped. */
+static bool punch_hole_in_backing_store(struct kvm_vm *vm,
+ struct userspace_mem_region *region)
+{
+ void *hva = (void *)region->region.userspace_addr;
+ uint64_t paging_size = region->region.memory_size;
+ int ret, fd = region->fd;
+
+ if (fd != -1) {
+ ret = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ 0, paging_size);
+ TEST_ASSERT(ret == 0, "fallocate failed\n");
+ } else {
+ ret = madvise(hva, paging_size, MADV_DONTNEED);
+ TEST_ASSERT(ret == 0, "madvise failed\n");
+ }
+
+ return true;
+}
+
+static void mmio_on_test_gpa_handler(struct kvm_vm *vm, struct kvm_run *run)
+{
+ struct userspace_mem_region *region;
+ void *hva;
+
+ region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+ hva = (void *)region->region.userspace_addr;
+
+ ASSERT_EQ(run->mmio.phys_addr, region->region.guest_phys_addr);
+
+ memcpy(hva, run->mmio.data, run->mmio.len);
+ events.mmio_exits += 1;
+}
+
+static void mmio_no_handler(struct kvm_vm *vm, struct kvm_run *run)
+{
+ uint64_t data;
+
+ memcpy(&data, run->mmio.data, sizeof(data));
+ pr_debug("addr=%lld len=%d w=%d data=%lx\n",
+ run->mmio.phys_addr, run->mmio.len,
+ run->mmio.is_write, data);
+ TEST_FAIL("There was no MMIO exit expected.");
+}
+
+static bool check_write_in_dirty_log(struct kvm_vm *vm,
+ struct userspace_mem_region *region,
+ uint64_t host_pg_nr)
+{
+ unsigned long *bmap;
+ bool first_page_dirty;
+ uint64_t size = region->region.memory_size;
+
+ /* getpage_size() is not always equal to vm->page_size */
+ bmap = bitmap_zalloc(size / getpagesize());
+ kvm_vm_get_dirty_log(vm, region->region.slot, bmap);
+ first_page_dirty = test_bit(host_pg_nr, bmap);
+ free(bmap);
+ return first_page_dirty;
+}
+
+/* Returns true to continue the test, and false if it should be skipped. */
+static bool handle_cmd(struct kvm_vm *vm, int cmd)
+{
+ struct userspace_mem_region *data_region, *pt_region;
+ bool continue_test = true;
+
+ data_region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+ pt_region = vm_get_mem_region(vm, MEM_REGION_PT);
+
+ if (cmd == CMD_SKIP_TEST)
+ continue_test = false;
+
+ if (cmd & CMD_HOLE_PT)
+ continue_test = punch_hole_in_backing_store(vm, pt_region);
+ if (cmd & CMD_HOLE_DATA)
+ continue_test = punch_hole_in_backing_store(vm, data_region);
+ if (cmd & CMD_CHECK_WRITE_IN_DIRTY_LOG)
+ TEST_ASSERT(check_write_in_dirty_log(vm, data_region, 0),
+ "Missing write in dirty log");
+ if (cmd & CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG)
+ TEST_ASSERT(check_write_in_dirty_log(vm, pt_region, 0),
+ "Missing s1ptw write in dirty log");
+ if (cmd & CMD_CHECK_NO_WRITE_IN_DIRTY_LOG)
+ TEST_ASSERT(!check_write_in_dirty_log(vm, data_region, 0),
+ "Unexpected write in dirty log");
+ if (cmd & CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG)
+ TEST_ASSERT(!check_write_in_dirty_log(vm, pt_region, 0),
+ "Unexpected s1ptw write in dirty log");
+
+ return continue_test;
+}
+
+void fail_vcpu_run_no_handler(int ret)
+{
+ TEST_FAIL("Unexpected vcpu run failure\n");
+}
+
+void fail_vcpu_run_mmio_no_syndrome_handler(int ret)
+{
+ TEST_ASSERT(errno == ENOSYS,
+ "The mmio handler should have returned not implemented.");
+ events.fail_vcpu_runs += 1;
+}
+
+typedef uint32_t aarch64_insn_t;
+extern aarch64_insn_t __exec_test[2];
+
+noinline void __return_0x77(void)
+{
+ asm volatile("__exec_test: mov x0, #0x77\n"
+ "ret\n");
+}
+
+/*
+ * Note that this function runs on the host before the test VM starts: there's
+ * no need to sync the D$ and I$ caches.
+ */
+static void load_exec_code_for_test(struct kvm_vm *vm)
+{
+ uint64_t *code;
+ struct userspace_mem_region *region;
+ void *hva;
+
+ region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+ hva = (void *)region->region.userspace_addr;
+
+ assert(TEST_EXEC_GVA > TEST_GVA);
+ code = hva + TEST_EXEC_GVA - TEST_GVA;
+ memcpy(code, __exec_test, sizeof(__exec_test));
+}
+
+static void setup_abort_handlers(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
+ struct test_desc *test)
+{
+ vm_init_descriptor_tables(vm);
+ vcpu_init_descriptor_tables(vcpu);
+
+ vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+ ESR_EC_DABT, no_dabt_handler);
+ vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+ ESR_EC_IABT, no_iabt_handler);
+}
+
+static void setup_gva_maps(struct kvm_vm *vm)
+{
+ struct userspace_mem_region *region;
+ uint64_t pte_gpa;
+
+ region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+ /* Map TEST_GVA first. This will install a new PTE. */
+ virt_pg_map(vm, TEST_GVA, region->region.guest_phys_addr);
+ /* Then map TEST_PTE_GVA to the above PTE. */
+ pte_gpa = addr_hva2gpa(vm, virt_get_pte_hva(vm, TEST_GVA));
+ virt_pg_map(vm, TEST_PTE_GVA, pte_gpa);
+}
+
+enum pf_test_memslots {
+ CODE_AND_DATA_MEMSLOT,
+ PAGE_TABLE_MEMSLOT,
+ TEST_DATA_MEMSLOT,
+};
+
+/*
+ * Create a memslot for code and data at pfn=0, and test-data and PT ones
+ * at max_gfn.
+ */
+static void setup_memslots(struct kvm_vm *vm, struct test_params *p)
+{
+ uint64_t backing_src_pagesz = get_backing_src_pagesz(p->src_type);
+ uint64_t guest_page_size = vm->page_size;
+ uint64_t max_gfn = vm_compute_max_gfn(vm);
+ /* Enough for 2M of code when using 4K guest pages. */
+ uint64_t code_npages = 512;
+ uint64_t pt_size, data_size, data_gpa;
+
+ /*
+ * This test requires 1 pgd, 2 pud, 4 pmd, and 6 pte pages when using
+ * VM_MODE_P48V48_4K. Note that the .text takes ~1.6MBs. That's 13
+ * pages. VM_MODE_P48V48_4K is the mode with most PT pages; let's use
+ * twice that just in case.
+ */
+ pt_size = 26 * guest_page_size;
+
+ /* memslot sizes and gpa's must be aligned to the backing page size */
+ pt_size = align_up(pt_size, backing_src_pagesz);
+ data_size = align_up(guest_page_size, backing_src_pagesz);
+ data_gpa = (max_gfn * guest_page_size) - data_size;
+ data_gpa = align_down(data_gpa, backing_src_pagesz);
+
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0,
+ CODE_AND_DATA_MEMSLOT, code_npages, 0);
+ vm->memslots[MEM_REGION_CODE] = CODE_AND_DATA_MEMSLOT;
+ vm->memslots[MEM_REGION_DATA] = CODE_AND_DATA_MEMSLOT;
+
+ vm_userspace_mem_region_add(vm, p->src_type, data_gpa - pt_size,
+ PAGE_TABLE_MEMSLOT, pt_size / guest_page_size,
+ p->test_desc->pt_memslot_flags);
+ vm->memslots[MEM_REGION_PT] = PAGE_TABLE_MEMSLOT;
+
+ vm_userspace_mem_region_add(vm, p->src_type, data_gpa, TEST_DATA_MEMSLOT,
+ data_size / guest_page_size,
+ p->test_desc->data_memslot_flags);
+ vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT;
+}
+
+static void setup_default_handlers(struct test_desc *test)
+{
+ if (!test->mmio_handler)
+ test->mmio_handler = mmio_no_handler;
+
+ if (!test->fail_vcpu_run_handler)
+ test->fail_vcpu_run_handler = fail_vcpu_run_no_handler;
+}
+
+static void check_event_counts(struct test_desc *test)
+{
+ ASSERT_EQ(test->expected_events.uffd_faults, events.uffd_faults);
+ ASSERT_EQ(test->expected_events.mmio_exits, events.mmio_exits);
+ ASSERT_EQ(test->expected_events.fail_vcpu_runs, events.fail_vcpu_runs);
+}
+
+static void print_test_banner(enum vm_guest_mode mode, struct test_params *p)
+{
+ struct test_desc *test = p->test_desc;
+
+ pr_debug("Test: %s\n", test->name);
+ pr_debug("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+ pr_debug("Testing memory backing src type: %s\n",
+ vm_mem_backing_src_alias(p->src_type)->name);
+}
+
+static void reset_event_counts(void)
+{
+ memset(&events, 0, sizeof(events));
+}
+
+/*
+ * This function either succeeds, skips the test (after setting test->skip), or
+ * fails with a TEST_FAIL that aborts all tests.
+ */
+static void vcpu_run_loop(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
+ struct test_desc *test)
+{
+ struct kvm_run *run;
+ struct ucall uc;
+ int ret;
+
+ run = vcpu->run;
+
+ for (;;) {
+ ret = _vcpu_run(vcpu);
+ if (ret) {
+ test->fail_vcpu_run_handler(ret);
+ goto done;
+ }
+
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_SYNC:
+ if (!handle_cmd(vm, uc.args[1])) {
+ test->skip = true;
+ goto done;
+ }
+ break;
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx");
+ break;
+ case UCALL_DONE:
+ goto done;
+ case UCALL_NONE:
+ if (run->exit_reason == KVM_EXIT_MMIO)
+ test->mmio_handler(vm, run);
+ break;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+ }
+
+done:
+ pr_debug(test->skip ? "Skipped.\n" : "Done.\n");
+}
+
+static void run_test(enum vm_guest_mode mode, void *arg)
+{
+ struct test_params *p = (struct test_params *)arg;
+ struct test_desc *test = p->test_desc;
+ struct kvm_vm *vm;
+ struct kvm_vcpu *vcpu;
+ struct uffd_desc *pt_uffd, *data_uffd;
+
+ print_test_banner(mode, p);
+
+ vm = ____vm_create(mode);
+ setup_memslots(vm, p);
+ kvm_vm_elf_load(vm, program_invocation_name);
+ vcpu = vm_vcpu_add(vm, 0, guest_code);
+
+ setup_gva_maps(vm);
+
+ ucall_init(vm, NULL);
+
+ reset_event_counts();
+
+ /*
+ * Set some code in the data memslot for the guest to execute (only
+ * applicable to the EXEC tests). This has to be done before
+ * setup_uffd() as that function copies the memslot data for the uffd
+ * handler.
+ */
+ load_exec_code_for_test(vm);
+ setup_uffd(vm, p, &pt_uffd, &data_uffd);
+ setup_abort_handlers(vm, vcpu, test);
+ setup_default_handlers(test);
+ vcpu_args_set(vcpu, 1, test);
+
+ vcpu_run_loop(vm, vcpu, test);
+
+ ucall_uninit(vm);
+ kvm_vm_free(vm);
+ free_uffd(test, pt_uffd, data_uffd);
+
+ /*
+ * Make sure we check the events after the uffd threads have exited,
+ * which means they updated their respective event counters.
+ */
+ if (!test->skip)
+ check_event_counts(test);
+}
+
+static void help(char *name)
+{
+ puts("");
+ printf("usage: %s [-h] [-s mem-type]\n", name);
+ puts("");
+ guest_modes_help();
+ backing_src_help("-s");
+ puts("");
+}
+
+#define SNAME(s) #s
+#define SCAT2(a, b) SNAME(a ## _ ## b)
+#define SCAT3(a, b, c) SCAT2(a, SCAT2(b, c))
+#define SCAT4(a, b, c, d) SCAT2(a, SCAT3(b, c, d))
+
+#define _CHECK(_test) _CHECK_##_test
+#define _PREPARE(_test) _PREPARE_##_test
+#define _PREPARE_guest_read64 NULL
+#define _PREPARE_guest_ld_preidx NULL
+#define _PREPARE_guest_write64 NULL
+#define _PREPARE_guest_st_preidx NULL
+#define _PREPARE_guest_exec NULL
+#define _PREPARE_guest_at NULL
+#define _PREPARE_guest_dc_zva guest_check_dc_zva
+#define _PREPARE_guest_cas guest_check_lse
+
+/* With or without access flag checks */
+#define _PREPARE_with_af guest_set_ha, guest_clear_pte_af
+#define _PREPARE_no_af NULL
+#define _CHECK_with_af guest_check_pte_af
+#define _CHECK_no_af NULL
+
+/* Performs an access and checks that no faults were triggered. */
+#define TEST_ACCESS(_access, _with_af, _mark_cmd) \
+{ \
+ .name = SCAT3(_access, _with_af, #_mark_cmd), \
+ .guest_prepare = { _PREPARE(_with_af), \
+ _PREPARE(_access) }, \
+ .mem_mark_cmd = _mark_cmd, \
+ .guest_test = _access, \
+ .guest_test_check = { _CHECK(_with_af) }, \
+ .expected_events = { 0 }, \
+}
+
+#define TEST_UFFD(_access, _with_af, _mark_cmd, \
+ _uffd_data_handler, _uffd_pt_handler, _uffd_faults) \
+{ \
+ .name = SCAT4(uffd, _access, _with_af, #_mark_cmd), \
+ .guest_prepare = { _PREPARE(_with_af), \
+ _PREPARE(_access) }, \
+ .guest_test = _access, \
+ .mem_mark_cmd = _mark_cmd, \
+ .guest_test_check = { _CHECK(_with_af) }, \
+ .uffd_data_handler = _uffd_data_handler, \
+ .uffd_pt_handler = _uffd_pt_handler, \
+ .expected_events = { .uffd_faults = _uffd_faults, }, \
+}
+
+#define TEST_DIRTY_LOG(_access, _with_af, _test_check) \
+{ \
+ .name = SCAT3(dirty_log, _access, _with_af), \
+ .data_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \
+ .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \
+ .guest_prepare = { _PREPARE(_with_af), \
+ _PREPARE(_access) }, \
+ .guest_test = _access, \
+ .guest_test_check = { _CHECK(_with_af), _test_check, \
+ guest_check_s1ptw_wr_in_dirty_log}, \
+ .expected_events = { 0 }, \
+}
+
+#define TEST_UFFD_AND_DIRTY_LOG(_access, _with_af, _uffd_data_handler, \
+ _uffd_faults, _test_check) \
+{ \
+ .name = SCAT3(uffd_and_dirty_log, _access, _with_af), \
+ .data_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \
+ .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \
+ .guest_prepare = { _PREPARE(_with_af), \
+ _PREPARE(_access) }, \
+ .guest_test = _access, \
+ .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \
+ .guest_test_check = { _CHECK(_with_af), _test_check }, \
+ .uffd_data_handler = _uffd_data_handler, \
+ .uffd_pt_handler = uffd_pt_write_handler, \
+ .expected_events = { .uffd_faults = _uffd_faults, }, \
+}
+
+#define TEST_RO_MEMSLOT(_access, _mmio_handler, _mmio_exits) \
+{ \
+ .name = SCAT3(ro_memslot, _access, _with_af), \
+ .data_memslot_flags = KVM_MEM_READONLY, \
+ .guest_prepare = { _PREPARE(_access) }, \
+ .guest_test = _access, \
+ .mmio_handler = _mmio_handler, \
+ .expected_events = { .mmio_exits = _mmio_exits }, \
+}
+
+#define TEST_RO_MEMSLOT_NO_SYNDROME(_access) \
+{ \
+ .name = SCAT2(ro_memslot_no_syndrome, _access), \
+ .data_memslot_flags = KVM_MEM_READONLY, \
+ .guest_test = _access, \
+ .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \
+ .expected_events = { .fail_vcpu_runs = 1 }, \
+}
+
+#define TEST_RO_MEMSLOT_AND_DIRTY_LOG(_access, _mmio_handler, _mmio_exits, \
+ _test_check) \
+{ \
+ .name = SCAT3(ro_memslot, _access, _with_af), \
+ .data_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \
+ .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \
+ .guest_prepare = { _PREPARE(_access) }, \
+ .guest_test = _access, \
+ .guest_test_check = { _test_check }, \
+ .mmio_handler = _mmio_handler, \
+ .expected_events = { .mmio_exits = _mmio_exits}, \
+}
+
+#define TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(_access, _test_check) \
+{ \
+ .name = SCAT2(ro_memslot_no_syn_and_dlog, _access), \
+ .data_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \
+ .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \
+ .guest_test = _access, \
+ .guest_test_check = { _test_check }, \
+ .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \
+ .expected_events = { .fail_vcpu_runs = 1 }, \
+}
+
+#define TEST_RO_MEMSLOT_AND_UFFD(_access, _mmio_handler, _mmio_exits, \
+ _uffd_data_handler, _uffd_faults) \
+{ \
+ .name = SCAT2(ro_memslot_uffd, _access), \
+ .data_memslot_flags = KVM_MEM_READONLY, \
+ .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \
+ .guest_prepare = { _PREPARE(_access) }, \
+ .guest_test = _access, \
+ .uffd_data_handler = _uffd_data_handler, \
+ .uffd_pt_handler = uffd_pt_write_handler, \
+ .mmio_handler = _mmio_handler, \
+ .expected_events = { .mmio_exits = _mmio_exits, \
+ .uffd_faults = _uffd_faults }, \
+}
+
+#define TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(_access, _uffd_data_handler, \
+ _uffd_faults) \
+{ \
+ .name = SCAT2(ro_memslot_no_syndrome, _access), \
+ .data_memslot_flags = KVM_MEM_READONLY, \
+ .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \
+ .guest_test = _access, \
+ .uffd_data_handler = _uffd_data_handler, \
+ .uffd_pt_handler = uffd_pt_write_handler, \
+ .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \
+ .expected_events = { .fail_vcpu_runs = 1, \
+ .uffd_faults = _uffd_faults }, \
+}
+
+static struct test_desc tests[] = {
+
+ /* Check that HW is setting the Access Flag (AF) (sanity checks). */
+ TEST_ACCESS(guest_read64, with_af, CMD_NONE),
+ TEST_ACCESS(guest_ld_preidx, with_af, CMD_NONE),
+ TEST_ACCESS(guest_cas, with_af, CMD_NONE),
+ TEST_ACCESS(guest_write64, with_af, CMD_NONE),
+ TEST_ACCESS(guest_st_preidx, with_af, CMD_NONE),
+ TEST_ACCESS(guest_dc_zva, with_af, CMD_NONE),
+ TEST_ACCESS(guest_exec, with_af, CMD_NONE),
+
+ /*
+ * Punch a hole in the data backing store, and then try multiple
+ * accesses: reads should rturn zeroes, and writes should
+ * re-populate the page. Moreover, the test also check that no
+ * exception was generated in the guest. Note that this
+ * reading/writing behavior is the same as reading/writing a
+ * punched page (with fallocate(FALLOC_FL_PUNCH_HOLE)) from
+ * userspace.
+ */
+ TEST_ACCESS(guest_read64, no_af, CMD_HOLE_DATA),
+ TEST_ACCESS(guest_cas, no_af, CMD_HOLE_DATA),
+ TEST_ACCESS(guest_ld_preidx, no_af, CMD_HOLE_DATA),
+ TEST_ACCESS(guest_write64, no_af, CMD_HOLE_DATA),
+ TEST_ACCESS(guest_st_preidx, no_af, CMD_HOLE_DATA),
+ TEST_ACCESS(guest_at, no_af, CMD_HOLE_DATA),
+ TEST_ACCESS(guest_dc_zva, no_af, CMD_HOLE_DATA),
+
+ /*
+ * Punch holes in the data and PT backing stores and mark them for
+ * userfaultfd handling. This should result in 2 faults: the access
+ * on the data backing store, and its respective S1 page table walk
+ * (S1PTW).
+ */
+ TEST_UFFD(guest_read64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ uffd_data_read_handler, uffd_pt_write_handler, 2),
+ /* no_af should also lead to a PT write. */
+ TEST_UFFD(guest_read64, no_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ uffd_data_read_handler, uffd_pt_write_handler, 2),
+ /* Note how that cas invokes the read handler. */
+ TEST_UFFD(guest_cas, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ uffd_data_read_handler, uffd_pt_write_handler, 2),
+ /*
+ * Can't test guest_at with_af as it's IMPDEF whether the AF is set.
+ * The S1PTW fault should still be marked as a write.
+ */
+ TEST_UFFD(guest_at, no_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ uffd_data_read_handler, uffd_pt_write_handler, 1),
+ TEST_UFFD(guest_ld_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ uffd_data_read_handler, uffd_pt_write_handler, 2),
+ TEST_UFFD(guest_write64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ uffd_data_write_handler, uffd_pt_write_handler, 2),
+ TEST_UFFD(guest_dc_zva, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ uffd_data_write_handler, uffd_pt_write_handler, 2),
+ TEST_UFFD(guest_st_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ uffd_data_write_handler, uffd_pt_write_handler, 2),
+ TEST_UFFD(guest_exec, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ uffd_data_read_handler, uffd_pt_write_handler, 2),
+
+ /*
+ * Try accesses when the data and PT memory regions are both
+ * tracked for dirty logging.
+ */
+ TEST_DIRTY_LOG(guest_read64, with_af, guest_check_no_write_in_dirty_log),
+ /* no_af should also lead to a PT write. */
+ TEST_DIRTY_LOG(guest_read64, no_af, guest_check_no_write_in_dirty_log),
+ TEST_DIRTY_LOG(guest_ld_preidx, with_af, guest_check_no_write_in_dirty_log),
+ TEST_DIRTY_LOG(guest_at, no_af, guest_check_no_write_in_dirty_log),
+ TEST_DIRTY_LOG(guest_exec, with_af, guest_check_no_write_in_dirty_log),
+ TEST_DIRTY_LOG(guest_write64, with_af, guest_check_write_in_dirty_log),
+ TEST_DIRTY_LOG(guest_cas, with_af, guest_check_write_in_dirty_log),
+ TEST_DIRTY_LOG(guest_dc_zva, with_af, guest_check_write_in_dirty_log),
+ TEST_DIRTY_LOG(guest_st_preidx, with_af, guest_check_write_in_dirty_log),
+
+ /*
+ * Access when the data and PT memory regions are both marked for
+ * dirty logging and UFFD at the same time. The expected result is
+ * that writes should mark the dirty log and trigger a userfaultfd
+ * write fault. Reads/execs should result in a read userfaultfd
+ * fault, and nothing in the dirty log. Any S1PTW should result in
+ * a write in the dirty log and a userfaultfd write.
+ */
+ TEST_UFFD_AND_DIRTY_LOG(guest_read64, with_af, uffd_data_read_handler, 2,
+ guest_check_no_write_in_dirty_log),
+ /* no_af should also lead to a PT write. */
+ TEST_UFFD_AND_DIRTY_LOG(guest_read64, no_af, uffd_data_read_handler, 2,
+ guest_check_no_write_in_dirty_log),
+ TEST_UFFD_AND_DIRTY_LOG(guest_ld_preidx, with_af, uffd_data_read_handler,
+ 2, guest_check_no_write_in_dirty_log),
+ TEST_UFFD_AND_DIRTY_LOG(guest_at, with_af, 0, 1,
+ guest_check_no_write_in_dirty_log),
+ TEST_UFFD_AND_DIRTY_LOG(guest_exec, with_af, uffd_data_read_handler, 2,
+ guest_check_no_write_in_dirty_log),
+ TEST_UFFD_AND_DIRTY_LOG(guest_write64, with_af, uffd_data_write_handler,
+ 2, guest_check_write_in_dirty_log),
+ TEST_UFFD_AND_DIRTY_LOG(guest_cas, with_af, uffd_data_read_handler, 2,
+ guest_check_write_in_dirty_log),
+ TEST_UFFD_AND_DIRTY_LOG(guest_dc_zva, with_af, uffd_data_write_handler,
+ 2, guest_check_write_in_dirty_log),
+ TEST_UFFD_AND_DIRTY_LOG(guest_st_preidx, with_af,
+ uffd_data_write_handler, 2,
+ guest_check_write_in_dirty_log),
+
+ /*
+ * Try accesses when the data memory region is marked read-only
+ * (with KVM_MEM_READONLY). Writes with a syndrome result in an
+ * MMIO exit, writes with no syndrome (e.g., CAS) result in a
+ * failed vcpu run, and reads/execs with and without syndroms do
+ * not fault.
+ */
+ TEST_RO_MEMSLOT(guest_read64, 0, 0),
+ TEST_RO_MEMSLOT(guest_ld_preidx, 0, 0),
+ TEST_RO_MEMSLOT(guest_at, 0, 0),
+ TEST_RO_MEMSLOT(guest_exec, 0, 0),
+ TEST_RO_MEMSLOT(guest_write64, mmio_on_test_gpa_handler, 1),
+ TEST_RO_MEMSLOT_NO_SYNDROME(guest_dc_zva),
+ TEST_RO_MEMSLOT_NO_SYNDROME(guest_cas),
+ TEST_RO_MEMSLOT_NO_SYNDROME(guest_st_preidx),
+
+ /*
+ * Access when both the data region is both read-only and marked
+ * for dirty logging at the same time. The expected result is that
+ * for writes there should be no write in the dirty log. The
+ * readonly handling is the same as if the memslot was not marked
+ * for dirty logging: writes with a syndrome result in an MMIO
+ * exit, and writes with no syndrome result in a failed vcpu run.
+ */
+ TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_read64, 0, 0,
+ guest_check_no_write_in_dirty_log),
+ TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_ld_preidx, 0, 0,
+ guest_check_no_write_in_dirty_log),
+ TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_at, 0, 0,
+ guest_check_no_write_in_dirty_log),
+ TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_exec, 0, 0,
+ guest_check_no_write_in_dirty_log),
+ TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_write64, mmio_on_test_gpa_handler,
+ 1, guest_check_no_write_in_dirty_log),
+ TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_dc_zva,
+ guest_check_no_write_in_dirty_log),
+ TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_cas,
+ guest_check_no_write_in_dirty_log),
+ TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_st_preidx,
+ guest_check_no_write_in_dirty_log),
+
+ /*
+ * Access when the data region is both read-only and punched with
+ * holes tracked with userfaultfd. The expected result is the
+ * union of both userfaultfd and read-only behaviors. For example,
+ * write accesses result in a userfaultfd write fault and an MMIO
+ * exit. Writes with no syndrome result in a failed vcpu run and
+ * no userfaultfd write fault. Reads result in userfaultfd getting
+ * triggered.
+ */
+ TEST_RO_MEMSLOT_AND_UFFD(guest_read64, 0, 0,
+ uffd_data_read_handler, 2),
+ TEST_RO_MEMSLOT_AND_UFFD(guest_ld_preidx, 0, 0,
+ uffd_data_read_handler, 2),
+ TEST_RO_MEMSLOT_AND_UFFD(guest_at, 0, 0,
+ uffd_no_handler, 1),
+ TEST_RO_MEMSLOT_AND_UFFD(guest_exec, 0, 0,
+ uffd_data_read_handler, 2),
+ TEST_RO_MEMSLOT_AND_UFFD(guest_write64, mmio_on_test_gpa_handler, 1,
+ uffd_data_write_handler, 2),
+ TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_cas,
+ uffd_data_read_handler, 2),
+ TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_dc_zva,
+ uffd_no_handler, 1),
+ TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_st_preidx,
+ uffd_no_handler, 1),
+
+ { 0 }
+};
+
+static void for_each_test_and_guest_mode(enum vm_mem_backing_src_type src_type)
+{
+ struct test_desc *t;
+
+ for (t = &tests[0]; t->name; t++) {
+ if (t->skip)
+ continue;
+
+ struct test_params p = {
+ .src_type = src_type,
+ .test_desc = t,
+ };
+
+ for_each_guest_mode(run_test, &p);
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ enum vm_mem_backing_src_type src_type;
+ int opt;
+
+ setbuf(stdout, NULL);
+
+ src_type = DEFAULT_VM_MEM_SRC;
+
+ while ((opt = getopt(argc, argv, "hm:s:")) != -1) {
+ switch (opt) {
+ case 'm':
+ guest_modes_cmdline(optarg);
+ break;
+ case 's':
+ src_type = parse_backing_src_type(optarg);
+ break;
+ case 'h':
+ default:
+ help(argv[0]);
+ exit(0);
+ }
+ }
+
+ for_each_test_and_guest_mode(src_type);
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c
index 76c583a07ea2..942370d57392 100644
--- a/tools/testing/selftests/kvm/access_tracking_perf_test.c
+++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c
@@ -58,9 +58,6 @@ static enum {
ITERATION_MARK_IDLE,
} iteration_work;
-/* Set to true when vCPU threads should exit. */
-static bool done;
-
/* The iteration that was last completed by each vCPU. */
static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
@@ -211,7 +208,7 @@ static bool spin_wait_for_next_iteration(int *current_iteration)
int last_iteration = *current_iteration;
do {
- if (READ_ONCE(done))
+ if (READ_ONCE(perf_test_args.stop_vcpus))
return false;
*current_iteration = READ_ONCE(iteration);
@@ -321,9 +318,6 @@ static void run_test(enum vm_guest_mode mode, void *arg)
mark_memory_idle(vm, nr_vcpus);
access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from idle memory");
- /* Set done to signal the vCPU threads to exit */
- done = true;
-
perf_test_join_vcpu_threads(nr_vcpus);
perf_test_destroy_vm(vm);
}
diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index 779ae54f89c4..8e1fe4ffcccd 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -22,23 +22,13 @@
#include "test_util.h"
#include "perf_test_util.h"
#include "guest_modes.h"
+#include "userfaultfd_util.h"
#ifdef __NR_userfaultfd
-#ifdef PRINT_PER_PAGE_UPDATES
-#define PER_PAGE_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define PER_PAGE_DEBUG(...) _no_printf(__VA_ARGS__)
-#endif
-
-#ifdef PRINT_PER_VCPU_UPDATES
-#define PER_VCPU_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__)
-#endif
-
static int nr_vcpus = 1;
static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
+
static size_t demand_paging_size;
static char *guest_data_prototype;
@@ -67,9 +57,11 @@ static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args)
ts_diff.tv_sec, ts_diff.tv_nsec);
}
-static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr)
+static int handle_uffd_page_request(int uffd_mode, int uffd,
+ struct uffd_msg *msg)
{
pid_t tid = syscall(__NR_gettid);
+ uint64_t addr = msg->arg.pagefault.address;
struct timespec start;
struct timespec ts_diff;
int r;
@@ -116,174 +108,32 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr)
return 0;
}
-bool quit_uffd_thread;
-
-struct uffd_handler_args {
+struct test_params {
int uffd_mode;
- int uffd;
- int pipefd;
- useconds_t delay;
+ useconds_t uffd_delay;
+ enum vm_mem_backing_src_type src_type;
+ bool partition_vcpu_memory_access;
};
-static void *uffd_handler_thread_fn(void *arg)
-{
- struct uffd_handler_args *uffd_args = (struct uffd_handler_args *)arg;
- int uffd = uffd_args->uffd;
- int pipefd = uffd_args->pipefd;
- useconds_t delay = uffd_args->delay;
- int64_t pages = 0;
- struct timespec start;
- struct timespec ts_diff;
-
- clock_gettime(CLOCK_MONOTONIC, &start);
- while (!quit_uffd_thread) {
- struct uffd_msg msg;
- struct pollfd pollfd[2];
- char tmp_chr;
- int r;
- uint64_t addr;
-
- pollfd[0].fd = uffd;
- pollfd[0].events = POLLIN;
- pollfd[1].fd = pipefd;
- pollfd[1].events = POLLIN;
-
- r = poll(pollfd, 2, -1);
- switch (r) {
- case -1:
- pr_info("poll err");
- continue;
- case 0:
- continue;
- case 1:
- break;
- default:
- pr_info("Polling uffd returned %d", r);
- return NULL;
- }
-
- if (pollfd[0].revents & POLLERR) {
- pr_info("uffd revents has POLLERR");
- return NULL;
- }
-
- if (pollfd[1].revents & POLLIN) {
- r = read(pollfd[1].fd, &tmp_chr, 1);
- TEST_ASSERT(r == 1,
- "Error reading pipefd in UFFD thread\n");
- return NULL;
- }
-
- if (!(pollfd[0].revents & POLLIN))
- continue;
-
- r = read(uffd, &msg, sizeof(msg));
- if (r == -1) {
- if (errno == EAGAIN)
- continue;
- pr_info("Read of uffd got errno %d\n", errno);
- return NULL;
- }
-
- if (r != sizeof(msg)) {
- pr_info("Read on uffd returned unexpected size: %d bytes", r);
- return NULL;
- }
-
- if (!(msg.event & UFFD_EVENT_PAGEFAULT))
- continue;
-
- if (delay)
- usleep(delay);
- addr = msg.arg.pagefault.address;
- r = handle_uffd_page_request(uffd_args->uffd_mode, uffd, addr);
- if (r < 0)
- return NULL;
- pages++;
- }
-
- ts_diff = timespec_elapsed(start);
- PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n",
- pages, ts_diff.tv_sec, ts_diff.tv_nsec,
- pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
-
- return NULL;
-}
-
-static void setup_demand_paging(struct kvm_vm *vm,
- pthread_t *uffd_handler_thread, int pipefd,
- int uffd_mode, useconds_t uffd_delay,
- struct uffd_handler_args *uffd_args,
- void *hva, void *alias, uint64_t len)
+static void prefault_mem(void *alias, uint64_t len)
{
- bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR);
- int uffd;
- struct uffdio_api uffdio_api;
- struct uffdio_register uffdio_register;
- uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY;
- int ret;
+ size_t p;
- PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n",
- is_minor ? "MINOR" : "MISSING",
- is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY");
-
- /* In order to get minor faults, prefault via the alias. */
- if (is_minor) {
- size_t p;
-
- expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE;
-
- TEST_ASSERT(alias != NULL, "Alias required for minor faults");
- for (p = 0; p < (len / demand_paging_size); ++p) {
- memcpy(alias + (p * demand_paging_size),
- guest_data_prototype, demand_paging_size);
- }
+ TEST_ASSERT(alias != NULL, "Alias required for minor faults");
+ for (p = 0; p < (len / demand_paging_size); ++p) {
+ memcpy(alias + (p * demand_paging_size),
+ guest_data_prototype, demand_paging_size);
}
-
- uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
- TEST_ASSERT(uffd >= 0, __KVM_SYSCALL_ERROR("userfaultfd()", uffd));
-
- uffdio_api.api = UFFD_API;
- uffdio_api.features = 0;
- ret = ioctl(uffd, UFFDIO_API, &uffdio_api);
- TEST_ASSERT(ret != -1, __KVM_SYSCALL_ERROR("UFFDIO_API", ret));
-
- uffdio_register.range.start = (uint64_t)hva;
- uffdio_register.range.len = len;
- uffdio_register.mode = uffd_mode;
- ret = ioctl(uffd, UFFDIO_REGISTER, &uffdio_register);
- TEST_ASSERT(ret != -1, __KVM_SYSCALL_ERROR("UFFDIO_REGISTER", ret));
- TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) ==
- expected_ioctls, "missing userfaultfd ioctls");
-
- uffd_args->uffd_mode = uffd_mode;
- uffd_args->uffd = uffd;
- uffd_args->pipefd = pipefd;
- uffd_args->delay = uffd_delay;
- pthread_create(uffd_handler_thread, NULL, uffd_handler_thread_fn,
- uffd_args);
-
- PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n",
- hva, hva + len);
}
-struct test_params {
- int uffd_mode;
- useconds_t uffd_delay;
- enum vm_mem_backing_src_type src_type;
- bool partition_vcpu_memory_access;
-};
-
static void run_test(enum vm_guest_mode mode, void *arg)
{
struct test_params *p = arg;
- pthread_t *uffd_handler_threads = NULL;
- struct uffd_handler_args *uffd_args = NULL;
+ struct uffd_desc **uffd_descs = NULL;
struct timespec start;
struct timespec ts_diff;
- int *pipefds = NULL;
struct kvm_vm *vm;
- int r, i;
+ int i;
vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
p->src_type, p->partition_vcpu_memory_access);
@@ -296,15 +146,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
memset(guest_data_prototype, 0xAB, demand_paging_size);
if (p->uffd_mode) {
- uffd_handler_threads =
- malloc(nr_vcpus * sizeof(*uffd_handler_threads));
- TEST_ASSERT(uffd_handler_threads, "Memory allocation failed");
-
- uffd_args = malloc(nr_vcpus * sizeof(*uffd_args));
- TEST_ASSERT(uffd_args, "Memory allocation failed");
-
- pipefds = malloc(sizeof(int) * nr_vcpus * 2);
- TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd");
+ uffd_descs = malloc(nr_vcpus * sizeof(struct uffd_desc *));
+ TEST_ASSERT(uffd_descs, "Memory allocation failed");
for (i = 0; i < nr_vcpus; i++) {
struct perf_test_vcpu_args *vcpu_args;
@@ -317,19 +160,17 @@ static void run_test(enum vm_guest_mode mode, void *arg)
vcpu_hva = addr_gpa2hva(vm, vcpu_args->gpa);
vcpu_alias = addr_gpa2alias(vm, vcpu_args->gpa);
+ prefault_mem(vcpu_alias,
+ vcpu_args->pages * perf_test_args.guest_page_size);
+
/*
* Set up user fault fd to handle demand paging
* requests.
*/
- r = pipe2(&pipefds[i * 2],
- O_CLOEXEC | O_NONBLOCK);
- TEST_ASSERT(!r, "Failed to set up pipefd");
-
- setup_demand_paging(vm, &uffd_handler_threads[i],
- pipefds[i * 2], p->uffd_mode,
- p->uffd_delay, &uffd_args[i],
- vcpu_hva, vcpu_alias,
- vcpu_args->pages * perf_test_args.guest_page_size);
+ uffd_descs[i] = uffd_setup_demand_paging(
+ p->uffd_mode, p->uffd_delay, vcpu_hva,
+ vcpu_args->pages * perf_test_args.guest_page_size,
+ &handle_uffd_page_request);
}
}
@@ -344,15 +185,9 @@ static void run_test(enum vm_guest_mode mode, void *arg)
pr_info("All vCPU threads joined\n");
if (p->uffd_mode) {
- char c;
-
/* Tell the user fault fd handler threads to quit */
- for (i = 0; i < nr_vcpus; i++) {
- r = write(pipefds[i * 2 + 1], &c, 1);
- TEST_ASSERT(r == 1, "Unable to write to pipefd");
-
- pthread_join(uffd_handler_threads[i], NULL);
- }
+ for (i = 0; i < nr_vcpus; i++)
+ uffd_stop_demand_paging(uffd_descs[i]);
}
pr_info("Total guest execution time: %ld.%.9lds\n",
@@ -364,11 +199,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
perf_test_destroy_vm(vm);
free(guest_data_prototype);
- if (p->uffd_mode) {
- free(uffd_handler_threads);
- free(uffd_args);
- free(pipefds);
- }
+ if (p->uffd_mode)
+ free(uffd_descs);
}
static void help(char *name)
diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c
index b5234d6efbe1..a87e5f78ebf1 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -24,6 +24,9 @@
#include "guest_modes.h"
#include "processor.h"
+#define DIRTY_MEM_BITS 30 /* 1G */
+#define PAGE_SHIFT_4K 12
+
/* The memory slot index to track dirty pages */
#define TEST_MEM_SLOT_INDEX 1
@@ -226,13 +229,15 @@ static void clear_log_create_vm_done(struct kvm_vm *vm)
}
static void dirty_log_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot,
- void *bitmap, uint32_t num_pages)
+ void *bitmap, uint32_t num_pages,
+ uint32_t *unused)
{
kvm_vm_get_dirty_log(vcpu->vm, slot, bitmap);
}
static void clear_log_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot,
- void *bitmap, uint32_t num_pages)
+ void *bitmap, uint32_t num_pages,
+ uint32_t *unused)
{
kvm_vm_get_dirty_log(vcpu->vm, slot, bitmap);
kvm_vm_clear_dirty_log(vcpu->vm, slot, bitmap, 0, num_pages);
@@ -271,6 +276,24 @@ static bool dirty_ring_supported(void)
static void dirty_ring_create_vm_done(struct kvm_vm *vm)
{
+ uint64_t pages;
+ uint32_t limit;
+
+ /*
+ * We rely on vcpu exit due to full dirty ring state. Adjust
+ * the ring buffer size to ensure we're able to reach the
+ * full dirty ring state.
+ */
+ pages = (1ul << (DIRTY_MEM_BITS - vm->page_shift)) + 3;
+ pages = vm_adjust_num_guest_pages(vm->mode, pages);
+ if (vm->page_size < getpagesize())
+ pages = vm_num_host_pages(vm->mode, pages);
+
+ limit = 1 << (31 - __builtin_clz(pages));
+ test_dirty_ring_count = 1 << (31 - __builtin_clz(test_dirty_ring_count));
+ test_dirty_ring_count = min(limit, test_dirty_ring_count);
+ pr_info("dirty ring count: 0x%x\n", test_dirty_ring_count);
+
/*
* Switch to dirty ring mode after VM creation but before any
* of the vcpu creation.
@@ -329,10 +352,9 @@ static void dirty_ring_continue_vcpu(void)
}
static void dirty_ring_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot,
- void *bitmap, uint32_t num_pages)
+ void *bitmap, uint32_t num_pages,
+ uint32_t *ring_buf_idx)
{
- /* We only have one vcpu */
- static uint32_t fetch_index = 0;
uint32_t count = 0, cleared;
bool continued_vcpu = false;
@@ -349,7 +371,8 @@ static void dirty_ring_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot,
/* Only have one vcpu */
count = dirty_ring_collect_one(vcpu_map_dirty_ring(vcpu),
- slot, bitmap, num_pages, &fetch_index);
+ slot, bitmap, num_pages,
+ ring_buf_idx);
cleared = kvm_vm_reset_dirty_ring(vcpu->vm);
@@ -406,7 +429,8 @@ struct log_mode {
void (*create_vm_done)(struct kvm_vm *vm);
/* Hook to collect the dirty pages into the bitmap provided */
void (*collect_dirty_pages) (struct kvm_vcpu *vcpu, int slot,
- void *bitmap, uint32_t num_pages);
+ void *bitmap, uint32_t num_pages,
+ uint32_t *ring_buf_idx);
/* Hook to call when after each vcpu run */
void (*after_vcpu_run)(struct kvm_vcpu *vcpu, int ret, int err);
void (*before_vcpu_join) (void);
@@ -471,13 +495,14 @@ static void log_mode_create_vm_done(struct kvm_vm *vm)
}
static void log_mode_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot,
- void *bitmap, uint32_t num_pages)
+ void *bitmap, uint32_t num_pages,
+ uint32_t *ring_buf_idx)
{
struct log_mode *mode = &log_modes[host_log_mode];
TEST_ASSERT(mode->collect_dirty_pages != NULL,
"collect_dirty_pages() is required for any log mode!");
- mode->collect_dirty_pages(vcpu, slot, bitmap, num_pages);
+ mode->collect_dirty_pages(vcpu, slot, bitmap, num_pages, ring_buf_idx);
}
static void log_mode_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err)
@@ -681,9 +706,6 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, struct kvm_vcpu **vcpu,
return vm;
}
-#define DIRTY_MEM_BITS 30 /* 1G */
-#define PAGE_SHIFT_4K 12
-
struct test_params {
unsigned long iterations;
unsigned long interval;
@@ -696,6 +718,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
struct kvm_vcpu *vcpu;
struct kvm_vm *vm;
unsigned long *bmap;
+ uint32_t ring_buf_idx = 0;
if (!log_mode_supported()) {
print_skip("Log mode '%s' not supported",
@@ -771,6 +794,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
host_dirty_count = 0;
host_clear_count = 0;
host_track_next_count = 0;
+ WRITE_ONCE(dirty_ring_vcpu_ring_full, false);
pthread_create(&vcpu_thread, NULL, vcpu_worker, vcpu);
@@ -778,7 +802,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
/* Give the vcpu thread some time to dirty some pages */
usleep(p->interval * 1000);
log_mode_collect_dirty_pages(vcpu, TEST_MEM_SLOT_INDEX,
- bmap, host_num_pages);
+ bmap, host_num_pages,
+ &ring_buf_idx);
/*
* See vcpu_sync_stop_requested definition for details on why
@@ -823,7 +848,7 @@ static void help(char *name)
printf("usage: %s [-h] [-i iterations] [-I interval] "
"[-p offset] [-m mode]\n", name);
puts("");
- printf(" -c: specify dirty ring size, in number of entries\n");
+ printf(" -c: hint to dirty ring size, in number of entries\n");
printf(" (only useful for dirty-ring test; default: %"PRIu32")\n",
TEST_DIRTY_RING_COUNT);
printf(" -i: specify iteration counts (default: %"PRIu64")\n",
diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h
index a8124f9dd68a..5f977528e09c 100644
--- a/tools/testing/selftests/kvm/include/aarch64/processor.h
+++ b/tools/testing/selftests/kvm/include/aarch64/processor.h
@@ -38,12 +38,25 @@
* NORMAL 4 1111:1111
* NORMAL_WT 5 1011:1011
*/
-#define DEFAULT_MAIR_EL1 ((0x00ul << (0 * 8)) | \
- (0x04ul << (1 * 8)) | \
- (0x0cul << (2 * 8)) | \
- (0x44ul << (3 * 8)) | \
- (0xfful << (4 * 8)) | \
- (0xbbul << (5 * 8)))
+
+/* Linux doesn't use these memory types, so let's define them. */
+#define MAIR_ATTR_DEVICE_GRE UL(0x0c)
+#define MAIR_ATTR_NORMAL_WT UL(0xbb)
+
+#define MT_DEVICE_nGnRnE 0
+#define MT_DEVICE_nGnRE 1
+#define MT_DEVICE_GRE 2
+#define MT_NORMAL_NC 3
+#define MT_NORMAL 4
+#define MT_NORMAL_WT 5
+
+#define DEFAULT_MAIR_EL1 \
+ (MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) | \
+ MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) | \
+ MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) | \
+ MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) | \
+ MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) | \
+ MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT))
#define MPIDR_HWID_BITMASK (0xff00fffffful)
@@ -92,11 +105,19 @@ enum {
#define ESR_EC_MASK (ESR_EC_NUM - 1)
#define ESR_EC_SVC64 0x15
+#define ESR_EC_IABT 0x21
+#define ESR_EC_DABT 0x25
#define ESR_EC_HW_BP_CURRENT 0x31
#define ESR_EC_SSTEP_CURRENT 0x33
#define ESR_EC_WP_CURRENT 0x35
#define ESR_EC_BRK_INS 0x3c
+/* Access flag */
+#define PTE_AF (1ULL << 10)
+
+/* Access flag update enable/disable */
+#define TCR_EL1_HA (1ULL << 39)
+
void aarch64_get_supported_page_sizes(uint32_t ipa,
bool *ps4k, bool *ps16k, bool *ps64k);
@@ -109,6 +130,8 @@ void vm_install_exception_handler(struct kvm_vm *vm,
void vm_install_sync_handler(struct kvm_vm *vm,
int vector, int ec, handler_fn handler);
+uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva);
+
static inline void cpu_relax(void)
{
asm volatile("yield" ::: "memory");
diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h
index e42a09cd24a0..b0da75af1ff3 100644
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -34,6 +34,7 @@ struct userspace_mem_region {
struct sparsebit *unused_phy_pages;
int fd;
off_t offset;
+ enum vm_mem_backing_src_type backing_src_type;
void *host_mem;
void *host_alias;
void *mmap_start;
@@ -64,6 +65,14 @@ struct userspace_mem_regions {
DECLARE_HASHTABLE(slot_hash, 9);
};
+enum kvm_mem_region_type {
+ MEM_REGION_CODE,
+ MEM_REGION_DATA,
+ MEM_REGION_PT,
+ MEM_REGION_TEST_DATA,
+ NR_MEM_REGIONS,
+};
+
struct kvm_vm {
int mode;
unsigned long type;
@@ -92,6 +101,13 @@ struct kvm_vm {
int stats_fd;
struct kvm_stats_header stats_header;
struct kvm_stats_desc *stats_desc;
+
+ /*
+ * KVM region slots. These are the default memslots used by page
+ * allocators, e.g., lib/elf uses the memslots[MEM_REGION_CODE]
+ * memslot.
+ */
+ uint32_t memslots[NR_MEM_REGIONS];
};
@@ -104,6 +120,13 @@ struct kvm_vm {
struct userspace_mem_region *
memslot2region(struct kvm_vm *vm, uint32_t memslot);
+static inline struct userspace_mem_region *vm_get_mem_region(struct kvm_vm *vm,
+ enum kvm_mem_region_type type)
+{
+ assert(type < NR_MEM_REGIONS);
+ return memslot2region(vm, vm->memslots[type]);
+}
+
/* Minimum allocated guest virtual and physical addresses */
#define KVM_UTIL_MIN_VADDR 0x2000
#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000
@@ -384,7 +407,11 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot);
struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id);
vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min);
+vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
+ enum kvm_mem_region_type type);
vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages);
+vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm,
+ enum kvm_mem_region_type type);
vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm);
void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
@@ -646,13 +673,13 @@ vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm);
* __vm_create() does NOT create vCPUs, @nr_runnable_vcpus is used purely to
* calculate the amount of memory needed for per-vCPU data, e.g. stacks.
*/
-struct kvm_vm *____vm_create(enum vm_guest_mode mode, uint64_t nr_pages);
+struct kvm_vm *____vm_create(enum vm_guest_mode mode);
struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus,
uint64_t nr_extra_pages);
static inline struct kvm_vm *vm_create_barebones(void)
{
- return ____vm_create(VM_MODE_DEFAULT, 0);
+ return ____vm_create(VM_MODE_DEFAULT);
}
static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus)
diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h
index eaa88df0555a..536d7c3c3f14 100644
--- a/tools/testing/selftests/kvm/include/perf_test_util.h
+++ b/tools/testing/selftests/kvm/include/perf_test_util.h
@@ -40,6 +40,9 @@ struct perf_test_args {
/* Run vCPUs in L2 instead of L1, if the architecture supports it. */
bool nested;
+ /* Test is done, stop running vCPUs. */
+ bool stop_vcpus;
+
struct perf_test_vcpu_args vcpu_args[KVM_MAX_VCPUS];
};
diff --git a/tools/testing/selftests/kvm/include/userfaultfd_util.h b/tools/testing/selftests/kvm/include/userfaultfd_util.h
new file mode 100644
index 000000000000..877449c34592
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/userfaultfd_util.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KVM userfaultfd util
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ * Copyright (C) 2019-2022 Google LLC
+ */
+
+#define _GNU_SOURCE /* for pipe2 */
+
+#include <inttypes.h>
+#include <time.h>
+#include <pthread.h>
+#include <linux/userfaultfd.h>
+
+#include "test_util.h"
+
+typedef int (*uffd_handler_t)(int uffd_mode, int uffd, struct uffd_msg *msg);
+
+struct uffd_desc {
+ int uffd_mode;
+ int uffd;
+ int pipefds[2];
+ useconds_t delay;
+ uffd_handler_t handler;
+ pthread_t thread;
+};
+
+struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay,
+ void *hva, uint64_t len,
+ uffd_handler_t handler);
+
+void uffd_stop_demand_paging(struct uffd_desc *uffd);
+
+#ifdef PRINT_PER_PAGE_UPDATES
+#define PER_PAGE_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define PER_PAGE_DEBUG(...) _no_printf(__VA_ARGS__)
+#endif
+
+#ifdef PRINT_PER_VCPU_UPDATES
+#define PER_VCPU_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__)
+#endif
diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
index 6f5551368944..f79f2e37dc3b 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/processor.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c
@@ -11,6 +11,7 @@
#include "guest_modes.h"
#include "kvm_util.h"
#include "processor.h"
+#include <linux/bitfield.h>
#define DEFAULT_ARM64_GUEST_STACK_VADDR_MIN 0xac0000
@@ -76,13 +77,15 @@ static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm)
void virt_arch_pgd_alloc(struct kvm_vm *vm)
{
- if (!vm->pgd_created) {
- vm_paddr_t paddr = vm_phy_pages_alloc(vm,
- page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size,
- KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0);
- vm->pgd = paddr;
- vm->pgd_created = true;
- }
+ size_t nr_pages = page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size;
+
+ if (vm->pgd_created)
+ return;
+
+ vm->pgd = vm_phy_pages_alloc(vm, nr_pages,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR,
+ vm->memslots[MEM_REGION_PT]);
+ vm->pgd_created = true;
}
static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
@@ -133,12 +136,12 @@ static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
{
- uint64_t attr_idx = 4; /* NORMAL (See DEFAULT_MAIR_EL1) */
+ uint64_t attr_idx = MT_NORMAL;
_virt_pg_map(vm, vaddr, paddr, attr_idx);
}
-vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva)
{
uint64_t *ptep;
@@ -169,11 +172,18 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
TEST_FAIL("Page table levels must be 2, 3, or 4");
}
- return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1));
+ return ptep;
unmapped_gva:
TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
- exit(1);
+ exit(EXIT_FAILURE);
+}
+
+vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ uint64_t *ptep = virt_get_pte_hva(vm, gva);
+
+ return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1));
}
static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t page, int level)
@@ -318,13 +328,16 @@ void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent)
struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
struct kvm_vcpu_init *init, void *guest_code)
{
- size_t stack_size = vm->page_size == 4096 ?
- DEFAULT_STACK_PGS * vm->page_size :
- vm->page_size;
- uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size,
- DEFAULT_ARM64_GUEST_STACK_VADDR_MIN);
+ size_t stack_size;
+ uint64_t stack_vaddr;
struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id);
+ stack_size = vm->page_size == 4096 ? DEFAULT_STACK_PGS * vm->page_size :
+ vm->page_size;
+ stack_vaddr = __vm_vaddr_alloc(vm, stack_size,
+ DEFAULT_ARM64_GUEST_STACK_VADDR_MIN,
+ MEM_REGION_DATA);
+
aarch64_vcpu_setup(vcpu, init);
vcpu_set_reg(vcpu, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size);
@@ -428,8 +441,8 @@ unexpected_exception:
void vm_init_descriptor_tables(struct kvm_vm *vm)
{
- vm->handlers = vm_vaddr_alloc(vm, sizeof(struct handlers),
- vm->page_size);
+ vm->handlers = __vm_vaddr_alloc(vm, sizeof(struct handlers),
+ vm->page_size, MEM_REGION_DATA);
*(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;
}
@@ -486,9 +499,9 @@ void aarch64_get_supported_page_sizes(uint32_t ipa,
err = ioctl(vcpu_fd, KVM_GET_ONE_REG, &reg);
TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_GET_ONE_REG, vcpu_fd));
- *ps4k = ((val >> 28) & 0xf) != 0xf;
- *ps64k = ((val >> 24) & 0xf) == 0;
- *ps16k = ((val >> 20) & 0xf) != 0;
+ *ps4k = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_TGRAN4), val) != 0xf;
+ *ps64k = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_TGRAN64), val) == 0;
+ *ps16k = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_TGRAN16), val) != 0;
close(vcpu_fd);
close(vm_fd);
diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c
index 9f54c098d9d0..51f280c412ba 100644
--- a/tools/testing/selftests/kvm/lib/elf.c
+++ b/tools/testing/selftests/kvm/lib/elf.c
@@ -161,7 +161,8 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename)
seg_vend |= vm->page_size - 1;
size_t seg_size = seg_vend - seg_vstart + 1;
- vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart);
+ vm_vaddr_t vaddr = __vm_vaddr_alloc(vm, seg_size, seg_vstart,
+ MEM_REGION_CODE);
TEST_ASSERT(vaddr == seg_vstart, "Unable to allocate "
"virtual memory for segment at requested min addr,\n"
" segment idx: %u\n"
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index f1cb1627161f..e3daa97ab0f4 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -185,13 +185,10 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = {
_Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
"Missing new mode params?");
-struct kvm_vm *____vm_create(enum vm_guest_mode mode, uint64_t nr_pages)
+struct kvm_vm *____vm_create(enum vm_guest_mode mode)
{
struct kvm_vm *vm;
- pr_debug("%s: mode='%s' pages='%ld'\n", __func__,
- vm_guest_mode_string(mode), nr_pages);
-
vm = calloc(1, sizeof(*vm));
TEST_ASSERT(vm != NULL, "Insufficient Memory");
@@ -287,9 +284,6 @@ struct kvm_vm *____vm_create(enum vm_guest_mode mode, uint64_t nr_pages)
/* Allocate and setup memory for guest. */
vm->vpages_mapped = sparsebit_alloc();
- if (nr_pages != 0)
- vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
- 0, 0, nr_pages, 0);
return vm;
}
@@ -335,8 +329,16 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus,
uint64_t nr_pages = vm_nr_pages_required(mode, nr_runnable_vcpus,
nr_extra_pages);
struct kvm_vm *vm;
+ int i;
+
+ pr_debug("%s: mode='%s' pages='%ld'\n", __func__,
+ vm_guest_mode_string(mode), nr_pages);
- vm = ____vm_create(mode, nr_pages);
+ vm = ____vm_create(mode);
+
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
+ for (i = 0; i < NR_MEM_REGIONS; i++)
+ vm->memslots[i] = 0;
kvm_vm_elf_load(vm, program_invocation_name);
@@ -586,6 +588,12 @@ static void __vm_mem_region_delete(struct kvm_vm *vm,
sparsebit_free(&region->unused_phy_pages);
ret = munmap(region->mmap_start, region->mmap_size);
TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
+ if (region->fd >= 0) {
+ /* There's an extra map when using shared memory. */
+ ret = munmap(region->mmap_alias, region->mmap_size);
+ TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
+ close(region->fd);
+ }
free(region);
}
@@ -923,6 +931,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
vm_mem_backing_src_alias(src_type)->name);
}
+ region->backing_src_type = src_type;
region->unused_phy_pages = sparsebit_alloc();
sparsebit_set_num(region->unused_phy_pages,
guest_paddr >> vm->page_shift, npages);
@@ -1217,32 +1226,15 @@ va_found:
return pgidx_start * vm->page_size;
}
-/*
- * VM Virtual Address Allocate
- *
- * Input Args:
- * vm - Virtual Machine
- * sz - Size in bytes
- * vaddr_min - Minimum starting virtual address
- *
- * Output Args: None
- *
- * Return:
- * Starting guest virtual address
- *
- * Allocates at least sz bytes within the virtual address space of the vm
- * given by vm. The allocated bytes are mapped to a virtual address >=
- * the address given by vaddr_min. Note that each allocation uses a
- * a unique set of pages, with the minimum real allocation being at least
- * a page.
- */
-vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min)
+vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
+ enum kvm_mem_region_type type)
{
uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0);
virt_pgd_alloc(vm);
vm_paddr_t paddr = vm_phy_pages_alloc(vm, pages,
- KVM_UTIL_MIN_PFN * vm->page_size, 0);
+ KVM_UTIL_MIN_PFN * vm->page_size,
+ vm->memslots[type]);
/*
* Find an unused range of virtual page addresses of at least
@@ -1264,6 +1256,30 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min)
}
/*
+ * VM Virtual Address Allocate
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * sz - Size in bytes
+ * vaddr_min - Minimum starting virtual address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Starting guest virtual address
+ *
+ * Allocates at least sz bytes within the virtual address space of the vm
+ * given by vm. The allocated bytes are mapped to a virtual address >=
+ * the address given by vaddr_min. Note that each allocation uses a
+ * a unique set of pages, with the minimum real allocation being at least
+ * a page. The allocated physical space comes from the TEST_DATA memory region.
+ */
+vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min)
+{
+ return __vm_vaddr_alloc(vm, sz, vaddr_min, MEM_REGION_TEST_DATA);
+}
+
+/*
* VM Virtual Address Allocate Pages
*
* Input Args:
@@ -1282,6 +1298,11 @@ vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages)
return vm_vaddr_alloc(vm, nr_pages * getpagesize(), KVM_UTIL_MIN_VADDR);
}
+vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, enum kvm_mem_region_type type)
+{
+ return __vm_vaddr_alloc(vm, getpagesize(), KVM_UTIL_MIN_VADDR, type);
+}
+
/*
* VM Virtual Address Allocate Page
*
@@ -1506,7 +1527,7 @@ struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu)
void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu)
{
- uint32_t page_size = vcpu->vm->page_size;
+ uint32_t page_size = getpagesize();
uint32_t size = vcpu->vm->dirty_ring_size;
TEST_ASSERT(size > 0, "Should enable dirty ring first");
@@ -1847,7 +1868,8 @@ vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm)
{
- return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0);
+ return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR,
+ vm->memslots[MEM_REGION_PT]);
}
/*
diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
index 9618b37c66f7..ee3f499ccbd2 100644
--- a/tools/testing/selftests/kvm/lib/perf_test_util.c
+++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
@@ -267,6 +267,7 @@ void perf_test_start_vcpu_threads(int nr_vcpus,
vcpu_thread_fn = vcpu_fn;
WRITE_ONCE(all_vcpu_threads_running, false);
+ WRITE_ONCE(perf_test_args.stop_vcpus, false);
for (i = 0; i < nr_vcpus; i++) {
struct vcpu_thread *vcpu = &vcpu_threads[i];
@@ -289,6 +290,8 @@ void perf_test_join_vcpu_threads(int nr_vcpus)
{
int i;
+ WRITE_ONCE(perf_test_args.stop_vcpus, true);
+
for (i = 0; i < nr_vcpus; i++)
pthread_join(vcpu_threads[i].thread, NULL);
}
diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c
index 604478151212..d146ca71e0c0 100644
--- a/tools/testing/selftests/kvm/lib/riscv/processor.c
+++ b/tools/testing/selftests/kvm/lib/riscv/processor.c
@@ -55,13 +55,15 @@ static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level)
void virt_arch_pgd_alloc(struct kvm_vm *vm)
{
- if (!vm->pgd_created) {
- vm_paddr_t paddr = vm_phy_pages_alloc(vm,
- page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size,
- KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0);
- vm->pgd = paddr;
- vm->pgd_created = true;
- }
+ size_t nr_pages = page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size;
+
+ if (vm->pgd_created)
+ return;
+
+ vm->pgd = vm_phy_pages_alloc(vm, nr_pages,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR,
+ vm->memslots[MEM_REGION_PT]);
+ vm->pgd_created = true;
}
void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
@@ -279,15 +281,18 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
void *guest_code)
{
int r;
- size_t stack_size = vm->page_size == 4096 ?
- DEFAULT_STACK_PGS * vm->page_size :
- vm->page_size;
- unsigned long stack_vaddr = vm_vaddr_alloc(vm, stack_size,
- DEFAULT_RISCV_GUEST_STACK_VADDR_MIN);
+ size_t stack_size;
+ unsigned long stack_vaddr;
unsigned long current_gp = 0;
struct kvm_mp_state mps;
struct kvm_vcpu *vcpu;
+ stack_size = vm->page_size == 4096 ? DEFAULT_STACK_PGS * vm->page_size :
+ vm->page_size;
+ stack_vaddr = __vm_vaddr_alloc(vm, stack_size,
+ DEFAULT_RISCV_GUEST_STACK_VADDR_MIN,
+ MEM_REGION_DATA);
+
vcpu = __vm_vcpu_add(vm, vcpu_id);
riscv_vcpu_mmu_setup(vcpu);
diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c
index 89d7340d9cbd..15945121daf1 100644
--- a/tools/testing/selftests/kvm/lib/s390x/processor.c
+++ b/tools/testing/selftests/kvm/lib/s390x/processor.c
@@ -21,7 +21,8 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm)
return;
paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION,
- KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0);
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR,
+ vm->memslots[MEM_REGION_PT]);
memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size);
vm->pgd = paddr;
@@ -167,8 +168,9 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x",
vm->page_size);
- stack_vaddr = vm_vaddr_alloc(vm, stack_size,
- DEFAULT_GUEST_STACK_VADDR_MIN);
+ stack_vaddr = __vm_vaddr_alloc(vm, stack_size,
+ DEFAULT_GUEST_STACK_VADDR_MIN,
+ MEM_REGION_DATA);
vcpu = __vm_vcpu_add(vm, vcpu_id);
diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c
new file mode 100644
index 000000000000..3b44846fc277
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM userfaultfd util
+ * Adapted from demand_paging_test.c
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ * Copyright (C) 2019-2022 Google LLC
+ */
+
+#define _GNU_SOURCE /* for pipe2 */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <poll.h>
+#include <pthread.h>
+#include <linux/userfaultfd.h>
+#include <sys/syscall.h>
+
+#include "kvm_util.h"
+#include "test_util.h"
+#include "perf_test_util.h"
+#include "userfaultfd_util.h"
+
+#ifdef __NR_userfaultfd
+
+static void *uffd_handler_thread_fn(void *arg)
+{
+ struct uffd_desc *uffd_desc = (struct uffd_desc *)arg;
+ int uffd = uffd_desc->uffd;
+ int pipefd = uffd_desc->pipefds[0];
+ useconds_t delay = uffd_desc->delay;
+ int64_t pages = 0;
+ struct timespec start;
+ struct timespec ts_diff;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ while (1) {
+ struct uffd_msg msg;
+ struct pollfd pollfd[2];
+ char tmp_chr;
+ int r;
+
+ pollfd[0].fd = uffd;
+ pollfd[0].events = POLLIN;
+ pollfd[1].fd = pipefd;
+ pollfd[1].events = POLLIN;
+
+ r = poll(pollfd, 2, -1);
+ switch (r) {
+ case -1:
+ pr_info("poll err");
+ continue;
+ case 0:
+ continue;
+ case 1:
+ break;
+ default:
+ pr_info("Polling uffd returned %d", r);
+ return NULL;
+ }
+
+ if (pollfd[0].revents & POLLERR) {
+ pr_info("uffd revents has POLLERR");
+ return NULL;
+ }
+
+ if (pollfd[1].revents & POLLIN) {
+ r = read(pollfd[1].fd, &tmp_chr, 1);
+ TEST_ASSERT(r == 1,
+ "Error reading pipefd in UFFD thread\n");
+ return NULL;
+ }
+
+ if (!(pollfd[0].revents & POLLIN))
+ continue;
+
+ r = read(uffd, &msg, sizeof(msg));
+ if (r == -1) {
+ if (errno == EAGAIN)
+ continue;
+ pr_info("Read of uffd got errno %d\n", errno);
+ return NULL;
+ }
+
+ if (r != sizeof(msg)) {
+ pr_info("Read on uffd returned unexpected size: %d bytes", r);
+ return NULL;
+ }
+
+ if (!(msg.event & UFFD_EVENT_PAGEFAULT))
+ continue;
+
+ if (delay)
+ usleep(delay);
+ r = uffd_desc->handler(uffd_desc->uffd_mode, uffd, &msg);
+ if (r < 0)
+ return NULL;
+ pages++;
+ }
+
+ ts_diff = timespec_elapsed(start);
+ PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n",
+ pages, ts_diff.tv_sec, ts_diff.tv_nsec,
+ pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
+
+ return NULL;
+}
+
+struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay,
+ void *hva, uint64_t len,
+ uffd_handler_t handler)
+{
+ struct uffd_desc *uffd_desc;
+ bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR);
+ int uffd;
+ struct uffdio_api uffdio_api;
+ struct uffdio_register uffdio_register;
+ uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY;
+ int ret;
+
+ PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n",
+ is_minor ? "MINOR" : "MISSING",
+ is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY");
+
+ uffd_desc = malloc(sizeof(struct uffd_desc));
+ TEST_ASSERT(uffd_desc, "malloc failed");
+
+ /* In order to get minor faults, prefault via the alias. */
+ if (is_minor)
+ expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE;
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno);
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1,
+ "ioctl UFFDIO_API failed: %" PRIu64,
+ (uint64_t)uffdio_api.api);
+
+ uffdio_register.range.start = (uint64_t)hva;
+ uffdio_register.range.len = len;
+ uffdio_register.mode = uffd_mode;
+ TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1,
+ "ioctl UFFDIO_REGISTER failed");
+ TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) ==
+ expected_ioctls, "missing userfaultfd ioctls");
+
+ ret = pipe2(uffd_desc->pipefds, O_CLOEXEC | O_NONBLOCK);
+ TEST_ASSERT(!ret, "Failed to set up pipefd");
+
+ uffd_desc->uffd_mode = uffd_mode;
+ uffd_desc->uffd = uffd;
+ uffd_desc->delay = delay;
+ uffd_desc->handler = handler;
+ pthread_create(&uffd_desc->thread, NULL, uffd_handler_thread_fn,
+ uffd_desc);
+
+ PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n",
+ hva, hva + len);
+
+ return uffd_desc;
+}
+
+void uffd_stop_demand_paging(struct uffd_desc *uffd)
+{
+ char c = 0;
+ int ret;
+
+ ret = write(uffd->pipefds[1], &c, 1);
+ TEST_ASSERT(ret == 1, "Unable to write to pipefd");
+
+ ret = pthread_join(uffd->thread, NULL);
+ TEST_ASSERT(ret == 0, "Pthread_join failed.");
+
+ close(uffd->uffd);
+
+ close(uffd->pipefds[1]);
+ close(uffd->pipefds[0]);
+
+ free(uffd);
+}
+
+#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 39c4409ef56a..b199dde90e9f 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -552,7 +552,7 @@ unmapped_gva:
static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt)
{
if (!vm->gdt)
- vm->gdt = vm_vaddr_alloc_page(vm);
+ vm->gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
dt->base = vm->gdt;
dt->limit = getpagesize();
@@ -562,7 +562,7 @@ static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp,
int selector)
{
if (!vm->tss)
- vm->tss = vm_vaddr_alloc_page(vm);
+ vm->tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
memset(segp, 0, sizeof(*segp));
segp->base = vm->tss;
@@ -647,8 +647,9 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
vm_vaddr_t stack_vaddr;
struct kvm_vcpu *vcpu;
- stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
- DEFAULT_GUEST_STACK_VADDR_MIN);
+ stack_vaddr = __vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
+ DEFAULT_GUEST_STACK_VADDR_MIN,
+ MEM_REGION_DATA);
vcpu = __vm_vcpu_add(vm, vcpu_id);
vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());
@@ -1145,8 +1146,8 @@ void vm_init_descriptor_tables(struct kvm_vm *vm)
extern void *idt_handlers;
int i;
- vm->idt = vm_vaddr_alloc_page(vm);
- vm->handlers = vm_vaddr_alloc_page(vm);
+ vm->idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
+ vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
/* Handlers have the same address in both address spaces.*/
for (i = 0; i < NUM_INTERRUPTS; i++)
set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0,
diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
index bb1d17a1171b..3a5e4518307c 100644
--- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c
+++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
@@ -34,8 +34,6 @@
static int nr_vcpus = 1;
static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
-static bool run_vcpus = true;
-
static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args)
{
struct kvm_vcpu *vcpu = vcpu_args->vcpu;
@@ -45,7 +43,7 @@ static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args)
run = vcpu->run;
/* Let the guest access its memory until a stop signal is received */
- while (READ_ONCE(run_vcpus)) {
+ while (!READ_ONCE(perf_test_args.stop_vcpus)) {
ret = _vcpu_run(vcpu);
TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
@@ -110,8 +108,6 @@ static void run_test(enum vm_guest_mode mode, void *arg)
add_remove_memslot(vm, p->memslot_modification_delay,
p->nr_memslot_modifications);
- run_vcpus = false;
-
perf_test_join_vcpu_threads(nr_vcpus);
pr_info("All vCPU threads joined\n");
diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c
index 44995446d942..2ad40f7c9c08 100644
--- a/tools/testing/selftests/kvm/memslot_perf_test.c
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@ -20,20 +20,20 @@
#include <unistd.h>
#include <linux/compiler.h>
+#include <linux/sizes.h>
#include <test_util.h>
#include <kvm_util.h>
#include <processor.h>
-#define MEM_SIZE ((512U << 20) + 4096)
-#define MEM_SIZE_PAGES (MEM_SIZE / 4096)
-#define MEM_GPA 0x10000000UL
+#define MEM_EXTRA_SIZE SZ_64K
+
+#define MEM_SIZE (SZ_512M + MEM_EXTRA_SIZE)
+#define MEM_GPA SZ_256M
#define MEM_AUX_GPA MEM_GPA
#define MEM_SYNC_GPA MEM_AUX_GPA
-#define MEM_TEST_GPA (MEM_AUX_GPA + 4096)
-#define MEM_TEST_SIZE (MEM_SIZE - 4096)
-static_assert(MEM_SIZE % 4096 == 0, "invalid mem size");
-static_assert(MEM_TEST_SIZE % 4096 == 0, "invalid mem test size");
+#define MEM_TEST_GPA (MEM_AUX_GPA + MEM_EXTRA_SIZE)
+#define MEM_TEST_SIZE (MEM_SIZE - MEM_EXTRA_SIZE)
/*
* 32 MiB is max size that gets well over 100 iterations on 509 slots.
@@ -41,44 +41,38 @@ static_assert(MEM_TEST_SIZE % 4096 == 0, "invalid mem test size");
* 8194 slots in use can then be tested (although with slightly
* limited resolution).
*/
-#define MEM_SIZE_MAP ((32U << 20) + 4096)
-#define MEM_SIZE_MAP_PAGES (MEM_SIZE_MAP / 4096)
-#define MEM_TEST_MAP_SIZE (MEM_SIZE_MAP - 4096)
-#define MEM_TEST_MAP_SIZE_PAGES (MEM_TEST_MAP_SIZE / 4096)
-static_assert(MEM_SIZE_MAP % 4096 == 0, "invalid map test region size");
-static_assert(MEM_TEST_MAP_SIZE % 4096 == 0, "invalid map test region size");
-static_assert(MEM_TEST_MAP_SIZE_PAGES % 2 == 0, "invalid map test region size");
-static_assert(MEM_TEST_MAP_SIZE_PAGES > 2, "invalid map test region size");
+#define MEM_SIZE_MAP (SZ_32M + MEM_EXTRA_SIZE)
+#define MEM_TEST_MAP_SIZE (MEM_SIZE_MAP - MEM_EXTRA_SIZE)
/*
* 128 MiB is min size that fills 32k slots with at least one page in each
* while at the same time gets 100+ iterations in such test
+ *
+ * 2 MiB chunk size like a typical huge page
*/
-#define MEM_TEST_UNMAP_SIZE (128U << 20)
-#define MEM_TEST_UNMAP_SIZE_PAGES (MEM_TEST_UNMAP_SIZE / 4096)
-/* 2 MiB chunk size like a typical huge page */
-#define MEM_TEST_UNMAP_CHUNK_PAGES (2U << (20 - 12))
-static_assert(MEM_TEST_UNMAP_SIZE <= MEM_TEST_SIZE,
- "invalid unmap test region size");
-static_assert(MEM_TEST_UNMAP_SIZE % 4096 == 0,
- "invalid unmap test region size");
-static_assert(MEM_TEST_UNMAP_SIZE_PAGES %
- (2 * MEM_TEST_UNMAP_CHUNK_PAGES) == 0,
- "invalid unmap test region size");
+#define MEM_TEST_UNMAP_SIZE SZ_128M
+#define MEM_TEST_UNMAP_CHUNK_SIZE SZ_2M
/*
* For the move active test the middle of the test area is placed on
* a memslot boundary: half lies in the memslot being moved, half in
* other memslot(s).
*
- * When running this test with 32k memslots (32764, really) each memslot
- * contains 4 pages.
- * The last one additionally contains the remaining 21 pages of memory,
- * for the total size of 25 pages.
- * Hence, the maximum size here is 50 pages.
+ * We have different number of memory slots, excluding the reserved
+ * memory slot 0, on various architectures and configurations. The
+ * memory size in this test is calculated by picking the maximal
+ * last memory slot's memory size, with alignment to the largest
+ * supported page size (64KB). In this way, the selected memory
+ * size for this test is compatible with test_memslot_move_prepare().
+ *
+ * architecture slots memory-per-slot memory-on-last-slot
+ * --------------------------------------------------------------
+ * x86-4KB 32763 16KB 160KB
+ * arm64-4KB 32766 16KB 112KB
+ * arm64-16KB 32766 16KB 112KB
+ * arm64-64KB 8192 64KB 128KB
*/
-#define MEM_TEST_MOVE_SIZE_PAGES (50)
-#define MEM_TEST_MOVE_SIZE (MEM_TEST_MOVE_SIZE_PAGES * 4096)
+#define MEM_TEST_MOVE_SIZE (3 * SZ_64K)
#define MEM_TEST_MOVE_GPA_DEST (MEM_GPA + MEM_SIZE)
static_assert(MEM_TEST_MOVE_SIZE <= MEM_TEST_SIZE,
"invalid move test region size");
@@ -100,6 +94,7 @@ struct vm_data {
};
struct sync_area {
+ uint32_t guest_page_size;
atomic_bool start_flag;
atomic_bool exit_flag;
atomic_bool sync_flag;
@@ -192,14 +187,15 @@ static void *vm_gpa2hva(struct vm_data *data, uint64_t gpa, uint64_t *rempages)
uint64_t gpage, pgoffs;
uint32_t slot, slotoffs;
void *base;
+ uint32_t guest_page_size = data->vm->page_size;
TEST_ASSERT(gpa >= MEM_GPA, "Too low gpa to translate");
- TEST_ASSERT(gpa < MEM_GPA + data->npages * 4096,
+ TEST_ASSERT(gpa < MEM_GPA + data->npages * guest_page_size,
"Too high gpa to translate");
gpa -= MEM_GPA;
- gpage = gpa / 4096;
- pgoffs = gpa % 4096;
+ gpage = gpa / guest_page_size;
+ pgoffs = gpa % guest_page_size;
slot = min(gpage / data->pages_per_slot, (uint64_t)data->nslots - 1);
slotoffs = gpage - (slot * data->pages_per_slot);
@@ -217,14 +213,16 @@ static void *vm_gpa2hva(struct vm_data *data, uint64_t gpa, uint64_t *rempages)
}
base = data->hva_slots[slot];
- return (uint8_t *)base + slotoffs * 4096 + pgoffs;
+ return (uint8_t *)base + slotoffs * guest_page_size + pgoffs;
}
static uint64_t vm_slot2gpa(struct vm_data *data, uint32_t slot)
{
+ uint32_t guest_page_size = data->vm->page_size;
+
TEST_ASSERT(slot < data->nslots, "Too high slot number");
- return MEM_GPA + slot * data->pages_per_slot * 4096;
+ return MEM_GPA + slot * data->pages_per_slot * guest_page_size;
}
static struct vm_data *alloc_vm(void)
@@ -241,82 +239,110 @@ static struct vm_data *alloc_vm(void)
return data;
}
+static bool check_slot_pages(uint32_t host_page_size, uint32_t guest_page_size,
+ uint64_t pages_per_slot, uint64_t rempages)
+{
+ if (!pages_per_slot)
+ return false;
+
+ if ((pages_per_slot * guest_page_size) % host_page_size)
+ return false;
+
+ if ((rempages * guest_page_size) % host_page_size)
+ return false;
+
+ return true;
+}
+
+
+static uint64_t get_max_slots(struct vm_data *data, uint32_t host_page_size)
+{
+ uint32_t guest_page_size = data->vm->page_size;
+ uint64_t mempages, pages_per_slot, rempages;
+ uint64_t slots;
+
+ mempages = data->npages;
+ slots = data->nslots;
+ while (--slots > 1) {
+ pages_per_slot = mempages / slots;
+ rempages = mempages % pages_per_slot;
+ if (check_slot_pages(host_page_size, guest_page_size,
+ pages_per_slot, rempages))
+ return slots + 1; /* slot 0 is reserved */
+ }
+
+ return 0;
+}
+
static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots,
- void *guest_code, uint64_t mempages,
+ void *guest_code, uint64_t mem_size,
struct timespec *slot_runtime)
{
- uint32_t max_mem_slots;
- uint64_t rempages;
+ uint64_t mempages, rempages;
uint64_t guest_addr;
- uint32_t slot;
+ uint32_t slot, host_page_size, guest_page_size;
struct timespec tstart;
struct sync_area *sync;
- max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
- TEST_ASSERT(max_mem_slots > 1,
- "KVM_CAP_NR_MEMSLOTS should be greater than 1");
- TEST_ASSERT(nslots > 1 || nslots == -1,
- "Slot count cap should be greater than 1");
- if (nslots != -1)
- max_mem_slots = min(max_mem_slots, (uint32_t)nslots);
- pr_info_v("Allowed number of memory slots: %"PRIu32"\n", max_mem_slots);
+ host_page_size = getpagesize();
+ guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size;
+ mempages = mem_size / guest_page_size;
- TEST_ASSERT(mempages > 1,
- "Can't test without any memory");
+ data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code);
+ ucall_init(data->vm, NULL);
+ TEST_ASSERT(data->vm->page_size == guest_page_size, "Invalid VM page size");
data->npages = mempages;
- data->nslots = max_mem_slots - 1;
- data->pages_per_slot = mempages / data->nslots;
- if (!data->pages_per_slot) {
- *maxslots = mempages + 1;
+ TEST_ASSERT(data->npages > 1, "Can't test without any memory");
+ data->nslots = nslots;
+ data->pages_per_slot = data->npages / data->nslots;
+ rempages = data->npages % data->nslots;
+ if (!check_slot_pages(host_page_size, guest_page_size,
+ data->pages_per_slot, rempages)) {
+ *maxslots = get_max_slots(data, host_page_size);
return false;
}
- rempages = mempages % data->nslots;
data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots);
TEST_ASSERT(data->hva_slots, "malloc() fail");
- data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code);
- ucall_init(data->vm, NULL);
-
pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n",
- max_mem_slots - 1, data->pages_per_slot, rempages);
+ data->nslots, data->pages_per_slot, rempages);
clock_gettime(CLOCK_MONOTONIC, &tstart);
- for (slot = 1, guest_addr = MEM_GPA; slot < max_mem_slots; slot++) {
+ for (slot = 1, guest_addr = MEM_GPA; slot <= data->nslots; slot++) {
uint64_t npages;
npages = data->pages_per_slot;
- if (slot == max_mem_slots - 1)
+ if (slot == data->nslots)
npages += rempages;
vm_userspace_mem_region_add(data->vm, VM_MEM_SRC_ANONYMOUS,
guest_addr, slot, npages,
0);
- guest_addr += npages * 4096;
+ guest_addr += npages * guest_page_size;
}
*slot_runtime = timespec_elapsed(tstart);
- for (slot = 0, guest_addr = MEM_GPA; slot < max_mem_slots - 1; slot++) {
+ for (slot = 1, guest_addr = MEM_GPA; slot <= data->nslots; slot++) {
uint64_t npages;
uint64_t gpa;
npages = data->pages_per_slot;
- if (slot == max_mem_slots - 2)
+ if (slot == data->nslots)
npages += rempages;
- gpa = vm_phy_pages_alloc(data->vm, npages, guest_addr,
- slot + 1);
+ gpa = vm_phy_pages_alloc(data->vm, npages, guest_addr, slot);
TEST_ASSERT(gpa == guest_addr,
"vm_phy_pages_alloc() failed\n");
- data->hva_slots[slot] = addr_gpa2hva(data->vm, guest_addr);
- memset(data->hva_slots[slot], 0, npages * 4096);
+ data->hva_slots[slot - 1] = addr_gpa2hva(data->vm, guest_addr);
+ memset(data->hva_slots[slot - 1], 0, npages * guest_page_size);
- guest_addr += npages * 4096;
+ guest_addr += npages * guest_page_size;
}
- virt_map(data->vm, MEM_GPA, MEM_GPA, mempages);
+ virt_map(data->vm, MEM_GPA, MEM_GPA, data->npages);
sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL);
atomic_init(&sync->start_flag, false);
@@ -415,6 +441,7 @@ static bool guest_perform_sync(void)
static void guest_code_test_memslot_move(void)
{
struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA;
+ uint32_t page_size = (typeof(page_size))READ_ONCE(sync->guest_page_size);
uintptr_t base = (typeof(base))READ_ONCE(sync->move_area_ptr);
GUEST_SYNC(0);
@@ -425,7 +452,7 @@ static void guest_code_test_memslot_move(void)
uintptr_t ptr;
for (ptr = base; ptr < base + MEM_TEST_MOVE_SIZE;
- ptr += 4096)
+ ptr += page_size)
*(uint64_t *)ptr = MEM_TEST_VAL_1;
/*
@@ -443,6 +470,7 @@ static void guest_code_test_memslot_move(void)
static void guest_code_test_memslot_map(void)
{
struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA;
+ uint32_t page_size = (typeof(page_size))READ_ONCE(sync->guest_page_size);
GUEST_SYNC(0);
@@ -452,14 +480,16 @@ static void guest_code_test_memslot_map(void)
uintptr_t ptr;
for (ptr = MEM_TEST_GPA;
- ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; ptr += 4096)
+ ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2;
+ ptr += page_size)
*(uint64_t *)ptr = MEM_TEST_VAL_1;
if (!guest_perform_sync())
break;
for (ptr = MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2;
- ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE; ptr += 4096)
+ ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE;
+ ptr += page_size)
*(uint64_t *)ptr = MEM_TEST_VAL_2;
if (!guest_perform_sync())
@@ -506,6 +536,9 @@ static void guest_code_test_memslot_unmap(void)
static void guest_code_test_memslot_rw(void)
{
+ struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA;
+ uint32_t page_size = (typeof(page_size))READ_ONCE(sync->guest_page_size);
+
GUEST_SYNC(0);
guest_spin_until_start();
@@ -514,14 +547,14 @@ static void guest_code_test_memslot_rw(void)
uintptr_t ptr;
for (ptr = MEM_TEST_GPA;
- ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096)
+ ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += page_size)
*(uint64_t *)ptr = MEM_TEST_VAL_1;
if (!guest_perform_sync())
break;
- for (ptr = MEM_TEST_GPA + 4096 / 2;
- ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096) {
+ for (ptr = MEM_TEST_GPA + page_size / 2;
+ ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += page_size) {
uint64_t val = *(uint64_t *)ptr;
GUEST_ASSERT_1(val == MEM_TEST_VAL_2, val);
@@ -539,6 +572,7 @@ static bool test_memslot_move_prepare(struct vm_data *data,
struct sync_area *sync,
uint64_t *maxslots, bool isactive)
{
+ uint32_t guest_page_size = data->vm->page_size;
uint64_t movesrcgpa, movetestgpa;
movesrcgpa = vm_slot2gpa(data, data->nslots - 1);
@@ -547,7 +581,7 @@ static bool test_memslot_move_prepare(struct vm_data *data,
uint64_t lastpages;
vm_gpa2hva(data, movesrcgpa, &lastpages);
- if (lastpages < MEM_TEST_MOVE_SIZE_PAGES / 2) {
+ if (lastpages * guest_page_size < MEM_TEST_MOVE_SIZE / 2) {
*maxslots = 0;
return false;
}
@@ -593,8 +627,9 @@ static void test_memslot_do_unmap(struct vm_data *data,
uint64_t offsp, uint64_t count)
{
uint64_t gpa, ctr;
+ uint32_t guest_page_size = data->vm->page_size;
- for (gpa = MEM_TEST_GPA + offsp * 4096, ctr = 0; ctr < count; ) {
+ for (gpa = MEM_TEST_GPA + offsp * guest_page_size, ctr = 0; ctr < count; ) {
uint64_t npages;
void *hva;
int ret;
@@ -602,12 +637,12 @@ static void test_memslot_do_unmap(struct vm_data *data,
hva = vm_gpa2hva(data, gpa, &npages);
TEST_ASSERT(npages, "Empty memory slot at gptr 0x%"PRIx64, gpa);
npages = min(npages, count - ctr);
- ret = madvise(hva, npages * 4096, MADV_DONTNEED);
+ ret = madvise(hva, npages * guest_page_size, MADV_DONTNEED);
TEST_ASSERT(!ret,
"madvise(%p, MADV_DONTNEED) on VM memory should not fail for gptr 0x%"PRIx64,
hva, gpa);
ctr += npages;
- gpa += npages * 4096;
+ gpa += npages * guest_page_size;
}
TEST_ASSERT(ctr == count,
"madvise(MADV_DONTNEED) should exactly cover all of the requested area");
@@ -618,11 +653,12 @@ static void test_memslot_map_unmap_check(struct vm_data *data,
{
uint64_t gpa;
uint64_t *val;
+ uint32_t guest_page_size = data->vm->page_size;
if (!map_unmap_verify)
return;
- gpa = MEM_TEST_GPA + offsp * 4096;
+ gpa = MEM_TEST_GPA + offsp * guest_page_size;
val = (typeof(val))vm_gpa2hva(data, gpa, NULL);
TEST_ASSERT(*val == valexp,
"Guest written values should read back correctly before unmap (%"PRIu64" vs %"PRIu64" @ %"PRIx64")",
@@ -632,12 +668,14 @@ static void test_memslot_map_unmap_check(struct vm_data *data,
static void test_memslot_map_loop(struct vm_data *data, struct sync_area *sync)
{
+ uint32_t guest_page_size = data->vm->page_size;
+ uint64_t guest_pages = MEM_TEST_MAP_SIZE / guest_page_size;
+
/*
* Unmap the second half of the test area while guest writes to (maps)
* the first half.
*/
- test_memslot_do_unmap(data, MEM_TEST_MAP_SIZE_PAGES / 2,
- MEM_TEST_MAP_SIZE_PAGES / 2);
+ test_memslot_do_unmap(data, guest_pages / 2, guest_pages / 2);
/*
* Wait for the guest to finish writing the first half of the test
@@ -648,10 +686,8 @@ static void test_memslot_map_loop(struct vm_data *data, struct sync_area *sync)
*/
host_perform_sync(sync);
test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1);
- test_memslot_map_unmap_check(data,
- MEM_TEST_MAP_SIZE_PAGES / 2 - 1,
- MEM_TEST_VAL_1);
- test_memslot_do_unmap(data, 0, MEM_TEST_MAP_SIZE_PAGES / 2);
+ test_memslot_map_unmap_check(data, guest_pages / 2 - 1, MEM_TEST_VAL_1);
+ test_memslot_do_unmap(data, 0, guest_pages / 2);
/*
@@ -664,16 +700,16 @@ static void test_memslot_map_loop(struct vm_data *data, struct sync_area *sync)
* the test area.
*/
host_perform_sync(sync);
- test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES / 2,
- MEM_TEST_VAL_2);
- test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES - 1,
- MEM_TEST_VAL_2);
+ test_memslot_map_unmap_check(data, guest_pages / 2, MEM_TEST_VAL_2);
+ test_memslot_map_unmap_check(data, guest_pages - 1, MEM_TEST_VAL_2);
}
static void test_memslot_unmap_loop_common(struct vm_data *data,
struct sync_area *sync,
uint64_t chunk)
{
+ uint32_t guest_page_size = data->vm->page_size;
+ uint64_t guest_pages = MEM_TEST_UNMAP_SIZE / guest_page_size;
uint64_t ctr;
/*
@@ -685,42 +721,49 @@ static void test_memslot_unmap_loop_common(struct vm_data *data,
*/
host_perform_sync(sync);
test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1);
- for (ctr = 0; ctr < MEM_TEST_UNMAP_SIZE_PAGES / 2; ctr += chunk)
+ for (ctr = 0; ctr < guest_pages / 2; ctr += chunk)
test_memslot_do_unmap(data, ctr, chunk);
/* Likewise, but for the opposite host / guest areas */
host_perform_sync(sync);
- test_memslot_map_unmap_check(data, MEM_TEST_UNMAP_SIZE_PAGES / 2,
- MEM_TEST_VAL_2);
- for (ctr = MEM_TEST_UNMAP_SIZE_PAGES / 2;
- ctr < MEM_TEST_UNMAP_SIZE_PAGES; ctr += chunk)
+ test_memslot_map_unmap_check(data, guest_pages / 2, MEM_TEST_VAL_2);
+ for (ctr = guest_pages / 2; ctr < guest_pages; ctr += chunk)
test_memslot_do_unmap(data, ctr, chunk);
}
static void test_memslot_unmap_loop(struct vm_data *data,
struct sync_area *sync)
{
- test_memslot_unmap_loop_common(data, sync, 1);
+ uint32_t host_page_size = getpagesize();
+ uint32_t guest_page_size = data->vm->page_size;
+ uint64_t guest_chunk_pages = guest_page_size >= host_page_size ?
+ 1 : host_page_size / guest_page_size;
+
+ test_memslot_unmap_loop_common(data, sync, guest_chunk_pages);
}
static void test_memslot_unmap_loop_chunked(struct vm_data *data,
struct sync_area *sync)
{
- test_memslot_unmap_loop_common(data, sync, MEM_TEST_UNMAP_CHUNK_PAGES);
+ uint32_t guest_page_size = data->vm->page_size;
+ uint64_t guest_chunk_pages = MEM_TEST_UNMAP_CHUNK_SIZE / guest_page_size;
+
+ test_memslot_unmap_loop_common(data, sync, guest_chunk_pages);
}
static void test_memslot_rw_loop(struct vm_data *data, struct sync_area *sync)
{
uint64_t gptr;
+ uint32_t guest_page_size = data->vm->page_size;
- for (gptr = MEM_TEST_GPA + 4096 / 2;
- gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096)
+ for (gptr = MEM_TEST_GPA + guest_page_size / 2;
+ gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += guest_page_size)
*(uint64_t *)vm_gpa2hva(data, gptr, NULL) = MEM_TEST_VAL_2;
host_perform_sync(sync);
for (gptr = MEM_TEST_GPA;
- gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096) {
+ gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += guest_page_size) {
uint64_t *vptr = (typeof(vptr))vm_gpa2hva(data, gptr, NULL);
uint64_t val = *vptr;
@@ -749,7 +792,7 @@ static bool test_execute(int nslots, uint64_t *maxslots,
struct timespec *slot_runtime,
struct timespec *guest_runtime)
{
- uint64_t mem_size = tdata->mem_size ? : MEM_SIZE_PAGES;
+ uint64_t mem_size = tdata->mem_size ? : MEM_SIZE;
struct vm_data *data;
struct sync_area *sync;
struct timespec tstart;
@@ -764,6 +807,7 @@ static bool test_execute(int nslots, uint64_t *maxslots,
sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL);
+ sync->guest_page_size = data->vm->page_size;
if (tdata->prepare &&
!tdata->prepare(data, sync, maxslots)) {
ret = false;
@@ -797,19 +841,19 @@ exit_free:
static const struct test_data tests[] = {
{
.name = "map",
- .mem_size = MEM_SIZE_MAP_PAGES,
+ .mem_size = MEM_SIZE_MAP,
.guest_code = guest_code_test_memslot_map,
.loop = test_memslot_map_loop,
},
{
.name = "unmap",
- .mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1,
+ .mem_size = MEM_TEST_UNMAP_SIZE + MEM_EXTRA_SIZE,
.guest_code = guest_code_test_memslot_unmap,
.loop = test_memslot_unmap_loop,
},
{
.name = "unmap chunked",
- .mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1,
+ .mem_size = MEM_TEST_UNMAP_SIZE + MEM_EXTRA_SIZE,
.guest_code = guest_code_test_memslot_unmap,
.loop = test_memslot_unmap_loop_chunked,
},
@@ -867,9 +911,46 @@ static void help(char *name, struct test_args *targs)
pr_info("%d: %s\n", ctr, tests[ctr].name);
}
+static bool check_memory_sizes(void)
+{
+ uint32_t host_page_size = getpagesize();
+ uint32_t guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size;
+
+ if (host_page_size > SZ_64K || guest_page_size > SZ_64K) {
+ pr_info("Unsupported page size on host (0x%x) or guest (0x%x)\n",
+ host_page_size, guest_page_size);
+ return false;
+ }
+
+ if (MEM_SIZE % guest_page_size ||
+ MEM_TEST_SIZE % guest_page_size) {
+ pr_info("invalid MEM_SIZE or MEM_TEST_SIZE\n");
+ return false;
+ }
+
+ if (MEM_SIZE_MAP % guest_page_size ||
+ MEM_TEST_MAP_SIZE % guest_page_size ||
+ (MEM_TEST_MAP_SIZE / guest_page_size) <= 2 ||
+ (MEM_TEST_MAP_SIZE / guest_page_size) % 2) {
+ pr_info("invalid MEM_SIZE_MAP or MEM_TEST_MAP_SIZE\n");
+ return false;
+ }
+
+ if (MEM_TEST_UNMAP_SIZE > MEM_TEST_SIZE ||
+ MEM_TEST_UNMAP_SIZE % guest_page_size ||
+ (MEM_TEST_UNMAP_SIZE / guest_page_size) %
+ (2 * MEM_TEST_UNMAP_CHUNK_SIZE / guest_page_size)) {
+ pr_info("invalid MEM_TEST_UNMAP_SIZE or MEM_TEST_UNMAP_CHUNK_SIZE\n");
+ return false;
+ }
+
+ return true;
+}
+
static bool parse_args(int argc, char *argv[],
struct test_args *targs)
{
+ uint32_t max_mem_slots;
int opt;
while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) {
@@ -886,8 +967,8 @@ static bool parse_args(int argc, char *argv[],
break;
case 's':
targs->nslots = atoi(optarg);
- if (targs->nslots <= 0 && targs->nslots != -1) {
- pr_info("Slot count cap has to be positive or -1 for no cap\n");
+ if (targs->nslots <= 1 && targs->nslots != -1) {
+ pr_info("Slot count cap must be larger than 1 or -1 for no cap\n");
return false;
}
break;
@@ -933,6 +1014,21 @@ static bool parse_args(int argc, char *argv[],
return false;
}
+ max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
+ if (max_mem_slots <= 1) {
+ pr_info("KVM_CAP_NR_MEMSLOTS should be greater than 1\n");
+ return false;
+ }
+
+ /* Memory slot 0 is reserved */
+ if (targs->nslots == -1)
+ targs->nslots = max_mem_slots - 1;
+ else
+ targs->nslots = min_t(int, targs->nslots, max_mem_slots) - 1;
+
+ pr_info_v("Allowed Number of memory slots: %"PRIu32"\n",
+ targs->nslots + 1);
+
return true;
}
@@ -1010,6 +1106,9 @@ int main(int argc, char *argv[])
/* Tell stdout not to buffer its content */
setbuf(stdout, NULL);
+ if (!check_memory_sizes())
+ return -1;
+
if (!parse_args(argc, argv, &targs))
return -1;
diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index 8a5cb800f50e..2a5727188c8d 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -15,9 +15,13 @@
#include <time.h>
#include <sched.h>
#include <signal.h>
+#include <pthread.h>
#include <sys/eventfd.h>
+/* Defined in include/linux/kvm_types.h */
+#define GPA_INVALID (~(ulong)0)
+
#define SHINFO_REGION_GVA 0xc0000000ULL
#define SHINFO_REGION_GPA 0xc0000000ULL
#define SHINFO_REGION_SLOT 10
@@ -44,6 +48,8 @@
#define MIN_STEAL_TIME 50000
+#define SHINFO_RACE_TIMEOUT 2 /* seconds */
+
#define __HYPERVISOR_set_timer_op 15
#define __HYPERVISOR_sched_op 29
#define __HYPERVISOR_event_channel_op 32
@@ -126,7 +132,7 @@ struct {
struct kvm_irq_routing_entry entries[2];
} irq_routes;
-bool guest_saw_irq;
+static volatile bool guest_saw_irq;
static void evtchn_handler(struct ex_regs *regs)
{
@@ -148,6 +154,7 @@ static void guest_wait_for_irq(void)
static void guest_code(void)
{
struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR;
+ int i;
__asm__ __volatile__(
"sti\n"
@@ -325,6 +332,49 @@ static void guest_code(void)
guest_wait_for_irq();
GUEST_SYNC(21);
+ /* Racing host ioctls */
+
+ guest_wait_for_irq();
+
+ GUEST_SYNC(22);
+ /* Racing vmcall against host ioctl */
+
+ ports[0] = 0;
+
+ p = (struct sched_poll) {
+ .ports = ports,
+ .nr_ports = 1,
+ .timeout = 0
+ };
+
+wait_for_timer:
+ /*
+ * Poll for a timer wake event while the worker thread is mucking with
+ * the shared info. KVM XEN drops timer IRQs if the shared info is
+ * invalid when the timer expires. Arbitrarily poll 100 times before
+ * giving up and asking the VMM to re-arm the timer. 100 polls should
+ * consume enough time to beat on KVM without taking too long if the
+ * timer IRQ is dropped due to an invalid event channel.
+ */
+ for (i = 0; i < 100 && !guest_saw_irq; i++)
+ asm volatile("vmcall"
+ : "=a" (rax)
+ : "a" (__HYPERVISOR_sched_op),
+ "D" (SCHEDOP_poll),
+ "S" (&p)
+ : "memory");
+
+ /*
+ * Re-send the timer IRQ if it was (likely) dropped due to the timer
+ * expiring while the event channel was invalid.
+ */
+ if (!guest_saw_irq) {
+ GUEST_SYNC(23);
+ goto wait_for_timer;
+ }
+ guest_saw_irq = false;
+
+ GUEST_SYNC(24);
}
static int cmp_timespec(struct timespec *a, struct timespec *b)
@@ -352,11 +402,36 @@ static void handle_alrm(int sig)
TEST_FAIL("IRQ delivery timed out");
}
+static void *juggle_shinfo_state(void *arg)
+{
+ struct kvm_vm *vm = (struct kvm_vm *)arg;
+
+ struct kvm_xen_hvm_attr cache_init = {
+ .type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+ .u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE
+ };
+
+ struct kvm_xen_hvm_attr cache_destroy = {
+ .type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+ .u.shared_info.gfn = GPA_INVALID
+ };
+
+ for (;;) {
+ __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_init);
+ __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_destroy);
+ pthread_testcancel();
+ };
+
+ return NULL;
+}
+
int main(int argc, char *argv[])
{
struct timespec min_ts, max_ts, vm_ts;
struct kvm_vm *vm;
+ pthread_t thread;
bool verbose;
+ int ret;
verbose = argc > 1 && (!strncmp(argv[1], "-v", 3) ||
!strncmp(argv[1], "--verbose", 10));
@@ -785,6 +860,71 @@ int main(int argc, char *argv[])
case 21:
TEST_ASSERT(!evtchn_irq_expected,
"Expected event channel IRQ but it didn't happen");
+ alarm(0);
+
+ if (verbose)
+ printf("Testing shinfo lock corruption (KVM_XEN_HVM_EVTCHN_SEND)\n");
+
+ ret = pthread_create(&thread, NULL, &juggle_shinfo_state, (void *)vm);
+ TEST_ASSERT(ret == 0, "pthread_create() failed: %s", strerror(ret));
+
+ struct kvm_irq_routing_xen_evtchn uxe = {
+ .port = 1,
+ .vcpu = vcpu->id,
+ .priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
+ };
+
+ evtchn_irq_expected = true;
+ for (time_t t = time(NULL) + SHINFO_RACE_TIMEOUT; time(NULL) < t;)
+ __vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &uxe);
+ break;
+
+ case 22:
+ TEST_ASSERT(!evtchn_irq_expected,
+ "Expected event channel IRQ but it didn't happen");
+
+ if (verbose)
+ printf("Testing shinfo lock corruption (SCHEDOP_poll)\n");
+
+ shinfo->evtchn_pending[0] = 1;
+
+ evtchn_irq_expected = true;
+ tmr.u.timer.expires_ns = rs->state_entry_time +
+ SHINFO_RACE_TIMEOUT * 1000000000ULL;
+ vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+ break;
+
+ case 23:
+ /*
+ * Optional and possibly repeated sync point.
+ * Injecting the timer IRQ may fail if the
+ * shinfo is invalid when the timer expires.
+ * If the timer has expired but the IRQ hasn't
+ * been delivered, rearm the timer and retry.
+ */
+ vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
+
+ /* Resume the guest if the timer is still pending. */
+ if (tmr.u.timer.expires_ns)
+ break;
+
+ /* All done if the IRQ was delivered. */
+ if (!evtchn_irq_expected)
+ break;
+
+ tmr.u.timer.expires_ns = rs->state_entry_time +
+ SHINFO_RACE_TIMEOUT * 1000000000ULL;
+ vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+ break;
+ case 24:
+ TEST_ASSERT(!evtchn_irq_expected,
+ "Expected event channel IRQ but it didn't happen");
+
+ ret = pthread_cancel(thread);
+ TEST_ASSERT(ret == 0, "pthread_cancel() failed: %s", strerror(ret));
+
+ ret = pthread_join(thread, 0);
+ TEST_ASSERT(ret == 0, "pthread_join() failed: %s", strerror(ret));
goto done;
case 0x20:
diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile
index 6632bfff486b..348e2dbdb4e0 100644
--- a/tools/testing/selftests/landlock/Makefile
+++ b/tools/testing/selftests/landlock/Makefile
@@ -3,7 +3,6 @@
# First run: make -C ../../../.. headers_install
CFLAGS += -Wall -O2 $(KHDR_INCLUDES)
-LDLIBS += -lcap
LOCAL_HDRS += common.h
@@ -13,10 +12,12 @@ TEST_GEN_PROGS := $(src_test:.c=)
TEST_GEN_PROGS_EXTENDED := true
-# Static linking for short targets:
+# Short targets:
+$(TEST_GEN_PROGS): LDLIBS += -lcap
$(TEST_GEN_PROGS_EXTENDED): LDFLAGS += -static
include ../lib.mk
-# Static linking for targets with $(OUTPUT)/ prefix:
+# Targets with $(OUTPUT)/ prefix:
+$(TEST_GEN_PROGS): LDLIBS += -lcap
$(TEST_GEN_PROGS_EXTENDED): LDFLAGS += -static
diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile
index f4a2f28f926b..778b6cdc8aed 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
-CFLAGS += -g -I../../../../usr/include/ -pthread
+CFLAGS += -g -I../../../../usr/include/ -pthread -Wall
TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \
pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test
diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c
index 9a2d64901d59..e2dd4ed84984 100644
--- a/tools/testing/selftests/pidfd/pidfd_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_test.c
@@ -413,7 +413,7 @@ static void poll_pidfd(const char *test_name, int pidfd)
c = epoll_wait(epoll_fd, events, MAX_EVENTS, 5000);
if (c != 1 || !(events[0].events & EPOLLIN))
- ksft_exit_fail_msg("%s test: Unexpected epoll_wait result (c=%d, events=%x) ",
+ ksft_exit_fail_msg("%s test: Unexpected epoll_wait result (c=%d, events=%x) "
"(errno %d)\n",
test_name, c, events[0].events, errno);
@@ -435,6 +435,8 @@ static int child_poll_exec_test(void *args)
*/
while (1)
sleep(1);
+
+ return 0;
}
static void test_pidfd_poll_exec(int use_waitpid)
diff --git a/tools/testing/selftests/pidfd/pidfd_wait.c b/tools/testing/selftests/pidfd/pidfd_wait.c
index 070c1c876df1..0dcb8365ddc3 100644
--- a/tools/testing/selftests/pidfd/pidfd_wait.c
+++ b/tools/testing/selftests/pidfd/pidfd_wait.c
@@ -95,20 +95,28 @@ TEST(wait_states)
.flags = CLONE_PIDFD | CLONE_PARENT_SETTID,
.exit_signal = SIGCHLD,
};
+ int pfd[2];
pid_t pid;
siginfo_t info = {
.si_signo = 0,
};
+ ASSERT_EQ(pipe(pfd), 0);
pid = sys_clone3(&args);
ASSERT_GE(pid, 0);
if (pid == 0) {
+ char buf[2];
+
+ close(pfd[1]);
kill(getpid(), SIGSTOP);
+ ASSERT_EQ(read(pfd[0], buf, 1), 1);
+ close(pfd[0]);
kill(getpid(), SIGSTOP);
exit(EXIT_SUCCESS);
}
+ close(pfd[0]);
ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_STOPPED);
@@ -117,6 +125,8 @@ TEST(wait_states)
ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0);
ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED, NULL), 0);
+ ASSERT_EQ(write(pfd[1], "C", 1), 1);
+ close(pfd[1]);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_CONTINUED);
ASSERT_EQ(info.si_pid, parent_tid);
@@ -138,7 +148,7 @@ TEST(wait_states)
TEST(wait_nonblock)
{
- int pidfd, status = 0;
+ int pidfd;
unsigned int flags = 0;
pid_t parent_tid = -1;
struct clone_args args = {
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 800f9470e36b..9fb1ff6f19e5 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -33,6 +33,12 @@ config HAVE_KVM_DIRTY_RING_ACQ_REL
bool
select HAVE_KVM_DIRTY_RING
+# Allow enabling both the dirty bitmap and dirty ring. Only architectures
+# that need to dirty memory outside of a vCPU context should select this.
+config NEED_KVM_DIRTY_RING_WITH_BITMAP
+ bool
+ depends on HAVE_KVM_DIRTY_RING
+
config HAVE_KVM_EVENTFD
bool
select EVENTFD
diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
index d6fabf238032..c1cd7dfe4a90 100644
--- a/virt/kvm/dirty_ring.c
+++ b/virt/kvm/dirty_ring.c
@@ -21,12 +21,26 @@ u32 kvm_dirty_ring_get_rsvd_entries(void)
return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size();
}
+bool kvm_use_dirty_bitmap(struct kvm *kvm)
+{
+ lockdep_assert_held(&kvm->slots_lock);
+
+ return !kvm->dirty_ring_size || kvm->dirty_ring_with_bitmap;
+}
+
+#ifndef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
+bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm)
+{
+ return false;
+}
+#endif
+
static u32 kvm_dirty_ring_used(struct kvm_dirty_ring *ring)
{
return READ_ONCE(ring->dirty_index) - READ_ONCE(ring->reset_index);
}
-bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
+static bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
{
return kvm_dirty_ring_used(ring) >= ring->soft_limit;
}
@@ -142,13 +156,19 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+ /*
+ * The request KVM_REQ_DIRTY_RING_SOFT_FULL will be cleared
+ * by the VCPU thread next time when it enters the guest.
+ */
+
trace_kvm_dirty_ring_reset(ring);
return count;
}
-void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset)
+void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset)
{
+ struct kvm_dirty_ring *ring = &vcpu->dirty_ring;
struct kvm_dirty_gfn *entry;
/* It should never get full */
@@ -166,6 +186,28 @@ void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset)
kvm_dirty_gfn_set_dirtied(entry);
ring->dirty_index++;
trace_kvm_dirty_ring_push(ring, slot, offset);
+
+ if (kvm_dirty_ring_soft_full(ring))
+ kvm_make_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu);
+}
+
+bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu)
+{
+ /*
+ * The VCPU isn't runnable when the dirty ring becomes soft full.
+ * The KVM_REQ_DIRTY_RING_SOFT_FULL event is always set to prevent
+ * the VCPU from running until the dirty pages are harvested and
+ * the dirty ring is reset by userspace.
+ */
+ if (kvm_check_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu) &&
+ kvm_dirty_ring_soft_full(&vcpu->dirty_ring)) {
+ kvm_make_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu);
+ vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
+ trace_kvm_dirty_ring_exit(vcpu);
+ return true;
+ }
+
+ return false;
}
struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1376a47fedee..03e6a38094c1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1617,7 +1617,7 @@ static int kvm_prepare_memory_region(struct kvm *kvm,
new->dirty_bitmap = NULL;
else if (old && old->dirty_bitmap)
new->dirty_bitmap = old->dirty_bitmap;
- else if (!kvm->dirty_ring_size) {
+ else if (kvm_use_dirty_bitmap(kvm)) {
r = kvm_alloc_dirty_bitmap(new);
if (r)
return r;
@@ -2060,8 +2060,8 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
unsigned long n;
unsigned long any = 0;
- /* Dirty ring tracking is exclusive to dirty log tracking */
- if (kvm->dirty_ring_size)
+ /* Dirty ring tracking may be exclusive to dirty log tracking */
+ if (!kvm_use_dirty_bitmap(kvm))
return -ENXIO;
*memslot = NULL;
@@ -2125,8 +2125,8 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
unsigned long *dirty_bitmap_buffer;
bool flush;
- /* Dirty ring tracking is exclusive to dirty log tracking */
- if (kvm->dirty_ring_size)
+ /* Dirty ring tracking may be exclusive to dirty log tracking */
+ if (!kvm_use_dirty_bitmap(kvm))
return -ENXIO;
as_id = log->slot >> 16;
@@ -2237,8 +2237,8 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm,
unsigned long *dirty_bitmap_buffer;
bool flush;
- /* Dirty ring tracking is exclusive to dirty log tracking */
- if (kvm->dirty_ring_size)
+ /* Dirty ring tracking may be exclusive to dirty log tracking */
+ if (!kvm_use_dirty_bitmap(kvm))
return -ENXIO;
as_id = log->slot >> 16;
@@ -3305,18 +3305,19 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
#ifdef CONFIG_HAVE_KVM_DIRTY_RING
- if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
+ if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
return;
+
+ WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
#endif
if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
unsigned long rel_gfn = gfn - memslot->base_gfn;
u32 slot = (memslot->as_id << 16) | memslot->id;
- if (kvm->dirty_ring_size)
- kvm_dirty_ring_push(&vcpu->dirty_ring,
- slot, rel_gfn);
- else
+ if (kvm->dirty_ring_size && vcpu)
+ kvm_dirty_ring_push(vcpu, slot, rel_gfn);
+ else if (memslot->dirty_bitmap)
set_bit_le(rel_gfn, memslot->dirty_bitmap);
}
}
@@ -4484,6 +4485,9 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
#else
return 0;
#endif
+#ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
+ case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
+#endif
case KVM_CAP_BINARY_STATS_FD:
case KVM_CAP_SYSTEM_EVENT_DATA:
return 1;
@@ -4559,6 +4563,20 @@ int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
return -EINVAL;
}
+static bool kvm_are_all_memslots_empty(struct kvm *kvm)
+{
+ int i;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
+ return false;
+ }
+
+ return true;
+}
+
static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
@@ -4585,7 +4603,33 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
}
case KVM_CAP_DIRTY_LOG_RING:
case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
+ if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
+ return -EINVAL;
+
return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
+ case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
+ int r = -EINVAL;
+
+ if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
+ !kvm->dirty_ring_size || cap->flags)
+ return r;
+
+ mutex_lock(&kvm->slots_lock);
+
+ /*
+ * For simplicity, allow enabling ring+bitmap if and only if
+ * there are no memslots, e.g. to ensure all memslots allocate
+ * a bitmap after the capability is enabled.
+ */
+ if (kvm_are_all_memslots_empty(kvm)) {
+ kvm->dirty_ring_with_bitmap = true;
+ r = 0;
+ }
+
+ mutex_unlock(&kvm->slots_lock);
+
+ return r;
+ }
default:
return kvm_vm_ioctl_enable_cap(kvm, cap);
}
@@ -5409,6 +5453,7 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file,
int (*get)(void *, u64 *), int (*set)(void *, u64),
const char *fmt)
{
+ int ret;
struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
inode->i_private;
@@ -5420,15 +5465,13 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file,
if (!kvm_get_kvm_safe(stat_data->kvm))
return -ENOENT;
- if (simple_attr_open(inode, file, get,
- kvm_stats_debugfs_mode(stat_data->desc) & 0222
- ? set : NULL,
- fmt)) {
+ ret = simple_attr_open(inode, file, get,
+ kvm_stats_debugfs_mode(stat_data->desc) & 0222
+ ? set : NULL, fmt);
+ if (ret)
kvm_put_kvm(stat_data->kvm);
- return -ENOMEM;
- }
- return 0;
+ return ret;
}
static int kvm_debugfs_release(struct inode *inode, struct file *file)
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 68ff41d39545..346e47f15572 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -81,6 +81,9 @@ bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
{
struct kvm_memslots *slots = kvm_memslots(kvm);
+ if (!gpc->active)
+ return false;
+
if ((gpa & ~PAGE_MASK) + len > PAGE_SIZE)
return false;
@@ -240,10 +243,11 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
{
struct kvm_memslots *slots = kvm_memslots(kvm);
unsigned long page_offset = gpa & ~PAGE_MASK;
- kvm_pfn_t old_pfn, new_pfn;
+ bool unmap_old = false;
unsigned long old_uhva;
+ kvm_pfn_t old_pfn;
void *old_khva;
- int ret = 0;
+ int ret;
/*
* If must fit within a single page. The 'len' argument is
@@ -261,6 +265,11 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
write_lock_irq(&gpc->lock);
+ if (!gpc->active) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
old_pfn = gpc->pfn;
old_khva = gpc->khva - offset_in_page(gpc->khva);
old_uhva = gpc->uhva;
@@ -291,6 +300,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
/* If the HVA→PFN mapping was already valid, don't unmap it. */
old_pfn = KVM_PFN_ERR_FAULT;
old_khva = NULL;
+ ret = 0;
}
out:
@@ -305,14 +315,15 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
gpc->khva = NULL;
}
- /* Snapshot the new pfn before dropping the lock! */
- new_pfn = gpc->pfn;
+ /* Detect a pfn change before dropping the lock! */
+ unmap_old = (old_pfn != gpc->pfn);
+out_unlock:
write_unlock_irq(&gpc->lock);
mutex_unlock(&gpc->refresh_lock);
- if (old_pfn != new_pfn)
+ if (unmap_old)
gpc_unmap_khva(kvm, old_pfn, old_khva);
return ret;
@@ -346,42 +357,61 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
}
EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_unmap);
+void kvm_gpc_init(struct gfn_to_pfn_cache *gpc)
+{
+ rwlock_init(&gpc->lock);
+ mutex_init(&gpc->refresh_lock);
+}
+EXPORT_SYMBOL_GPL(kvm_gpc_init);
-int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
- struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
- gpa_t gpa, unsigned long len)
+int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+ struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
+ gpa_t gpa, unsigned long len)
{
WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage);
if (!gpc->active) {
- rwlock_init(&gpc->lock);
- mutex_init(&gpc->refresh_lock);
-
gpc->khva = NULL;
gpc->pfn = KVM_PFN_ERR_FAULT;
gpc->uhva = KVM_HVA_ERR_BAD;
gpc->vcpu = vcpu;
gpc->usage = usage;
gpc->valid = false;
- gpc->active = true;
spin_lock(&kvm->gpc_lock);
list_add(&gpc->list, &kvm->gpc_list);
spin_unlock(&kvm->gpc_lock);
+
+ /*
+ * Activate the cache after adding it to the list, a concurrent
+ * refresh must not establish a mapping until the cache is
+ * reachable by mmu_notifier events.
+ */
+ write_lock_irq(&gpc->lock);
+ gpc->active = true;
+ write_unlock_irq(&gpc->lock);
}
return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len);
}
-EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_init);
+EXPORT_SYMBOL_GPL(kvm_gpc_activate);
-void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
+void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
{
if (gpc->active) {
+ /*
+ * Deactivate the cache before removing it from the list, KVM
+ * must stall mmu_notifier events until all users go away, i.e.
+ * until gpc->lock is dropped and refresh is guaranteed to fail.
+ */
+ write_lock_irq(&gpc->lock);
+ gpc->active = false;
+ write_unlock_irq(&gpc->lock);
+
spin_lock(&kvm->gpc_lock);
list_del(&gpc->list);
spin_unlock(&kvm->gpc_lock);
kvm_gfn_to_pfn_cache_unmap(kvm, gpc);
- gpc->active = false;
}
}
-EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_destroy);
+EXPORT_SYMBOL_GPL(kvm_gpc_deactivate);