From 0b9e66762aa0cda2a9c2d5542d64e04dac528fa6 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 7 Mar 2022 02:47:17 -0800 Subject: btrfs: zoned: traverse devices under chunk_mutex in btrfs_can_activate_zone btrfs_can_activate_zone() can be called with the device_list_mutex already held, which will lead to a deadlock: insert_dev_extents() // Takes device_list_mutex `-> insert_dev_extent() `-> btrfs_insert_empty_item() `-> btrfs_insert_empty_items() `-> btrfs_search_slot() `-> btrfs_cow_block() `-> __btrfs_cow_block() `-> btrfs_alloc_tree_block() `-> btrfs_reserve_extent() `-> find_free_extent() `-> find_free_extent_update_loop() `-> can_allocate_chunk() `-> btrfs_can_activate_zone() // Takes device_list_mutex again Instead of using the RCU on fs_devices->device_list we can use fs_devices->alloc_list, protected by the chunk_mutex to traverse the list of active devices. We are in the chunk allocation thread. The newer chunk allocation happens from the devices in the fs_device->alloc_list protected by the chunk_mutex. btrfs_create_chunk() lockdep_assert_held(&info->chunk_mutex); gather_device_info list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) Also, a device that reappears after the mount won't join the alloc_list yet and, it will be in the dev_list, which we don't want to consider in the context of the chunk alloc. [15.166572] WARNING: possible recursive locking detected [15.167117] 5.17.0-rc6-dennis #79 Not tainted [15.167487] -------------------------------------------- [15.167733] kworker/u8:3/146 is trying to acquire lock: [15.167733] ffff888102962ee0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: find_free_extent+0x15a/0x14f0 [btrfs] [15.167733] [15.167733] but task is already holding lock: [15.167733] ffff888102962ee0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: btrfs_create_pending_block_groups+0x20a/0x560 [btrfs] [15.167733] [15.167733] other info that might help us debug this: [15.167733] Possible unsafe locking scenario: [15.167733] [15.171834] CPU0 [15.171834] ---- [15.171834] lock(&fs_devs->device_list_mutex); [15.171834] lock(&fs_devs->device_list_mutex); [15.171834] [15.171834] *** DEADLOCK *** [15.171834] [15.171834] May be due to missing lock nesting notation [15.171834] [15.171834] 5 locks held by kworker/u8:3/146: [15.171834] #0: ffff888100050938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1c3/0x5a0 [15.171834] #1: ffffc9000067be80 ((work_completion)(&fs_info->async_data_reclaim_work)){+.+.}-{0:0}, at: process_one_work+0x1c3/0x5a0 [15.176244] #2: ffff88810521e620 (sb_internal){.+.+}-{0:0}, at: flush_space+0x335/0x600 [btrfs] [15.176244] #3: ffff888102962ee0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: btrfs_create_pending_block_groups+0x20a/0x560 [btrfs] [15.176244] #4: ffff8881152e4b78 (btrfs-dev-00){++++}-{3:3}, at: __btrfs_tree_lock+0x27/0x130 [btrfs] [15.179641] [15.179641] stack backtrace: [15.179641] CPU: 1 PID: 146 Comm: kworker/u8:3 Not tainted 5.17.0-rc6-dennis #79 [15.179641] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1.fc35 04/01/2014 [15.179641] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs] [15.179641] Call Trace: [15.179641] [15.179641] dump_stack_lvl+0x45/0x59 [15.179641] __lock_acquire.cold+0x217/0x2b2 [15.179641] lock_acquire+0xbf/0x2b0 [15.183838] ? find_free_extent+0x15a/0x14f0 [btrfs] [15.183838] __mutex_lock+0x8e/0x970 [15.183838] ? find_free_extent+0x15a/0x14f0 [btrfs] [15.183838] ? find_free_extent+0x15a/0x14f0 [btrfs] [15.183838] ? lock_is_held_type+0xd7/0x130 [15.183838] ? find_free_extent+0x15a/0x14f0 [btrfs] [15.183838] find_free_extent+0x15a/0x14f0 [btrfs] [15.183838] ? _raw_spin_unlock+0x24/0x40 [15.183838] ? btrfs_get_alloc_profile+0x106/0x230 [btrfs] [15.187601] btrfs_reserve_extent+0x131/0x260 [btrfs] [15.187601] btrfs_alloc_tree_block+0xb5/0x3b0 [btrfs] [15.187601] __btrfs_cow_block+0x138/0x600 [btrfs] [15.187601] btrfs_cow_block+0x10f/0x230 [btrfs] [15.187601] btrfs_search_slot+0x55f/0xbc0 [btrfs] [15.187601] ? lock_is_held_type+0xd7/0x130 [15.187601] btrfs_insert_empty_items+0x2d/0x60 [btrfs] [15.187601] btrfs_create_pending_block_groups+0x2b3/0x560 [btrfs] [15.187601] __btrfs_end_transaction+0x36/0x2a0 [btrfs] [15.192037] flush_space+0x374/0x600 [btrfs] [15.192037] ? find_held_lock+0x2b/0x80 [15.192037] ? btrfs_async_reclaim_data_space+0x49/0x180 [btrfs] [15.192037] ? lock_release+0x131/0x2b0 [15.192037] btrfs_async_reclaim_data_space+0x70/0x180 [btrfs] [15.192037] process_one_work+0x24c/0x5a0 [15.192037] worker_thread+0x4a/0x3d0 Fixes: a85f05e59bc1 ("btrfs: zoned: avoid chunk allocation if active block group has enough space") CC: stable@vger.kernel.org # 5.16+ Reviewed-by: Anand Jain Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b7b5fac1c779..61125aec8723 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1976,18 +1976,19 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) { + struct btrfs_fs_info *fs_info = fs_devices->fs_info; struct btrfs_device *device; bool ret = false; - if (!btrfs_is_zoned(fs_devices->fs_info)) + if (!btrfs_is_zoned(fs_info)) return true; /* Non-single profiles are not supported yet */ ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0); /* Check if there is a device with active zones left */ - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + mutex_lock(&fs_info->chunk_mutex); + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { struct btrfs_zoned_device_info *zinfo = device->zone_info; if (!device->bdev) @@ -1999,7 +2000,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) break; } } - mutex_unlock(&fs_devices->device_list_mutex); + mutex_unlock(&fs_info->chunk_mutex); return ret; } -- cgit v1.2.3 From 62ed0bf7315b524973bb5fb9174b60e353289835 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 7 Mar 2022 02:47:18 -0800 Subject: btrfs: zoned: remove left over ASSERT checking for single profile With commit dcf5652291f6 ("btrfs: zoned: allow DUP on meta-data block groups") we started allowing DUP on metadata block groups, so the ASSERT()s in btrfs_can_activate_zone() and btrfs_zoned_get_device() are no longer valid and in fact even harmful. Fixes: dcf5652291f6 ("btrfs: zoned: allow DUP on meta-data block groups") CC: stable@vger.kernel.org # 5.17 Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 61125aec8723..1b1b310c3c51 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1801,7 +1801,6 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, map = em->map_lookup; /* We only support single profile for now */ - ASSERT(map->num_stripes == 1); device = map->stripes[0].dev; free_extent_map(em); @@ -1983,9 +1982,6 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) if (!btrfs_is_zoned(fs_info)) return true; - /* Non-single profiles are not supported yet */ - ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0); - /* Check if there is a device with active zones left */ mutex_lock(&fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { -- cgit v1.2.3 From b642b52d0b50f4d398cb4293f64992d0eed2e2ce Mon Sep 17 00:00:00 2001 From: Ethan Lien Date: Mon, 7 Mar 2022 18:00:04 +0800 Subject: btrfs: fix qgroup reserve overflow the qgroup limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We use extent_changeset->bytes_changed in qgroup_reserve_data() to record how many bytes we set for EXTENT_QGROUP_RESERVED state. Currently the bytes_changed is set as "unsigned int", and it will overflow if we try to fallocate a range larger than 4GiB. The result is we reserve less bytes and eventually break the qgroup limit. Unlike regular buffered/direct write, which we use one changeset for each ordered extent, which can never be larger than 256M. For fallocate, we use one changeset for the whole range, thus it no longer respects the 256M per extent limit, and caused the problem. The following example test script reproduces the problem: $ cat qgroup-overflow.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj mkfs.btrfs -f $DEV mount $DEV $MNT # Set qgroup limit to 2GiB. btrfs quota enable $MNT btrfs qgroup limit 2G $MNT # Try to fallocate a 3GiB file. This should fail. echo echo "Try to fallocate a 3GiB file..." fallocate -l 3G $MNT/3G.file # Try to fallocate a 5GiB file. echo echo "Try to fallocate a 5GiB file..." fallocate -l 5G $MNT/5G.file # See we break the qgroup limit. echo sync btrfs qgroup show -r $MNT umount $MNT When running the test: $ ./qgroup-overflow.sh (...) Try to fallocate a 3GiB file... fallocate: fallocate failed: Disk quota exceeded Try to fallocate a 5GiB file... qgroupid         rfer         excl     max_rfer --------         ----         ----     -------- 0/5           5.00GiB      5.00GiB      2.00GiB Since we have no control of how bytes_changed is used, it's better to set it to u64. CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Qu Wenruo Signed-off-by: Ethan Lien Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 0399cf8e3c32..151e9da5da2d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -118,7 +118,7 @@ struct btrfs_bio_ctrl { */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - unsigned int bytes_changed; + u64 bytes_changed; /* Changed ranges */ struct ulist range_changed; -- cgit v1.2.3 From bbac58698a55cc0a6f0c0d69a6dcd3f9f3134c11 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 8 Mar 2022 13:36:38 +0800 Subject: btrfs: remove device item and update super block in the same transaction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [BUG] There is a report that a btrfs has a bad super block num devices. This makes btrfs to reject the fs completely. BTRFS error (device sdd3): super_num_devices 3 mismatch with num_devices 2 found here BTRFS error (device sdd3): failed to read chunk tree: -22 BTRFS error (device sdd3): open_ctree failed [CAUSE] During btrfs device removal, chunk tree and super block num devs are updated in two different transactions: btrfs_rm_device() |- btrfs_rm_dev_item(device) | |- trans = btrfs_start_transaction() | | Now we got transaction X | | | |- btrfs_del_item() | | Now device item is removed from chunk tree | | | |- btrfs_commit_transaction() | Transaction X got committed, super num devs untouched, | but device item removed from chunk tree. | (AKA, super num devs is already incorrect) | |- cur_devices->num_devices--; |- cur_devices->total_devices--; |- btrfs_set_super_num_devices() All those operations are not in transaction X, thus it will only be written back to disk in next transaction. So after the transaction X in btrfs_rm_dev_item() committed, but before transaction X+1 (which can be minutes away), a power loss happen, then we got the super num mismatch. [FIX] Instead of starting and committing a transaction inside btrfs_rm_dev_item(), start a transaction in side btrfs_rm_device() and pass it to btrfs_rm_dev_item(). And only commit the transaction after everything is done. Reported-by: Luca Béla Palkovics Link: https://lore.kernel.org/linux-btrfs/CA+8xDSpvdm_U0QLBAnrH=zqDq_cWCOH5TiV46CKmp3igr44okQ@mail.gmail.com/ CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 65 +++++++++++++++++++++++------------------------------- 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1be7cb2f955f..2cfbc74a3b4e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1896,23 +1896,18 @@ static void update_dev_time(const char *device_path) path_put(&path); } -static int btrfs_rm_dev_item(struct btrfs_device *device) +static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, + struct btrfs_device *device) { struct btrfs_root *root = device->fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_trans_handle *trans; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1923,21 +1918,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device) if (ret) { if (ret > 0) ret = -ENOENT; - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); goto out; } ret = btrfs_del_item(trans, root, path); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - } - out: btrfs_free_path(path); - if (!ret) - ret = btrfs_commit_transaction(trans); return ret; } @@ -2078,6 +2064,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, struct block_device **bdev, fmode_t *mode) { + struct btrfs_trans_handle *trans; struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -2098,7 +2085,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) - goto out; + return ret; device = btrfs_find_device(fs_info->fs_devices, args); if (!device) { @@ -2106,27 +2093,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else ret = -ENOENT; - goto out; + return ret; } if (btrfs_pinned_by_swapfile(fs_info, device)) { btrfs_warn_in_rcu(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", rcu_str_deref(device->name), device->devid); - ret = -ETXTBSY; - goto out; + return -ETXTBSY; } - if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { - ret = BTRFS_ERROR_DEV_TGT_REPLACE; - goto out; - } + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + return BTRFS_ERROR_DEV_TGT_REPLACE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && - fs_info->fs_devices->rw_devices == 1) { - ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; - goto out; - } + fs_info->fs_devices->rw_devices == 1) + return BTRFS_ERROR_DEV_ONLY_WRITABLE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); @@ -2139,14 +2121,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, if (ret) goto error_undo; - /* - * TODO: the superblock still includes this device in its num_devices - * counter although write_all_supers() is not locked out. This - * could give a filesystem state which requires a degraded mount. - */ - ret = btrfs_rm_dev_item(device); - if (ret) + trans = btrfs_start_transaction(fs_info->chunk_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); goto error_undo; + } + + ret = btrfs_rm_dev_item(trans, device); + if (ret) { + /* Any error in dev item removal is critical */ + btrfs_crit(fs_info, + "failed to remove device item for devid %llu: %d", + device->devid, ret); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); btrfs_scrub_cancel_dev(device); @@ -2229,7 +2219,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, free_fs_devices(cur_devices); } -out: + ret = btrfs_commit_transaction(trans); + return ret; error_undo: @@ -2240,7 +2231,7 @@ error_undo: device->fs_devices->rw_devices++; mutex_unlock(&fs_info->chunk_mutex); } - goto out; + return ret; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) -- cgit v1.2.3 From 05fd9564e9faf0f23b4676385e27d9405cef6637 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 14 Mar 2022 10:55:32 -0700 Subject: btrfs: fix fallocate to use file_modified to update permissions consistently Since the initial introduction of (posix) fallocate back at the turn of the century, it has been possible to use this syscall to change the user-visible contents of files. This can happen by extending the file size during a preallocation, or through any of the newer modes (punch, zero range). Because the call can be used to change file contents, we should treat it like we do any other modification to a file -- update the mtime, and drop set[ug]id privileges/capabilities. The VFS function file_modified() does all this for us if pass it a locked inode, so let's make fallocate drop permissions correctly. Reviewed-by: Filipe Manana Signed-off-by: Darrick J. Wong Signed-off-by: David Sterba --- fs/btrfs/file.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9f455c96c974..380054c94e4b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2957,8 +2957,9 @@ out: return ret; } -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; @@ -2990,6 +2991,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } + ret = file_modified(file); + if (ret) + goto out_only_mutex; + lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); lockend = round_down(offset + len, btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; @@ -3430,7 +3435,7 @@ static long btrfs_fallocate(struct file *file, int mode, return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) - return btrfs_punch_hole(inode, offset, len); + return btrfs_punch_hole(file, offset, len); /* * Only trigger disk allocation, don't trigger qgroup reserve @@ -3452,6 +3457,10 @@ static long btrfs_fallocate(struct file *file, int mode, goto out; } + ret = file_modified(file); + if (ret) + goto out; + /* * TODO: Move these two operations after we have checked * accurate reserved space, or fallocate can still fail but -- cgit v1.2.3 From 75a36a7d3ea904cef2e5b56af0c58cc60dcf947a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 15 Mar 2022 19:28:05 +0800 Subject: btrfs: avoid defragging extents whose next extents are not targets [BUG] There is a report that autodefrag is defragging single sector, which is completely waste of IO, and no help for defragging: btrfs-cleaner-808 defrag_one_locked_range: root=256 ino=651122 start=0 len=4096 [CAUSE] In defrag_collect_targets(), we check if the current range (A) can be merged with next one (B). If mergeable, we will add range A into target for defrag. However there is a catch for autodefrag, when checking mergeability against range B, we intentionally pass 0 as @newer_than, hoping to get a higher chance to merge with the next extent. But in the next iteration, range B will looked up by defrag_lookup_extent(), with non-zero @newer_than. And if range B is not really newer, it will rejected directly, causing only range A being defragged, while we expect to defrag both range A and B. [FIX] Since the root cause is the difference in check condition of defrag_check_next_extent() and defrag_collect_targets(), we fix it by: 1. Pass @newer_than to defrag_check_next_extent() 2. Pass @extent_thresh to defrag_check_next_extent() This makes the check between defrag_collect_targets() and defrag_check_next_extent() more consistent. While there is still some minor difference, the remaining checks are focus on runtime flags like writeback/delalloc, which are mostly transient and safe to be checked only in defrag_collect_targets(). Link: https://github.com/btrfs/linux/issues/423#issuecomment-1066981856 CC: stable@vger.kernel.org # 5.16+ Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 238cee5b5254..f46e71061942 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1239,7 +1239,7 @@ static u32 get_extent_max_capacity(const struct extent_map *em) } static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, - bool locked) + u32 extent_thresh, u64 newer_than, bool locked) { struct extent_map *next; bool ret = false; @@ -1249,11 +1249,12 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, return false; /* - * We want to check if the next extent can be merged with the current - * one, which can be an extent created in a past generation, so we pass - * a minimum generation of 0 to defrag_lookup_extent(). + * Here we need to pass @newer_then when checking the next extent, or + * we will hit a case we mark current extent for defrag, but the next + * one will not be a target. + * This will just cause extra IO without really reducing the fragments. */ - next = defrag_lookup_extent(inode, em->start + em->len, 0, locked); + next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); /* No more em or hole */ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) goto out; @@ -1265,6 +1266,13 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, */ if (next->len >= get_extent_max_capacity(em)) goto out; + /* Skip older extent */ + if (next->generation < newer_than) + goto out; + /* Also check extent size */ + if (next->len >= extent_thresh) + goto out; + ret = true; out: free_extent_map(next); @@ -1470,7 +1478,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, goto next; next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, - locked); + extent_thresh, newer_than, locked); if (!next_mergeable) { struct defrag_target_range *last; -- cgit v1.2.3 From a7d16d9a07bbcb7dcd5214a1bea75c808830bc0d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 23 Mar 2022 11:30:36 -0400 Subject: btrfs: do not warn for free space inode in cow_file_range This is a long time leftover from when I originally added the free space inode, the point was to catch cases where we weren't honoring the NOCOW flag. However there exists a race with relocation, if we allocate our free space inode in a block group that is about to be relocated, we could trigger the COW path before the relocation has the opportunity to find the extents and delete the free space cache. In production where we have auto-relocation enabled we're seeing this WARN_ON_ONCE() around 5k times in a 2 week period, so not super common but enough that it's at the top of our metrics. We're properly handling the error here, and with us phasing out v1 space cache anyway just drop the WARN_ON_ONCE. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2e7143ff5523..b976f757571f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1128,7 +1128,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, int ret = 0; if (btrfs_is_free_space_inode(inode)) { - WARN_ON_ONCE(1); ret = -EINVAL; goto out_unlock; } -- cgit v1.2.3 From 60021bd754c6ca0addc6817994f20290a321d8d6 Mon Sep 17 00:00:00 2001 From: Kaiwen Hu Date: Wed, 23 Mar 2022 15:10:32 +0800 Subject: btrfs: prevent subvol with swapfile from being deleted A subvolume with an active swapfile must not be deleted otherwise it would not be possible to deactivate it. After the subvolume is deleted, we cannot swapoff the swapfile in this deleted subvolume because the path is unreachable. The swapfile is still active and holding references, the filesystem cannot be unmounted. The test looks like this: mkfs.btrfs -f $dev > /dev/null mount $dev $mnt btrfs sub create $mnt/subvol touch $mnt/subvol/swapfile chmod 600 $mnt/subvol/swapfile chattr +C $mnt/subvol/swapfile dd if=/dev/zero of=$mnt/subvol/swapfile bs=1K count=4096 mkswap $mnt/subvol/swapfile swapon $mnt/subvol/swapfile btrfs sub delete $mnt/subvol swapoff $mnt/subvol/swapfile # failed: No such file or directory swapoff --all unmount $mnt # target is busy. To prevent above issue, we simply check that whether the subvolume contains any active swapfile, and stop the deleting process. This behavior is like snapshot ioctl dealing with a swapfile. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Robbie Ko Reviewed-by: Qu Wenruo Reviewed-by: Filipe Manana Signed-off-by: Kaiwen Hu Signed-off-by: David Sterba --- fs/btrfs/inode.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b976f757571f..5aab6af88349 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4487,6 +4487,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) dest->root_key.objectid); return -EPERM; } + if (atomic_read(&dest->nr_swapfiles)) { + spin_unlock(&dest->root_item_lock); + btrfs_warn(fs_info, + "attempt to delete subvolume %llu with active swapfile", + root->root_key.objectid); + return -EPERM; + } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); @@ -11110,8 +11117,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, * set. We use this counter to prevent snapshots. We must increment it * before walking the extents because we don't want a concurrent * snapshot to run after we've already checked the extents. + * + * It is possible that subvolume is marked for deletion but still not + * removed yet. To prevent this race, we check the root status before + * activating the swapfile. */ + spin_lock(&root->root_item_lock); + if (btrfs_root_dead(root)) { + spin_unlock(&root->root_item_lock); + + btrfs_exclop_finish(fs_info); + btrfs_warn(fs_info, + "cannot activate swapfile because subvolume %llu is being deleted", + root->root_key.objectid); + return -EPERM; + } atomic_inc(&root->nr_swapfiles); + spin_unlock(&root->root_item_lock); isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); -- cgit v1.2.3 From 9f8b577f7b43b2170628d6c537252785dcc2dcea Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Tue, 1 Mar 2022 15:11:35 +0100 Subject: Drivers: hv: vmbus: Deactivate sysctl_record_panic_msg by default in isolated guests hv_panic_page might contain guest-sensitive information, do not dump it over to Hyper-V by default in isolated guests. While at it, update some comments in hyperv_{panic,die}_event(). Reported-by: Dexuan Cui Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Dexuan Cui Link: https://lore.kernel.org/r/20220301141135.2232-1-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 12a2b37e87f3..a963b970ffb2 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -77,8 +77,8 @@ static int hyperv_panic_event(struct notifier_block *nb, unsigned long val, /* * Hyper-V should be notified only once about a panic. If we will be - * doing hyperv_report_panic_msg() later with kmsg data, don't do - * the notification here. + * doing hv_kmsg_dump() with kmsg data later, don't do the notification + * here. */ if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE && hyperv_report_reg()) { @@ -100,8 +100,8 @@ static int hyperv_die_event(struct notifier_block *nb, unsigned long val, /* * Hyper-V should be notified only once about a panic. If we will be - * doing hyperv_report_panic_msg() later with kmsg data, don't do - * the notification here. + * doing hv_kmsg_dump() with kmsg data later, don't do the notification + * here. */ if (hyperv_report_reg()) hyperv_report_panic(regs, val, true); @@ -1546,14 +1546,20 @@ static int vmbus_bus_init(void) if (ret) goto err_connect; + if (hv_is_isolation_supported()) + sysctl_record_panic_msg = 0; + /* * Only register if the crash MSRs are available */ if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { u64 hyperv_crash_ctl; /* - * Sysctl registration is not fatal, since by default - * reporting is enabled. + * Panic message recording (sysctl_record_panic_msg) + * is enabled by default in non-isolated guests and + * disabled by default in isolated guests; the panic + * message recording won't be available in isolated + * guests should the following registration fail. */ hv_ctl_table_hdr = register_sysctl_table(hv_root_table); if (!hv_ctl_table_hdr) -- cgit v1.2.3 From 3a5469582c241abca22500f36a9cb8e9331969cf Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Tue, 15 Mar 2022 15:10:53 +0100 Subject: Drivers: hv: vmbus: Fix initialization of device object in vmbus_device_register() Initialize the device's dma_{mask,parms} pointers and the device's dma_mask value before invoking device_register(). Address the following trace with 5.17-rc7: [ 49.646839] WARNING: CPU: 0 PID: 189 at include/linux/dma-mapping.h:543 netvsc_probe+0x37a/0x3a0 [hv_netvsc] [ 49.646928] Call Trace: [ 49.646930] [ 49.646935] vmbus_probe+0x40/0x60 [hv_vmbus] [ 49.646942] really_probe+0x1ce/0x3b0 [ 49.646948] __driver_probe_device+0x109/0x180 [ 49.646952] driver_probe_device+0x23/0xa0 [ 49.646955] __device_attach_driver+0x76/0xe0 [ 49.646958] ? driver_allows_async_probing+0x50/0x50 [ 49.646961] bus_for_each_drv+0x84/0xd0 [ 49.646964] __device_attach+0xed/0x170 [ 49.646967] device_initial_probe+0x13/0x20 [ 49.646970] bus_probe_device+0x8f/0xa0 [ 49.646973] device_add+0x41a/0x8e0 [ 49.646975] ? hrtimer_init+0x28/0x80 [ 49.646981] device_register+0x1b/0x20 [ 49.646983] vmbus_device_register+0x5e/0xf0 [hv_vmbus] [ 49.646991] vmbus_add_channel_work+0x12d/0x190 [hv_vmbus] [ 49.646999] process_one_work+0x21d/0x3f0 [ 49.647002] worker_thread+0x4a/0x3b0 [ 49.647005] ? process_one_work+0x3f0/0x3f0 [ 49.647007] kthread+0xff/0x130 [ 49.647011] ? kthread_complete_and_exit+0x20/0x20 [ 49.647015] ret_from_fork+0x22/0x30 [ 49.647020] [ 49.647021] ---[ end trace 0000000000000000 ]--- Fixes: 743b237c3a7b0 ("scsi: storvsc: Add Isolation VM support for storvsc driver") Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220315141053.3223-1-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index a963b970ffb2..df22fbdc2ae2 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2103,6 +2103,10 @@ int vmbus_device_register(struct hv_device *child_device_obj) child_device_obj->device.parent = &hv_acpi_dev->dev; child_device_obj->device.release = vmbus_device_release; + child_device_obj->device.dma_parms = &child_device_obj->dma_parms; + child_device_obj->device.dma_mask = &child_device_obj->dma_mask; + dma_set_mask(&child_device_obj->device, DMA_BIT_MASK(64)); + /* * Register with the LDM. This will kick off the driver/device * binding...which will eventually call vmbus_match() and vmbus_probe() @@ -2128,9 +2132,6 @@ int vmbus_device_register(struct hv_device *child_device_obj) } hv_debug_add_dev_dir(child_device_obj); - child_device_obj->device.dma_parms = &child_device_obj->dma_parms; - child_device_obj->device.dma_mask = &child_device_obj->dma_mask; - dma_set_mask(&child_device_obj->device, DMA_BIT_MASK(64)); return 0; err_kset_unregister: -- cgit v1.2.3 From 792f232d57ff28bbd5f9c4abe0466b23d5879dc8 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Tue, 15 Mar 2022 17:35:35 -0300 Subject: Drivers: hv: vmbus: Fix potential crash on module unload The vmbus driver relies on the panic notifier infrastructure to perform some operations when a panic event is detected. Since vmbus can be built as module, it is required that the driver handles both registering and unregistering such panic notifier callback. After commit 74347a99e73a ("x86/Hyper-V: Unload vmbus channel in hv panic callback") though, the panic notifier registration is done unconditionally in the module initialization routine whereas the unregistering procedure is conditionally guarded and executes only if HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE capability is set. This patch fixes that by unconditionally unregistering the panic notifier in the module's exit routine as well. Fixes: 74347a99e73a ("x86/Hyper-V: Unload vmbus channel in hv panic callback") Signed-off-by: Guilherme G. Piccoli Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220315203535.682306-1-gpiccoli@igalia.com Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index df22fbdc2ae2..6c057c76c2ca 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2787,10 +2787,15 @@ static void __exit vmbus_exit(void) if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { kmsg_dump_unregister(&hv_kmsg_dumper); unregister_die_notifier(&hyperv_die_block); - atomic_notifier_chain_unregister(&panic_notifier_list, - &hyperv_panic_block); } + /* + * The panic notifier is always registered, hence we should + * also unconditionally unregister it here as well. + */ + atomic_notifier_chain_unregister(&panic_notifier_list, + &hyperv_panic_block); + free_page((unsigned long)hv_panic_page); unregister_sysctl_table(hv_ctl_table_hdr); hv_ctl_table_hdr = NULL; -- cgit v1.2.3 From 37200078ed6aa2ac3c88a01a64996133dccfdd34 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Thu, 24 Mar 2022 09:14:51 -0700 Subject: Drivers: hv: vmbus: Propagate VMbus coherence to each VMbus device VMbus synthetic devices are not represented in the ACPI DSDT -- only the top level VMbus device is represented. As a result, on ARM64 coherence information in the _CCA method is not specified for synthetic devices, so they default to not hardware coherent. Drivers for some of these synthetic devices have been recently updated to use the standard DMA APIs, and they are incurring extra overhead of unneeded software coherence management. Fix this by propagating coherence information from the VMbus node in ACPI to the individual synthetic devices. There's no effect on x86/x64 where devices are always hardware coherent. Signed-off-by: Michael Kelley Acked-by: Robin Murphy Link: https://lore.kernel.org/r/1648138492-2191-2-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- drivers/hv/hv_common.c | 11 +++++++++++ drivers/hv/vmbus_drv.c | 31 +++++++++++++++++++++++++++++++ include/asm-generic/mshyperv.h | 1 + 3 files changed, 43 insertions(+) diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 181d16bbf49d..820e81406251 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -216,6 +217,16 @@ bool hv_query_ext_cap(u64 cap_query) } EXPORT_SYMBOL_GPL(hv_query_ext_cap); +void hv_setup_dma_ops(struct device *dev, bool coherent) +{ + /* + * Hyper-V does not offer a vIOMMU in the guest + * VM, so pass 0/NULL for the IOMMU settings + */ + arch_setup_dma_ops(dev, 0, 0, NULL, coherent); +} +EXPORT_SYMBOL_GPL(hv_setup_dma_ops); + bool hv_is_hibernation_supported(void) { return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 6c057c76c2ca..3cd0d3a44fa2 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -920,6 +920,21 @@ static int vmbus_probe(struct device *child_device) return ret; } +/* + * vmbus_dma_configure -- Configure DMA coherence for VMbus device + */ +static int vmbus_dma_configure(struct device *child_device) +{ + /* + * On ARM64, propagate the DMA coherence setting from the top level + * VMbus ACPI device to the child VMbus device being added here. + * On x86/x64 coherence is assumed and these calls have no effect. + */ + hv_setup_dma_ops(child_device, + device_get_dma_attr(&hv_acpi_dev->dev) == DEV_DMA_COHERENT); + return 0; +} + /* * vmbus_remove - Remove a vmbus device */ @@ -1040,6 +1055,7 @@ static struct bus_type hv_bus = { .remove = vmbus_remove, .probe = vmbus_probe, .uevent = vmbus_uevent, + .dma_configure = vmbus_dma_configure, .dev_groups = vmbus_dev_groups, .drv_groups = vmbus_drv_groups, .bus_groups = vmbus_bus_groups, @@ -2435,6 +2451,21 @@ static int vmbus_acpi_add(struct acpi_device *device) hv_acpi_dev = device; + /* + * Older versions of Hyper-V for ARM64 fail to include the _CCA + * method on the top level VMbus device in the DSDT. But devices + * are hardware coherent in all current Hyper-V use cases, so fix + * up the ACPI device to behave as if _CCA is present and indicates + * hardware coherence. + */ + ACPI_COMPANION_SET(&device->dev, device); + if (IS_ENABLED(CONFIG_ACPI_CCA_REQUIRED) && + device_get_dma_attr(&device->dev) == DEV_DMA_NOT_SUPPORTED) { + pr_info("No ACPI _CCA found; assuming coherent device I/O\n"); + device->flags.cca_seen = true; + device->flags.coherent_dma = true; + } + result = acpi_walk_resources(device->handle, METHOD_NAME__CRS, vmbus_walk_resources, NULL); diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index c08758b6b364..c05d2ce9b6cd 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -269,6 +269,7 @@ bool hv_isolation_type_snp(void); u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size); void hyperv_cleanup(void); bool hv_query_ext_cap(u64 cap_query); +void hv_setup_dma_ops(struct device *dev, bool coherent); void *hv_map_memory(void *addr, unsigned long size); void hv_unmap_memory(void *addr); #else /* CONFIG_HYPERV */ -- cgit v1.2.3 From 8d21732475c637c7efcdb91dc927a4c594e97898 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Thu, 24 Mar 2022 09:14:52 -0700 Subject: PCI: hv: Propagate coherence from VMbus device to PCI device PCI pass-thru devices in a Hyper-V VM are represented as a VMBus device and as a PCI device. The coherence of the VMbus device is set based on the VMbus node in ACPI, but the PCI device has no ACPI node and defaults to not hardware coherent. This results in extra software coherence management overhead on ARM64 when devices are hardware coherent. Fix this by setting up the PCI host bus so that normal PCI mechanisms will propagate the coherence of the VMbus device to the PCI device. There's no effect on x86/x64 where devices are always hardware coherent. Signed-off-by: Michael Kelley Acked-by: Boqun Feng Acked-by: Robin Murphy Link: https://lore.kernel.org/r/1648138492-2191-3-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index ae0bc2fee4ca..88b3b56d0522 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -3404,6 +3404,15 @@ static int hv_pci_probe(struct hv_device *hdev, hbus->bridge->domain_nr = dom; #ifdef CONFIG_X86 hbus->sysdata.domain = dom; +#elif defined(CONFIG_ARM64) + /* + * Set the PCI bus parent to be the corresponding VMbus + * device. Then the VMbus device will be assigned as the + * ACPI companion in pcibios_root_bridge_prepare() and + * pci_dma_configure() will propagate device coherence + * information to devices created on the bus. + */ + hbus->sysdata.parent = hdev->device.parent; #endif hbus->hdev = hdev; -- cgit v1.2.3 From b6cae15b5710c8097aad26a2e5e752c323ee5348 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 27 Mar 2022 08:25:10 -0700 Subject: Drivers: hv: vmbus: Prevent load re-ordering when reading ring buffer When reading a packet from a host-to-guest ring buffer, there is no memory barrier between reading the write index (to see if there is a packet to read) and reading the contents of the packet. The Hyper-V host uses store-release when updating the write index to ensure that writes of the packet data are completed first. On the guest side, the processor can reorder and read the packet data before the write index, and sometimes get stale packet data. Getting such stale packet data has been observed in a reproducible case in a VM on ARM64. Fix this by using virt_load_acquire() to read the write index, ensuring that reads of the packet data cannot be reordered before it. Preventing such reordering is logically correct, and with this change, getting stale data can no longer be reproduced. Signed-off-by: Michael Kelley Reviewed-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/1648394710-33480-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- drivers/hv/ring_buffer.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index 71efacb90965..3d215d9dec43 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -439,7 +439,16 @@ int hv_ringbuffer_read(struct vmbus_channel *channel, static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi) { u32 priv_read_loc = rbi->priv_read_index; - u32 write_loc = READ_ONCE(rbi->ring_buffer->write_index); + u32 write_loc; + + /* + * The Hyper-V host writes the packet data, then uses + * store_release() to update the write_index. Use load_acquire() + * here to prevent loads of the packet data from being re-ordered + * before the read of the write_index and potentially getting + * stale data. + */ + write_loc = virt_load_acquire(&rbi->ring_buffer->write_index); if (write_loc >= priv_read_loc) return write_loc - priv_read_loc; -- cgit v1.2.3 From 31818213170caa51d116eb5dc1167b88523b4fe1 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Sun, 27 Mar 2022 23:36:25 +0100 Subject: netfilter: bitwise: fix reduce comparisons The `nft_bitwise_reduce` and `nft_bitwise_fast_reduce` functions should compare the bitwise operation in `expr` with the tracked operation associated with the destination register of `expr`. However, instead of being called on `expr` and `track->regs[priv->dreg].selector`, `nft_expr_priv` is called on `expr` twice, so both reduce functions return true even when the operations differ. Fixes: be5650f8f47e ("netfilter: nft_bitwise: track register operations") Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_bitwise.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 38caa66632b4..f590ee1c8a1b 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -290,7 +290,7 @@ static bool nft_bitwise_reduce(struct nft_regs_track *track, if (!track->regs[priv->sreg].selector) return false; - bitwise = nft_expr_priv(expr); + bitwise = nft_expr_priv(track->regs[priv->dreg].selector); if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector && track->regs[priv->sreg].num_reg == 0 && track->regs[priv->dreg].bitwise && @@ -442,7 +442,7 @@ static bool nft_bitwise_fast_reduce(struct nft_regs_track *track, if (!track->regs[priv->sreg].selector) return false; - bitwise = nft_expr_priv(expr); + bitwise = nft_expr_priv(track->regs[priv->dreg].selector); if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector && track->regs[priv->dreg].bitwise && track->regs[priv->dreg].bitwise->ops == expr->ops && -- cgit v1.2.3 From 7414539c5f2e43bad67ae88a3612455d01583429 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 30 Mar 2022 02:19:16 -0400 Subject: Revert "virtio: use virtio_device_ready() in virtio_device_restore()" This reverts commit 8d65bc9a5be3f23c5e2ab36b6b8ef40095165b18. We reverted the problematic changes, no more need for work arounds on restore. Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/virtio/virtio.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 75c8d560bbd3..22f15f444f75 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -526,9 +526,8 @@ int virtio_device_restore(struct virtio_device *dev) goto err; } - /* If restore didn't do it, mark device DRIVER_OK ourselves. */ - if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK)) - virtio_device_ready(dev); + /* Finally, tell the device we're all set */ + virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); virtio_config_enable(dev); -- cgit v1.2.3 From c18c86808b78c4c2dc69f27f37c57abab14ee387 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 30 Mar 2022 02:22:17 -0400 Subject: Revert "virtio_config: introduce a new .enable_cbs method" This reverts commit d50497eb4e554e1f0351e1836ee7241c059592e6. The new callback ended up not being used, and it's asymmetrical: just enable, no disable. Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- include/linux/virtio_config.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index dafdc7f48c01..b341dd62aa4d 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -23,8 +23,6 @@ struct virtio_shm_region { * any of @get/@set, @get_status/@set_status, or @get_features/ * @finalize_features are NOT safe to be called from an atomic * context. - * @enable_cbs: enable the callbacks - * vdev: the virtio_device * @get: read the value of a configuration field * vdev: the virtio_device * offset: the offset of the configuration field @@ -78,7 +76,6 @@ struct virtio_shm_region { */ typedef void vq_callback_t(struct virtqueue *); struct virtio_config_ops { - void (*enable_cbs)(struct virtio_device *vdev); void (*get)(struct virtio_device *vdev, unsigned offset, void *buf, unsigned len); void (*set)(struct virtio_device *vdev, unsigned offset, @@ -233,9 +230,6 @@ void virtio_device_ready(struct virtio_device *dev) { unsigned status = dev->config->get_status(dev); - if (dev->config->enable_cbs) - dev->config->enable_cbs(dev); - BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK); dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK); } -- cgit v1.2.3 From 55ebf0d60e3cc6c9e8593399e185842c00e12f36 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 29 Mar 2022 12:21:07 +0800 Subject: vdpa: mlx5: prevent cvq work from hogging CPU A userspace triggerable infinite loop could happen in mlx5_cvq_kick_handler() if userspace keeps sending a huge amount of cvq requests. Fixing this by introducing a quota and re-queue the work if we're out of the budget (currently the implicit budget is one) . While at it, using a per device work struct to avoid on demand memory allocation for cvq. Fixes: 5262912ef3cfc ("vdpa/mlx5: Add support for control VQ and MAC setting") Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20220329042109.4029-1-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin Acked-by: Eli Cohen --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 2f4fb09f1e89..da21e5430434 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -163,6 +163,7 @@ struct mlx5_vdpa_net { u32 cur_num_vqs; struct notifier_block nb; struct vdpa_callback config_cb; + struct mlx5_vdpa_wq_ent cvq_ent; }; static void free_resources(struct mlx5_vdpa_net *ndev); @@ -1659,10 +1660,10 @@ static void mlx5_cvq_kick_handler(struct work_struct *work) ndev = to_mlx5_vdpa_ndev(mvdev); cvq = &mvdev->cvq; if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) - goto out; + return; if (!cvq->ready) - goto out; + return; while (true) { err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head, @@ -1696,9 +1697,10 @@ static void mlx5_cvq_kick_handler(struct work_struct *work) if (vringh_need_notify_iotlb(&cvq->vring)) vringh_notify(&cvq->vring); + + queue_work(mvdev->wq, &wqent->work); + break; } -out: - kfree(wqent); } static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx) @@ -1706,7 +1708,6 @@ static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx) struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); struct mlx5_vdpa_virtqueue *mvq; - struct mlx5_vdpa_wq_ent *wqent; if (!is_index_valid(mvdev, idx)) return; @@ -1715,13 +1716,7 @@ static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx) if (!mvdev->wq || !mvdev->cvq.ready) return; - wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC); - if (!wqent) - return; - - wqent->mvdev = mvdev; - INIT_WORK(&wqent->work, mlx5_cvq_kick_handler); - queue_work(mvdev->wq, &wqent->work); + queue_work(mvdev->wq, &ndev->cvq_ent.work); return; } @@ -2740,6 +2735,8 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, if (err) goto err_mr; + ndev->cvq_ent.mvdev = mvdev; + INIT_WORK(&ndev->cvq_ent.work, mlx5_cvq_kick_handler); mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq"); if (!mvdev->wq) { err = -ENOMEM; -- cgit v1.2.3 From 1c80cf031e0204fde471558ee40183695773ce13 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 29 Mar 2022 12:21:08 +0800 Subject: vdpa: mlx5: synchronize driver status with CVQ Currently, CVQ doesn't have any synchronization with the driver status. Then CVQ emulation code run in the middle of: 1) device reset 2) device status changed 3) map updating The will lead several unexpected issue like trying to execute CVQ command after the driver has been teared down. Fixing this by using reslock to synchronize CVQ emulation code with the driver status changing: - protect the whole device reset, status changing and set_map() updating with reslock - protect the CVQ handler with the reslock and check VIRTIO_CONFIG_S_DRIVER_OK in the CVQ handler This will guarantee that: 1) CVQ handler won't work if VIRTIO_CONFIG_S_DRIVER_OK is not set 2) CVQ handler will see a consistent state of the driver instead of the partial one when it is running in the middle of the teardown_driver() or setup_driver(). Cc: 5262912ef3cfc ("vdpa/mlx5: Add support for control VQ and MAC setting") Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20220329042109.4029-2-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin Acked-by: Eli Cohen --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 51 ++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index da21e5430434..79001301b383 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1659,11 +1659,17 @@ static void mlx5_cvq_kick_handler(struct work_struct *work) mvdev = wqent->mvdev; ndev = to_mlx5_vdpa_ndev(mvdev); cvq = &mvdev->cvq; + + mutex_lock(&ndev->reslock); + + if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) + goto out; + if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) - return; + goto out; if (!cvq->ready) - return; + goto out; while (true) { err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head, @@ -1701,6 +1707,9 @@ static void mlx5_cvq_kick_handler(struct work_struct *work) queue_work(mvdev->wq, &wqent->work); break; } + +out: + mutex_unlock(&ndev->reslock); } static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx) @@ -2175,7 +2184,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb goto err_mr; if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) - return 0; + goto err_mr; restore_channels_info(ndev); err = setup_driver(mvdev); @@ -2190,12 +2199,14 @@ err_mr: return err; } +/* reslock must be held for this function */ static int setup_driver(struct mlx5_vdpa_dev *mvdev) { struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); int err; - mutex_lock(&ndev->reslock); + WARN_ON(!mutex_is_locked(&ndev->reslock)); + if (ndev->setup) { mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n"); err = 0; @@ -2225,7 +2236,6 @@ static int setup_driver(struct mlx5_vdpa_dev *mvdev) goto err_fwd; } ndev->setup = true; - mutex_unlock(&ndev->reslock); return 0; @@ -2236,23 +2246,23 @@ err_tir: err_rqt: teardown_virtqueues(ndev); out: - mutex_unlock(&ndev->reslock); return err; } +/* reslock must be held for this function */ static void teardown_driver(struct mlx5_vdpa_net *ndev) { - mutex_lock(&ndev->reslock); + + WARN_ON(!mutex_is_locked(&ndev->reslock)); + if (!ndev->setup) - goto out; + return; remove_fwd_to_tir(ndev); destroy_tir(ndev); destroy_rqt(ndev); teardown_virtqueues(ndev); ndev->setup = false; -out: - mutex_unlock(&ndev->reslock); } static void clear_vqs_ready(struct mlx5_vdpa_net *ndev) @@ -2273,6 +2283,8 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) print_status(mvdev, status, true); + mutex_lock(&ndev->reslock); + if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) { if (status & VIRTIO_CONFIG_S_DRIVER_OK) { err = setup_driver(mvdev); @@ -2282,16 +2294,19 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) } } else { mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n"); - return; + goto err_clear; } } ndev->mvdev.status = status; + mutex_unlock(&ndev->reslock); return; err_setup: mlx5_vdpa_destroy_mr(&ndev->mvdev); ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED; +err_clear: + mutex_unlock(&ndev->reslock); } static int mlx5_vdpa_reset(struct vdpa_device *vdev) @@ -2301,6 +2316,8 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev) print_status(mvdev, 0, true); mlx5_vdpa_info(mvdev, "performing device reset\n"); + + mutex_lock(&ndev->reslock); teardown_driver(ndev); clear_vqs_ready(ndev); mlx5_vdpa_destroy_mr(&ndev->mvdev); @@ -2313,6 +2330,7 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev) if (mlx5_vdpa_create_mr(mvdev, NULL)) mlx5_vdpa_warn(mvdev, "create MR failed\n"); } + mutex_unlock(&ndev->reslock); return 0; } @@ -2348,19 +2366,24 @@ static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev) static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); bool change_map; int err; + mutex_lock(&ndev->reslock); + err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map); if (err) { mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err); - return err; + goto err; } if (change_map) - return mlx5_vdpa_change_map(mvdev, iotlb); + err = mlx5_vdpa_change_map(mvdev, iotlb); - return 0; +err: + mutex_unlock(&ndev->reslock); + return err; } static void mlx5_vdpa_free(struct vdpa_device *vdev) -- cgit v1.2.3 From 522574fd7864e091d473765102e866414979b2ab Mon Sep 17 00:00:00 2001 From: Delyan Kratunov Date: Mon, 21 Mar 2022 23:29:18 +0000 Subject: bpftool: Explicit errno handling in skeletons Andrii noticed that since f97b8b9bd630 ("bpftool: Fix a bug in subskeleton code generation") the subskeleton code allows bpf_object__destroy_subskeleton to overwrite the errno that subskeleton__open would return with. While this is not currently an issue, let's make it future-proof. This patch explicitly tracks err in subskeleton__open and skeleton__create (i.e. calloc failure is explicitly ENOMEM) and ensures that errno is -err on the error return path. The skeleton code had to be changed since maps and progs codegen is shared with subskeletons. Fixes: f97b8b9bd630 ("bpftool: Fix a bug in subskeleton code generation") Signed-off-by: Delyan Kratunov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/3b6bfbb770c79ae64d8de26c1c1bd9d53a4b85f8.camel@fb.com --- tools/bpf/bpftool/gen.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 91af2850b505..7678af364793 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -828,8 +828,10 @@ codegen_maps_skeleton(struct bpf_object *obj, size_t map_cnt, bool mmaped) s->map_cnt = %zu; \n\ s->map_skel_sz = sizeof(*s->maps); \n\ s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz);\n\ - if (!s->maps) \n\ + if (!s->maps) { \n\ + err = -ENOMEM; \n\ goto err; \n\ + } \n\ ", map_cnt ); @@ -870,8 +872,10 @@ codegen_progs_skeleton(struct bpf_object *obj, size_t prog_cnt, bool populate_li s->prog_cnt = %zu; \n\ s->prog_skel_sz = sizeof(*s->progs); \n\ s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz);\n\ - if (!s->progs) \n\ + if (!s->progs) { \n\ + err = -ENOMEM; \n\ goto err; \n\ + } \n\ ", prog_cnt ); @@ -1182,10 +1186,13 @@ static int do_skeleton(int argc, char **argv) %1$s__create_skeleton(struct %1$s *obj) \n\ { \n\ struct bpf_object_skeleton *s; \n\ + int err; \n\ \n\ s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s));\n\ - if (!s) \n\ + if (!s) { \n\ + err = -ENOMEM; \n\ goto err; \n\ + } \n\ \n\ s->sz = sizeof(*s); \n\ s->name = \"%1$s\"; \n\ @@ -1206,7 +1213,7 @@ static int do_skeleton(int argc, char **argv) return 0; \n\ err: \n\ bpf_object__destroy_skeleton(s); \n\ - return -ENOMEM; \n\ + return err; \n\ } \n\ \n\ static inline const void *%2$s__elf_bytes(size_t *sz) \n\ @@ -1466,12 +1473,12 @@ static int do_subskeleton(int argc, char **argv) \n\ obj = (struct %1$s *)calloc(1, sizeof(*obj)); \n\ if (!obj) { \n\ - errno = ENOMEM; \n\ + err = -ENOMEM; \n\ goto err; \n\ } \n\ s = (struct bpf_object_subskeleton *)calloc(1, sizeof(*s));\n\ if (!s) { \n\ - errno = ENOMEM; \n\ + err = -ENOMEM; \n\ goto err; \n\ } \n\ s->sz = sizeof(*s); \n\ @@ -1483,7 +1490,7 @@ static int do_subskeleton(int argc, char **argv) s->var_cnt = %2$d; \n\ s->vars = (struct bpf_var_skeleton *)calloc(%2$d, sizeof(*s->vars));\n\ if (!s->vars) { \n\ - errno = ENOMEM; \n\ + err = -ENOMEM; \n\ goto err; \n\ } \n\ ", @@ -1538,6 +1545,7 @@ static int do_subskeleton(int argc, char **argv) return obj; \n\ err: \n\ %1$s__destroy(obj); \n\ + errno = -err; \n\ return NULL; \n\ } \n\ \n\ -- cgit v1.2.3 From d31e0386a2f122b40b605eb0120a2fbcfca77868 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 30 Mar 2022 13:05:10 +0200 Subject: bpf: Fix sparse warnings in kprobe_multi_resolve_syms Adding missing __user tags to fix sparse warnings: kernel/trace/bpf_trace.c:2370:34: warning: incorrect type in argument 2 (different address spaces) kernel/trace/bpf_trace.c:2370:34: expected void const [noderef] __user *from kernel/trace/bpf_trace.c:2370:34: got void const *usyms kernel/trace/bpf_trace.c:2376:51: warning: incorrect type in argument 2 (different address spaces) kernel/trace/bpf_trace.c:2376:51: expected char const [noderef] __user *src kernel/trace/bpf_trace.c:2376:51: got char const * kernel/trace/bpf_trace.c:2443:49: warning: incorrect type in argument 1 (different address spaces) kernel/trace/bpf_trace.c:2443:49: expected void const *usyms kernel/trace/bpf_trace.c:2443:49: got void [noderef] __user *[assigned] usyms Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Reported-by: Alexei Starovoitov Reported-by: Jakub Kicinski Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220330110510.398558-1-jolsa@kernel.org --- kernel/trace/bpf_trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7fa2ebc07f60..d8553f46caa2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2349,11 +2349,11 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip, } static int -kprobe_multi_resolve_syms(const void *usyms, u32 cnt, +kprobe_multi_resolve_syms(const void __user *usyms, u32 cnt, unsigned long *addrs) { unsigned long addr, size; - const char **syms; + const char __user **syms; int err = -ENOMEM; unsigned int i; char *func; -- cgit v1.2.3 From 2609f635a20d3691e7b5725edc3bdadb7bedf8fb Mon Sep 17 00:00:00 2001 From: Haowen Bai Date: Wed, 30 Mar 2022 09:59:48 +0800 Subject: selftests/bpf: Fix warning comparing pointer to 0 Avoid pointer type value compared with 0 to make code clear. Reported by coccicheck: tools/testing/selftests/bpf/progs/map_ptr_kern.c:370:21-22: WARNING comparing pointer to 0 tools/testing/selftests/bpf/progs/map_ptr_kern.c:397:21-22: WARNING comparing pointer to 0 Signed-off-by: Haowen Bai Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/1648605588-19269-1-git-send-email-baihaowen@meizu.com --- tools/testing/selftests/bpf/progs/map_ptr_kern.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c index b64df94ec476..db388f593d0a 100644 --- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c +++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c @@ -367,7 +367,7 @@ static inline int check_array_of_maps(void) VERIFY(check_default(&array_of_maps->map, map)); inner_map = bpf_map_lookup_elem(array_of_maps, &key); - VERIFY(inner_map != 0); + VERIFY(inner_map != NULL); VERIFY(inner_map->map.max_entries == INNER_MAX_ENTRIES); return 1; @@ -394,7 +394,7 @@ static inline int check_hash_of_maps(void) VERIFY(check_default(&hash_of_maps->map, map)); inner_map = bpf_map_lookup_elem(hash_of_maps, &key); - VERIFY(inner_map != 0); + VERIFY(inner_map != NULL); VERIFY(inner_map->map.max_entries == INNER_MAX_ENTRIES); return 1; -- cgit v1.2.3 From a2fb49833cad07a53651c23dce508127085fae2d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 31 Mar 2022 10:11:17 +0900 Subject: rethook: Fix to use WRITE_ONCE() for rethook:: Handler Since the function pointered by rethook::handler never be removed when the rethook is alive, it doesn't need to use rcu_assign_pointer() to update it. Just use WRITE_ONCE(). Reported-by: Alexei Starovoitov Signed-off-by: Masami Hiramatsu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164868907688.21983.1606862921419988152.stgit@devnote2 --- kernel/trace/rethook.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index ab463a4d2b23..b56833700d23 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -65,7 +65,7 @@ static void rethook_free_rcu(struct rcu_head *head) */ void rethook_free(struct rethook *rh) { - rcu_assign_pointer(rh->handler, NULL); + WRITE_ONCE(rh->handler, NULL); call_rcu(&rh->rcu, rethook_free_rcu); } -- cgit v1.2.3 From 4a9c7bbe2ed4d2b240674b1fb606c41d3940c412 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 29 Mar 2022 18:14:56 -0700 Subject: bpf: Resolve to prog->aux->dst_prog->type only for BPF_PROG_TYPE_EXT The commit 7e40781cc8b7 ("bpf: verifier: Use target program's type for access verifications") fixes the verifier checking for BPF_PROG_TYPE_EXT (extension) prog such that the verifier looks for things based on the target prog type that it is extending instead of the BPF_PROG_TYPE_EXT itself. The current resolve_prog_type() returns the target prog type. It checks for nullness on prog->aux->dst_prog. However, when loading a BPF_PROG_TYPE_TRACING prog and it is tracing another bpf prog instead of a kernel function, prog->aux->dst_prog is not NULL also. In this case, the verifier should still verify as the BPF_PROG_TYPE_TRACING type instead of the traced prog type in prog->aux->dst_prog->type. An oops has been reported when tracing a struct_ops prog. A NULL dereference happened in check_return_code() when accessing the prog->aux->attach_func_proto->type and prog->aux->attach_func_proto is NULL here because the traced struct_ops prog has the "unreliable" set. This patch is to change the resolve_prog_type() to only return the target prog type if the prog being verified is BPF_PROG_TYPE_EXT. Fixes: 7e40781cc8b7 ("bpf: verifier: Use target program's type for access verifications") Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220330011456.2984509-1-kafai@fb.com --- include/linux/bpf_verifier.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c1fc4af47f69..3a9d2d7cc6b7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -570,9 +570,11 @@ static inline u32 type_flag(u32 type) return type & ~BPF_BASE_TYPE_MASK; } +/* only use after check_attach_btf_id() */ static inline enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) { - return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type; + return prog->type == BPF_PROG_TYPE_EXT ? + prog->aux->dst_prog->type : prog->type; } #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From 0a210af6d0a0595fef566e7eeb072f10f37774be Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 29 Mar 2022 18:15:02 -0700 Subject: bpf: selftests: Test fentry tracing a struct_ops program This patch tests attaching an fentry prog to a struct_ops prog. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220330011502.2985292-1-kafai@fb.com --- .../selftests/bpf/prog_tests/dummy_st_ops.c | 23 ++++++++++++++++++++++ .../selftests/bpf/progs/trace_dummy_st_ops.c | 21 ++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/trace_dummy_st_ops.c diff --git a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c index 5aa52cc31dc2..c11832657d2b 100644 --- a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c +++ b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c @@ -2,6 +2,7 @@ /* Copyright (C) 2021. Huawei Technologies Co., Ltd */ #include #include "dummy_st_ops.skel.h" +#include "trace_dummy_st_ops.skel.h" /* Need to keep consistent with definition in include/linux/bpf.h */ struct bpf_dummy_ops_state { @@ -56,6 +57,7 @@ static void test_dummy_init_ptr_arg(void) .ctx_in = args, .ctx_size_in = sizeof(args), ); + struct trace_dummy_st_ops *trace_skel; struct dummy_st_ops *skel; int fd, err; @@ -64,12 +66,33 @@ static void test_dummy_init_ptr_arg(void) return; fd = bpf_program__fd(skel->progs.test_1); + + trace_skel = trace_dummy_st_ops__open(); + if (!ASSERT_OK_PTR(trace_skel, "trace_dummy_st_ops__open")) + goto done; + + err = bpf_program__set_attach_target(trace_skel->progs.fentry_test_1, + fd, "test_1"); + if (!ASSERT_OK(err, "set_attach_target(fentry_test_1)")) + goto done; + + err = trace_dummy_st_ops__load(trace_skel); + if (!ASSERT_OK(err, "load(trace_skel)")) + goto done; + + err = trace_dummy_st_ops__attach(trace_skel); + if (!ASSERT_OK(err, "attach(trace_skel)")) + goto done; + err = bpf_prog_test_run_opts(fd, &attr); ASSERT_OK(err, "test_run"); ASSERT_EQ(in_state.val, 0x5a, "test_ptr_ret"); ASSERT_EQ(attr.retval, exp_retval, "test_ret"); + ASSERT_EQ(trace_skel->bss->val, exp_retval, "fentry_val"); +done: dummy_st_ops__destroy(skel); + trace_dummy_st_ops__destroy(trace_skel); } static void test_dummy_multiple_args(void) diff --git a/tools/testing/selftests/bpf/progs/trace_dummy_st_ops.c b/tools/testing/selftests/bpf/progs/trace_dummy_st_ops.c new file mode 100644 index 000000000000..00a4be9d3074 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/trace_dummy_st_ops.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +int val = 0; + +SEC("fentry/test_1") +int BPF_PROG(fentry_test_1, __u64 *st_ops_ctx) +{ + __u64 state; + + /* Read the traced st_ops arg1 which is a pointer */ + bpf_probe_read_kernel(&state, sizeof(__u64), (void *)st_ops_ctx); + /* Read state->val */ + bpf_probe_read_kernel(&val, sizeof(__u32), (void *)state); + + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 059a47f1da93811d37533556d67e72f2261b1127 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 30 Mar 2022 16:37:03 +0000 Subject: net: sfc: add missing xdp queue reinitialization After rx/tx ring buffer size is changed, kernel panic occurs when it acts XDP_TX or XDP_REDIRECT. When tx/rx ring buffer size is changed(ethtool -G), sfc driver reallocates and reinitializes rx and tx queues and their buffer (tx_queue->buffer). But it misses reinitializing xdp queues(efx->xdp_tx_queues). So, while it is acting XDP_TX or XDP_REDIRECT, it uses the uninitialized tx_queue->buffer. A new function efx_set_xdp_channels() is separated from efx_set_channels() to handle only xdp queues. Splat looks like: BUG: kernel NULL pointer dereference, address: 000000000000002a #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#4] PREEMPT SMP NOPTI RIP: 0010:efx_tx_map_chunk+0x54/0x90 [sfc] CPU: 2 PID: 0 Comm: swapper/2 Tainted: G D 5.17.0+ #55 e8beeee8289528f11357029357cf Code: 48 8b 8d a8 01 00 00 48 8d 14 52 4c 8d 2c d0 44 89 e0 48 85 c9 74 0e 44 89 e2 4c 89 f6 48 80 RSP: 0018:ffff92f121e45c60 EFLAGS: 00010297 RIP: 0010:efx_tx_map_chunk+0x54/0x90 [sfc] RAX: 0000000000000040 RBX: ffff92ea506895c0 RCX: ffffffffc0330870 RDX: 0000000000000001 RSI: 00000001139b10ce RDI: ffff92ea506895c0 RBP: ffffffffc0358a80 R08: 00000001139b110d R09: 0000000000000000 R10: 0000000000000001 R11: ffff92ea414c0088 R12: 0000000000000040 R13: 0000000000000018 R14: 00000001139b10ce R15: ffff92ea506895c0 FS: 0000000000000000(0000) GS:ffff92f121ec0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 Code: 48 8b 8d a8 01 00 00 48 8d 14 52 4c 8d 2c d0 44 89 e0 48 85 c9 74 0e 44 89 e2 4c 89 f6 48 80 CR2: 000000000000002a CR3: 00000003e6810004 CR4: 00000000007706e0 RSP: 0018:ffff92f121e85c60 EFLAGS: 00010297 PKRU: 55555554 RAX: 0000000000000040 RBX: ffff92ea50689700 RCX: ffffffffc0330870 RDX: 0000000000000001 RSI: 00000001145a90ce RDI: ffff92ea50689700 RBP: ffffffffc0358a80 R08: 00000001145a910d R09: 0000000000000000 R10: 0000000000000001 R11: ffff92ea414c0088 R12: 0000000000000040 R13: 0000000000000018 R14: 00000001145a90ce R15: ffff92ea50689700 FS: 0000000000000000(0000) GS:ffff92f121e80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000002a CR3: 00000003e6810005 CR4: 00000000007706e0 PKRU: 55555554 Call Trace: efx_xdp_tx_buffers+0x12b/0x3d0 [sfc 84c94b8e32d44d296c17e10a634d3ad454de4ba5] __efx_rx_packet+0x5c3/0x930 [sfc 84c94b8e32d44d296c17e10a634d3ad454de4ba5] efx_rx_packet+0x28c/0x2e0 [sfc 84c94b8e32d44d296c17e10a634d3ad454de4ba5] efx_ef10_ev_process+0x5f8/0xf40 [sfc 84c94b8e32d44d296c17e10a634d3ad454de4ba5] ? enqueue_task_fair+0x95/0x550 efx_poll+0xc4/0x360 [sfc 84c94b8e32d44d296c17e10a634d3ad454de4ba5] Fixes: 3990a8fffbda ("sfc: allocate channels for XDP tx queues") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/efx_channels.c | 146 ++++++++++++++++++-------------- 1 file changed, 81 insertions(+), 65 deletions(-) diff --git a/drivers/net/ethernet/sfc/efx_channels.c b/drivers/net/ethernet/sfc/efx_channels.c index f9064532beb6..83e27231fbe6 100644 --- a/drivers/net/ethernet/sfc/efx_channels.c +++ b/drivers/net/ethernet/sfc/efx_channels.c @@ -786,6 +786,85 @@ void efx_remove_channels(struct efx_nic *efx) kfree(efx->xdp_tx_queues); } +static int efx_set_xdp_tx_queue(struct efx_nic *efx, int xdp_queue_number, + struct efx_tx_queue *tx_queue) +{ + if (xdp_queue_number >= efx->xdp_tx_queue_count) + return -EINVAL; + + netif_dbg(efx, drv, efx->net_dev, + "Channel %u TXQ %u is XDP %u, HW %u\n", + tx_queue->channel->channel, tx_queue->label, + xdp_queue_number, tx_queue->queue); + efx->xdp_tx_queues[xdp_queue_number] = tx_queue; + return 0; +} + +static void efx_set_xdp_channels(struct efx_nic *efx) +{ + struct efx_tx_queue *tx_queue; + struct efx_channel *channel; + unsigned int next_queue = 0; + int xdp_queue_number = 0; + int rc; + + /* We need to mark which channels really have RX and TX + * queues, and adjust the TX queue numbers if we have separate + * RX-only and TX-only channels. + */ + efx_for_each_channel(channel, efx) { + if (channel->channel < efx->tx_channel_offset) + continue; + + if (efx_channel_is_xdp_tx(channel)) { + efx_for_each_channel_tx_queue(tx_queue, channel) { + tx_queue->queue = next_queue++; + rc = efx_set_xdp_tx_queue(efx, xdp_queue_number, + tx_queue); + if (rc == 0) + xdp_queue_number++; + } + } else { + efx_for_each_channel_tx_queue(tx_queue, channel) { + tx_queue->queue = next_queue++; + netif_dbg(efx, drv, efx->net_dev, + "Channel %u TXQ %u is HW %u\n", + channel->channel, tx_queue->label, + tx_queue->queue); + } + + /* If XDP is borrowing queues from net stack, it must + * use the queue with no csum offload, which is the + * first one of the channel + * (note: tx_queue_by_type is not initialized yet) + */ + if (efx->xdp_txq_queues_mode == + EFX_XDP_TX_QUEUES_BORROWED) { + tx_queue = &channel->tx_queue[0]; + rc = efx_set_xdp_tx_queue(efx, xdp_queue_number, + tx_queue); + if (rc == 0) + xdp_queue_number++; + } + } + } + WARN_ON(efx->xdp_txq_queues_mode == EFX_XDP_TX_QUEUES_DEDICATED && + xdp_queue_number != efx->xdp_tx_queue_count); + WARN_ON(efx->xdp_txq_queues_mode != EFX_XDP_TX_QUEUES_DEDICATED && + xdp_queue_number > efx->xdp_tx_queue_count); + + /* If we have more CPUs than assigned XDP TX queues, assign the already + * existing queues to the exceeding CPUs + */ + next_queue = 0; + while (xdp_queue_number < efx->xdp_tx_queue_count) { + tx_queue = efx->xdp_tx_queues[next_queue++]; + rc = efx_set_xdp_tx_queue(efx, xdp_queue_number, tx_queue); + if (rc == 0) + xdp_queue_number++; + } +} + int efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries) { struct efx_channel *other_channel[EFX_MAX_CHANNELS], *channel; @@ -857,6 +936,7 @@ int efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries) efx_init_napi_channel(efx->channel[i]); } + efx_set_xdp_channels(efx); out: /* Destroy unused channel structures */ for (i = 0; i < efx->n_channels; i++) { @@ -889,26 +969,9 @@ rollback: goto out; } -static inline int -efx_set_xdp_tx_queue(struct efx_nic *efx, int xdp_queue_number, - struct efx_tx_queue *tx_queue) -{ - if (xdp_queue_number >= efx->xdp_tx_queue_count) - return -EINVAL; - - netif_dbg(efx, drv, efx->net_dev, "Channel %u TXQ %u is XDP %u, HW %u\n", - tx_queue->channel->channel, tx_queue->label, - xdp_queue_number, tx_queue->queue); - efx->xdp_tx_queues[xdp_queue_number] = tx_queue; - return 0; -} - int efx_set_channels(struct efx_nic *efx) { - struct efx_tx_queue *tx_queue; struct efx_channel *channel; - unsigned int next_queue = 0; - int xdp_queue_number; int rc; efx->tx_channel_offset = @@ -926,61 +989,14 @@ int efx_set_channels(struct efx_nic *efx) return -ENOMEM; } - /* We need to mark which channels really have RX and TX - * queues, and adjust the TX queue numbers if we have separate - * RX-only and TX-only channels. - */ - xdp_queue_number = 0; efx_for_each_channel(channel, efx) { if (channel->channel < efx->n_rx_channels) channel->rx_queue.core_index = channel->channel; else channel->rx_queue.core_index = -1; - - if (channel->channel >= efx->tx_channel_offset) { - if (efx_channel_is_xdp_tx(channel)) { - efx_for_each_channel_tx_queue(tx_queue, channel) { - tx_queue->queue = next_queue++; - rc = efx_set_xdp_tx_queue(efx, xdp_queue_number, tx_queue); - if (rc == 0) - xdp_queue_number++; - } - } else { - efx_for_each_channel_tx_queue(tx_queue, channel) { - tx_queue->queue = next_queue++; - netif_dbg(efx, drv, efx->net_dev, "Channel %u TXQ %u is HW %u\n", - channel->channel, tx_queue->label, - tx_queue->queue); - } - - /* If XDP is borrowing queues from net stack, it must use the queue - * with no csum offload, which is the first one of the channel - * (note: channel->tx_queue_by_type is not initialized yet) - */ - if (efx->xdp_txq_queues_mode == EFX_XDP_TX_QUEUES_BORROWED) { - tx_queue = &channel->tx_queue[0]; - rc = efx_set_xdp_tx_queue(efx, xdp_queue_number, tx_queue); - if (rc == 0) - xdp_queue_number++; - } - } - } } - WARN_ON(efx->xdp_txq_queues_mode == EFX_XDP_TX_QUEUES_DEDICATED && - xdp_queue_number != efx->xdp_tx_queue_count); - WARN_ON(efx->xdp_txq_queues_mode != EFX_XDP_TX_QUEUES_DEDICATED && - xdp_queue_number > efx->xdp_tx_queue_count); - /* If we have more CPUs than assigned XDP TX queues, assign the already - * existing queues to the exceeding CPUs - */ - next_queue = 0; - while (xdp_queue_number < efx->xdp_tx_queue_count) { - tx_queue = efx->xdp_tx_queues[next_queue++]; - rc = efx_set_xdp_tx_queue(efx, xdp_queue_number, tx_queue); - if (rc == 0) - xdp_queue_number++; - } + efx_set_xdp_channels(efx); rc = netif_set_real_num_tx_queues(efx->net_dev, efx->n_tx_channels); if (rc) -- cgit v1.2.3 From 9381fe8c849cfbe50245ac01fc077554f6eaa0e2 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Thu, 31 Mar 2022 15:04:28 +0800 Subject: net/tls: fix slab-out-of-bounds bug in decrypt_internal The memory size of tls_ctx->rx.iv for AES128-CCM is 12 setting in tls_set_sw_offload(). The return value of crypto_aead_ivsize() for "ccm(aes)" is 16. So memcpy() require 16 bytes from 12 bytes memory space will trigger slab-out-of-bounds bug as following: ================================================================== BUG: KASAN: slab-out-of-bounds in decrypt_internal+0x385/0xc40 [tls] Read of size 16 at addr ffff888114e84e60 by task tls/10911 Call Trace: dump_stack_lvl+0x34/0x44 print_report.cold+0x5e/0x5db ? decrypt_internal+0x385/0xc40 [tls] kasan_report+0xab/0x120 ? decrypt_internal+0x385/0xc40 [tls] kasan_check_range+0xf9/0x1e0 memcpy+0x20/0x60 decrypt_internal+0x385/0xc40 [tls] ? tls_get_rec+0x2e0/0x2e0 [tls] ? process_rx_list+0x1a5/0x420 [tls] ? tls_setup_from_iter.constprop.0+0x2e0/0x2e0 [tls] decrypt_skb_update+0x9d/0x400 [tls] tls_sw_recvmsg+0x3c8/0xb50 [tls] Allocated by task 10911: kasan_save_stack+0x1e/0x40 __kasan_kmalloc+0x81/0xa0 tls_set_sw_offload+0x2eb/0xa20 [tls] tls_setsockopt+0x68c/0x700 [tls] __sys_setsockopt+0xfe/0x1b0 Replace the crypto_aead_ivsize() with prot->iv_size + prot->salt_size when memcpy() iv value in TLS_1_3_VERSION scenario. Fixes: f295b3ae9f59 ("net/tls: Add support of AES128-CCM based ciphers") Signed-off-by: Ziyang Xuan Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 0024a692f0f8..a8976ef95528 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1496,7 +1496,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, if (prot->version == TLS_1_3_VERSION || prot->cipher_type == TLS_CIPHER_CHACHA20_POLY1305) memcpy(iv + iv_offset, tls_ctx->rx.iv, - crypto_aead_ivsize(ctx->aead_recv)); + prot->iv_size + prot->salt_size); else memcpy(iv + iv_offset, tls_ctx->rx.iv, prot->salt_size); -- cgit v1.2.3 From 012d69fbfcc739f846766c1da56ef8b493b803b5 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Thu, 31 Mar 2022 10:26:43 +0300 Subject: vrf: fix packet sniffing for traffic originating from ip tunnels in commit 048939088220 ("vrf: add mac header for tunneled packets when sniffer is attached") an Ethernet header was cooked for traffic originating from tunnel devices. However, the header is added based on whether the mac_header is unset and ignores cases where the device doesn't expose a mac header to upper layers, such as in ip tunnels like ipip and gre. Traffic originating from such devices still appears garbled when capturing on the vrf device. Fix by observing whether the original device exposes a header to upper layers, similar to the logic done in af_packet. In addition, skb->mac_len needs to be adjusted after adding the Ethernet header for the skb_push/pull() surrounding dev_queue_xmit_nit() to work on these packets. Fixes: 048939088220 ("vrf: add mac header for tunneled packets when sniffer is attached") Signed-off-by: Eyal Birger Reviewed-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 85e362461d71..cfc30ce4c6e1 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1265,6 +1265,7 @@ static int vrf_prepare_mac_header(struct sk_buff *skb, eth = (struct ethhdr *)skb->data; skb_reset_mac_header(skb); + skb_reset_mac_len(skb); /* we set the ethernet destination and the source addresses to the * address of the VRF device. @@ -1294,9 +1295,9 @@ static int vrf_prepare_mac_header(struct sk_buff *skb, */ static int vrf_add_mac_header_if_unset(struct sk_buff *skb, struct net_device *vrf_dev, - u16 proto) + u16 proto, struct net_device *orig_dev) { - if (skb_mac_header_was_set(skb)) + if (skb_mac_header_was_set(skb) && dev_has_header(orig_dev)) return 0; return vrf_prepare_mac_header(skb, vrf_dev, proto); @@ -1402,6 +1403,8 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, /* if packet is NDISC then keep the ingress interface */ if (!is_ndisc) { + struct net_device *orig_dev = skb->dev; + vrf_rx_stats(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; @@ -1410,7 +1413,8 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, - ETH_P_IPV6); + ETH_P_IPV6, + orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); @@ -1440,6 +1444,8 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { + struct net_device *orig_dev = skb->dev; + skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IPCB(skb)->flags |= IPSKB_L3SLAVE; @@ -1460,7 +1466,8 @@ static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, if (!list_empty(&vrf_dev->ptype_all)) { int err; - err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IP); + err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IP, + orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); -- cgit v1.2.3 From 1effe8ca4e34c34cdd9318436a4232dcb582ebf4 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 31 Mar 2022 11:24:41 +0100 Subject: skbuff: fix coalescing for page_pool fragment recycling Fix a use-after-free when using page_pool with page fragments. We encountered this problem during normal RX in the hns3 driver: (1) Initially we have three descriptors in the RX queue. The first one allocates PAGE1 through page_pool, and the other two allocate one half of PAGE2 each. Page references look like this: RX_BD1 _______ PAGE1 RX_BD2 _______ PAGE2 RX_BD3 _________/ (2) Handle RX on the first descriptor. Allocate SKB1, eventually added to the receive queue by tcp_queue_rcv(). (3) Handle RX on the second descriptor. Allocate SKB2 and pass it to netif_receive_skb(): netif_receive_skb(SKB2) ip_rcv(SKB2) SKB3 = skb_clone(SKB2) SKB2 and SKB3 share a reference to PAGE2 through skb_shinfo()->dataref. The other ref to PAGE2 is still held by RX_BD3: SKB2 ---+- PAGE2 SKB3 __/ / RX_BD3 _________/ (3b) Now while handling TCP, coalesce SKB3 with SKB1: tcp_v4_rcv(SKB3) tcp_try_coalesce(to=SKB1, from=SKB3) // succeeds kfree_skb_partial(SKB3) skb_release_data(SKB3) // drops one dataref SKB1 _____ PAGE1 \____ SKB2 _____ PAGE2 / RX_BD3 _________/ In skb_try_coalesce(), __skb_frag_ref() takes a page reference to PAGE2, where it should instead have increased the page_pool frag reference, pp_frag_count. Without coalescing, when releasing both SKB2 and SKB3, a single reference to PAGE2 would be dropped. Now when releasing SKB1 and SKB2, two references to PAGE2 will be dropped, resulting in underflow. (3c) Drop SKB2: af_packet_rcv(SKB2) consume_skb(SKB2) skb_release_data(SKB2) // drops second dataref page_pool_return_skb_page(PAGE2) // drops one pp_frag_count SKB1 _____ PAGE1 \____ PAGE2 / RX_BD3 _________/ (4) Userspace calls recvmsg() Copies SKB1 and releases it. Since SKB3 was coalesced with SKB1, we release the SKB3 page as well: tcp_eat_recv_skb(SKB1) skb_release_data(SKB1) page_pool_return_skb_page(PAGE1) page_pool_return_skb_page(PAGE2) // drops second pp_frag_count (5) PAGE2 is freed, but the third RX descriptor was still using it! In our case this causes IOMMU faults, but it would silently corrupt memory if the IOMMU was disabled. Change the logic that checks whether pp_recycle SKBs can be coalesced. We still reject differing pp_recycle between 'from' and 'to' SKBs, but in order to avoid the situation described above, we also reject coalescing when both 'from' and 'to' are pp_recycled and 'from' is cloned. The new logic allows coalescing a cloned pp_recycle SKB into a page refcounted one, because in this case the release (4) will drop the right reference, the one taken by skb_try_coalesce(). Fixes: 53e0961da1c7 ("page_pool: add frag page recycling support in page pool") Suggested-by: Alexander Duyck Signed-off-by: Jean-Philippe Brucker Reviewed-by: Yunsheng Lin Reviewed-by: Alexander Duyck Acked-by: Ilias Apalodimas Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/skbuff.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 10bde7c6db44..30b523fa4ad2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5276,11 +5276,18 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_cloned(to)) return false; - /* The page pool signature of struct page will eventually figure out - * which pages can be recycled or not but for now let's prohibit slab - * allocated and page_pool allocated SKBs from being coalesced. + /* In general, avoid mixing slab allocated and page_pool allocated + * pages within the same SKB. However when @to is not pp_recycle and + * @from is cloned, we can transition frag pages from page_pool to + * reference counted. + * + * On the other hand, don't allow coalescing two pp_recycle SKBs if + * @from is cloned, in case the SKB is using page_pool fragment + * references (PP_FLAG_PAGE_FRAG). Since we only take full page + * references for cloned SKBs at the moment that would result in + * inconsistent reference counts. */ - if (to->pp_recycle != from->pp_recycle) + if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from))) return false; if (len <= skb_tailroom(to)) { -- cgit v1.2.3 From 066dfc4290406b1b0b014ae3267d4266a344efd1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 31 Mar 2022 16:28:54 +0300 Subject: Revert "net: dsa: stop updating master MTU from master.c" This reverts commit a1ff94c2973c43bc1e2677ac63ebb15b1d1ff846. Switch drivers that don't implement ->port_change_mtu() will cause the DSA master to remain with an MTU of 1500, since we've deleted the other code path. In turn, this causes a regression for those systems, where MTU-sized traffic can no longer be terminated. Revert the change taking into account the fact that rtnl_lock() is now taken top-level from the callers of dsa_master_setup() and dsa_master_teardown(). Also add a comment in order for it to be absolutely clear why it is still needed. Fixes: a1ff94c2973c ("net: dsa: stop updating master MTU from master.c") Reported-by: Luiz Angelo Daros de Luca Signed-off-by: Vladimir Oltean Tested-by: Luiz Angelo Daros de Luca Signed-off-by: David S. Miller --- net/dsa/master.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/net/dsa/master.c b/net/dsa/master.c index 991c2930d631..2851e44c4cf0 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -335,11 +335,24 @@ static const struct attribute_group dsa_group = { .attrs = dsa_slave_attrs, }; +static void dsa_master_reset_mtu(struct net_device *dev) +{ + int err; + + err = dev_set_mtu(dev, ETH_DATA_LEN); + if (err) + netdev_dbg(dev, + "Unable to reset MTU to exclude DSA overheads\n"); +} + int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { + const struct dsa_device_ops *tag_ops = cpu_dp->tag_ops; struct dsa_switch *ds = cpu_dp->ds; struct device_link *consumer_link; - int ret; + int mtu, ret; + + mtu = ETH_DATA_LEN + dsa_tag_protocol_overhead(tag_ops); /* The DSA master must use SET_NETDEV_DEV for this to work. */ consumer_link = device_link_add(ds->dev, dev->dev.parent, @@ -349,6 +362,15 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) "Failed to create a device link to DSA switch %s\n", dev_name(ds->dev)); + /* The switch driver may not implement ->port_change_mtu(), case in + * which dsa_slave_change_mtu() will not update the master MTU either, + * so we need to do that here. + */ + ret = dev_set_mtu(dev, mtu); + if (ret) + netdev_warn(dev, "error %d setting MTU to %d to include DSA overhead\n", + ret, mtu); + /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get * sent to the tag format's receive function. @@ -384,6 +406,7 @@ void dsa_master_teardown(struct net_device *dev) sysfs_remove_group(&dev->dev.kobj, &dsa_group); dsa_netdev_ops_set(dev, NULL); dsa_master_ethtool_teardown(dev); + dsa_master_reset_mtu(dev); dsa_master_set_promiscuity(dev, -1); dev->dsa_ptr = NULL; -- cgit v1.2.3 From bd8c624c0cd59de0032752ba3001c107bba97f7b Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 31 Mar 2022 09:20:06 -0700 Subject: ice: Clear default forwarding VSI during VSI release VSI is set as default forwarding one when promisc mode is set for PF interface, when PF is switched to switchdev mode or when VF driver asks to enable allmulticast or promisc mode for the VF interface (when vf-true-promisc-support priv flag is off). The third case is buggy because in that case VSI associated with VF remains as default one after VF removal. Reproducer: 1. Create VF echo 1 > sys/class/net/ens7f0/device/sriov_numvfs 2. Enable allmulticast or promisc mode on VF ip link set ens7f0v0 allmulticast on ip link set ens7f0v0 promisc on 3. Delete VF echo 0 > sys/class/net/ens7f0/device/sriov_numvfs 4. Try to enable promisc mode on PF ip link set ens7f0 promisc on Although it looks that promisc mode on PF is enabled the opposite is true because ice_vsi_sync_fltr() responsible for IFF_PROMISC handling first checks if any other VSI is set as default forwarding one and if so the function does not do anything. At this point it is not possible to enable promisc mode on PF without re-probe device. To resolve the issue this patch clear default forwarding VSI during ice_vsi_release() when the VSI to be released is the default one. Fixes: 01b5e89aab49 ("ice: Add VF promiscuous support") Signed-off-by: Ivan Vecera Reviewed-by: Michal Swiatkowski Reviewed-by: Maciej Fijalkowski Signed-off-by: Alice Michael Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ice/ice_lib.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index b897926f817d..6d6233204388 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2983,6 +2983,8 @@ int ice_vsi_release(struct ice_vsi *vsi) } } + if (ice_is_vsi_dflt_vsi(pf->first_sw, vsi)) + ice_clear_dflt_vsi(pf->first_sw); ice_fltr_remove_all(vsi); ice_rm_vsi_lan_cfg(vsi->port_info, vsi->idx); err = ice_rm_vsi_rdma_cfg(vsi->port_info, vsi->idx); -- cgit v1.2.3 From 2c0069f3f91f125b1b2ce66cc6bea8eb134723c3 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 31 Mar 2022 09:20:07 -0700 Subject: ice: Fix MAC address setting Commit 2ccc1c1ccc671b ("ice: Remove excess error variables") merged the usage of 'status' and 'err' variables into single one in function ice_set_mac_address(). Unfortunately this causes a regression when call of ice_fltr_add_mac() returns -EEXIST because this return value does not indicate an error in this case but value of 'err' remains to be -EEXIST till the end of the function and is returned to caller. Prior mentioned commit this does not happen because return value of ice_fltr_add_mac() was stored to 'status' variable first and if it was -EEXIST then 'err' remains to be zero. Fix the problem by reset 'err' to zero when ice_fltr_add_mac() returns -EEXIST. Fixes: 2ccc1c1ccc671b ("ice: Remove excess error variables") Signed-off-by: Ivan Vecera Reviewed-by: Jacob Keller Acked-by: Alexander Lobakin Signed-off-by: Alice Michael Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ice/ice_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index b588d7995631..d755ce07869f 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5475,16 +5475,19 @@ static int ice_set_mac_address(struct net_device *netdev, void *pi) /* Add filter for new MAC. If filter exists, return success */ err = ice_fltr_add_mac(vsi, mac, ICE_FWD_TO_VSI); - if (err == -EEXIST) + if (err == -EEXIST) { /* Although this MAC filter is already present in hardware it's * possible in some cases (e.g. bonding) that dev_addr was * modified outside of the driver and needs to be restored back * to this value. */ netdev_dbg(netdev, "filter for MAC %pM already exists\n", mac); - else if (err) + + return 0; + } else if (err) { /* error if the new filter addition failed */ err = -EADDRNOTAVAIL; + } err_update_filters: if (err) { -- cgit v1.2.3 From 1273f89578f268ea705ddbad60c4bd2dcff80611 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 31 Mar 2022 09:20:08 -0700 Subject: ice: Fix broken IFF_ALLMULTI handling Handling of all-multicast flag and associated multicast promiscuous mode is broken in ice driver. When an user switches allmulticast flag on or off the driver checks whether any VLANs are configured over the interface (except default VLAN 0). If any extra VLANs are registered it enables multicast promiscuous mode for all these VLANs (including default VLAN 0) using ICE_SW_LKUP_PROMISC_VLAN look-up type. In this situation all multicast packets tagged with known VLAN ID or untagged are received and multicast packets tagged with unknown VLAN ID ignored. If no extra VLANs are registered (so only VLAN 0 exists) it enables multicast promiscuous mode for VLAN 0 and uses ICE_SW_LKUP_PROMISC look-up type. In this situation any multicast packets including tagged ones are received. The driver handles IFF_ALLMULTI in ice_vsi_sync_fltr() this way: ice_vsi_sync_fltr() { ... if (changed_flags & IFF_ALLMULTI) { if (netdev->flags & IFF_ALLMULTI) { if (vsi->num_vlans > 1) ice_set_promisc(..., ICE_MCAST_VLAN_PROMISC_BITS); else ice_set_promisc(..., ICE_MCAST_PROMISC_BITS); } else { if (vsi->num_vlans > 1) ice_clear_promisc(..., ICE_MCAST_VLAN_PROMISC_BITS); else ice_clear_promisc(..., ICE_MCAST_PROMISC_BITS); } } ... } The code above depends on value vsi->num_vlan that specifies number of VLANs configured over the interface (including VLAN 0) and this is problem because that value is modified in NDO callbacks ice_vlan_rx_add_vid() and ice_vlan_rx_kill_vid(). Scenario 1: 1. ip link set ens7f0 allmulticast on 2. ip link add vlan10 link ens7f0 type vlan id 10 3. ip link set ens7f0 allmulticast off 4. ip link set ens7f0 allmulticast on [1] In this scenario IFF_ALLMULTI is enabled and the driver calls ice_set_promisc(..., ICE_MCAST_PROMISC_BITS) that installs multicast promisc rule with non-VLAN look-up type. [2] Then VLAN with ID 10 is added and vsi->num_vlan incremented to 2 [3] Command switches IFF_ALLMULTI off and the driver calls ice_clear_promisc(..., ICE_MCAST_VLAN_PROMISC_BITS) but this call is effectively NOP because it looks for multicast promisc rules for VLAN 0 and VLAN 10 with VLAN look-up type but no such rules exist. So the all-multicast remains enabled silently in hardware. [4] Command tries to switch IFF_ALLMULTI on and the driver calls ice_clear_promisc(..., ICE_MCAST_PROMISC_BITS) but this call fails (-EEXIST) because non-VLAN multicast promisc rule already exists. Scenario 2: 1. ip link add vlan10 link ens7f0 type vlan id 10 2. ip link set ens7f0 allmulticast on 3. ip link add vlan20 link ens7f0 type vlan id 20 4. ip link del vlan10 ; ip link del vlan20 5. ip link set ens7f0 allmulticast off [1] VLAN with ID 10 is added and vsi->num_vlan==2 [2] Command switches IFF_ALLMULTI on and driver installs multicast promisc rules with VLAN look-up type for VLAN 0 and 10 [3] VLAN with ID 20 is added and vsi->num_vlan==3 but no multicast promisc rules is added for this new VLAN so the interface does not receive MC packets from VLAN 20 [4] Both VLANs are removed but multicast rule for VLAN 10 remains installed so interface receives multicast packets from VLAN 10 [5] Command switches IFF_ALLMULTI off and because vsi->num_vlan is 1 the driver tries to remove multicast promisc rule for VLAN 0 with non-VLAN look-up that does not exist. All-multicast looks disabled from user point of view but it is partially enabled in HW (interface receives all multicast packets either untagged or tagged with VLAN ID 10) To resolve these issues the patch introduces these changes: 1. Adds handling for IFF_ALLMULTI to ice_vlan_rx_add_vid() and ice_vlan_rx_kill_vid() callbacks. So when VLAN is added/removed and IFF_ALLMULTI is enabled an appropriate multicast promisc rule for that VLAN ID is added/removed. 2. In ice_vlan_rx_add_vid() when first VLAN besides VLAN 0 is added so (vsi->num_vlan == 2) and IFF_ALLMULTI is enabled then look-up type for existing multicast promisc rule for VLAN 0 is updated to ICE_MCAST_VLAN_PROMISC_BITS. 3. In ice_vlan_rx_kill_vid() when last VLAN besides VLAN 0 is removed so (vsi->num_vlan == 1) and IFF_ALLMULTI is enabled then look-up type for existing multicast promisc rule for VLAN 0 is updated to ICE_MCAST_PROMISC_BITS. 4. Both ice_vlan_rx_{add,kill}_vid() have to run under ICE_CFG_BUSY bit protection to avoid races with ice_vsi_sync_fltr() that runs in ice_service_task() context. 5. Bit ICE_VSI_VLAN_FLTR_CHANGED is use-less and can be removed. 6. Error messages added to ice_fltr_*_vsi_promisc() helper functions to avoid them in their callers 7. Small improvements to increase readability Fixes: 5eda8afd6bcc ("ice: Add support for PF/VF promiscuous mode") Signed-off-by: Ivan Vecera Reviewed-by: Jacob Keller Signed-off-by: Alice Michael Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ice/ice.h | 1 - drivers/net/ethernet/intel/ice/ice_fltr.c | 44 ++++++++++-- drivers/net/ethernet/intel/ice/ice_main.c | 114 +++++++++++++++++++++--------- 3 files changed, 121 insertions(+), 38 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index d4f1874df7d0..26eaee0b6503 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -301,7 +301,6 @@ enum ice_vsi_state { ICE_VSI_NETDEV_REGISTERED, ICE_VSI_UMAC_FLTR_CHANGED, ICE_VSI_MMAC_FLTR_CHANGED, - ICE_VSI_VLAN_FLTR_CHANGED, ICE_VSI_PROMISC_CHANGED, ICE_VSI_STATE_NBITS /* must be last */ }; diff --git a/drivers/net/ethernet/intel/ice/ice_fltr.c b/drivers/net/ethernet/intel/ice/ice_fltr.c index af57eb114966..85a94483c2ed 100644 --- a/drivers/net/ethernet/intel/ice/ice_fltr.c +++ b/drivers/net/ethernet/intel/ice/ice_fltr.c @@ -58,7 +58,16 @@ int ice_fltr_set_vlan_vsi_promisc(struct ice_hw *hw, struct ice_vsi *vsi, u8 promisc_mask) { - return ice_set_vlan_vsi_promisc(hw, vsi->idx, promisc_mask, false); + struct ice_pf *pf = hw->back; + int result; + + result = ice_set_vlan_vsi_promisc(hw, vsi->idx, promisc_mask, false); + if (result) + dev_err(ice_pf_to_dev(pf), + "Error setting promisc mode on VSI %i (rc=%d)\n", + vsi->vsi_num, result); + + return result; } /** @@ -73,7 +82,16 @@ int ice_fltr_clear_vlan_vsi_promisc(struct ice_hw *hw, struct ice_vsi *vsi, u8 promisc_mask) { - return ice_set_vlan_vsi_promisc(hw, vsi->idx, promisc_mask, true); + struct ice_pf *pf = hw->back; + int result; + + result = ice_set_vlan_vsi_promisc(hw, vsi->idx, promisc_mask, true); + if (result) + dev_err(ice_pf_to_dev(pf), + "Error clearing promisc mode on VSI %i (rc=%d)\n", + vsi->vsi_num, result); + + return result; } /** @@ -87,7 +105,16 @@ int ice_fltr_clear_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask, u16 vid) { - return ice_clear_vsi_promisc(hw, vsi_handle, promisc_mask, vid); + struct ice_pf *pf = hw->back; + int result; + + result = ice_clear_vsi_promisc(hw, vsi_handle, promisc_mask, vid); + if (result) + dev_err(ice_pf_to_dev(pf), + "Error clearing promisc mode on VSI %i for VID %u (rc=%d)\n", + ice_get_hw_vsi_num(hw, vsi_handle), vid, result); + + return result; } /** @@ -101,7 +128,16 @@ int ice_fltr_set_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask, u16 vid) { - return ice_set_vsi_promisc(hw, vsi_handle, promisc_mask, vid); + struct ice_pf *pf = hw->back; + int result; + + result = ice_set_vsi_promisc(hw, vsi_handle, promisc_mask, vid); + if (result) + dev_err(ice_pf_to_dev(pf), + "Error setting promisc mode on VSI %i for VID %u (rc=%d)\n", + ice_get_hw_vsi_num(hw, vsi_handle), vid, result); + + return result; } /** diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index d755ce07869f..1d2ca39add95 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -243,8 +243,7 @@ static int ice_add_mac_to_unsync_list(struct net_device *netdev, const u8 *addr) static bool ice_vsi_fltr_changed(struct ice_vsi *vsi) { return test_bit(ICE_VSI_UMAC_FLTR_CHANGED, vsi->state) || - test_bit(ICE_VSI_MMAC_FLTR_CHANGED, vsi->state) || - test_bit(ICE_VSI_VLAN_FLTR_CHANGED, vsi->state); + test_bit(ICE_VSI_MMAC_FLTR_CHANGED, vsi->state); } /** @@ -260,10 +259,15 @@ static int ice_set_promisc(struct ice_vsi *vsi, u8 promisc_m) if (vsi->type != ICE_VSI_PF) return 0; - if (ice_vsi_has_non_zero_vlans(vsi)) - status = ice_fltr_set_vlan_vsi_promisc(&vsi->back->hw, vsi, promisc_m); - else - status = ice_fltr_set_vsi_promisc(&vsi->back->hw, vsi->idx, promisc_m, 0); + if (ice_vsi_has_non_zero_vlans(vsi)) { + promisc_m |= (ICE_PROMISC_VLAN_RX | ICE_PROMISC_VLAN_TX); + status = ice_fltr_set_vlan_vsi_promisc(&vsi->back->hw, vsi, + promisc_m); + } else { + status = ice_fltr_set_vsi_promisc(&vsi->back->hw, vsi->idx, + promisc_m, 0); + } + return status; } @@ -280,10 +284,15 @@ static int ice_clear_promisc(struct ice_vsi *vsi, u8 promisc_m) if (vsi->type != ICE_VSI_PF) return 0; - if (ice_vsi_has_non_zero_vlans(vsi)) - status = ice_fltr_clear_vlan_vsi_promisc(&vsi->back->hw, vsi, promisc_m); - else - status = ice_fltr_clear_vsi_promisc(&vsi->back->hw, vsi->idx, promisc_m, 0); + if (ice_vsi_has_non_zero_vlans(vsi)) { + promisc_m |= (ICE_PROMISC_VLAN_RX | ICE_PROMISC_VLAN_TX); + status = ice_fltr_clear_vlan_vsi_promisc(&vsi->back->hw, vsi, + promisc_m); + } else { + status = ice_fltr_clear_vsi_promisc(&vsi->back->hw, vsi->idx, + promisc_m, 0); + } + return status; } @@ -302,7 +311,6 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi) struct ice_pf *pf = vsi->back; struct ice_hw *hw = &pf->hw; u32 changed_flags = 0; - u8 promisc_m; int err; if (!vsi->netdev) @@ -320,7 +328,6 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi) if (ice_vsi_fltr_changed(vsi)) { clear_bit(ICE_VSI_UMAC_FLTR_CHANGED, vsi->state); clear_bit(ICE_VSI_MMAC_FLTR_CHANGED, vsi->state); - clear_bit(ICE_VSI_VLAN_FLTR_CHANGED, vsi->state); /* grab the netdev's addr_list_lock */ netif_addr_lock_bh(netdev); @@ -369,29 +376,15 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi) /* check for changes in promiscuous modes */ if (changed_flags & IFF_ALLMULTI) { if (vsi->current_netdev_flags & IFF_ALLMULTI) { - if (ice_vsi_has_non_zero_vlans(vsi)) - promisc_m = ICE_MCAST_VLAN_PROMISC_BITS; - else - promisc_m = ICE_MCAST_PROMISC_BITS; - - err = ice_set_promisc(vsi, promisc_m); + err = ice_set_promisc(vsi, ICE_MCAST_PROMISC_BITS); if (err) { - netdev_err(netdev, "Error setting Multicast promiscuous mode on VSI %i\n", - vsi->vsi_num); vsi->current_netdev_flags &= ~IFF_ALLMULTI; goto out_promisc; } } else { /* !(vsi->current_netdev_flags & IFF_ALLMULTI) */ - if (ice_vsi_has_non_zero_vlans(vsi)) - promisc_m = ICE_MCAST_VLAN_PROMISC_BITS; - else - promisc_m = ICE_MCAST_PROMISC_BITS; - - err = ice_clear_promisc(vsi, promisc_m); + err = ice_clear_promisc(vsi, ICE_MCAST_PROMISC_BITS); if (err) { - netdev_err(netdev, "Error clearing Multicast promiscuous mode on VSI %i\n", - vsi->vsi_num); vsi->current_netdev_flags |= IFF_ALLMULTI; goto out_promisc; } @@ -3488,6 +3481,20 @@ ice_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid) if (!vid) return 0; + while (test_and_set_bit(ICE_CFG_BUSY, vsi->state)) + usleep_range(1000, 2000); + + /* Add multicast promisc rule for the VLAN ID to be added if + * all-multicast is currently enabled. + */ + if (vsi->current_netdev_flags & IFF_ALLMULTI) { + ret = ice_fltr_set_vsi_promisc(&vsi->back->hw, vsi->idx, + ICE_MCAST_VLAN_PROMISC_BITS, + vid); + if (ret) + goto finish; + } + vlan_ops = ice_get_compat_vsi_vlan_ops(vsi); /* Add a switch rule for this VLAN ID so its corresponding VLAN tagged @@ -3495,8 +3502,23 @@ ice_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid) */ vlan = ICE_VLAN(be16_to_cpu(proto), vid, 0); ret = vlan_ops->add_vlan(vsi, &vlan); - if (!ret) - set_bit(ICE_VSI_VLAN_FLTR_CHANGED, vsi->state); + if (ret) + goto finish; + + /* If all-multicast is currently enabled and this VLAN ID is only one + * besides VLAN-0 we have to update look-up type of multicast promisc + * rule for VLAN-0 from ICE_SW_LKUP_PROMISC to ICE_SW_LKUP_PROMISC_VLAN. + */ + if ((vsi->current_netdev_flags & IFF_ALLMULTI) && + ice_vsi_num_non_zero_vlans(vsi) == 1) { + ice_fltr_clear_vsi_promisc(&vsi->back->hw, vsi->idx, + ICE_MCAST_PROMISC_BITS, 0); + ice_fltr_set_vsi_promisc(&vsi->back->hw, vsi->idx, + ICE_MCAST_VLAN_PROMISC_BITS, 0); + } + +finish: + clear_bit(ICE_CFG_BUSY, vsi->state); return ret; } @@ -3522,6 +3544,9 @@ ice_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid) if (!vid) return 0; + while (test_and_set_bit(ICE_CFG_BUSY, vsi->state)) + usleep_range(1000, 2000); + vlan_ops = ice_get_compat_vsi_vlan_ops(vsi); /* Make sure VLAN delete is successful before updating VLAN @@ -3530,10 +3555,33 @@ ice_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid) vlan = ICE_VLAN(be16_to_cpu(proto), vid, 0); ret = vlan_ops->del_vlan(vsi, &vlan); if (ret) - return ret; + goto finish; - set_bit(ICE_VSI_VLAN_FLTR_CHANGED, vsi->state); - return 0; + /* Remove multicast promisc rule for the removed VLAN ID if + * all-multicast is enabled. + */ + if (vsi->current_netdev_flags & IFF_ALLMULTI) + ice_fltr_clear_vsi_promisc(&vsi->back->hw, vsi->idx, + ICE_MCAST_VLAN_PROMISC_BITS, vid); + + if (!ice_vsi_has_non_zero_vlans(vsi)) { + /* Update look-up type of multicast promisc rule for VLAN 0 + * from ICE_SW_LKUP_PROMISC_VLAN to ICE_SW_LKUP_PROMISC when + * all-multicast is enabled and VLAN 0 is the only VLAN rule. + */ + if (vsi->current_netdev_flags & IFF_ALLMULTI) { + ice_fltr_clear_vsi_promisc(&vsi->back->hw, vsi->idx, + ICE_MCAST_VLAN_PROMISC_BITS, + 0); + ice_fltr_set_vsi_promisc(&vsi->back->hw, vsi->idx, + ICE_MCAST_PROMISC_BITS, 0); + } + } + +finish: + clear_bit(ICE_CFG_BUSY, vsi->state); + + return ret; } /** -- cgit v1.2.3 From 60be976ac45137657b7b505d7e0d44d0e51accb7 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Fri, 1 Apr 2022 10:48:42 +0800 Subject: mctp: Fix check for dev_hard_header() result dev_hard_header() returns the length of the header, so we need to test for negative errors rather than non-zero. Fixes: 889b7da23abf ("mctp: Add initial routing framework") Signed-off-by: Matt Johnston Signed-off-by: David S. Miller --- net/mctp/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mctp/route.c b/net/mctp/route.c index d5e7db83fe9d..ee548c46c78f 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -512,7 +512,7 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) rc = dev_hard_header(skb, skb->dev, ntohs(skb->protocol), daddr, skb->dev->dev_addr, skb->len); - if (rc) { + if (rc < 0) { kfree_skb(skb); return -EHOSTUNREACH; } -- cgit v1.2.3 From 8ce40a2fd350769e94877b53d353a3b11d85f43b Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Fri, 1 Apr 2022 10:48:43 +0800 Subject: mctp i2c: correct mctp_i2c_header_create result header_ops.create should return the length of the header, instead mctp_i2c_head_create() returned 0. This didn't cause any problem because the MCTP stack accepted 0 as success. Fixes: f5b8abf9fc3d ("mctp i2c: MCTP I2C binding driver") Signed-off-by: Matt Johnston Signed-off-by: David S. Miller --- drivers/net/mctp/mctp-i2c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/mctp/mctp-i2c.c b/drivers/net/mctp/mctp-i2c.c index baf7afac7857..53846c6b56ca 100644 --- a/drivers/net/mctp/mctp-i2c.c +++ b/drivers/net/mctp/mctp-i2c.c @@ -553,7 +553,7 @@ static int mctp_i2c_header_create(struct sk_buff *skb, struct net_device *dev, hdr->source_slave = ((llsrc << 1) & 0xff) | 0x01; mhdr->ver = 0x01; - return 0; + return sizeof(struct mctp_i2c_hdr); } static int mctp_i2c_tx_thread(void *data) -- cgit v1.2.3 From 4a9dda1c1da65beee994f0977a56a9a21c5db2a7 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Fri, 1 Apr 2022 10:48:44 +0800 Subject: mctp: Use output netdev to allocate skb headroom Previously the skb was allocated with headroom MCTP_HEADER_MAXLEN, but that isn't sufficient if we are using devs that are not MCTP specific. This also adds a check that the smctp_halen provided to sendmsg for extended addressing is the correct size for the netdev. Fixes: 833ef3b91de6 ("mctp: Populate socket implementation") Reported-by: Matthew Rinaldi Signed-off-by: Matt Johnston Signed-off-by: David S. Miller --- include/net/mctp.h | 2 -- net/mctp/af_mctp.c | 46 +++++++++++++++++++++++++++++++++------------- net/mctp/route.c | 14 +++++++++++--- 3 files changed, 44 insertions(+), 18 deletions(-) diff --git a/include/net/mctp.h b/include/net/mctp.h index d37268fe6825..82800d521c3d 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -36,8 +36,6 @@ struct mctp_hdr { #define MCTP_HDR_TAG_SHIFT 0 #define MCTP_HDR_TAG_MASK GENMASK(2, 0) -#define MCTP_HEADER_MAXLEN 4 - #define MCTP_INITIAL_DEFAULT_NET 1 static inline bool mctp_address_unicast(mctp_eid_t eid) diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index f0702d920d8d..e22b0cbb2f35 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -93,13 +93,13 @@ out_release: static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_mctp *, addr, msg->msg_name); - const int hlen = MCTP_HEADER_MAXLEN + sizeof(struct mctp_hdr); int rc, addrlen = msg->msg_namelen; struct sock *sk = sock->sk; struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); struct mctp_skb_cb *cb; struct mctp_route *rt; - struct sk_buff *skb; + struct sk_buff *skb = NULL; + int hlen; if (addr) { const u8 tagbits = MCTP_TAG_MASK | MCTP_TAG_OWNER | @@ -129,6 +129,34 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (addr->smctp_network == MCTP_NET_ANY) addr->smctp_network = mctp_default_net(sock_net(sk)); + /* direct addressing */ + if (msk->addr_ext && addrlen >= sizeof(struct sockaddr_mctp_ext)) { + DECLARE_SOCKADDR(struct sockaddr_mctp_ext *, + extaddr, msg->msg_name); + struct net_device *dev; + + rc = -EINVAL; + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), extaddr->smctp_ifindex); + /* check for correct halen */ + if (dev && extaddr->smctp_halen == dev->addr_len) { + hlen = LL_RESERVED_SPACE(dev) + sizeof(struct mctp_hdr); + rc = 0; + } + rcu_read_unlock(); + if (rc) + goto err_free; + rt = NULL; + } else { + rt = mctp_route_lookup(sock_net(sk), addr->smctp_network, + addr->smctp_addr.s_addr); + if (!rt) { + rc = -EHOSTUNREACH; + goto err_free; + } + hlen = LL_RESERVED_SPACE(rt->dev->dev) + sizeof(struct mctp_hdr); + } + skb = sock_alloc_send_skb(sk, hlen + 1 + len, msg->msg_flags & MSG_DONTWAIT, &rc); if (!skb) @@ -147,8 +175,8 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) cb = __mctp_cb(skb); cb->net = addr->smctp_network; - /* direct addressing */ - if (msk->addr_ext && addrlen >= sizeof(struct sockaddr_mctp_ext)) { + if (!rt) { + /* fill extended address in cb */ DECLARE_SOCKADDR(struct sockaddr_mctp_ext *, extaddr, msg->msg_name); @@ -159,17 +187,9 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) } cb->ifindex = extaddr->smctp_ifindex; + /* smctp_halen is checked above */ cb->halen = extaddr->smctp_halen; memcpy(cb->haddr, extaddr->smctp_haddr, cb->halen); - - rt = NULL; - } else { - rt = mctp_route_lookup(sock_net(sk), addr->smctp_network, - addr->smctp_addr.s_addr); - if (!rt) { - rc = -EHOSTUNREACH; - goto err_free; - } } rc = mctp_local_output(sk, rt, skb, addr->smctp_addr.s_addr, diff --git a/net/mctp/route.c b/net/mctp/route.c index ee548c46c78f..3b24b8d18b5b 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -503,6 +503,11 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) if (cb->ifindex) { /* direct route; use the hwaddr we stashed in sendmsg */ + if (cb->halen != skb->dev->addr_len) { + /* sanity check, sendmsg should have already caught this */ + kfree_skb(skb); + return -EMSGSIZE; + } daddr = cb->haddr; } else { /* If lookup fails let the device handle daddr==NULL */ @@ -756,7 +761,7 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, { const unsigned int hlen = sizeof(struct mctp_hdr); struct mctp_hdr *hdr, *hdr2; - unsigned int pos, size; + unsigned int pos, size, headroom; struct sk_buff *skb2; int rc; u8 seq; @@ -770,6 +775,9 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, return -EMSGSIZE; } + /* keep same headroom as the original skb */ + headroom = skb_headroom(skb); + /* we've got the header */ skb_pull(skb, hlen); @@ -777,7 +785,7 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, /* size of message payload */ size = min(mtu - hlen, skb->len - pos); - skb2 = alloc_skb(MCTP_HEADER_MAXLEN + hlen + size, GFP_KERNEL); + skb2 = alloc_skb(headroom + hlen + size, GFP_KERNEL); if (!skb2) { rc = -ENOMEM; break; @@ -793,7 +801,7 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, skb_set_owner_w(skb2, skb->sk); /* establish packet */ - skb_reserve(skb2, MCTP_HEADER_MAXLEN); + skb_reserve(skb2, headroom); skb_reset_network_header(skb2); skb_put(skb2, hlen + size); skb2->transport_header = skb2->network_header + hlen; -- cgit v1.2.3 From c3efcedd272aa6dd5929e20cf902a52ddaa1197a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 31 Mar 2022 22:42:44 -0700 Subject: net: micrel: fix KS8851_MLL Kconfig KS8851_MLL selects MICREL_PHY, which depends on PTP_1588_CLOCK_OPTIONAL, so make KS8851_MLL also depend on PTP_1588_CLOCK_OPTIONAL since 'select' does not follow any dependency chains. Fixes kconfig warning and build errors: WARNING: unmet direct dependencies detected for MICREL_PHY Depends on [m]: NETDEVICES [=y] && PHYLIB [=y] && PTP_1588_CLOCK_OPTIONAL [=m] Selected by [y]: - KS8851_MLL [=y] && NETDEVICES [=y] && ETHERNET [=y] && NET_VENDOR_MICREL [=y] && HAS_IOMEM [=y] ld: drivers/net/phy/micrel.o: in function `lan8814_ts_info': micrel.c:(.text+0xb35): undefined reference to `ptp_clock_index' ld: drivers/net/phy/micrel.o: in function `lan8814_probe': micrel.c:(.text+0x2586): undefined reference to `ptp_clock_register' Signed-off-by: Randy Dunlap Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Paolo Abeni Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/micrel/Kconfig b/drivers/net/ethernet/micrel/Kconfig index 93df3049cdc0..1b632cdd7630 100644 --- a/drivers/net/ethernet/micrel/Kconfig +++ b/drivers/net/ethernet/micrel/Kconfig @@ -39,6 +39,7 @@ config KS8851 config KS8851_MLL tristate "Micrel KS8851 MLL" depends on HAS_IOMEM + depends on PTP_1588_CLOCK_OPTIONAL select MII select CRC32 select EEPROM_93CX6 -- cgit v1.2.3 From 6bf92d70e690b7ff12b24f4bfff5e5434d019b82 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 1 Apr 2022 10:33:42 +0300 Subject: net: ipv4: fix route with nexthop object delete warning FRR folks have hit a kernel warning[1] while deleting routes[2] which is caused by trying to delete a route pointing to a nexthop id without specifying nhid but matching on an interface. That is, a route is found but we hit a warning while matching it. The warning is from fib_info_nh() in include/net/nexthop.h because we run it on a fib_info with nexthop object. The call chain is: inet_rtm_delroute -> fib_table_delete -> fib_nh_match (called with a nexthop fib_info and also with fc_oif set thus calling fib_info_nh on the fib_info and triggering the warning). The fix is to not do any matching in that branch if the fi has a nexthop object because those are managed separately. I.e. we should match when deleting without nh spec and should fail when deleting a nexthop route with old-style nh spec because nexthop objects are managed separately, e.g.: $ ip r show 1.2.3.4/32 1.2.3.4 nhid 12 via 192.168.11.2 dev dummy0 $ ip r del 1.2.3.4/32 $ ip r del 1.2.3.4/32 nhid 12 $ ip r del 1.2.3.4/32 dev dummy0 [1] [ 523.462226] ------------[ cut here ]------------ [ 523.462230] WARNING: CPU: 14 PID: 22893 at include/net/nexthop.h:468 fib_nh_match+0x210/0x460 [ 523.462236] Modules linked in: dummy rpcsec_gss_krb5 xt_socket nf_socket_ipv4 nf_socket_ipv6 ip6table_raw iptable_raw bpf_preload xt_statistic ip_set ip_vs_sh ip_vs_wrr ip_vs_rr ip_vs xt_mark nf_tables xt_nat veth nf_conntrack_netlink nfnetlink xt_addrtype br_netfilter overlay dm_crypt nfsv3 nfs fscache netfs vhost_net vhost vhost_iotlb tap tun xt_CHECKSUM xt_MASQUERADE xt_conntrack 8021q garp mrp ipt_REJECT nf_reject_ipv4 ip6table_mangle ip6table_nat iptable_mangle iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_filter bridge stp llc rfcomm snd_seq_dummy snd_hrtimer rpcrdma rdma_cm iw_cm ib_cm ib_core ip6table_filter xt_comment ip6_tables vboxnetadp(OE) vboxnetflt(OE) vboxdrv(OE) qrtr bnep binfmt_misc xfs vfat fat squashfs loop nvidia_drm(POE) nvidia_modeset(POE) nvidia_uvm(POE) nvidia(POE) intel_rapl_msr intel_rapl_common snd_hda_codec_realtek snd_hda_codec_generic ledtrig_audio snd_hda_codec_hdmi btusb btrtl iwlmvm uvcvideo btbcm snd_hda_intel edac_mce_amd [ 523.462274] videobuf2_vmalloc videobuf2_memops btintel snd_intel_dspcfg videobuf2_v4l2 snd_intel_sdw_acpi bluetooth snd_usb_audio snd_hda_codec mac80211 snd_usbmidi_lib joydev snd_hda_core videobuf2_common kvm_amd snd_rawmidi snd_hwdep snd_seq videodev ccp snd_seq_device libarc4 ecdh_generic mc snd_pcm kvm iwlwifi snd_timer drm_kms_helper snd cfg80211 cec soundcore irqbypass rapl wmi_bmof i2c_piix4 rfkill k10temp pcspkr acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc drm zram ip_tables crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel nvme sp5100_tco r8169 nvme_core wmi ipmi_devintf ipmi_msghandler fuse [ 523.462300] CPU: 14 PID: 22893 Comm: ip Tainted: P OE 5.16.18-200.fc35.x86_64 #1 [ 523.462302] Hardware name: Micro-Star International Co., Ltd. MS-7C37/MPG X570 GAMING EDGE WIFI (MS-7C37), BIOS 1.C0 10/29/2020 [ 523.462303] RIP: 0010:fib_nh_match+0x210/0x460 [ 523.462304] Code: 7c 24 20 48 8b b5 90 00 00 00 e8 bb ee f4 ff 48 8b 7c 24 20 41 89 c4 e8 ee eb f4 ff 45 85 e4 0f 85 2e fe ff ff e9 4c ff ff ff <0f> 0b e9 17 ff ff ff 3c 0a 0f 85 61 fe ff ff 48 8b b5 98 00 00 00 [ 523.462306] RSP: 0018:ffffaa53d4d87928 EFLAGS: 00010286 [ 523.462307] RAX: 0000000000000000 RBX: ffffaa53d4d87a90 RCX: ffffaa53d4d87bb0 [ 523.462308] RDX: ffff9e3d2ee6be80 RSI: ffffaa53d4d87a90 RDI: ffffffff920ed380 [ 523.462309] RBP: ffff9e3d2ee6be80 R08: 0000000000000064 R09: 0000000000000000 [ 523.462310] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000031 [ 523.462310] R13: 0000000000000020 R14: 0000000000000000 R15: ffff9e3d331054e0 [ 523.462311] FS: 00007f245517c1c0(0000) GS:ffff9e492ed80000(0000) knlGS:0000000000000000 [ 523.462313] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 523.462313] CR2: 000055e5dfdd8268 CR3: 00000003ef488000 CR4: 0000000000350ee0 [ 523.462315] Call Trace: [ 523.462316] [ 523.462320] fib_table_delete+0x1a9/0x310 [ 523.462323] inet_rtm_delroute+0x93/0x110 [ 523.462325] rtnetlink_rcv_msg+0x133/0x370 [ 523.462327] ? _copy_to_iter+0xb5/0x6f0 [ 523.462330] ? rtnl_calcit.isra.0+0x110/0x110 [ 523.462331] netlink_rcv_skb+0x50/0xf0 [ 523.462334] netlink_unicast+0x211/0x330 [ 523.462336] netlink_sendmsg+0x23f/0x480 [ 523.462338] sock_sendmsg+0x5e/0x60 [ 523.462340] ____sys_sendmsg+0x22c/0x270 [ 523.462341] ? import_iovec+0x17/0x20 [ 523.462343] ? sendmsg_copy_msghdr+0x59/0x90 [ 523.462344] ? __mod_lruvec_page_state+0x85/0x110 [ 523.462348] ___sys_sendmsg+0x81/0xc0 [ 523.462350] ? netlink_seq_start+0x70/0x70 [ 523.462352] ? __dentry_kill+0x13a/0x180 [ 523.462354] ? __fput+0xff/0x250 [ 523.462356] __sys_sendmsg+0x49/0x80 [ 523.462358] do_syscall_64+0x3b/0x90 [ 523.462361] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 523.462364] RIP: 0033:0x7f24552aa337 [ 523.462365] Code: 0e 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b9 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10 [ 523.462366] RSP: 002b:00007fff7f05a838 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 523.462368] RAX: ffffffffffffffda RBX: 000000006245bf91 RCX: 00007f24552aa337 [ 523.462368] RDX: 0000000000000000 RSI: 00007fff7f05a8a0 RDI: 0000000000000003 [ 523.462369] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 [ 523.462370] R10: 0000000000000008 R11: 0000000000000246 R12: 0000000000000001 [ 523.462370] R13: 00007fff7f05ce08 R14: 0000000000000000 R15: 000055e5dfdd1040 [ 523.462373] [ 523.462374] ---[ end trace ba537bc16f6bf4ed ]--- [2] https://github.com/FRRouting/frr/issues/6412 Fixes: 4c7e8084fd46 ("ipv4: Plumb support for nexthop object in a fib_info") Signed-off-by: Nikolay Aleksandrov Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index cc8e84ef2ae4..ccb62038f6a4 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -889,8 +889,13 @@ int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, } if (cfg->fc_oif || cfg->fc_gw_family) { - struct fib_nh *nh = fib_info_nh(fi, 0); + struct fib_nh *nh; + + /* cannot match on nexthop object attributes */ + if (fi->nh) + return 1; + nh = fib_info_nh(fi, 0); if (cfg->fc_encap) { if (fib_encap_match(net, cfg->fc_encap_type, cfg->fc_encap, nh, cfg, extack)) -- cgit v1.2.3 From 392baa339c6a42a2cb088e5e5df2b59b8f89be24 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 1 Apr 2022 10:33:43 +0300 Subject: selftests: net: add delete nexthop route warning test Add a test which causes a WARNING on kernels which treat a nexthop route like a normal route when comparing for deletion and a device is specified. That is, a route is found but we hit a warning while matching it. The warning is from fib_info_nh() in include/net/nexthop.h because we run it on a fib_info with nexthop object. The call chain is: inet_rtm_delroute -> fib_table_delete -> fib_nh_match (called with a nexthop fib_info and also with fc_oif set thus calling fib_info_nh on the fib_info and triggering the warning). Repro steps: $ ip nexthop add id 12 via 172.16.1.3 dev veth1 $ ip route add 172.16.101.1/32 nhid 12 $ ip route delete 172.16.101.1/32 dev veth1 Signed-off-by: Nikolay Aleksandrov Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index d444ee6aa3cb..d8ede0c81ac1 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -1208,6 +1208,20 @@ ipv4_fcnal() set +e check_nexthop "dev veth1" "" log_test $? 0 "Nexthops removed on admin down" + + # nexthop route delete warning: route add with nhid and delete + # using device + run_cmd "$IP li set dev veth1 up" + run_cmd "$IP nexthop add id 12 via 172.16.1.3 dev veth1" + out1=`dmesg | grep "WARNING:.*fib_nh_match.*" | wc -l` + run_cmd "$IP route add 172.16.101.1/32 nhid 12" + run_cmd "$IP route delete 172.16.101.1/32 dev veth1" + out2=`dmesg | grep "WARNING:.*fib_nh_match.*" | wc -l` + [ $out1 -eq $out2 ] + rc=$? + log_test $rc 0 "Delete nexthop route warning" + run_cmd "$IP ip route delete 172.16.101.1/32 nhid 12" + run_cmd "$IP ip nexthop del id 12" } ipv4_grp_fcnal() -- cgit v1.2.3 From 31ac3bcee47b8628d676bf4080c56b238d0222d1 Mon Sep 17 00:00:00 2001 From: Dimitris Michailidis Date: Fri, 1 Apr 2022 16:24:11 -0700 Subject: net/fungible: Fix reference to __udivdi3 on 32b builds 32b builds with CONFIG_PHYS_ADDR_T_64BIT=y, such as i386 PAE, raise a linker error due to a 64b division: ld: drivers/net/ethernet/fungible/funcore/fun_dev.o: in function `fun_dev_enable': (.text+0xe1a): undefined reference to `__udivdi3' The divisor in the offendinng expression is a power of 2. Change it to use an explicit right shift. Fixes: e1ffcc66818f ("net/fungible: Add service module for Fungible drivers") Reported-by: Randy Dunlap Signed-off-by: Dimitris Michailidis Link: https://lore.kernel.org/r/20220401232411.313881-1-dmichail@fungible.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/fungible/funcore/fun_dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/fungible/funcore/fun_dev.c b/drivers/net/ethernet/fungible/funcore/fun_dev.c index 5d7aef73df61..fb5120d90f26 100644 --- a/drivers/net/ethernet/fungible/funcore/fun_dev.c +++ b/drivers/net/ethernet/fungible/funcore/fun_dev.c @@ -586,8 +586,8 @@ static int fun_get_dev_limits(struct fun_dev *fdev) /* Calculate the max QID based on SQ/CQ/doorbell counts. * SQ/CQ doorbells alternate. */ - num_dbs = (pci_resource_len(pdev, 0) - NVME_REG_DBS) / - (fdev->db_stride * 4); + num_dbs = (pci_resource_len(pdev, 0) - NVME_REG_DBS) >> + (2 + NVME_CAP_STRIDE(fdev->cap_reg)); fdev->max_qid = min3(cq_count, sq_count, num_dbs / 2) - 1; fdev->kern_end_qid = fdev->max_qid + 1; return 0; -- cgit v1.2.3 From c21cabb0fd0b54b8b54235fc1ecfe1195a23bcb2 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Fri, 1 Apr 2022 02:48:32 +0800 Subject: net: stmmac: Fix unset max_speed difference between DT and non-DT platforms In commit 9cbadf094d9d ("net: stmmac: support max-speed device tree property"), when DT platforms don't set "max-speed", max_speed is set to -1; for non-DT platforms, it stays the default 0. Prior to commit eeef2f6b9f6e ("net: stmmac: Start adding phylink support"), the check for a valid max_speed setting was to check if it was greater than zero. This commit got it right, but subsequent patches just checked for non-zero, which is incorrect for DT platforms. In commit 92c3807b9ac3 ("net: stmmac: convert to phylink_get_linkmodes()") the conversion switched completely to checking for non-zero value as a valid value, which caused 1000base-T to stop getting advertised by default. Instead of trying to fix all the checks, simply leave max_speed alone if DT property parsing fails. Fixes: 9cbadf094d9d ("net: stmmac: support max-speed device tree property") Fixes: 92c3807b9ac3 ("net: stmmac: convert to phylink_get_linkmodes()") Signed-off-by: Chen-Yu Tsai Acked-by: Russell King (Oracle) Reviewed-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220331184832.16316-1-wens@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 5d29f336315b..11e1055e8260 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -431,8 +431,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) plat->phylink_node = np; /* Get max speed of operation from device tree */ - if (of_property_read_u32(np, "max-speed", &plat->max_speed)) - plat->max_speed = -1; + of_property_read_u32(np, "max-speed", &plat->max_speed); plat->bus_id = of_alias_get_id(np, "ethernet"); if (plat->bus_id < 0) -- cgit v1.2.3 From 5a48b7433a5aee719ab242d2feadaf4c9e065989 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Sat, 2 Apr 2022 09:46:23 -0500 Subject: docs: net: dsa: fix minor grammar and punctuation issues Fix a few typos and minor grammatical issues. Signed-off-by: Bjorn Helgaas Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 64 ++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index 89bb4fa4c362..ddc1dd039337 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -10,21 +10,21 @@ in joining the effort. Design principles ================= -The Distributed Switch Architecture is a subsystem which was primarily designed -to support Marvell Ethernet switches (MV88E6xxx, a.k.a Linkstreet product line) -using Linux, but has since evolved to support other vendors as well. +The Distributed Switch Architecture subsystem was primarily designed to +support Marvell Ethernet switches (MV88E6xxx, a.k.a. Link Street product +line) using Linux, but has since evolved to support other vendors as well. The original philosophy behind this design was to be able to use unmodified Linux tools such as bridge, iproute2, ifconfig to work transparently whether they configured/queried a switch port network device or a regular network device. -An Ethernet switch is typically comprised of multiple front-panel ports, and one -or more CPU or management port. The DSA subsystem currently relies on the +An Ethernet switch typically comprises multiple front-panel ports and one +or more CPU or management ports. The DSA subsystem currently relies on the presence of a management port connected to an Ethernet controller capable of receiving Ethernet frames from the switch. This is a very common setup for all kinds of Ethernet switches found in Small Home and Office products: routers, -gateways, or even top-of-the rack switches. This host Ethernet controller will +gateways, or even top-of-rack switches. This host Ethernet controller will be later referred to as "master" and "cpu" in DSA terminology and code. The D in DSA stands for Distributed, because the subsystem has been designed @@ -33,14 +33,14 @@ using upstream and downstream Ethernet links between switches. These specific ports are referred to as "dsa" ports in DSA terminology and code. A collection of multiple switches connected to each other is called a "switch tree". -For each front-panel port, DSA will create specialized network devices which are +For each front-panel port, DSA creates specialized network devices which are used as controlling and data-flowing endpoints for use by the Linux networking stack. These specialized network interfaces are referred to as "slave" network interfaces in DSA terminology and code. The ideal case for using DSA is when an Ethernet switch supports a "switch tag" which is a hardware feature making the switch insert a specific tag for each -Ethernet frames it received to/from specific ports to help the management +Ethernet frame it receives to/from specific ports to help the management interface figure out: - what port is this frame coming from @@ -125,7 +125,7 @@ other switches from the same fabric, and in this case, the outermost switch ports must decapsulate the packet. Note that in certain cases, it might be the case that the tagging format used -by a leaf switch (not connected directly to the CPU) to not be the same as what +by a leaf switch (not connected directly to the CPU) is not the same as what the network stack sees. This can be seen with Marvell switch trees, where the CPU port can be configured to use either the DSA or the Ethertype DSA (EDSA) format, but the DSA links are configured to use the shorter (without Ethertype) @@ -270,21 +270,21 @@ These interfaces are specialized in order to: to/from specific switch ports - query the switch for ethtool operations: statistics, link state, Wake-on-LAN, register dumps... -- external/internal PHY management: link, auto-negotiation etc. +- manage external/internal PHY: link, auto-negotiation, etc. These slave network devices have custom net_device_ops and ethtool_ops function pointers which allow DSA to introduce a level of layering between the networking -stack/ethtool, and the switch driver implementation. +stack/ethtool and the switch driver implementation. Upon frame transmission from these slave network devices, DSA will look up which -switch tagging protocol is currently registered with these network devices, and +switch tagging protocol is currently registered with these network devices and invoke a specific transmit routine which takes care of adding the relevant switch tag in the Ethernet frames. These frames are then queued for transmission using the master network device -``ndo_start_xmit()`` function, since they contain the appropriate switch tag, the +``ndo_start_xmit()`` function. Since they contain the appropriate switch tag, the Ethernet switch will be able to process these incoming frames from the -management interface and delivers these frames to the physical switch port. +management interface and deliver them to the physical switch port. Graphical representation ------------------------ @@ -330,9 +330,9 @@ MDIO reads/writes towards specific PHY addresses. In most MDIO-connected switches, these functions would utilize direct or indirect PHY addressing mode to return standard MII registers from the switch builtin PHYs, allowing the PHY library and/or to return link status, link partner pages, auto-negotiation -results etc.. +results, etc. -For Ethernet switches which have both external and internal MDIO busses, the +For Ethernet switches which have both external and internal MDIO buses, the slave MII bus can be utilized to mux/demux MDIO reads and writes towards either internal or external MDIO devices this switch might be connected to: internal PHYs, external PHYs, or even external switches. @@ -349,7 +349,7 @@ DSA data structures are defined in ``include/net/dsa.h`` as well as table indication (when cascading switches) - ``dsa_platform_data``: platform device configuration data which can reference - a collection of dsa_chip_data structure if multiples switches are cascaded, + a collection of dsa_chip_data structures if multiple switches are cascaded, the master network device this switch tree is attached to needs to be referenced @@ -426,7 +426,7 @@ logic basically looks like this: "phy-handle" property, if found, this PHY device is created and registered using ``of_phy_connect()`` -- if Device Tree is used, and the PHY device is "fixed", that is, conforms to +- if Device Tree is used and the PHY device is "fixed", that is, conforms to the definition of a non-MDIO managed PHY as defined in ``Documentation/devicetree/bindings/net/fixed-link.txt``, the PHY is registered and connected transparently using the special fixed MDIO bus driver @@ -481,7 +481,7 @@ Device Tree DSA features a standardized binding which is documented in ``Documentation/devicetree/bindings/net/dsa/dsa.txt``. PHY/MDIO library helper functions such as ``of_get_phy_mode()``, ``of_phy_connect()`` are also used to query -per-port PHY specific details: interface connection, MDIO bus location etc.. +per-port PHY specific details: interface connection, MDIO bus location, etc. Driver development ================== @@ -509,7 +509,7 @@ Switch configuration - ``setup``: setup function for the switch, this function is responsible for setting up the ``dsa_switch_ops`` private structure with all it needs: register maps, - interrupts, mutexes, locks etc.. This function is also expected to properly + interrupts, mutexes, locks, etc. This function is also expected to properly configure the switch to separate all network interfaces from each other, that is, they should be isolated by the switch hardware itself, typically by creating a Port-based VLAN ID for each port and allowing only the CPU port and the @@ -526,13 +526,13 @@ PHY devices and link management - ``get_phy_flags``: Some switches are interfaced to various kinds of Ethernet PHYs, if the PHY library PHY driver needs to know about information it cannot obtain on its own (e.g.: coming from switch memory mapped registers), this function - should return a 32-bits bitmask of "flags", that is private between the switch + should return a 32-bit bitmask of "flags" that is private between the switch driver and the Ethernet PHY driver in ``drivers/net/phy/\*``. - ``phy_read``: Function invoked by the DSA slave MDIO bus when attempting to read the switch port MDIO registers. If unavailable, return 0xffff for each read. For builtin switch Ethernet PHYs, this function should allow reading the link - status, auto-negotiation results, link partner pages etc.. + status, auto-negotiation results, link partner pages, etc. - ``phy_write``: Function invoked by the DSA slave MDIO bus when attempting to write to the switch port MDIO registers. If unavailable return a negative error @@ -554,7 +554,7 @@ Ethtool operations ------------------ - ``get_strings``: ethtool function used to query the driver's strings, will - typically return statistics strings, private flags strings etc. + typically return statistics strings, private flags strings, etc. - ``get_ethtool_stats``: ethtool function used to query per-port statistics and return their values. DSA overlays slave network devices general statistics: @@ -564,7 +564,7 @@ Ethtool operations - ``get_sset_count``: ethtool function used to query the number of statistics items - ``get_wol``: ethtool function used to obtain Wake-on-LAN settings per-port, this - function may, for certain implementations also query the master network device + function may for certain implementations also query the master network device Wake-on-LAN settings if this interface needs to participate in Wake-on-LAN - ``set_wol``: ethtool function used to configure Wake-on-LAN settings per-port, @@ -607,14 +607,14 @@ Power management in a fully active state - ``port_enable``: function invoked by the DSA slave network device ndo_open - function when a port is administratively brought up, this function should be - fully enabling a given switch port. DSA takes care of marking the port with + function when a port is administratively brought up, this function should + fully enable a given switch port. DSA takes care of marking the port with ``BR_STATE_BLOCKING`` if the port is a bridge member, or ``BR_STATE_FORWARDING`` if it was not, and propagating these changes down to the hardware - ``port_disable``: function invoked by the DSA slave network device ndo_close - function when a port is administratively brought down, this function should be - fully disabling a given switch port. DSA takes care of marking the port with + function when a port is administratively brought down, this function should + fully disable a given switch port. DSA takes care of marking the port with ``BR_STATE_DISABLED`` and propagating changes to the hardware if this port is disabled while being a bridge member @@ -622,12 +622,12 @@ Bridge layer ------------ - ``port_bridge_join``: bridge layer function invoked when a given switch port is - added to a bridge, this function should be doing the necessary at the switch - level to permit the joining port from being added to the relevant logical + added to a bridge, this function should do what's necessary at the switch + level to permit the joining port to be added to the relevant logical domain for it to ingress/egress traffic with other members of the bridge. - ``port_bridge_leave``: bridge layer function invoked when a given switch port is - removed from a bridge, this function should be doing the necessary at the + removed from a bridge, this function should do what's necessary at the switch level to deny the leaving port from ingress/egress traffic from the remaining bridge members. When the port leaves the bridge, it should be aged out at the switch hardware for the switch to (re) learn MAC addresses behind @@ -663,7 +663,7 @@ Bridge layer point for drivers that need to configure the hardware for enabling this feature. -- ``port_bridge_tx_fwd_unoffload``: bridge layer function invoken when a driver +- ``port_bridge_tx_fwd_unoffload``: bridge layer function invoked when a driver leaves a bridge port which had the TX forwarding offload feature enabled. Bridge VLAN filtering -- cgit v1.2.3 From 692930cc435099580a4b9e32fa781b0688c18439 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 1 Apr 2022 18:54:27 +0300 Subject: selftests: net: fix nexthop warning cleanup double ip typo I made a stupid typo when adding the nexthop route warning selftest and added both $IP and ip after it (double ip) on the cleanup path. The error doesn't show up when running the test, but obviously it doesn't cleanup properly after it. Fixes: 392baa339c6a ("selftests: net: add delete nexthop route warning test") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index d8ede0c81ac1..b3bf5319bb0e 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -1220,8 +1220,8 @@ ipv4_fcnal() [ $out1 -eq $out2 ] rc=$? log_test $rc 0 "Delete nexthop route warning" - run_cmd "$IP ip route delete 172.16.101.1/32 nhid 12" - run_cmd "$IP ip nexthop del id 12" + run_cmd "$IP route delete 172.16.101.1/32 nhid 12" + run_cmd "$IP nexthop del id 12" } ipv4_grp_fcnal() -- cgit v1.2.3 From 76ed2f61ae3ee5ca6e3ef155a703ab3eee1eb295 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 22 Mar 2022 16:01:37 +0900 Subject: ata: libata-sff: Fix compilation warning in ata_sff_lost_interrupt() When returning false, ata_sff_altstatus() does not return any status value, resulting in a compilation warning in ata_sff_lost_interrupt() ("uninitialized symbol 'status'"). Fix this by initializing the local variable "status" to 0. Fixes: 03c0e84f9c1e ("ata: libata-sff: refactor ata_sff_altstatus()") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal --- drivers/ata/libata-sff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c index b3be7a8f5bea..b1666adc1c3a 100644 --- a/drivers/ata/libata-sff.c +++ b/drivers/ata/libata-sff.c @@ -1634,7 +1634,7 @@ EXPORT_SYMBOL_GPL(ata_sff_interrupt); void ata_sff_lost_interrupt(struct ata_port *ap) { - u8 status; + u8 status = 0; struct ata_queued_cmd *qc; /* Only one outstanding command per SFF channel */ -- cgit v1.2.3 From 7aa8104a554713b685db729e66511b93d989dd6a Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Sat, 19 Mar 2022 21:11:02 +0100 Subject: ata: sata_dwc_460ex: Fix crash due to OOB write the driver uses libata's "tag" values from in various arrays. Since the mentioned patch bumped the ATA_TAG_INTERNAL to 32, the value of the SATA_DWC_QCMD_MAX needs to account for that. Otherwise ATA_TAG_INTERNAL usage cause similar crashes like this as reported by Tice Rex on the OpenWrt Forum and reproduced (with symbols) here: | BUG: Kernel NULL pointer dereference at 0x00000000 | Faulting instruction address: 0xc03ed4b8 | Oops: Kernel access of bad area, sig: 11 [#1] | BE PAGE_SIZE=4K PowerPC 44x Platform | CPU: 0 PID: 362 Comm: scsi_eh_1 Not tainted 5.4.163 #0 | NIP: c03ed4b8 LR: c03d27e8 CTR: c03ed36c | REGS: cfa59950 TRAP: 0300 Not tainted (5.4.163) | MSR: 00021000 CR: 42000222 XER: 00000000 | DEAR: 00000000 ESR: 00000000 | GPR00: c03d27e8 cfa59a08 cfa55fe0 00000000 0fa46bc0 [...] | [..] | NIP [c03ed4b8] sata_dwc_qc_issue+0x14c/0x254 | LR [c03d27e8] ata_qc_issue+0x1c8/0x2dc | Call Trace: | [cfa59a08] [c003f4e0] __cancel_work_timer+0x124/0x194 (unreliable) | [cfa59a78] [c03d27e8] ata_qc_issue+0x1c8/0x2dc | [cfa59a98] [c03d2b3c] ata_exec_internal_sg+0x240/0x524 | [cfa59b08] [c03d2e98] ata_exec_internal+0x78/0xe0 | [cfa59b58] [c03d30fc] ata_read_log_page.part.38+0x1dc/0x204 | [cfa59bc8] [c03d324c] ata_identify_page_supported+0x68/0x130 | [...] This is because sata_dwc_dma_xfer_complete() NULLs the dma_pending's next neighbour "chan" (a *dma_chan struct) in this '32' case right here (line ~735): > hsdevp->dma_pending[tag] = SATA_DWC_DMA_PENDING_NONE; Then the next time, a dma gets issued; dma_dwc_xfer_setup() passes the NULL'd hsdevp->chan to the dmaengine_slave_config() which then causes the crash. With this patch, SATA_DWC_QCMD_MAX is now set to ATA_MAX_QUEUE + 1. This avoids the OOB. But please note, there was a worthwhile discussion on what ATA_TAG_INTERNAL and ATA_MAX_QUEUE is. And why there should not be a "fake" 33 command-long queue size. Ideally, the dw driver should account for the ATA_TAG_INTERNAL. In Damien Le Moal's words: "... having looked at the driver, it is a bigger change than just faking a 33rd "tag" that is in fact not a command tag at all." Fixes: 28361c403683c ("libata: add extra internal command") Cc: stable@kernel.org # 4.18+ BugLink: https://github.com/openwrt/openwrt/issues/9505 Signed-off-by: Christian Lamparter Signed-off-by: Damien Le Moal --- drivers/ata/sata_dwc_460ex.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/ata/sata_dwc_460ex.c b/drivers/ata/sata_dwc_460ex.c index bec33d781ae0..e3263e961045 100644 --- a/drivers/ata/sata_dwc_460ex.c +++ b/drivers/ata/sata_dwc_460ex.c @@ -137,7 +137,11 @@ struct sata_dwc_device { #endif }; -#define SATA_DWC_QCMD_MAX 32 +/* + * Allow one extra special slot for commands and DMA management + * to account for libata internal commands. + */ +#define SATA_DWC_QCMD_MAX (ATA_MAX_QUEUE + 1) struct sata_dwc_device_port { struct sata_dwc_device *hsdev; -- cgit v1.2.3 From 5399752299396a3c9df6617f4b3c907d7aa4ded8 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Sat, 19 Mar 2022 21:11:03 +0100 Subject: ata: libata-core: Disable READ LOG DMA EXT for Samsung 840 EVOs Samsung' 840 EVO with the latest firmware (EXT0DB6Q) locks up with the a message: "READ LOG DMA EXT failed, trying PIO" during boot. Initially this was discovered because it caused a crash with the sata_dwc_460ex controller on a WD MyBook Live DUO. The reporter "Tice Rex" which has the unique opportunity that he has two Samsung 840 EVO SSD! One with the older firmware "EXT0BB0Q" which booted fine and didn't expose "READ LOG DMA EXT". But the newer/latest firmware "EXT0DB6Q" caused the headaches. BugLink: https://github.com/openwrt/openwrt/issues/9505 Signed-off-by: Christian Lamparter Signed-off-by: Damien Le Moal --- drivers/ata/libata-core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index cceedde51126..ca64837641be 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4014,6 +4014,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { ATA_HORKAGE_ZERO_AFTER_TRIM, }, { "Crucial_CT*MX100*", "MU01", ATA_HORKAGE_NO_NCQ_TRIM | ATA_HORKAGE_ZERO_AFTER_TRIM, }, + { "Samsung SSD 840 EVO*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | + ATA_HORKAGE_NO_DMA_LOG | + ATA_HORKAGE_ZERO_AFTER_TRIM, }, { "Samsung SSD 840*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | ATA_HORKAGE_ZERO_AFTER_TRIM, }, { "Samsung SSD 850*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | -- cgit v1.2.3 From b117c88df0e3b48903c36f97be92bac6a9e03df7 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Fri, 1 Apr 2022 13:05:20 +0200 Subject: dt-bindings: net: micrel: Revert latency support and timestamping check Revert latency support from binding. Based on the discussion[1], the DT is the wrong place to have the lantecies for the PHY. [1] https://lkml.org/lkml/2022/3/4/325 Fixes: 2358dd3fd325fc ("dt-bindings: net: micrel: Configure latency values and timestamping check for LAN8814 phy") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/micrel.txt | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/Documentation/devicetree/bindings/net/micrel.txt b/Documentation/devicetree/bindings/net/micrel.txt index c5ab62c39133..8d157f0295a5 100644 --- a/Documentation/devicetree/bindings/net/micrel.txt +++ b/Documentation/devicetree/bindings/net/micrel.txt @@ -45,20 +45,3 @@ Optional properties: In fiber mode, auto-negotiation is disabled and the PHY can only work in 100base-fx (full and half duplex) modes. - - - lan8814,ignore-ts: If present the PHY will not support timestamping. - - This option acts as check whether Timestamping is supported by - hardware or not. LAN8814 phy support hardware tmestamping. - - - lan8814,latency_rx_10: Configures Latency value of phy in ingress at 10 Mbps. - - - lan8814,latency_tx_10: Configures Latency value of phy in egress at 10 Mbps. - - - lan8814,latency_rx_100: Configures Latency value of phy in ingress at 100 Mbps. - - - lan8814,latency_tx_100: Configures Latency value of phy in egress at 100 Mbps. - - - lan8814,latency_rx_1000: Configures Latency value of phy in ingress at 1000 Mbps. - - - lan8814,latency_tx_1000: Configures Latency value of phy in egress at 1000 Mbps. -- cgit v1.2.3 From b814403a8cd8b28a2c0497e211f029786394531d Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Fri, 1 Apr 2022 13:05:21 +0200 Subject: net: phy: micrel: Remove latency from driver Based on the discussions here[1], the PHY driver is the wrong place to set the latencies, therefore remove them. [1] https://lkml.org/lkml/2022/3/4/325 Fixes: ece19502834d84 ("net: phy: micrel: 1588 support for LAN8814 phy") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 102 +---------------------------------------------- 1 file changed, 1 insertion(+), 101 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 19b11e896460..a873df07ad24 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -99,15 +99,6 @@ #define PTP_TIMESTAMP_EN_PDREQ_ BIT(2) #define PTP_TIMESTAMP_EN_PDRES_ BIT(3) -#define PTP_RX_LATENCY_1000 0x0224 -#define PTP_TX_LATENCY_1000 0x0225 - -#define PTP_RX_LATENCY_100 0x0222 -#define PTP_TX_LATENCY_100 0x0223 - -#define PTP_RX_LATENCY_10 0x0220 -#define PTP_TX_LATENCY_10 0x0221 - #define PTP_TX_PARSE_L2_ADDR_EN 0x0284 #define PTP_RX_PARSE_L2_ADDR_EN 0x0244 @@ -268,15 +259,6 @@ struct lan8814_ptp_rx_ts { u16 seq_id; }; -struct kszphy_latencies { - u16 rx_10; - u16 tx_10; - u16 rx_100; - u16 tx_100; - u16 rx_1000; - u16 tx_1000; -}; - struct kszphy_ptp_priv { struct mii_timestamper mii_ts; struct phy_device *phydev; @@ -296,7 +278,6 @@ struct kszphy_ptp_priv { struct kszphy_priv { struct kszphy_ptp_priv ptp_priv; - struct kszphy_latencies latencies; const struct kszphy_type *type; int led_mode; bool rmii_ref_clk_sel; @@ -304,14 +285,6 @@ struct kszphy_priv { u64 stats[ARRAY_SIZE(kszphy_hw_stats)]; }; -static struct kszphy_latencies lan8814_latencies = { - .rx_10 = 0x22AA, - .tx_10 = 0x2E4A, - .rx_100 = 0x092A, - .tx_100 = 0x02C1, - .rx_1000 = 0x01AD, - .tx_1000 = 0x00C9, -}; static const struct kszphy_type ksz8021_type = { .led_mode_reg = MII_KSZPHY_CTRL_2, .has_broadcast_disable = true, @@ -2618,55 +2591,6 @@ static int lan8814_ptp_probe_once(struct phy_device *phydev) return 0; } -static int lan8814_read_status(struct phy_device *phydev) -{ - struct kszphy_priv *priv = phydev->priv; - struct kszphy_latencies *latencies = &priv->latencies; - int err; - int regval; - - err = genphy_read_status(phydev); - if (err) - return err; - - switch (phydev->speed) { - case SPEED_1000: - lanphy_write_page_reg(phydev, 5, PTP_RX_LATENCY_1000, - latencies->rx_1000); - lanphy_write_page_reg(phydev, 5, PTP_TX_LATENCY_1000, - latencies->tx_1000); - break; - case SPEED_100: - lanphy_write_page_reg(phydev, 5, PTP_RX_LATENCY_100, - latencies->rx_100); - lanphy_write_page_reg(phydev, 5, PTP_TX_LATENCY_100, - latencies->tx_100); - break; - case SPEED_10: - lanphy_write_page_reg(phydev, 5, PTP_RX_LATENCY_10, - latencies->rx_10); - lanphy_write_page_reg(phydev, 5, PTP_TX_LATENCY_10, - latencies->tx_10); - break; - default: - break; - } - - /* Make sure the PHY is not broken. Read idle error count, - * and reset the PHY if it is maxed out. - */ - regval = phy_read(phydev, MII_STAT1000); - if ((regval & 0xFF) == 0xFF) { - phy_init_hw(phydev); - phydev->link = 0; - if (phydev->drv->config_intr && phy_interrupt_is_valid(phydev)) - phydev->drv->config_intr(phydev); - return genphy_config_aneg(phydev); - } - - return 0; -} - static int lan8814_config_init(struct phy_device *phydev) { int val; @@ -2690,27 +2614,6 @@ static int lan8814_config_init(struct phy_device *phydev) return 0; } -static void lan8814_parse_latency(struct phy_device *phydev) -{ - const struct device_node *np = phydev->mdio.dev.of_node; - struct kszphy_priv *priv = phydev->priv; - struct kszphy_latencies *latency = &priv->latencies; - u32 val; - - if (!of_property_read_u32(np, "lan8814,latency_rx_10", &val)) - latency->rx_10 = val; - if (!of_property_read_u32(np, "lan8814,latency_tx_10", &val)) - latency->tx_10 = val; - if (!of_property_read_u32(np, "lan8814,latency_rx_100", &val)) - latency->rx_100 = val; - if (!of_property_read_u32(np, "lan8814,latency_tx_100", &val)) - latency->tx_100 = val; - if (!of_property_read_u32(np, "lan8814,latency_rx_1000", &val)) - latency->rx_1000 = val; - if (!of_property_read_u32(np, "lan8814,latency_tx_1000", &val)) - latency->tx_1000 = val; -} - static int lan8814_probe(struct phy_device *phydev) { const struct device_node *np = phydev->mdio.dev.of_node; @@ -2724,8 +2627,6 @@ static int lan8814_probe(struct phy_device *phydev) priv->led_mode = -1; - priv->latencies = lan8814_latencies; - phydev->priv = priv; if (!IS_ENABLED(CONFIG_PTP_1588_CLOCK) || @@ -2746,7 +2647,6 @@ static int lan8814_probe(struct phy_device *phydev) return err; } - lan8814_parse_latency(phydev); lan8814_ptp_init(phydev); return 0; @@ -2928,7 +2828,7 @@ static struct phy_driver ksphy_driver[] = { .config_init = lan8814_config_init, .probe = lan8814_probe, .soft_reset = genphy_soft_reset, - .read_status = lan8814_read_status, + .read_status = ksz9031_read_status, .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, -- cgit v1.2.3 From 76e9ccd6894377bc3cf7fbdea90b0af2cb4eb12a Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Fri, 1 Apr 2022 13:05:22 +0200 Subject: net: phy: micrel: Remove DT option lan8814,ignore-ts When the PHY and the MAC are capable of doing timestamping, the PHY has priority. Therefore the DT option lan8814,ignore-ts was added such that the PHY will not expose a PHC so then the timestamping was done in the MAC. This is not the correct approach of doing it, therefore remove this. Fixes: ece19502834d84 ("net: phy: micrel: 1588 support for LAN8814 phy") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index a873df07ad24..fc53b71dc872 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -2616,7 +2616,6 @@ static int lan8814_config_init(struct phy_device *phydev) static int lan8814_probe(struct phy_device *phydev) { - const struct device_node *np = phydev->mdio.dev.of_node; struct kszphy_priv *priv; u16 addr; int err; @@ -2630,8 +2629,7 @@ static int lan8814_probe(struct phy_device *phydev) phydev->priv = priv; if (!IS_ENABLED(CONFIG_PTP_1588_CLOCK) || - !IS_ENABLED(CONFIG_NETWORK_PHY_TIMESTAMPING) || - of_property_read_bool(np, "lan8814,ignore-ts")) + !IS_ENABLED(CONFIG_NETWORK_PHY_TIMESTAMPING)) return 0; /* Strap-in value for PHY address, below register read gives starting -- cgit v1.2.3 From 20921c0c86092b4082c91bd7c88305da74e5520b Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Fri, 1 Apr 2022 11:53:04 -0700 Subject: qed: fix ethtool register dump To fix a coverity complain, commit d5ac07dfbd2b ("qed: Initialize debug string array") removed "sw-platform" (one of the common global parameters) from the dump as this was used in the dump with an uninitialized string, however it did not reduce the number of common global parameters which caused the incorrect (unable to parse) register dump this patch fixes it with reducing NUM_COMMON_GLOBAL_PARAMS bye one. Cc: stable@vger.kernel.org Cc: Tim Gardner Cc: "David S. Miller" Fixes: d5ac07dfbd2b ("qed: Initialize debug string array") Signed-off-by: Prabhakar Kushwaha Signed-off-by: Alok Prasad Signed-off-by: Ariel Elior Signed-off-by: Manish Chopra Reviewed-by: Tim Gardner Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c index e3edca187ddf..5250d1d1e49c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_debug.c +++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c @@ -489,7 +489,7 @@ struct split_type_defs { #define STATIC_DEBUG_LINE_DWORDS 9 -#define NUM_COMMON_GLOBAL_PARAMS 11 +#define NUM_COMMON_GLOBAL_PARAMS 10 #define MAX_RECURSION_DEPTH 10 -- cgit v1.2.3 From 4f81def272de17dc4bbd89ac38f49b2676c9b3d2 Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Fri, 1 Apr 2022 20:21:10 -0400 Subject: bnxt_en: Synchronize tx when xdp redirects happen on same ring If there are more CPUs than the number of TX XDP rings, multiple XDP redirects can select the same TX ring based on the CPU on which XDP redirect is called. Add locking when needed and use static key to decide whether to take the lock. Fixes: f18c2b77b2e4 ("bnxt_en: optimized XDP_REDIRECT support") Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 7 +++++++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 8 ++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h | 2 ++ 4 files changed, 19 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 1c28495875cf..874fad0a5cf8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3253,6 +3253,7 @@ static int bnxt_alloc_tx_rings(struct bnxt *bp) } qidx = bp->tc_to_qidx[j]; ring->queue_id = bp->q_info[qidx].queue_id; + spin_lock_init(&txr->xdp_tx_lock); if (i < bp->tx_nr_rings_xdp) continue; if (i % bp->tx_nr_rings_per_tc == (bp->tx_nr_rings_per_tc - 1)) @@ -10338,6 +10339,12 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init) if (irq_re_init) udp_tunnel_nic_reset_ntf(bp->dev); + if (bp->tx_nr_rings_xdp < num_possible_cpus()) { + if (!static_key_enabled(&bnxt_xdp_locking_key)) + static_branch_enable(&bnxt_xdp_locking_key); + } else if (static_key_enabled(&bnxt_xdp_locking_key)) { + static_branch_disable(&bnxt_xdp_locking_key); + } set_bit(BNXT_STATE_OPEN, &bp->state); bnxt_enable_int(bp); /* Enable TX queues */ diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 61aa3e8c5952..b4d3d051463b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -800,6 +800,8 @@ struct bnxt_tx_ring_info { u32 dev_state; struct bnxt_ring_struct tx_ring_struct; + /* Synchronize simultaneous xdp_xmit on same ring */ + spinlock_t xdp_tx_lock; }; #define BNXT_LEGACY_COAL_CMPL_PARAMS \ diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 52fad0fdeacf..c0541ff00ac8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -20,6 +20,8 @@ #include "bnxt.h" #include "bnxt_xdp.h" +DEFINE_STATIC_KEY_FALSE(bnxt_xdp_locking_key); + struct bnxt_sw_tx_bd *bnxt_xmit_bd(struct bnxt *bp, struct bnxt_tx_ring_info *txr, dma_addr_t mapping, u32 len) @@ -227,6 +229,9 @@ int bnxt_xdp_xmit(struct net_device *dev, int num_frames, ring = smp_processor_id() % bp->tx_nr_rings_xdp; txr = &bp->tx_ring[ring]; + if (static_branch_unlikely(&bnxt_xdp_locking_key)) + spin_lock(&txr->xdp_tx_lock); + for (i = 0; i < num_frames; i++) { struct xdp_frame *xdp = frames[i]; @@ -250,6 +255,9 @@ int bnxt_xdp_xmit(struct net_device *dev, int num_frames, bnxt_db_write(bp, &txr->tx_db, txr->tx_prod); } + if (static_branch_unlikely(&bnxt_xdp_locking_key)) + spin_unlock(&txr->xdp_tx_lock); + return nxmit; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h index 0df40c3beb05..067bb5e821f5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h @@ -10,6 +10,8 @@ #ifndef BNXT_XDP_H #define BNXT_XDP_H +DECLARE_STATIC_KEY_FALSE(bnxt_xdp_locking_key); + struct bnxt_sw_tx_bd *bnxt_xmit_bd(struct bnxt *bp, struct bnxt_tx_ring_info *txr, dma_addr_t mapping, u32 len); -- cgit v1.2.3 From facc173cf700e55b2ad249ecbd3a7537f7315691 Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Fri, 1 Apr 2022 20:21:11 -0400 Subject: bnxt_en: reserve space inside receive page for skb_shared_info Insufficient space was being reserved in the page used for packet reception, so the interface MTU could be set too large to still have room for the contents of the packet when doing XDP redirect. This resulted in the following message when redirecting a packet between 3520 and 3822 bytes with an MTU of 3822: [311815.561880] XDP_WARN: xdp_update_frame_from_buff(line:200): Driver BUG: missing reserved tailroom Fixes: f18c2b77b2e4 ("bnxt_en: optimized XDP_REDIRECT support") Reviewed-by: Somnath Kotur Reviewed-by: Pavan Chebbi Signed-off-by: Andy Gospodarek Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index b4d3d051463b..98453a78cbd0 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -593,7 +593,8 @@ struct nqe_cn { #define BNXT_MAX_MTU 9500 #define BNXT_MAX_PAGE_MODE_MTU \ ((unsigned int)PAGE_SIZE - VLAN_ETH_HLEN - NET_IP_ALIGN - \ - XDP_PACKET_HEADROOM) + XDP_PACKET_HEADROOM - \ + SKB_DATA_ALIGN((unsigned int)sizeof(struct skb_shared_info))) #define BNXT_MIN_PKT_SIZE 52 -- cgit v1.2.3 From 27d4073f8d9af0340362554414f4961643a4f4de Mon Sep 17 00:00:00 2001 From: Ray Jui Date: Fri, 1 Apr 2022 20:21:12 -0400 Subject: bnxt_en: Prevent XDP redirect from running when stopping TX queue Add checks in the XDP redirect callback to prevent XDP from running when the TX ring is undergoing shutdown. Also remove redundant checks in the XDP redirect callback to validate the txr and the flag that indicates the ring supports XDP. The modulo arithmetic on 'tx_nr_rings_xdp' already guarantees the derived TX ring is an XDP ring. txr is also guaranteed to be valid after checking BNXT_STATE_OPEN and within RCU grace period. Fixes: f18c2b77b2e4 ("bnxt_en: optimized XDP_REDIRECT support") Reviewed-by: Vladimir Olovyannikov Signed-off-by: Ray Jui Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index c0541ff00ac8..03b1d6c04504 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -229,14 +229,16 @@ int bnxt_xdp_xmit(struct net_device *dev, int num_frames, ring = smp_processor_id() % bp->tx_nr_rings_xdp; txr = &bp->tx_ring[ring]; + if (READ_ONCE(txr->dev_state) == BNXT_DEV_STATE_CLOSING) + return -EINVAL; + if (static_branch_unlikely(&bnxt_xdp_locking_key)) spin_lock(&txr->xdp_tx_lock); for (i = 0; i < num_frames; i++) { struct xdp_frame *xdp = frames[i]; - if (!txr || !bnxt_tx_avail(bp, txr) || - !(bp->bnapi[ring]->flags & BNXT_NAPI_FLAG_XDP)) + if (!bnxt_tx_avail(bp, txr)) break; mapping = dma_map_single(&pdev->dev, xdp->data, xdp->len, -- cgit v1.2.3 From 2baed4f9b085724a8a34add832d4763f3d83f877 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 3 Apr 2022 10:02:02 -0400 Subject: stmmac: dwmac-loongson: change loongson_dwmac_driver from global to static Smatch reports this issue dwmac-loongson.c:208:19: warning: symbol 'loongson_dwmac_driver' was not declared. Should it be static? loongson_dwmac_driver is only used in dwmac-loongson.c. File scope variables used only in one file should be static. Change loongson_dwmac_driver's storage-class-specifier from global to static. Signed-off-by: Tom Rix Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index ecf759ee1c9f..017dbbda0c1c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -205,7 +205,7 @@ static const struct pci_device_id loongson_dwmac_id_table[] = { }; MODULE_DEVICE_TABLE(pci, loongson_dwmac_id_table); -struct pci_driver loongson_dwmac_driver = { +static struct pci_driver loongson_dwmac_driver = { .name = "dwmac-loongson-pci", .id_table = loongson_dwmac_id_table, .probe = loongson_dwmac_probe, -- cgit v1.2.3 From 458f5d92df4807e2a7c803ed928369129996bf96 Mon Sep 17 00:00:00 2001 From: Martin Habets Date: Mon, 4 Apr 2022 11:48:51 +0100 Subject: sfc: Do not free an empty page_ring When the page_ring is not used page_ptr_mask is 0. Do not dereference page_ring[0] in this case. Fixes: 2768935a4660 ("sfc: reuse pages to avoid DMA mapping/unmapping costs") Reported-by: Taehee Yoo Signed-off-by: Martin Habets Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/rx_common.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c index 1b22c7be0088..fa8b9aacca11 100644 --- a/drivers/net/ethernet/sfc/rx_common.c +++ b/drivers/net/ethernet/sfc/rx_common.c @@ -150,6 +150,9 @@ static void efx_fini_rx_recycle_ring(struct efx_rx_queue *rx_queue) struct efx_nic *efx = rx_queue->efx; int i; + if (unlikely(!rx_queue->page_ring)) + return; + /* Unmap and release the pages in the recycle ring. Remove the ring. */ for (i = 0; i <= rx_queue->page_ptr_mask; i++) { struct page *page = rx_queue->page_ring[i]; -- cgit v1.2.3 From 527a9867af29ff89f278d037db704e0ed50fb666 Mon Sep 17 00:00:00 2001 From: Jan Varho Date: Mon, 4 Apr 2022 19:42:30 +0300 Subject: random: do not split fast init input in add_hwgenerator_randomness() add_hwgenerator_randomness() tries to only use the required amount of input for fast init, but credits all the entropy, rather than a fraction of it. Since it's hard to determine how much entropy is left over out of a non-unformly random sample, either give it all to fast init or credit it, but don't attempt to do both. In the process, we can clean up the injection code to no longer need to return a value. Signed-off-by: Jan Varho [Jason: expanded commit message] Fixes: 73c7733f122e ("random: do not throw away excess input to crng_fast_load") Cc: stable@vger.kernel.org # 5.17+, requires af704c856e88 Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 1d8242969751..ee3ad2ba0942 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -437,11 +437,8 @@ static void crng_make_state(u32 chacha_state[CHACHA_STATE_WORDS], * This shouldn't be set by functions like add_device_randomness(), * where we can't trust the buffer passed to it is guaranteed to be * unpredictable (so it might not have any entropy at all). - * - * Returns the number of bytes processed from input, which is bounded - * by CRNG_INIT_CNT_THRESH if account is true. */ -static size_t crng_pre_init_inject(const void *input, size_t len, bool account) +static void crng_pre_init_inject(const void *input, size_t len, bool account) { static int crng_init_cnt = 0; struct blake2s_state hash; @@ -452,18 +449,15 @@ static size_t crng_pre_init_inject(const void *input, size_t len, bool account) spin_lock_irqsave(&base_crng.lock, flags); if (crng_init != 0) { spin_unlock_irqrestore(&base_crng.lock, flags); - return 0; + return; } - if (account) - len = min_t(size_t, len, CRNG_INIT_CNT_THRESH - crng_init_cnt); - blake2s_update(&hash, base_crng.key, sizeof(base_crng.key)); blake2s_update(&hash, input, len); blake2s_final(&hash, base_crng.key); if (account) { - crng_init_cnt += len; + crng_init_cnt += min_t(size_t, len, CRNG_INIT_CNT_THRESH - crng_init_cnt); if (crng_init_cnt >= CRNG_INIT_CNT_THRESH) { ++base_crng.generation; crng_init = 1; @@ -474,8 +468,6 @@ static size_t crng_pre_init_inject(const void *input, size_t len, bool account) if (crng_init == 1) pr_notice("fast init done\n"); - - return len; } static void _get_random_bytes(void *buf, size_t nbytes) @@ -1141,12 +1133,9 @@ void add_hwgenerator_randomness(const void *buffer, size_t count, size_t entropy) { if (unlikely(crng_init == 0 && entropy < POOL_MIN_BITS)) { - size_t ret = crng_pre_init_inject(buffer, count, true); - mix_pool_bytes(buffer, ret); - count -= ret; - buffer += ret; - if (!count || crng_init == 0) - return; + crng_pre_init_inject(buffer, count, true); + mix_pool_bytes(buffer, count); + return; } /* -- cgit v1.2.3 From e3d37210df5c41c51147a2d5d465de1a4d77be7a Mon Sep 17 00:00:00 2001 From: Jamie Bainbridge Date: Mon, 4 Apr 2022 09:47:48 +1000 Subject: sctp: count singleton chunks in assoc user stats Singleton chunks (INIT, HEARTBEAT PMTU probes, and SHUTDOWN- COMPLETE) are not counted in SCTP_GET_ASOC_STATS "sas_octrlchunks" counter available to the assoc owner. These are all control chunks so they should be counted as such. Add counting of singleton chunks so they are properly accounted for. Fixes: 196d67593439 ("sctp: Add support to per-association statistics via a new SCTP_GET_ASSOC_STATS call") Signed-off-by: Jamie Bainbridge Acked-by: Marcelo Ricardo Leitner Link: https://lore.kernel.org/r/c9ba8785789880cf07923b8a5051e174442ea9ee.1649029663.git.jamie.bainbridge@gmail.com Signed-off-by: Paolo Abeni --- net/sctp/outqueue.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index a18609f608fb..e213aaf45d67 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -914,6 +914,7 @@ static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx) ctx->asoc->base.sk->sk_err = -error; return; } + ctx->asoc->stats.octrlchunks++; break; case SCTP_CID_ABORT: @@ -938,7 +939,10 @@ static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx) case SCTP_CID_HEARTBEAT: if (chunk->pmtu_probe) { - sctp_packet_singleton(ctx->transport, chunk, ctx->gfp); + error = sctp_packet_singleton(ctx->transport, + chunk, ctx->gfp); + if (!error) + ctx->asoc->stats.octrlchunks++; break; } fallthrough; -- cgit v1.2.3 From 42193ffd79bd3acd91bd947e53f3548a3661d0a1 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sat, 2 Apr 2022 12:50:37 +0300 Subject: netfilter: nf_tables: memcg accounting for dynamically allocated objects nft_*.c files whose NFT_EXPR_STATEFUL flag is set on need to use __GFP_ACCOUNT flag for objects that are dynamically allocated from the packet path. Such objects are allocated inside nft_expr_ops->init() callbacks executed in task context while processing netlink messages. In addition, this patch adds accounting to nft_set_elem_expr_clone() used for the same purposes. Signed-off-by: Vasily Averin Acked-by: Roman Gushchin Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- net/netfilter/nft_connlimit.c | 2 +- net/netfilter/nft_counter.c | 2 +- net/netfilter/nft_last.c | 2 +- net/netfilter/nft_limit.c | 2 +- net/netfilter/nft_quota.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5ddfdb2adaf1..128ee3b300d6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5526,7 +5526,7 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, int err, i, k; for (i = 0; i < set->num_exprs; i++) { - expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL); + expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL_ACCOUNT); if (!expr) goto err_expr; diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 9de1462e4ac4..d657f999a11b 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -77,7 +77,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx, invert = true; } - priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL); + priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL_ACCOUNT); if (!priv->list) return -ENOMEM; diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index da9083605a61..f4d3573e8782 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -62,7 +62,7 @@ static int nft_counter_do_init(const struct nlattr * const tb[], struct nft_counter __percpu *cpu_stats; struct nft_counter *this_cpu; - cpu_stats = alloc_percpu(struct nft_counter); + cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_KERNEL_ACCOUNT); if (cpu_stats == NULL) return -ENOMEM; diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c index 43d0d4aadb1f..bb15a55dad5c 100644 --- a/net/netfilter/nft_last.c +++ b/net/netfilter/nft_last.c @@ -30,7 +30,7 @@ static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr, u64 last_jiffies; int err; - last = kzalloc(sizeof(*last), GFP_KERNEL); + last = kzalloc(sizeof(*last), GFP_KERNEL_ACCOUNT); if (!last) return -ENOMEM; diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index d4a6cf3cd697..04ea8b9bf202 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -90,7 +90,7 @@ static int nft_limit_init(struct nft_limit_priv *priv, priv->rate); } - priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL); + priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT); if (!priv->limit) return -ENOMEM; diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index d7db57ed3bc1..e6b0df68feea 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -90,7 +90,7 @@ static int nft_quota_do_init(const struct nlattr * const tb[], return -EOPNOTSUPP; } - priv->consumed = kmalloc(sizeof(*priv->consumed), GFP_KERNEL); + priv->consumed = kmalloc(sizeof(*priv->consumed), GFP_KERNEL_ACCOUNT); if (!priv->consumed) return -ENOMEM; -- cgit v1.2.3 From 2b04bd4f03bba021959ca339314f6739710f0954 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Mon, 4 Apr 2022 12:53:36 +0000 Subject: dpaa2-ptp: Fix refcount leak in dpaa2_ptp_probe This node pointer is returned by of_find_compatible_node() with refcount incremented. Calling of_node_put() to aovid the refcount leak. Fixes: d346c9e86d86 ("dpaa2-ptp: reuse ptp_qoriq driver") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220404125336.13427-1-linmq006@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c index 5f5f8c53c4a0..c8cb541572ff 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c @@ -167,7 +167,7 @@ static int dpaa2_ptp_probe(struct fsl_mc_device *mc_dev) base = of_iomap(node, 0); if (!base) { err = -ENOMEM; - goto err_close; + goto err_put; } err = fsl_mc_allocate_irqs(mc_dev); @@ -210,6 +210,8 @@ err_free_mc_irq: fsl_mc_free_irqs(mc_dev); err_unmap: iounmap(base); +err_put: + of_node_put(node); err_close: dprtc_close(mc_dev->mc_io, 0, mc_dev->mc_handle); err_free_mcp: -- cgit v1.2.3 From ccfee1822042b87e5135d33cad8ea353e64612d2 Mon Sep 17 00:00:00 2001 From: Anatolii Gerasymenko Date: Mon, 4 Apr 2022 11:35:47 -0700 Subject: ice: Set txq_teid to ICE_INVAL_TEID on ring creation When VF is freshly created, but not brought up, ring->txq_teid value is by default set to 0. But 0 is a valid TEID. On some platforms the Root Node of Tx scheduler has a TEID = 0. This can cause issues as shown below. The proper way is to set ring->txq_teid to ICE_INVAL_TEID (0xFFFFFFFF). Testing Hints: echo 1 > /sys/class/net/ens785f0/device/sriov_numvfs ip link set dev ens785f0v0 up ip link set dev ens785f0v0 down If we have freshly created VF and quickly turn it on and off, so there would be no time to reach VIRTCHNL_OP_CONFIG_VSI_QUEUES stage, then VIRTCHNL_OP_DISABLE_QUEUES stage will fail with error: [ 639.531454] disable queue 89 failed 14 [ 639.532233] Failed to disable LAN Tx queues, error: ICE_ERR_AQ_ERROR [ 639.533107] ice 0000:02:00.0: Failed to stop Tx ring 0 on VSI 5 The reason for the fail is that we are trying to send AQ command to delete queue 89, which has never been created and receive an "invalid argument" error from firmware. As this queue has never been created, it's teid and ring->txq_teid have default value 0. ice_dis_vsi_txq has a check against non-existent queues: node = ice_sched_find_node_by_teid(pi->root, q_teids[i]); if (!node) continue; But on some platforms the Root Node of Tx scheduler has a teid = 0. Hence, ice_sched_find_node_by_teid finds a node with teid = 0 (it is pi->root), and we go further to submit an erroneous request to firmware. Fixes: 37bb83901286 ("ice: Move common functions out of ice_main.c part 7/7") Signed-off-by: Anatolii Gerasymenko Reviewed-by: Maciej Fijalkowski Tested-by: Konrad Jankowski Signed-off-by: Alice Michael Signed-off-by: Tony Nguyen Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/ice/ice_lib.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 6d6233204388..2774cbd5b12a 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -1480,6 +1480,7 @@ static int ice_vsi_alloc_rings(struct ice_vsi *vsi) ring->tx_tstamps = &pf->ptp.port.tx; ring->dev = dev; ring->count = vsi->num_tx_desc; + ring->txq_teid = ICE_INVAL_TEID; if (dvm_ena) ring->flags |= ICE_TX_FLAGS_RING_VLAN_L2TAG2; else -- cgit v1.2.3 From 05ef6813b234db3196f083b91db3963f040b65bb Mon Sep 17 00:00:00 2001 From: Anatolii Gerasymenko Date: Mon, 4 Apr 2022 11:35:48 -0700 Subject: ice: Do not skip not enabled queues in ice_vc_dis_qs_msg Disable check for queue being enabled in ice_vc_dis_qs_msg, because there could be a case when queues were created, but were not enabled. We still need to delete those queues. Normal workflow for VF looks like: Enable path: VIRTCHNL_OP_ADD_ETH_ADDR (opcode 10) VIRTCHNL_OP_CONFIG_VSI_QUEUES (opcode 6) VIRTCHNL_OP_ENABLE_QUEUES (opcode 8) Disable path: VIRTCHNL_OP_DISABLE_QUEUES (opcode 9) VIRTCHNL_OP_DEL_ETH_ADDR (opcode 11) The issue appears only in stress conditions when VF is enabled and disabled very fast. Eventually there will be a case, when queues are created by VIRTCHNL_OP_CONFIG_VSI_QUEUES, but are not enabled by VIRTCHNL_OP_ENABLE_QUEUES. In turn, these queues are not deleted by VIRTCHNL_OP_DISABLE_QUEUES, because there is a check whether queues are enabled in ice_vc_dis_qs_msg. When we bring up the VF again, we will see the "Failed to set LAN Tx queue context" error during VIRTCHNL_OP_CONFIG_VSI_QUEUES step. This happens because old 16 queues were not deleted and VF requests to create 16 more, but ice_sched_get_free_qparent in ice_ena_vsi_txq would fail to find a parent node for first newly requested queue (because all nodes are allocated to 16 old queues). Testing Hints: Just enable and disable VF fast enough, so it would be disabled before reaching VIRTCHNL_OP_ENABLE_QUEUES. while true; do ip link set dev ens785f0v0 up sleep 0.065 # adjust delay value for you machine ip link set dev ens785f0v0 down done Fixes: 77ca27c41705 ("ice: add support for virtchnl_queue_select.[tx|rx]_queues bitmap") Signed-off-by: Anatolii Gerasymenko Tested-by: Konrad Jankowski Signed-off-by: Alice Michael Signed-off-by: Tony Nguyen Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/ice/ice_virtchnl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c index 3f1a63815bac..69ff4b929772 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c @@ -1358,9 +1358,9 @@ static int ice_vc_dis_qs_msg(struct ice_vf *vf, u8 *msg) goto error_param; } - /* Skip queue if not enabled */ if (!test_bit(vf_q_id, vf->txq_ena)) - continue; + dev_dbg(ice_pf_to_dev(vsi->back), "Queue %u on VSI %u is not enabled, but stopping it anyway\n", + vf_q_id, vsi->vsi_num); ice_fill_txq_meta(vsi, ring, &txq_meta); -- cgit v1.2.3 From 1158f79f82d437093aeed87d57df0548bdd68146 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 4 Apr 2022 09:09:08 -0600 Subject: ipv6: Fix stats accounting in ip6_pkt_drop VRF devices are the loopbacks for VRFs, and a loopback can not be assigned to a VRF. Accordingly, the condition in ip6_pkt_drop should be '||' not '&&'. Fixes: 1d3fd8a10bed ("vrf: Use orig netdev to count Ip6InNoRoutes and a fresh route lookup when sending dest unreach") Reported-by: Pudak, Filip Reported-by: Xiao, Jiguang Signed-off-by: David Ahern Link: https://lore.kernel.org/r/20220404150908.2937-1-dsahern@kernel.org Signed-off-by: Paolo Abeni --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2fa10e60cccd..169e9df6d172 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4484,7 +4484,7 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) struct inet6_dev *idev; int type; - if (netif_is_l3_master(skb->dev) && + if (netif_is_l3_master(skb->dev) || dst->dev == net->loopback_dev) idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); else -- cgit v1.2.3 From 48bff1053c172e6c7f340e506027d118147c8b7f Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 5 Apr 2022 15:57:05 +0200 Subject: random: opportunistically initialize on /dev/urandom reads In 6f98a4bfee72 ("random: block in /dev/urandom"), we tried to make a successful try_to_generate_entropy() call *required* if the RNG was not already initialized. Unfortunately, weird architectures and old userspaces combined in TCG test harnesses, making that change still not realistic, so it was reverted in 0313bc278dac ("Revert "random: block in /dev/urandom""). However, rather than making a successful try_to_generate_entropy() call *required*, we can instead make it *best-effort*. If try_to_generate_entropy() fails, it fails, and nothing changes from the current behavior. If it succeeds, then /dev/urandom becomes safe to use for free. This way, we don't risk the regression potential that led to us reverting the required-try_to_generate_entropy() call before. Practically speaking, this means that at least on x86, /dev/urandom becomes safe. Probably other architectures with working cycle counters will also become safe. And architectures with slow or broken cycle counters at least won't be affected at all by this change. So it may not be the glorious "all things are unified!" change we were hoping for initially, but practically speaking, it makes a positive impact. Cc: Theodore Ts'o Cc: Dominik Brodowski Cc: Linus Torvalds Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/char/random.c b/drivers/char/random.c index ee3ad2ba0942..388025d6d38d 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1534,6 +1534,13 @@ static ssize_t urandom_read(struct file *file, char __user *buf, size_t nbytes, { static int maxwarn = 10; + /* + * Opportunistically attempt to initialize the RNG on platforms that + * have fast cycle counters, but don't (for now) require it to succeed. + */ + if (!crng_ready()) + try_to_generate_entropy(); + if (!crng_ready() && maxwarn > 0) { maxwarn--; if (__ratelimit(&urandom_warning)) -- cgit v1.2.3 From f9124c68f05ffdb87a47e3ea6d5fae9dad7cb6eb Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 17 Mar 2022 19:36:27 +0100 Subject: ice: synchronize_rcu() when terminating rings Unfortunately, the ice driver doesn't respect the RCU critical section that XSK wakeup is surrounded with. To fix this, add synchronize_rcu() calls to paths that destroy resources that might be in use. This was addressed in other AF_XDP ZC enabled drivers, for reference see for example commit b3873a5be757 ("net/i40e: Fix concurrency issues between config flow and XSK") Fixes: efc2214b6047 ("ice: Add support for XDP") Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Signed-off-by: Maciej Fijalkowski Tested-by: Shwetha Nagaraju Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 2 +- drivers/net/ethernet/intel/ice/ice_main.c | 4 +++- drivers/net/ethernet/intel/ice/ice_xsk.c | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 26eaee0b6503..8ed3c9ab7ff7 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -671,7 +671,7 @@ static inline struct ice_pf *ice_netdev_to_pf(struct net_device *netdev) static inline bool ice_is_xdp_ena_vsi(struct ice_vsi *vsi) { - return !!vsi->xdp_prog; + return !!READ_ONCE(vsi->xdp_prog); } static inline void ice_set_ring_xdp(struct ice_tx_ring *ring) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 1d2ca39add95..d2039a9306b8 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2758,8 +2758,10 @@ free_qmap: ice_for_each_xdp_txq(vsi, i) if (vsi->xdp_rings[i]) { - if (vsi->xdp_rings[i]->desc) + if (vsi->xdp_rings[i]->desc) { + synchronize_rcu(); ice_free_tx_ring(vsi->xdp_rings[i]); + } kfree_rcu(vsi->xdp_rings[i], rcu); vsi->xdp_rings[i] = NULL; } diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index dfbcaf08520e..33b28a72ffcb 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -41,8 +41,10 @@ static void ice_qp_reset_stats(struct ice_vsi *vsi, u16 q_idx) static void ice_qp_clean_rings(struct ice_vsi *vsi, u16 q_idx) { ice_clean_tx_ring(vsi->tx_rings[q_idx]); - if (ice_is_xdp_ena_vsi(vsi)) + if (ice_is_xdp_ena_vsi(vsi)) { + synchronize_rcu(); ice_clean_tx_ring(vsi->xdp_rings[q_idx]); + } ice_clean_rx_ring(vsi->rx_rings[q_idx]); } -- cgit v1.2.3 From 72b915a2b444e9247c9d424a840e94263db07c27 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 17 Mar 2022 19:36:28 +0100 Subject: ice: xsk: fix VSI state check in ice_xsk_wakeup() ICE_DOWN is dedicated for pf->state. Check for ICE_VSI_DOWN being set on vsi->state in ice_xsk_wakeup(). Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Signed-off-by: Maciej Fijalkowski Tested-by: Shwetha Nagaraju Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 33b28a72ffcb..866ee4df9671 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -920,7 +920,7 @@ ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, struct ice_vsi *vsi = np->vsi; struct ice_tx_ring *ring; - if (test_bit(ICE_DOWN, vsi->state)) + if (test_bit(ICE_VSI_DOWN, vsi->state)) return -ENETDOWN; if (!ice_is_xdp_ena_vsi(vsi)) -- cgit v1.2.3 From e19778e6c911691856447c3bf9617f00b3e1347f Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 17 Mar 2022 19:36:29 +0100 Subject: ice: clear cmd_type_offset_bsz for TX rings Currently when XDP rings are created, each descriptor gets its DD bit set, which turns out to be the wrong approach as it can lead to a situation where more descriptors get cleaned than it was supposed to, e.g. when AF_XDP busy poll is run with a large batch size. In this situation, the driver would request for more buffers than it is able to handle. Fix this by not setting the DD bits in ice_xdp_alloc_setup_rings(). They should be initialized to zero instead. Fixes: 9610bd988df9 ("ice: optimize XDP_TX workloads") Signed-off-by: Maciej Fijalkowski Tested-by: Shwetha Nagaraju Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index d2039a9306b8..d768925785ca 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2562,7 +2562,7 @@ static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi) spin_lock_init(&xdp_ring->tx_lock); for (j = 0; j < xdp_ring->count; j++) { tx_desc = ICE_TX_DESC(xdp_ring, j); - tx_desc->cmd_type_offset_bsz = cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE); + tx_desc->cmd_type_offset_bsz = 0; } } -- cgit v1.2.3 From 73924ec4d560257004d5b5116b22a3647661e364 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 4 Apr 2022 17:34:19 -0700 Subject: x86/pm: Save the MSR validity status at context setup The mechanism to save/restore MSRs during S3 suspend/resume checks for the MSR validity during suspend, and only restores the MSR if its a valid MSR. This is not optimal, as an invalid MSR will unnecessarily throw an exception for every suspend cycle. The more invalid MSRs, higher the impact will be. Check and save the MSR validity at setup. This ensures that only valid MSRs that are guaranteed to not throw an exception will be attempted during suspend. Fixes: 7a9c2dd08ead ("x86/pm: Introduce quirk framework to save/restore extra MSR registers around suspend/resume") Suggested-by: Dave Hansen Signed-off-by: Pawan Gupta Reviewed-by: Dave Hansen Acked-by: Borislav Petkov Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- arch/x86/power/cpu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 9f2b251e83c5..eaec0cb3fe04 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -40,7 +40,8 @@ static void msr_save_context(struct saved_context *ctxt) struct saved_msr *end = msr + ctxt->saved_msrs.num; while (msr < end) { - msr->valid = !rdmsrl_safe(msr->info.msr_no, &msr->info.reg.q); + if (msr->valid) + rdmsrl(msr->info.msr_no, msr->info.reg.q); msr++; } } @@ -424,8 +425,10 @@ static int msr_build_context(const u32 *msr_id, const int num) } for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) { + u64 dummy; + msr_array[i].info.msr_no = msr_id[j]; - msr_array[i].valid = false; + msr_array[i].valid = !rdmsrl_safe(msr_id[j], &dummy); msr_array[i].info.reg.q = 0; } saved_msrs->num = total_num; -- cgit v1.2.3 From e2a1256b17b16f9b9adf1b6fea56819e7b68e463 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 4 Apr 2022 17:35:45 -0700 Subject: x86/speculation: Restore speculation related MSRs during S3 resume After resuming from suspend-to-RAM, the MSRs that control CPU's speculative execution behavior are not being restored on the boot CPU. These MSRs are used to mitigate speculative execution vulnerabilities. Not restoring them correctly may leave the CPU vulnerable. Secondary CPU's MSRs are correctly being restored at S3 resume by identify_secondary_cpu(). During S3 resume, restore these MSRs for boot CPU when restoring its processor state. Fixes: 772439717dbf ("x86/bugs/intel: Set proper CPU features and setup RDS") Reported-by: Neelima Krishnan Signed-off-by: Pawan Gupta Tested-by: Neelima Krishnan Acked-by: Borislav Petkov Reviewed-by: Dave Hansen Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- arch/x86/power/cpu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index eaec0cb3fe04..3822666fb73d 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -503,10 +503,24 @@ static int pm_cpu_check(const struct x86_cpu_id *c) return ret; } +static void pm_save_spec_msr(void) +{ + u32 spec_msr_id[] = { + MSR_IA32_SPEC_CTRL, + MSR_IA32_TSX_CTRL, + MSR_TSX_FORCE_ABORT, + MSR_IA32_MCU_OPT_CTRL, + MSR_AMD64_LS_CFG, + }; + + msr_build_context(spec_msr_id, ARRAY_SIZE(spec_msr_id)); +} + static int pm_check_save_msr(void) { dmi_check_system(msr_save_dmi_table); pm_cpu_check(msr_save_cpu_table); + pm_save_spec_msr(); return 0; } -- cgit v1.2.3 From 1d7e4fd72bb9be080c23a099b0dff1007109fc2b Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 5 Apr 2022 08:59:36 +0200 Subject: net: micrel: Fix KS8851 Kconfig KS8851 selects MICREL_PHY, which depends on PTP_1588_CLOCK_OPTIONAL, so make KS8851 also depend on PTP_1588_CLOCK_OPTIONAL. Fixes kconfig warning and build errors: WARNING: unmet direct dependencies detected for MICREL_PHY Depends on [m]: NETDEVICES [=y] && PHYLIB [=y] && PTP_1588_CLOCK_OPTIONAL [=m] Selected by [y]: - KS8851 [=y] && NETDEVICES [=y] && ETHERNET [=y] && NET_VENDOR_MICREL [=y] && SPI [=y] ld.lld: error: undefined symbol: ptp_clock_register referenced by micrel.c net/phy/micrel.o:(lan8814_probe) in archive drivers/built-in.a ld.lld: error: undefined symbol: ptp_clock_index referenced by micrel.c net/phy/micrel.o:(lan8814_ts_info) in archive drivers/built-in.a Reported-by: kernel test robot Fixes: ece19502834d ("net: phy: micrel: 1588 support for LAN8814 phy") Signed-off-by: Horatiu Vultur Tested-by: Randy Dunlap Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20220405065936.4105272-1-horatiu.vultur@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/micrel/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/micrel/Kconfig b/drivers/net/ethernet/micrel/Kconfig index 1b632cdd7630..830363bafcce 100644 --- a/drivers/net/ethernet/micrel/Kconfig +++ b/drivers/net/ethernet/micrel/Kconfig @@ -28,6 +28,7 @@ config KS8842 config KS8851 tristate "Micrel KS8851 SPI" depends on SPI + depends on PTP_1588_CLOCK_OPTIONAL select MII select CRC32 select EEPROM_93CX6 -- cgit v1.2.3 From 3f2a3050b4a3e7f32fc0ea3c9b0183090ae00522 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 4 Apr 2022 12:41:50 +0200 Subject: net: openvswitch: don't send internal clone attribute to the userspace. 'OVS_CLONE_ATTR_EXEC' is an internal attribute that is used for performance optimization inside the kernel. It's added by the kernel while parsing user-provided actions and should not be sent during the flow dump as it's not part of the uAPI. The issue doesn't cause any significant problems to the ovs-vswitchd process, because reported actions are not really used in the application lifecycle and only supposed to be shown to a human via ovs-dpctl flow dump. However, the action list is still incorrect and causes the following error if the user wants to look at the datapath flows: # ovs-dpctl add-dp system@ovs-system # ovs-dpctl add-flow "" "clone(ct(commit),0)" # ovs-dpctl dump-flows , packets:0, bytes:0, used:never, actions:clone(bad length 4, expected -1 for: action0(01 00 00 00), ct(commit),0) With the fix: # ovs-dpctl dump-flows , packets:0, bytes:0, used:never, actions:clone(ct(commit),0) Additionally fixed an incorrect attribute name in the comment. Fixes: b233504033db ("openvswitch: kernel datapath clone action") Signed-off-by: Ilya Maximets Acked-by: Aaron Conole Link: https://lore.kernel.org/r/20220404104150.2865736-1-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- net/openvswitch/actions.c | 2 +- net/openvswitch/flow_netlink.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 7056cb1b8ba0..1b5d73079dc9 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1051,7 +1051,7 @@ static int clone(struct datapath *dp, struct sk_buff *skb, int rem = nla_len(attr); bool dont_clone_flow_key; - /* The first action is always 'OVS_CLONE_ATTR_ARG'. */ + /* The first action is always 'OVS_CLONE_ATTR_EXEC'. */ clone_arg = nla_data(attr); dont_clone_flow_key = nla_get_u32(clone_arg); actions = nla_next(clone_arg, &rem); diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index cc282a58b75b..dbdcaaa27f5b 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -3458,7 +3458,9 @@ static int clone_action_to_attr(const struct nlattr *attr, if (!start) return -EMSGSIZE; - err = ovs_nla_put_actions(nla_data(attr), rem, skb); + /* Skipping the OVS_CLONE_ATTR_EXEC that is always the first attribute. */ + attr = nla_next(nla_data(attr), &rem); + err = ovs_nla_put_actions(attr, rem, skb); if (err) nla_nest_cancel(skb, start); -- cgit v1.2.3 From 11f8e7c122ce013fa745029fa8c94c6db69c2e54 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 5 Apr 2022 02:04:04 +0200 Subject: net: ethernet: mv643xx: Fix over zealous checking of_get_mac_address() There is often not a MAC address available in an EEPROM accessible by Linux with Marvell devices. Instead the bootload has the MAC address and directly programs it into the hardware. So don't consider an error from of_get_mac_address() has fatal. However, the check was added for the case where there is a MAC address in an the EEPROM, but the EEPROM has not probed yet, and -EPROBE_DEFER is returned. In that case the error should be returned. So make the check specific to this error code. Cc: Mauri Sandberg Reported-by: Thomas Walther Fixes: 42404d8f1c01 ("net: mv643xx_eth: process retval from of_get_mac_address") Signed-off-by: Andrew Lunn Link: https://lore.kernel.org/r/20220405000404.3374734-1-andrew@lunn.ch Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mv643xx_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 5f9ab1842d49..c18801490649 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -2751,7 +2751,7 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev, } ret = of_get_mac_address(pnp, ppd.mac_addr); - if (ret) + if (ret == -EPROBE_DEFER) return ret; mv643xx_eth_property(pnp, "tx-queue-size", ppd.tx_queue_size); -- cgit v1.2.3 From 55b014159ee7af63770cd7f2b6fe926f6dd99335 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 6 Apr 2022 10:57:51 +0900 Subject: ata: ahci: Rename CONFIG_SATA_LPM_POLICY configuration item back CONFIG_SATA_LPM_MOBILE_POLICY was renamed to CONFIG_SATA_LPM_POLICY in commit 4dd4d3deb502 ("ata: ahci: Rename CONFIG_SATA_LPM_MOBILE_POLICY configuration item"). This can potentially cause problems as users would invisibly lose configuration policy defaults when they built the new kernel. To avoid such problems, switch back to the old name (even if it's wrong). Suggested-by: Christoph Hellwig Suggested-by: Damien Le Moal Signed-off-by: Mario Limonciello Signed-off-by: Damien Le Moal --- drivers/ata/Kconfig | 6 ++++-- drivers/ata/ahci.c | 2 +- drivers/ata/ahci.h | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig index e5641e6c52ee..bb45a9c00514 100644 --- a/drivers/ata/Kconfig +++ b/drivers/ata/Kconfig @@ -115,14 +115,16 @@ config SATA_AHCI If unsure, say N. -config SATA_LPM_POLICY +config SATA_MOBILE_LPM_POLICY int "Default SATA Link Power Management policy for low power chipsets" range 0 4 default 0 depends on SATA_AHCI help Select the Default SATA Link Power Management (LPM) policy to use - for chipsets / "South Bridges" designated as supporting low power. + for chipsets / "South Bridges" supporting low-power modes. Such + chipsets are typically found on most laptops but desktops and + servers now also widely use chipsets supporting low power modes. The value set has the following meanings: 0 => Keep firmware settings diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 84456c05e845..397dfd27c90d 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1595,7 +1595,7 @@ static int ahci_init_msi(struct pci_dev *pdev, unsigned int n_ports, static void ahci_update_initial_lpm_policy(struct ata_port *ap, struct ahci_host_priv *hpriv) { - int policy = CONFIG_SATA_LPM_POLICY; + int policy = CONFIG_SATA_MOBILE_LPM_POLICY; /* Ignore processing for chipsets that don't use policy */ diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h index 6ead58c1b6e5..ad11a4c52fbe 100644 --- a/drivers/ata/ahci.h +++ b/drivers/ata/ahci.h @@ -236,7 +236,7 @@ enum { AHCI_HFLAG_NO_WRITE_TO_RO = (1 << 24), /* don't write to read only registers */ AHCI_HFLAG_USE_LPM_POLICY = (1 << 25), /* chipset that should use - SATA_LPM_POLICY + SATA_MOBILE_LPM_POLICY as default lpm_policy */ AHCI_HFLAG_SUSPEND_PHYS = (1 << 26), /* handle PHYs during suspend/resume */ -- cgit v1.2.3 From 1f30fb9166d4f15a1aa19449b9da871fe0ed4796 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 4 Apr 2022 17:43:45 +0200 Subject: net: openvswitch: fix leak of nested actions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While parsing user-provided actions, openvswitch module may dynamically allocate memory and store pointers in the internal copy of the actions. So this memory has to be freed while destroying the actions. Currently there are only two such actions: ct() and set(). However, there are many actions that can hold nested lists of actions and ovs_nla_free_flow_actions() just jumps over them leaking the memory. For example, removal of the flow with the following actions will lead to a leak of the memory allocated by nf_ct_tmpl_alloc(): actions:clone(ct(commit),0) Non-freed set() action may also leak the 'dst' structure for the tunnel info including device references. Under certain conditions with a high rate of flow rotation that may cause significant memory leak problem (2MB per second in reporter's case). The problem is also hard to mitigate, because the user doesn't have direct control over the datapath flows generated by OVS. Fix that by iterating over all the nested actions and freeing everything that needs to be freed recursively. New build time assertion should protect us from this problem if new actions will be added in the future. Unfortunately, openvswitch module doesn't use NLA_F_NESTED, so all attributes has to be explicitly checked. sample() and clone() actions are mixing extra attributes into the user-provided action list. That prevents some code generalization too. Fixes: 34ae932a4036 ("openvswitch: Make tunnel set action attach a metadata dst") Link: https://mail.openvswitch.org/pipermail/ovs-dev/2022-March/392922.html Reported-by: Stéphane Graber Signed-off-by: Ilya Maximets Acked-by: Aaron Conole Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 95 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 5 deletions(-) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index dbdcaaa27f5b..7176156d3844 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2317,6 +2317,62 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size) return sfa; } +static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len); + +static void ovs_nla_free_check_pkt_len_action(const struct nlattr *action) +{ + const struct nlattr *a; + int rem; + + nla_for_each_nested(a, action, rem) { + switch (nla_type(a)) { + case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL: + case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER: + ovs_nla_free_nested_actions(nla_data(a), nla_len(a)); + break; + } + } +} + +static void ovs_nla_free_clone_action(const struct nlattr *action) +{ + const struct nlattr *a = nla_data(action); + int rem = nla_len(action); + + switch (nla_type(a)) { + case OVS_CLONE_ATTR_EXEC: + /* The real list of actions follows this attribute. */ + a = nla_next(a, &rem); + ovs_nla_free_nested_actions(a, rem); + break; + } +} + +static void ovs_nla_free_dec_ttl_action(const struct nlattr *action) +{ + const struct nlattr *a = nla_data(action); + + switch (nla_type(a)) { + case OVS_DEC_TTL_ATTR_ACTION: + ovs_nla_free_nested_actions(nla_data(a), nla_len(a)); + break; + } +} + +static void ovs_nla_free_sample_action(const struct nlattr *action) +{ + const struct nlattr *a = nla_data(action); + int rem = nla_len(action); + + switch (nla_type(a)) { + case OVS_SAMPLE_ATTR_ARG: + /* The real list of actions follows this attribute. */ + a = nla_next(a, &rem); + ovs_nla_free_nested_actions(a, rem); + break; + } +} + static void ovs_nla_free_set_action(const struct nlattr *a) { const struct nlattr *ovs_key = nla_data(a); @@ -2330,25 +2386,54 @@ static void ovs_nla_free_set_action(const struct nlattr *a) } } -void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len) { const struct nlattr *a; int rem; - if (!sf_acts) + /* Whenever new actions are added, the need to update this + * function should be considered. + */ + BUILD_BUG_ON(OVS_ACTION_ATTR_MAX != 23); + + if (!actions) return; - nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) { + nla_for_each_attr(a, actions, len, rem) { switch (nla_type(a)) { - case OVS_ACTION_ATTR_SET: - ovs_nla_free_set_action(a); + case OVS_ACTION_ATTR_CHECK_PKT_LEN: + ovs_nla_free_check_pkt_len_action(a); + break; + + case OVS_ACTION_ATTR_CLONE: + ovs_nla_free_clone_action(a); break; + case OVS_ACTION_ATTR_CT: ovs_ct_free_action(a); break; + + case OVS_ACTION_ATTR_DEC_TTL: + ovs_nla_free_dec_ttl_action(a); + break; + + case OVS_ACTION_ATTR_SAMPLE: + ovs_nla_free_sample_action(a); + break; + + case OVS_ACTION_ATTR_SET: + ovs_nla_free_set_action(a); + break; } } +} + +void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +{ + if (!sf_acts) + return; + ovs_nla_free_nested_actions(sf_acts->actions, sf_acts->actions_len); kfree(sf_acts); } -- cgit v1.2.3 From 1946014ca3b19be9e485e780e862c375c6f98bad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Apr 2022 11:34:39 -0700 Subject: rxrpc: fix a race in rxrpc_exit_net() Current code can lead to the following race: CPU0 CPU1 rxrpc_exit_net() rxrpc_peer_keepalive_worker() if (rxnet->live) rxnet->live = false; del_timer_sync(&rxnet->peer_keepalive_timer); timer_reduce(&rxnet->peer_keepalive_timer, jiffies + delay); cancel_work_sync(&rxnet->peer_keepalive_work); rxrpc_exit_net() exits while peer_keepalive_timer is still armed, leading to use-after-free. syzbot report was: ODEBUG: free active (active state 0) object type: timer_list hint: rxrpc_peer_keepalive_timeout+0x0/0xb0 WARNING: CPU: 0 PID: 3660 at lib/debugobjects.c:505 debug_print_object+0x16e/0x250 lib/debugobjects.c:505 Modules linked in: CPU: 0 PID: 3660 Comm: kworker/u4:6 Not tainted 5.17.0-syzkaller-13993-g88e6c0207623 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: netns cleanup_net RIP: 0010:debug_print_object+0x16e/0x250 lib/debugobjects.c:505 Code: ff df 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 af 00 00 00 48 8b 14 dd 00 1c 26 8a 4c 89 ee 48 c7 c7 00 10 26 8a e8 b1 e7 28 05 <0f> 0b 83 05 15 eb c5 09 01 48 83 c4 18 5b 5d 41 5c 41 5d 41 5e c3 RSP: 0018:ffffc9000353fb00 EFLAGS: 00010082 RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000000 RDX: ffff888029196140 RSI: ffffffff815efad8 RDI: fffff520006a7f52 RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff815ea4ae R11: 0000000000000000 R12: ffffffff89ce23e0 R13: ffffffff8a2614e0 R14: ffffffff816628c0 R15: dffffc0000000000 FS: 0000000000000000(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe1f2908924 CR3: 0000000043720000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __debug_check_no_obj_freed lib/debugobjects.c:992 [inline] debug_check_no_obj_freed+0x301/0x420 lib/debugobjects.c:1023 kfree+0xd6/0x310 mm/slab.c:3809 ops_free_list.part.0+0x119/0x370 net/core/net_namespace.c:176 ops_free_list net/core/net_namespace.c:174 [inline] cleanup_net+0x591/0xb00 net/core/net_namespace.c:598 process_one_work+0x996/0x1610 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e9/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:298 Fixes: ace45bec6d77 ("rxrpc: Fix firewall route keepalive") Signed-off-by: Eric Dumazet Cc: David Howells Cc: Marc Dionne Cc: linux-afs@lists.infradead.org Reported-by: syzbot Signed-off-by: David S. Miller --- net/rxrpc/net_ns.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 25bbc4cc8b13..f15d6942da45 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -113,8 +113,8 @@ static __net_exit void rxrpc_exit_net(struct net *net) struct rxrpc_net *rxnet = rxrpc_net(net); rxnet->live = false; - del_timer_sync(&rxnet->peer_keepalive_timer); cancel_work_sync(&rxnet->peer_keepalive_work); + del_timer_sync(&rxnet->peer_keepalive_timer); rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); rxrpc_destroy_all_peers(rxnet); -- cgit v1.2.3 From fb5833d81e4333294add35d3ac7f7f52a7bf107f Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 5 Apr 2022 08:45:44 +0000 Subject: net: sfc: fix using uninitialized xdp tx_queue In some cases, xdp tx_queue can get used before initialization. 1. interface up/down 2. ring buffer size change When CPU cores are lower than maximum number of channels of sfc driver, it creates new channels only for XDP. When an interface is up or ring buffer size is changed, all channels are initialized. But xdp channels are always initialized later. So, the below scenario is possible. Packets are received to rx queue of normal channels and it is acted XDP_TX and tx_queue of xdp channels get used. But these tx_queues are not initialized yet. If so, TX DMA or queue error occurs. In order to avoid this problem. 1. initializes xdp tx_queues earlier than other rx_queue in efx_start_channels(). 2. checks whether tx_queue is initialized or not in efx_xdp_tx_buffers(). Splat looks like: sfc 0000:08:00.1 enp8s0f1np1: TX queue 10 spurious TX completion id 250 sfc 0000:08:00.1 enp8s0f1np1: resetting (RECOVER_OR_ALL) sfc 0000:08:00.1 enp8s0f1np1: MC command 0x80 inlen 100 failed rc=-22 (raw=22) arg=789 sfc 0000:08:00.1 enp8s0f1np1: has been disabled Fixes: f28100cb9c96 ("sfc: fix lack of XDP TX queues - error XDP TX failed (-22)") Acked-by: Martin Habets Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/efx_channels.c | 2 +- drivers/net/ethernet/sfc/tx.c | 3 +++ drivers/net/ethernet/sfc/tx_common.c | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/efx_channels.c b/drivers/net/ethernet/sfc/efx_channels.c index 83e27231fbe6..377df8b7f015 100644 --- a/drivers/net/ethernet/sfc/efx_channels.c +++ b/drivers/net/ethernet/sfc/efx_channels.c @@ -1140,7 +1140,7 @@ void efx_start_channels(struct efx_nic *efx) struct efx_rx_queue *rx_queue; struct efx_channel *channel; - efx_for_each_channel(channel, efx) { + efx_for_each_channel_rev(channel, efx) { efx_for_each_channel_tx_queue(tx_queue, channel) { efx_init_tx_queue(tx_queue); atomic_inc(&efx->active_queues); diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c index d16e031e95f4..6983799e1c05 100644 --- a/drivers/net/ethernet/sfc/tx.c +++ b/drivers/net/ethernet/sfc/tx.c @@ -443,6 +443,9 @@ int efx_xdp_tx_buffers(struct efx_nic *efx, int n, struct xdp_frame **xdpfs, if (unlikely(!tx_queue)) return -EINVAL; + if (!tx_queue->initialised) + return -EINVAL; + if (efx->xdp_txq_queues_mode != EFX_XDP_TX_QUEUES_DEDICATED) HARD_TX_LOCK(efx->net_dev, tx_queue->core_txq, cpu); diff --git a/drivers/net/ethernet/sfc/tx_common.c b/drivers/net/ethernet/sfc/tx_common.c index d530cde2b864..9bc8281b7f5b 100644 --- a/drivers/net/ethernet/sfc/tx_common.c +++ b/drivers/net/ethernet/sfc/tx_common.c @@ -101,6 +101,8 @@ void efx_fini_tx_queue(struct efx_tx_queue *tx_queue) netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev, "shutting down TX queue %d\n", tx_queue->queue); + tx_queue->initialised = false; + if (!tx_queue->buffer) return; -- cgit v1.2.3 From d1c4f93e3f0a023024a6f022a61528c06cf1daa9 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Tue, 5 Apr 2022 17:19:26 +0800 Subject: net: axienet: setup mdio unconditionally The call to axienet_mdio_setup should not depend on whether "phy-node" pressents on the DT. Besides, since `lp->phy_node` is used if PHY is in SGMII or 100Base-X modes, move it into the if statement. And the next patch will remove `lp->phy_node` from driver's private structure and do an of_node_put on it right away after use since it is not used elsewhere. Signed-off-by: Andy Chiu Reviewed-by: Greentime Hu Reviewed-by: Robert Hancock Reviewed-by: Radhey Shyam Pandey Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index c7eb05e4a6bf..78a991bbbcf9 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -2064,15 +2064,14 @@ static int axienet_probe(struct platform_device *pdev) if (ret) goto cleanup_clk; - lp->phy_node = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0); - if (lp->phy_node) { - ret = axienet_mdio_setup(lp); - if (ret) - dev_warn(&pdev->dev, - "error registering MDIO bus: %d\n", ret); - } + ret = axienet_mdio_setup(lp); + if (ret) + dev_warn(&pdev->dev, + "error registering MDIO bus: %d\n", ret); + if (lp->phy_mode == PHY_INTERFACE_MODE_SGMII || lp->phy_mode == PHY_INTERFACE_MODE_1000BASEX) { + lp->phy_node = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0); if (!lp->phy_node) { dev_err(&pdev->dev, "phy-handle required for 1000BaseX/SGMII\n"); ret = -EINVAL; -- cgit v1.2.3 From ab3a5d4c6081dbcfd90d19cc9849af89c6985d0f Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Tue, 5 Apr 2022 17:19:27 +0800 Subject: net: axienet: factor out phy_node in struct axienet_local the struct member `phy_node` of struct axienet_local is not used by the driver anymore after initialization. It might be a remnent of old code and could be removed. Signed-off-by: Andy Chiu Reviewed-by: Greentime Hu Reviewed-by: Robert Hancock Reviewed-by: Radhey Shyam Pandey Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet.h | 2 -- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 13 +++++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h b/drivers/net/ethernet/xilinx/xilinx_axienet.h index 0f9c88dd1a4a..d5c1e5c4a508 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet.h +++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h @@ -433,8 +433,6 @@ struct axienet_local { struct net_device *ndev; struct device *dev; - struct device_node *phy_node; - struct phylink *phylink; struct phylink_config phylink_config; diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 78a991bbbcf9..3daef64a85bd 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -2071,17 +2071,19 @@ static int axienet_probe(struct platform_device *pdev) if (lp->phy_mode == PHY_INTERFACE_MODE_SGMII || lp->phy_mode == PHY_INTERFACE_MODE_1000BASEX) { - lp->phy_node = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0); - if (!lp->phy_node) { + np = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0); + if (!np) { dev_err(&pdev->dev, "phy-handle required for 1000BaseX/SGMII\n"); ret = -EINVAL; goto cleanup_mdio; } - lp->pcs_phy = of_mdio_find_device(lp->phy_node); + lp->pcs_phy = of_mdio_find_device(np); if (!lp->pcs_phy) { ret = -EPROBE_DEFER; + of_node_put(np); goto cleanup_mdio; } + of_node_put(np); lp->pcs.ops = &axienet_pcs_ops; lp->pcs.poll = true; } @@ -2124,8 +2126,6 @@ cleanup_mdio: put_device(&lp->pcs_phy->dev); if (lp->mii_bus) axienet_mdio_teardown(lp); - of_node_put(lp->phy_node); - cleanup_clk: clk_bulk_disable_unprepare(XAE_NUM_MISC_CLOCKS, lp->misc_clks); clk_disable_unprepare(lp->axi_clk); @@ -2154,9 +2154,6 @@ static int axienet_remove(struct platform_device *pdev) clk_bulk_disable_unprepare(XAE_NUM_MISC_CLOCKS, lp->misc_clks); clk_disable_unprepare(lp->axi_clk); - of_node_put(lp->phy_node); - lp->phy_node = NULL; - free_netdev(ndev); return 0; -- cgit v1.2.3 From dc48f04fd6562de6019e9fc7ed9ed539d632babb Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Tue, 5 Apr 2022 17:19:28 +0800 Subject: dt-bindings: net: add pcs-handle attribute Document the new pcs-handle attribute to support connecting to an external PHY. For Xilinx's AXI Ethernet, this is used when the core operates in SGMII or 1000Base-X modes and links through the internal PCS/PMA PHY. Signed-off-by: Andy Chiu Reviewed-by: Greentime Hu Reviewed-by: Rob Herring Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/ethernet-controller.yaml | 6 ++++++ Documentation/devicetree/bindings/net/xilinx_axienet.txt | 8 +++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml index 817794e56227..4f15463611f8 100644 --- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml @@ -106,6 +106,12 @@ properties: phy-mode: $ref: "#/properties/phy-connection-type" + pcs-handle: + $ref: /schemas/types.yaml#/definitions/phandle + description: + Specifies a reference to a node representing a PCS PHY device on a MDIO + bus to link with an external PHY (phy-handle) if exists. + phy-handle: $ref: /schemas/types.yaml#/definitions/phandle description: diff --git a/Documentation/devicetree/bindings/net/xilinx_axienet.txt b/Documentation/devicetree/bindings/net/xilinx_axienet.txt index b8e4894bc634..1aa4c6006cd0 100644 --- a/Documentation/devicetree/bindings/net/xilinx_axienet.txt +++ b/Documentation/devicetree/bindings/net/xilinx_axienet.txt @@ -26,7 +26,8 @@ Required properties: specified, the TX/RX DMA interrupts should be on that node instead, and only the Ethernet core interrupt is optionally specified here. -- phy-handle : Should point to the external phy device. +- phy-handle : Should point to the external phy device if exists. Pointing + this to the PCS/PMA PHY is deprecated and should be avoided. See ethernet.txt file in the same directory. - xlnx,rxmem : Set to allocated memory buffer for Rx/Tx in the hardware @@ -68,6 +69,11 @@ Optional properties: required through the core's MDIO interface (i.e. always, unless the PHY is accessed through a different bus). + - pcs-handle: Phandle to the internal PCS/PMA PHY in SGMII or 1000Base-X + modes, where "pcs-handle" should be used to point + to the PCS/PMA PHY, and "phy-handle" should point to an + external PHY if exists. + Example: axi_ethernet_eth: ethernet@40c00000 { compatible = "xlnx,axi-ethernet-1.00.a"; -- cgit v1.2.3 From 19c7a43912c61a3bcc09f220cd8681d35c1bec79 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Tue, 5 Apr 2022 17:19:29 +0800 Subject: net: axiemac: use a phandle to reference pcs_phy In some SGMII use cases where both a fixed link external PHY and the internal PCS/PMA PHY need to be configured, we should explicitly use a phandle "pcs-phy" to get the reference to the PCS/PMA PHY. Otherwise, the driver would use "phy-handle" in the DT as the reference to both the external and the internal PCS/PMA PHY. In other cases where the core is connected to a SFP cage, we could still point phy-handle to the intenal PCS/PMA PHY, and let the driver connect to the SFP module, if exist, via phylink. Signed-off-by: Andy Chiu Reviewed-by: Greentime Hu Reviewed-by: Robert Hancock Reviewed-by: Andrew Lunn Reviewed-by: Radhey Shyam Pandey Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 3daef64a85bd..d6fc3f7acdf0 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -2071,9 +2071,16 @@ static int axienet_probe(struct platform_device *pdev) if (lp->phy_mode == PHY_INTERFACE_MODE_SGMII || lp->phy_mode == PHY_INTERFACE_MODE_1000BASEX) { - np = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0); + np = of_parse_phandle(pdev->dev.of_node, "pcs-handle", 0); if (!np) { - dev_err(&pdev->dev, "phy-handle required for 1000BaseX/SGMII\n"); + /* Deprecated: Always use "pcs-handle" for pcs_phy. + * Falling back to "phy-handle" here is only for + * backward compatibility with old device trees. + */ + np = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0); + } + if (!np) { + dev_err(&pdev->dev, "pcs-handle (preferred) or phy-handle required for 1000BaseX/SGMII\n"); ret = -EINVAL; goto cleanup_mdio; } -- cgit v1.2.3 From 8d90991e5bf7fdb9f264f5f579d18969913054b7 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 5 Apr 2022 14:02:33 +0200 Subject: net: phy: mscc-miim: reject clause 45 register accesses The driver doesn't support clause 45 register access yet, but doesn't check if the access is a c45 one either. This leads to spurious register reads and writes. Add the check. Fixes: 542671fe4d86 ("net: phy: mscc-miim: Add MDIO driver") Signed-off-by: Michael Walle Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/mdio/mdio-mscc-miim.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/mdio/mdio-mscc-miim.c b/drivers/net/mdio/mdio-mscc-miim.c index c483ba67c21f..582969751b4c 100644 --- a/drivers/net/mdio/mdio-mscc-miim.c +++ b/drivers/net/mdio/mdio-mscc-miim.c @@ -102,6 +102,9 @@ static int mscc_miim_read(struct mii_bus *bus, int mii_id, int regnum) u32 val; int ret; + if (regnum & MII_ADDR_C45) + return -EOPNOTSUPP; + ret = mscc_miim_wait_pending(bus); if (ret) goto out; @@ -145,6 +148,9 @@ static int mscc_miim_write(struct mii_bus *bus, int mii_id, struct mscc_miim_dev *miim = bus->priv; int ret; + if (regnum & MII_ADDR_C45) + return -EOPNOTSUPP; + ret = mscc_miim_wait_pending(bus); if (ret < 0) goto out; -- cgit v1.2.3 From aba120cc101788544aa3e2c30c8da88513892350 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 5 Apr 2022 16:40:51 +0200 Subject: random: do not allow user to keep crng key around on stack The fast key erasure RNG design relies on the key that's used to be used and then discarded. We do this, making judicious use of memzero_explicit(). However, reads to /dev/urandom and calls to getrandom() involve a copy_to_user(), and userspace can use FUSE or userfaultfd, or make a massive call, dynamically remap memory addresses as it goes, and set the process priority to idle, in order to keep a kernel stack alive indefinitely. By probing /proc/sys/kernel/random/entropy_avail to learn when the crng key is refreshed, a malicious userspace could mount this attack every 5 minutes thereafter, breaking the crng's forward secrecy. In order to fix this, we just overwrite the stack's key with the first 32 bytes of the "free" fast key erasure output. If we're returning <= 32 bytes to the user, then we can still return those bytes directly, so that short reads don't become slower. And for long reads, the difference is hopefully lost in the amortization, so it doesn't change much, with that amortization helping variously for medium reads. We don't need to do this for get_random_bytes() and the various kernel-space callers, and later, if we ever switch to always batching, this won't be necessary either, so there's no need to change the API of these functions. Cc: Theodore Ts'o Reviewed-by: Jann Horn Fixes: c92e040d575a ("random: add backtracking protection to the CRNG") Fixes: 186873c549df ("random: use simpler fast key erasure flow on per-cpu keys") Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 388025d6d38d..47f01b1482a9 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -532,19 +532,29 @@ static ssize_t get_random_bytes_user(void __user *buf, size_t nbytes) if (!nbytes) return 0; - len = min_t(size_t, 32, nbytes); - crng_make_state(chacha_state, output, len); - - if (copy_to_user(buf, output, len)) - return -EFAULT; - nbytes -= len; - buf += len; - ret += len; + /* + * Immediately overwrite the ChaCha key at index 4 with random + * bytes, in case userspace causes copy_to_user() below to sleep + * forever, so that we still retain forward secrecy in that case. + */ + crng_make_state(chacha_state, (u8 *)&chacha_state[4], CHACHA_KEY_SIZE); + /* + * However, if we're doing a read of len <= 32, we don't need to + * use chacha_state after, so we can simply return those bytes to + * the user directly. + */ + if (nbytes <= CHACHA_KEY_SIZE) { + ret = copy_to_user(buf, &chacha_state[4], nbytes) ? -EFAULT : nbytes; + goto out_zero_chacha; + } - while (nbytes) { + do { if (large_request && need_resched()) { - if (signal_pending(current)) + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; break; + } schedule(); } @@ -561,10 +571,11 @@ static ssize_t get_random_bytes_user(void __user *buf, size_t nbytes) nbytes -= len; buf += len; ret += len; - } + } while (nbytes); - memzero_explicit(chacha_state, sizeof(chacha_state)); memzero_explicit(output, sizeof(output)); +out_zero_chacha: + memzero_explicit(chacha_state, sizeof(chacha_state)); return ret; } -- cgit v1.2.3 From 1448769c9cdb69ad65287f4f7ab58bc5f2f5d7ba Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 5 Apr 2022 18:39:31 +0200 Subject: random: check for signal_pending() outside of need_resched() check signal_pending() checks TIF_NOTIFY_SIGNAL and TIF_SIGPENDING, which signal that the task should bail out of the syscall when possible. This is a separate concept from need_resched(), which checks TIF_NEED_RESCHED, signaling that the task should preempt. In particular, with the current code, the signal_pending() bailout probably won't work reliably. Change this to look like other functions that read lots of data, such as read_zero(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Jann Horn Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 47f01b1482a9..394cbd814a0b 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -549,13 +549,13 @@ static ssize_t get_random_bytes_user(void __user *buf, size_t nbytes) } do { - if (large_request && need_resched()) { + if (large_request) { if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; break; } - schedule(); + cond_resched(); } chacha20_block(chacha_state, output); -- cgit v1.2.3 From b3d6dd09ff00fdcf4f7c0cb54700ffd5dd343502 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 25 Mar 2022 10:32:11 +0800 Subject: Drivers: hv: balloon: Support status report for larger page sizes DM_STATUS_REPORT expects the numbers of pages in the unit of 4k pages (HV_HYP_PAGE) instead of guest pages, so to make it work when guest page sizes are larger than 4k, convert the numbers of guest pages into the numbers of HV_HYP_PAGEs. Note that the numbers of guest pages are still used for tracing because tracing is internal to the guest kernel. Reported-by: Vitaly Kuznetsov Signed-off-by: Boqun Feng Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220325023212.1570049-2-boqun.feng@gmail.com Signed-off-by: Wei Liu --- drivers/hv/hv_balloon.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index f2d05bff4245..062156b88a87 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1130,6 +1131,7 @@ static void post_status(struct hv_dynmem_device *dm) struct dm_status status; unsigned long now = jiffies; unsigned long last_post = last_post_time; + unsigned long num_pages_avail, num_pages_committed; if (pressure_report_delay > 0) { --pressure_report_delay; @@ -1154,16 +1156,21 @@ static void post_status(struct hv_dynmem_device *dm) * num_pages_onlined) as committed to the host, otherwise it can try * asking us to balloon them out. */ - status.num_avail = si_mem_available(); - status.num_committed = vm_memory_committed() + + num_pages_avail = si_mem_available(); + num_pages_committed = vm_memory_committed() + dm->num_pages_ballooned + (dm->num_pages_added > dm->num_pages_onlined ? dm->num_pages_added - dm->num_pages_onlined : 0) + compute_balloon_floor(); - trace_balloon_status(status.num_avail, status.num_committed, + trace_balloon_status(num_pages_avail, num_pages_committed, vm_memory_committed(), dm->num_pages_ballooned, dm->num_pages_added, dm->num_pages_onlined); + + /* Convert numbers of pages into numbers of HV_HYP_PAGEs. */ + status.num_avail = num_pages_avail * NR_HV_HYP_PAGES_IN_PAGE; + status.num_committed = num_pages_committed * NR_HV_HYP_PAGES_IN_PAGE; + /* * If our transaction ID is no longer current, just don't * send the status. This can happen if we were interrupted -- cgit v1.2.3 From be5802795cf8d0b881745fa9ba7790293b382280 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 25 Mar 2022 10:32:12 +0800 Subject: Drivers: hv: balloon: Disable balloon and hot-add accordingly Currently there are known potential issues for balloon and hot-add on ARM64: * Unballoon requests from Hyper-V should only unballoon ranges that are guest page size aligned, otherwise guests cannot handle because it's impossible to partially free a page. This is a problem when guest page size > 4096 bytes. * Memory hot-add requests from Hyper-V should provide the NUMA node id of the added ranges or ARM64 should have a functional memory_add_physaddr_to_nid(), otherwise the node id is missing for add_memory(). These issues require discussions on design and implementation. In the meanwhile, post_status() is working and essential to guest monitoring. Therefore instead of disabling the entire hv_balloon driver, the ballooning (when page size > 4096 bytes) and hot-add are disabled accordingly for now. Once the issues are fixed, they can be re-enable in these cases. Signed-off-by: Boqun Feng Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220325023212.1570049-3-boqun.feng@gmail.com Signed-off-by: Wei Liu --- drivers/hv/hv_balloon.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 062156b88a87..eee7402cfc02 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1660,6 +1660,38 @@ static void disable_page_reporting(void) } } +static int ballooning_enabled(void) +{ + /* + * Disable ballooning if the page size is not 4k (HV_HYP_PAGE_SIZE), + * since currently it's unclear to us whether an unballoon request can + * make sure all page ranges are guest page size aligned. + */ + if (PAGE_SIZE != HV_HYP_PAGE_SIZE) { + pr_info("Ballooning disabled because page size is not 4096 bytes\n"); + return 0; + } + + return 1; +} + +static int hot_add_enabled(void) +{ + /* + * Disable hot add on ARM64, because we currently rely on + * memory_add_physaddr_to_nid() to get a node id of a hot add range, + * however ARM64's memory_add_physaddr_to_nid() always return 0 and + * DM_MEM_HOT_ADD_REQUEST doesn't have the NUMA node information for + * add_memory(). + */ + if (IS_ENABLED(CONFIG_ARM64)) { + pr_info("Memory hot add disabled on ARM64\n"); + return 0; + } + + return 1; +} + static int balloon_connect_vsp(struct hv_device *dev) { struct dm_version_request version_req; @@ -1731,8 +1763,8 @@ static int balloon_connect_vsp(struct hv_device *dev) * currently still requires the bits to be set, so we have to add code * to fail the host's hot-add and balloon up/down requests, if any. */ - cap_msg.caps.cap_bits.balloon = 1; - cap_msg.caps.cap_bits.hot_add = 1; + cap_msg.caps.cap_bits.balloon = ballooning_enabled(); + cap_msg.caps.cap_bits.hot_add = hot_add_enabled(); /* * Specify our alignment requirements as it relates -- cgit v1.2.3 From eaa03d34535872d29004cb5cf77dc9dec1ba9a25 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 28 Mar 2022 17:44:57 +0200 Subject: Drivers: hv: vmbus: Replace smp_store_mb() with virt_store_mb() Following the recommendation in Documentation/memory-barriers.txt for virtual machine guests. Fixes: 8b6a877c060ed ("Drivers: hv: vmbus: Replace the per-CPU channel lists with a global array of channels") Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20220328154457.100872-1-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 60375879612f..67be81208a2d 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -380,7 +380,7 @@ void vmbus_channel_map_relid(struct vmbus_channel *channel) * execute: * * (a) In the "normal (i.e., not resuming from hibernation)" path, - * the full barrier in smp_store_mb() guarantees that the store + * the full barrier in virt_store_mb() guarantees that the store * is propagated to all CPUs before the add_channel_work work * is queued. In turn, add_channel_work is queued before the * channel's ring buffer is allocated/initialized and the @@ -392,14 +392,14 @@ void vmbus_channel_map_relid(struct vmbus_channel *channel) * recv_int_page before retrieving the channel pointer from the * array of channels. * - * (b) In the "resuming from hibernation" path, the smp_store_mb() + * (b) In the "resuming from hibernation" path, the virt_store_mb() * guarantees that the store is propagated to all CPUs before * the VMBus connection is marked as ready for the resume event * (cf. check_ready_for_resume_event()). The interrupt handler * of the VMBus driver and vmbus_chan_sched() can not run before * vmbus_bus_resume() has completed execution (cf. resume_noirq). */ - smp_store_mb( + virt_store_mb( vmbus_connection.channels[channel->offermsg.child_relid], channel); } -- cgit v1.2.3 From a3ebe92a0f2dfaeac257b685531decf8c9cd8eee Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Apr 2022 12:04:45 +0200 Subject: net: ipv6mr: fix unused variable warning with CONFIG_IPV6_PIMSM_V2=n net/ipv6/ip6mr.c:1656:14: warning: unused variable 'do_wrmifwhole' Move it to the CONFIG_IPV6_PIMSM_V2 scope where its used. Fixes: 4b340a5a726d ("net: ip6mr: add support for passing full packet on wrong mif") Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index a9775c830194..4e74bc61a3db 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1653,7 +1653,6 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, mifi_t mifi; struct net *net = sock_net(sk); struct mr_table *mrt; - bool do_wrmifwhole; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) @@ -1761,6 +1760,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: { + bool do_wrmifwhole; int v; if (optlen != sizeof(v)) -- cgit v1.2.3 From 4e910dbe36508654a896d5735b318c0b88172570 Mon Sep 17 00:00:00 2001 From: Jamie Bainbridge Date: Wed, 6 Apr 2022 21:19:19 +1000 Subject: qede: confirm skb is allocated before using qede_build_skb() assumes build_skb() always works and goes straight to skb_reserve(). However, build_skb() can fail under memory pressure. This results in a kernel panic because the skb to reserve is NULL. Add a check in case build_skb() failed to allocate and return NULL. The NULL return is handled correctly in callers to qede_build_skb(). Fixes: 8a8633978b842 ("qede: Add build_skb() support.") Signed-off-by: Jamie Bainbridge Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qede/qede_fp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index b242000a77fd..b7cc36589f59 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -748,6 +748,9 @@ qede_build_skb(struct qede_rx_queue *rxq, buf = page_address(bd->data) + bd->page_offset; skb = build_skb(buf, rxq->rx_buf_seg_size); + if (unlikely(!skb)) + return NULL; + skb_reserve(skb, pad); skb_put(skb, len); -- cgit v1.2.3 From afb8e246527536848b9b4025b40e613edf776a9d Mon Sep 17 00:00:00 2001 From: Marcin Kozlowski Date: Wed, 6 Apr 2022 10:05:37 +0200 Subject: net: usb: aqc111: Fix out-of-bounds accesses in RX fixup aqc111_rx_fixup() contains several out-of-bounds accesses that can be triggered by a malicious (or defective) USB device, in particular: - The metadata array (desc_offset..desc_offset+2*pkt_count) can be out of bounds, causing OOB reads and (on big-endian systems) OOB endianness flips. - A packet can overlap the metadata array, causing a later OOB endianness flip to corrupt data used by a cloned SKB that has already been handed off into the network stack. - A packet SKB can be constructed whose tail is far beyond its end, causing out-of-bounds heap data to be considered part of the SKB's data. Found doing variant analysis. Tested it with another driver (ax88179_178a), since I don't have a aqc111 device to test it, but the code looks very similar. Signed-off-by: Marcin Kozlowski Signed-off-by: David S. Miller --- drivers/net/usb/aqc111.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/aqc111.c b/drivers/net/usb/aqc111.c index ea06d10e1c21..ca409d450a29 100644 --- a/drivers/net/usb/aqc111.c +++ b/drivers/net/usb/aqc111.c @@ -1102,10 +1102,15 @@ static int aqc111_rx_fixup(struct usbnet *dev, struct sk_buff *skb) if (start_of_descs != desc_offset) goto err; - /* self check desc_offset from header*/ - if (desc_offset >= skb_len) + /* self check desc_offset from header and make sure that the + * bounds of the metadata array are inside the SKB + */ + if (pkt_count * 2 + desc_offset >= skb_len) goto err; + /* Packets must not overlap the metadata array */ + skb_trim(skb, desc_offset); + if (pkt_count == 0) goto err; -- cgit v1.2.3 From b423e54ba965b4469b48e46fd16941f1e1701697 Mon Sep 17 00:00:00 2001 From: Xiaomeng Tong Date: Wed, 6 Apr 2022 11:55:56 +0800 Subject: myri10ge: fix an incorrect free for skb in myri10ge_sw_tso All remaining skbs should be released when myri10ge_xmit fails to transmit a packet. Fix it within another skb_list_walk_safe. Signed-off-by: Xiaomeng Tong Signed-off-by: David S. Miller --- drivers/net/ethernet/myricom/myri10ge/myri10ge.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index 50ac3ee2577a..21d2645885ce 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -2903,11 +2903,9 @@ static netdev_tx_t myri10ge_sw_tso(struct sk_buff *skb, status = myri10ge_xmit(curr, dev); if (status != 0) { dev_kfree_skb_any(curr); - if (segs != NULL) { - curr = segs; - segs = next; + skb_list_walk_safe(next, curr, next) { curr->next = NULL; - dev_kfree_skb_any(segs); + dev_kfree_skb_any(curr); } goto drop; } -- cgit v1.2.3 From 2e8702cc0cfa1080f29fd64003c00a3e24ac38de Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 6 Apr 2022 15:41:12 +0300 Subject: bpf: Support dual-stack sockets in bpf_tcp_check_syncookie bpf_tcp_gen_syncookie looks at the IP version in the IP header and validates the address family of the socket. It supports IPv4 packets in AF_INET6 dual-stack sockets. On the other hand, bpf_tcp_check_syncookie looks only at the address family of the socket, ignoring the real IP version in headers, and validates only the packet size. This implementation has some drawbacks: 1. Packets are not validated properly, allowing a BPF program to trick bpf_tcp_check_syncookie into handling an IPv6 packet on an IPv4 socket. 2. Dual-stack sockets fail the checks on IPv4 packets. IPv4 clients end up receiving a SYNACK with the cookie, but the following ACK gets dropped. This patch fixes these issues by changing the checks in bpf_tcp_check_syncookie to match the ones in bpf_tcp_gen_syncookie. IP version from the header is taken into account, and it is validated properly with address family. Fixes: 399040847084 ("bpf: add helper to check for a valid SYN cookie") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Alexei Starovoitov Reviewed-by: Tariq Toukan Acked-by: Arthur Fabre Link: https://lore.kernel.org/bpf/20220406124113.2795730-1-maximmi@nvidia.com --- net/core/filter.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index a7044e98765e..64470a727ef7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7016,24 +7016,33 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (!th->ack || th->rst || th->syn) return -ENOENT; + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + if (tcp_synq_no_recent_overflow(sk)) return -ENOENT; cookie = ntohl(th->ack_seq) - 1; - switch (sk->sk_family) { - case AF_INET: - if (unlikely(iph_len < sizeof(struct iphdr))) + /* Both struct iphdr and struct ipv6hdr have the version field at the + * same offset so we can cast to the shorter header (struct iphdr). + */ + switch (((struct iphdr *)iph)->version) { + case 4: + if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; ret = __cookie_v4_check((struct iphdr *)iph, th, cookie); break; #if IS_BUILTIN(CONFIG_IPV6) - case AF_INET6: + case 6: if (unlikely(iph_len < sizeof(struct ipv6hdr))) return -EINVAL; + if (sk->sk_family != AF_INET6) + return -EINVAL; + ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie); break; #endif /* CONFIG_IPV6 */ -- cgit v1.2.3 From 53968dafc4a6061c1e01d884f1f9a4b8c4b0d5bc Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 6 Apr 2022 15:41:13 +0300 Subject: bpf: Adjust bpf_tcp_check_syncookie selftest to test dual-stack sockets The previous commit fixed support for dual-stack sockets in bpf_tcp_check_syncookie. This commit adjusts the selftest to verify the fixed functionality. Signed-off-by: Maxim Mikityanskiy Signed-off-by: Alexei Starovoitov Acked-by: Arthur Fabre Link: https://lore.kernel.org/bpf/20220406124113.2795730-2-maximmi@nvidia.com --- .../selftests/bpf/test_tcp_check_syncookie_user.c | 78 ++++++++++++++++------ 1 file changed, 59 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c index b9e991d43155..e7775d3bbe08 100644 --- a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c +++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c @@ -18,8 +18,9 @@ #include "bpf_rlimit.h" #include "cgroup_helpers.h" -static int start_server(const struct sockaddr *addr, socklen_t len) +static int start_server(const struct sockaddr *addr, socklen_t len, bool dual) { + int mode = !dual; int fd; fd = socket(addr->sa_family, SOCK_STREAM, 0); @@ -28,6 +29,14 @@ static int start_server(const struct sockaddr *addr, socklen_t len) goto out; } + if (addr->sa_family == AF_INET6) { + if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, (char *)&mode, + sizeof(mode)) == -1) { + log_err("Failed to set the dual-stack mode"); + goto close_out; + } + } + if (bind(fd, addr, len) == -1) { log_err("Failed to bind server socket"); goto close_out; @@ -47,24 +56,17 @@ out: return fd; } -static int connect_to_server(int server_fd) +static int connect_to_server(const struct sockaddr *addr, socklen_t len) { - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); int fd = -1; - if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { - log_err("Failed to get server addr"); - goto out; - } - - fd = socket(addr.ss_family, SOCK_STREAM, 0); + fd = socket(addr->sa_family, SOCK_STREAM, 0); if (fd == -1) { log_err("Failed to create client socket"); goto out; } - if (connect(fd, (const struct sockaddr *)&addr, len) == -1) { + if (connect(fd, (const struct sockaddr *)addr, len) == -1) { log_err("Fail to connect to server"); goto close_out; } @@ -116,7 +118,8 @@ err: return map_fd; } -static int run_test(int server_fd, int results_fd, bool xdp) +static int run_test(int server_fd, int results_fd, bool xdp, + const struct sockaddr *addr, socklen_t len) { int client = -1, srv_client = -1; int ret = 0; @@ -142,7 +145,7 @@ static int run_test(int server_fd, int results_fd, bool xdp) goto err; } - client = connect_to_server(server_fd); + client = connect_to_server(addr, len); if (client == -1) goto err; @@ -199,12 +202,30 @@ out: return ret; } +static bool get_port(int server_fd, in_port_t *port) +{ + struct sockaddr_in addr; + socklen_t len = sizeof(addr); + + if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get server addr"); + return false; + } + + /* sin_port and sin6_port are located at the same offset. */ + *port = addr.sin_port; + return true; +} + int main(int argc, char **argv) { struct sockaddr_in addr4; struct sockaddr_in6 addr6; + struct sockaddr_in addr4dual; + struct sockaddr_in6 addr6dual; int server = -1; int server_v6 = -1; + int server_dual = -1; int results = -1; int err = 0; bool xdp; @@ -224,25 +245,43 @@ int main(int argc, char **argv) addr4.sin_family = AF_INET; addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); addr4.sin_port = 0; + memcpy(&addr4dual, &addr4, sizeof(addr4dual)); memset(&addr6, 0, sizeof(addr6)); addr6.sin6_family = AF_INET6; addr6.sin6_addr = in6addr_loopback; addr6.sin6_port = 0; - server = start_server((const struct sockaddr *)&addr4, sizeof(addr4)); - if (server == -1) + memset(&addr6dual, 0, sizeof(addr6dual)); + addr6dual.sin6_family = AF_INET6; + addr6dual.sin6_addr = in6addr_any; + addr6dual.sin6_port = 0; + + server = start_server((const struct sockaddr *)&addr4, sizeof(addr4), + false); + if (server == -1 || !get_port(server, &addr4.sin_port)) goto err; server_v6 = start_server((const struct sockaddr *)&addr6, - sizeof(addr6)); - if (server_v6 == -1) + sizeof(addr6), false); + if (server_v6 == -1 || !get_port(server_v6, &addr6.sin6_port)) + goto err; + + server_dual = start_server((const struct sockaddr *)&addr6dual, + sizeof(addr6dual), true); + if (server_dual == -1 || !get_port(server_dual, &addr4dual.sin_port)) + goto err; + + if (run_test(server, results, xdp, + (const struct sockaddr *)&addr4, sizeof(addr4))) goto err; - if (run_test(server, results, xdp)) + if (run_test(server_v6, results, xdp, + (const struct sockaddr *)&addr6, sizeof(addr6))) goto err; - if (run_test(server_v6, results, xdp)) + if (run_test(server_dual, results, xdp, + (const struct sockaddr *)&addr4dual, sizeof(addr4dual))) goto err; printf("ok\n"); @@ -252,6 +291,7 @@ err: out: close(server); close(server_v6); + close(server_dual); close(results); return err; } -- cgit v1.2.3 From e3c1c4fd9e6d14059ed93ebfe15e1c57793b1a05 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 6 Apr 2022 02:36:16 +0200 Subject: random: check for signals every PAGE_SIZE chunk of /dev/[u]random In 1448769c9cdb ("random: check for signal_pending() outside of need_resched() check"), Jann pointed out that we previously were only checking the TIF_NOTIFY_SIGNAL and TIF_SIGPENDING flags if the process had TIF_NEED_RESCHED set, which meant in practice, super long reads to /dev/[u]random would delay signal handling by a long time. I tried this using the below program, and indeed I wasn't able to interrupt a /dev/urandom read until after several megabytes had been read. The bug he fixed has always been there, and so code that reads from /dev/urandom without checking the return value of read() has mostly worked for a long time, for most sizes, not just for <= 256. Maybe it makes sense to keep that code working. The reason it was so small prior, ignoring the fact that it didn't work anyway, was likely because /dev/random used to block, and that could happen for pretty large lengths of time while entropy was gathered. But now, it's just a chacha20 call, which is extremely fast and is just operating on pure data, without having to wait for some external event. In that sense, /dev/[u]random is a lot more like /dev/zero. Taking a page out of /dev/zero's read_zero() function, it always returns at least one chunk, and then checks for signals after each chunk. Chunk sizes there are of length PAGE_SIZE. Let's just copy the same thing for /dev/[u]random, and check for signals and cond_resched() for every PAGE_SIZE amount of data. This makes the behavior more consistent with expectations, and should mitigate the impact of Jann's fix for the age-old signal check bug. ---- test program ---- #include #include #include #include static unsigned char x[~0U]; static void handle(int) { } int main(int argc, char *argv[]) { pid_t pid = getpid(), child; signal(SIGUSR1, handle); if (!(child = fork())) { for (;;) kill(pid, SIGUSR1); } pause(); printf("interrupted after reading %zd bytes\n", getrandom(x, sizeof(x), 0)); kill(child, SIGTERM); return 0; } Cc: Jann Horn Cc: Theodore Ts'o Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 394cbd814a0b..e15063d61460 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -523,7 +523,6 @@ EXPORT_SYMBOL(get_random_bytes); static ssize_t get_random_bytes_user(void __user *buf, size_t nbytes) { - bool large_request = nbytes > 256; ssize_t ret = 0; size_t len; u32 chacha_state[CHACHA_STATE_WORDS]; @@ -549,15 +548,6 @@ static ssize_t get_random_bytes_user(void __user *buf, size_t nbytes) } do { - if (large_request) { - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } - cond_resched(); - } - chacha20_block(chacha_state, output); if (unlikely(chacha_state[12] == 0)) ++chacha_state[13]; @@ -571,6 +561,13 @@ static ssize_t get_random_bytes_user(void __user *buf, size_t nbytes) nbytes -= len; buf += len; ret += len; + + BUILD_BUG_ON(PAGE_SIZE % CHACHA_BLOCK_SIZE != 0); + if (!(ret % PAGE_SIZE) && nbytes) { + if (signal_pending(current)) + break; + cond_resched(); + } } while (nbytes); memzero_explicit(output, sizeof(output)); -- cgit v1.2.3 From ec4eb8a86ade4d22633e1da2a7d85a846b7d1798 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Tue, 5 Apr 2022 21:22:06 +0800 Subject: drivers: net: slip: fix NPD bug in sl_tx_timeout() When a slip driver is detaching, the slip_close() will act to cleanup necessary resources and sl->tty is set to NULL in slip_close(). Meanwhile, the packet we transmit is blocked, sl_tx_timeout() will be called. Although slip_close() and sl_tx_timeout() use sl->lock to synchronize, we don`t judge whether sl->tty equals to NULL in sl_tx_timeout() and the null pointer dereference bug will happen. (Thread 1) | (Thread 2) | slip_close() | spin_lock_bh(&sl->lock) | ... ... | sl->tty = NULL //(1) sl_tx_timeout() | spin_unlock_bh(&sl->lock) spin_lock(&sl->lock); | ... | ... tty_chars_in_buffer(sl->tty)| if (tty->ops->..) //(2) | ... | synchronize_rcu() We set NULL to sl->tty in position (1) and dereference sl->tty in position (2). This patch adds check in sl_tx_timeout(). If sl->tty equals to NULL, sl_tx_timeout() will goto out. Signed-off-by: Duoming Zhou Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/r/20220405132206.55291-1-duoming@zju.edu.cn Signed-off-by: Jakub Kicinski --- drivers/net/slip/slip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c index 88396ff99f03..6865d32270e5 100644 --- a/drivers/net/slip/slip.c +++ b/drivers/net/slip/slip.c @@ -469,7 +469,7 @@ static void sl_tx_timeout(struct net_device *dev, unsigned int txqueue) spin_lock(&sl->lock); if (netif_queue_stopped(dev)) { - if (!netif_running(dev)) + if (!netif_running(dev) || !sl->tty) goto out; /* May be we must check transmitter timeout here ? -- cgit v1.2.3