From 7877e7a5a52e1c9716326b94dfe6c7e6cd7bce5a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Oct 2021 08:25:25 +0200 Subject: block: remove __sync_blockdev [ Upstream commit 70164eb6ccb76ab679b016b4b60123bf4ec6c162 ] Instead offer a new sync_blockdev_nowait helper for the !wait case. This new helper is exported as it will grow modular callers in a bit. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211019062530.2174626-3-hch@lst.de Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 413c0148c0ce..6bbd393e6bcc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1999,6 +1999,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); +int sync_blockdev_nowait(struct block_device *bdev); #else static inline void invalidate_bdev(struct block_device *bdev) { @@ -2007,6 +2008,10 @@ static inline int sync_blockdev(struct block_device *bdev) { return 0; } +static inline int sync_blockdev_nowait(struct block_device *bdev) +{ + return 0; +} #endif int fsync_bdev(struct block_device *bdev); -- cgit v1.2.3 From 6eb927ee189f39746bcb02123d270ef04457eab6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Oct 2021 08:25:30 +0200 Subject: block: simplify the block device syncing code [ Upstream commit 1e03a36bdff4709c1bbf0f57f60ae3f776d51adf ] Get rid of the indirections and just provide a sync_bdevs helper for the generic sync code. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211019062530.2174626-8-hch@lst.de Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- block/bdev.c | 17 ++++++++++++++--- fs/internal.h | 6 ------ fs/sync.c | 23 ++++------------------- include/linux/blkdev.h | 4 ++++ 4 files changed, 22 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/block/bdev.c b/block/bdev.c index 33cac289302e..18abafb135e0 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -1017,7 +1017,7 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty) } EXPORT_SYMBOL(__invalidate_device); -void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) +void sync_bdevs(bool wait) { struct inode *inode, *old_inode = NULL; @@ -1048,8 +1048,19 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) bdev = I_BDEV(inode); mutex_lock(&bdev->bd_disk->open_mutex); - if (bdev->bd_openers) - func(bdev, arg); + if (!bdev->bd_openers) { + ; /* skip */ + } else if (wait) { + /* + * We keep the error status of individual mapping so + * that applications can catch the writeback error using + * fsync(2). See filemap_fdatawait_keep_errors() for + * details. + */ + filemap_fdatawait_keep_errors(inode->i_mapping); + } else { + filemap_fdatawrite(inode->i_mapping); + } mutex_unlock(&bdev->bd_disk->open_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); diff --git a/fs/internal.h b/fs/internal.h index b5caa16f4645..cdd83d4899bb 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -23,17 +23,11 @@ struct pipe_inode_info; #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); -void iterate_bdevs(void (*)(struct block_device *, void *), void *); void emergency_thaw_bdev(struct super_block *sb); #else static inline void bdev_cache_init(void) { } - -static inline void iterate_bdevs(void (*f)(struct block_device *, void *), - void *arg) -{ -} static inline int emergency_thaw_bdev(struct super_block *sb) { return 0; diff --git a/fs/sync.c b/fs/sync.c index a621089eb07e..3ce8e2137f31 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -78,21 +78,6 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg) sb->s_op->sync_fs(sb, *(int *)arg); } -static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) -{ - filemap_fdatawrite(bdev->bd_inode->i_mapping); -} - -static void fdatawait_one_bdev(struct block_device *bdev, void *arg) -{ - /* - * We keep the error status of individual mapping so that - * applications can catch the writeback error using fsync(2). - * See filemap_fdatawait_keep_errors() for details. - */ - filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping); -} - /* * Sync everything. We start by waking flusher threads so that most of * writeback runs on all devices in parallel. Then we sync all inodes reliably @@ -111,8 +96,8 @@ void ksys_sync(void) iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &wait); - iterate_bdevs(fdatawrite_one_bdev, NULL); - iterate_bdevs(fdatawait_one_bdev, NULL); + sync_bdevs(false); + sync_bdevs(true); if (unlikely(laptop_mode)) laptop_sync_completion(); } @@ -133,10 +118,10 @@ static void do_sync_work(struct work_struct *work) */ iterate_supers(sync_inodes_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &nowait); - iterate_bdevs(fdatawrite_one_bdev, NULL); + sync_bdevs(false); iterate_supers(sync_inodes_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &nowait); - iterate_bdevs(fdatawrite_one_bdev, NULL); + sync_bdevs(false); printk("Emergency Sync complete\n"); kfree(work); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6bbd393e6bcc..aebe67ed7a73 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -2000,6 +2000,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); int sync_blockdev_nowait(struct block_device *bdev); +void sync_bdevs(bool wait); #else static inline void invalidate_bdev(struct block_device *bdev) { @@ -2012,6 +2013,9 @@ static inline int sync_blockdev_nowait(struct block_device *bdev) { return 0; } +static inline void sync_bdevs(bool wait) +{ +} #endif int fsync_bdev(struct block_device *bdev); -- cgit v1.2.3 From 3177d047e58a56b2df9e9d59283c3121f1888fe8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 12 Feb 2022 09:14:49 -0800 Subject: etherdevice: Adjust ether_addr* prototypes to silence -Wstringop-overead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 2618a0dae09ef37728dab89ff60418cbe25ae6bd upstream. With GCC 12, -Wstringop-overread was warning about an implicit cast from char[6] to char[8]. However, the extra 2 bytes are always thrown away, alignment doesn't matter, and the risk of hitting the edge of unallocated memory has been accepted, so this prototype can just be converted to a regular char *. Silences: net/core/dev.c: In function ‘bpf_prog_run_generic_xdp’: net/core/dev.c:4618:21: warning: ‘ether_addr_equal_64bits’ reading 8 bytes from a region of size 6 [-Wstringop-overread] 4618 | orig_host = ether_addr_equal_64bits(eth->h_dest, > skb->dev->dev_addr); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ net/core/dev.c:4618:21: note: referencing argument 1 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’} net/core/dev.c:4618:21: note: referencing argument 2 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’} In file included from net/core/dev.c:91: include/linux/etherdevice.h:375:20: note: in a call to function ‘ether_addr_equal_64bits’ 375 | static inline bool ether_addr_equal_64bits(const u8 addr1[6+2], | ^~~~~~~~~~~~~~~~~~~~~~~ Reported-by: Marc Kleine-Budde Tested-by: Marc Kleine-Budde Link: https://lore.kernel.org/netdev/20220212090811.uuzk6d76agw2vv73@pengutronix.de Cc: Jakub Kicinski Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller Cc: Khem Raj Signed-off-by: Greg Kroah-Hartman --- include/linux/etherdevice.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index c58d50451485..7f28fa702bb7 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -127,7 +127,7 @@ static inline bool is_multicast_ether_addr(const u8 *addr) #endif } -static inline bool is_multicast_ether_addr_64bits(const u8 addr[6+2]) +static inline bool is_multicast_ether_addr_64bits(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #ifdef __BIG_ENDIAN @@ -364,8 +364,7 @@ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits. */ -static inline bool ether_addr_equal_64bits(const u8 addr1[6+2], - const u8 addr2[6+2]) +static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2); -- cgit v1.2.3 From a52e73bef25486cf2aa786ca5c83bcc940625103 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 14 Apr 2022 19:13:40 -0700 Subject: mm, kfence: support kmem_dump_obj() for KFENCE objects commit 2dfe63e61cc31ee59ce951672b0850b5229cd5b0 upstream. Calling kmem_obj_info() via kmem_dump_obj() on KFENCE objects has been producing garbage data due to the object not actually being maintained by SLAB or SLUB. Fix this by implementing __kfence_obj_info() that copies relevant information to struct kmem_obj_info when the object was allocated by KFENCE; this is called by a common kmem_obj_info(), which also calls the slab/slub/slob specific variant now called __kmem_obj_info(). For completeness, kmem_dump_obj() now displays if the object was allocated by KFENCE. Link: https://lore.kernel.org/all/20220323090520.GG16885@xsang-OptiPlex-9020/ Link: https://lkml.kernel.org/r/20220406131558.3558585-1-elver@google.com Fixes: b89fb5ef0ce6 ("mm, kfence: insert KFENCE hooks for SLUB") Fixes: d3fb45f370d9 ("mm, kfence: insert KFENCE hooks for SLAB") Signed-off-by: Marco Elver Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reported-by: kernel test robot Acked-by: Vlastimil Babka [slab] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/kfence.h | 24 ++++++++++++++++++++++++ mm/kfence/core.c | 21 --------------------- mm/kfence/kfence.h | 21 +++++++++++++++++++++ mm/kfence/report.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ mm/slab.c | 2 +- mm/slab.h | 2 +- mm/slab_common.c | 9 +++++++++ mm/slob.c | 2 +- mm/slub.c | 2 +- 9 files changed, 105 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 4b5e3679a72c..3c75209a545e 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -202,6 +202,22 @@ static __always_inline __must_check bool kfence_free(void *addr) */ bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs); +#ifdef CONFIG_PRINTK +struct kmem_obj_info; +/** + * __kfence_obj_info() - fill kmem_obj_info struct + * @kpp: kmem_obj_info to be filled + * @object: the object + * + * Return: + * * false - not a KFENCE object + * * true - a KFENCE object, filled @kpp + * + * Copies information to @kpp for KFENCE objects. + */ +bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); +#endif + #else /* CONFIG_KFENCE */ static inline bool is_kfence_address(const void *addr) { return false; } @@ -219,6 +235,14 @@ static inline bool __must_check kfence_handle_page_fault(unsigned long addr, boo return false; } +#ifdef CONFIG_PRINTK +struct kmem_obj_info; +static inline bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +{ + return false; +} +#endif + #endif #endif /* _LINUX_KFENCE_H */ diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 51ea9193cecb..86260e8f2830 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -221,27 +221,6 @@ static bool kfence_unprotect(unsigned long addr) return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false)); } -static inline struct kfence_metadata *addr_to_metadata(unsigned long addr) -{ - long index; - - /* The checks do not affect performance; only called from slow-paths. */ - - if (!is_kfence_address((void *)addr)) - return NULL; - - /* - * May be an invalid index if called with an address at the edge of - * __kfence_pool, in which case we would report an "invalid access" - * error. - */ - index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1; - if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS) - return NULL; - - return &kfence_metadata[index]; -} - static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta) { unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2; diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 2a2d5de9d379..92bf6eff6060 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -93,6 +93,27 @@ struct kfence_metadata { extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; +static inline struct kfence_metadata *addr_to_metadata(unsigned long addr) +{ + long index; + + /* The checks do not affect performance; only called from slow-paths. */ + + if (!is_kfence_address((void *)addr)) + return NULL; + + /* + * May be an invalid index if called with an address at the edge of + * __kfence_pool, in which case we would report an "invalid access" + * error. + */ + index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1; + if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS) + return NULL; + + return &kfence_metadata[index]; +} + /* KFENCE error types for report generation. */ enum kfence_error_type { KFENCE_ERROR_OOB, /* Detected a out-of-bounds access. */ diff --git a/mm/kfence/report.c b/mm/kfence/report.c index f93a7b2a338b..37e140e7f201 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -273,3 +273,50 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r /* We encountered a memory safety error, taint the kernel! */ add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK); } + +#ifdef CONFIG_PRINTK +static void kfence_to_kp_stack(const struct kfence_track *track, void **kp_stack) +{ + int i, j; + + i = get_stack_skipnr(track->stack_entries, track->num_stack_entries, NULL); + for (j = 0; i < track->num_stack_entries && j < KS_ADDRS_COUNT; ++i, ++j) + kp_stack[j] = (void *)track->stack_entries[i]; + if (j < KS_ADDRS_COUNT) + kp_stack[j] = NULL; +} + +bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +{ + struct kfence_metadata *meta = addr_to_metadata((unsigned long)object); + unsigned long flags; + + if (!meta) + return false; + + /* + * If state is UNUSED at least show the pointer requested; the rest + * would be garbage data. + */ + kpp->kp_ptr = object; + + /* Requesting info an a never-used object is almost certainly a bug. */ + if (WARN_ON(meta->state == KFENCE_OBJECT_UNUSED)) + return true; + + raw_spin_lock_irqsave(&meta->lock, flags); + + kpp->kp_page = page; + kpp->kp_slab_cache = meta->cache; + kpp->kp_objp = (void *)meta->addr; + kfence_to_kp_stack(&meta->alloc_track, kpp->kp_stack); + if (meta->state == KFENCE_OBJECT_FREED) + kfence_to_kp_stack(&meta->free_track, kpp->kp_free_stack); + /* get_stack_skipnr() ensures the first entry is outside allocator. */ + kpp->kp_ret = kpp->kp_stack[0]; + + raw_spin_unlock_irqrestore(&meta->lock, flags); + + return true; +} +#endif diff --git a/mm/slab.c b/mm/slab.c index 03d3074d0bb0..1bd283e98c58 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3658,7 +3658,7 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller); #endif /* CONFIG_NUMA */ #ifdef CONFIG_PRINTK -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) { struct kmem_cache *cachep; unsigned int objnr; diff --git a/mm/slab.h b/mm/slab.h index 56ad7eea3ddf..1ae1bdd485c1 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -643,7 +643,7 @@ struct kmem_obj_info { void *kp_stack[KS_ADDRS_COUNT]; void *kp_free_stack[KS_ADDRS_COUNT]; }; -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); #endif #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index ec2bb0beed75..022319e7deaf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -568,6 +568,13 @@ bool kmem_valid_obj(void *object) } EXPORT_SYMBOL_GPL(kmem_valid_obj); +static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +{ + if (__kfence_obj_info(kpp, object, page)) + return; + __kmem_obj_info(kpp, object, page); +} + /** * kmem_dump_obj - Print available slab provenance information * @object: slab object for which to find provenance information. @@ -603,6 +610,8 @@ void kmem_dump_obj(void *object) pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name); else pr_cont(" slab%s", cp); + if (is_kfence_address(object)) + pr_cont(" (kfence)"); if (kp.kp_objp) pr_cont(" start %px", kp.kp_objp); if (kp.kp_data_offset) diff --git a/mm/slob.c b/mm/slob.c index 74d3f6e60666..f3fc15df971a 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -462,7 +462,7 @@ out: } #ifdef CONFIG_PRINTK -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) { kpp->kp_ptr = object; kpp->kp_page = page; diff --git a/mm/slub.c b/mm/slub.c index ca6ba6bdf27b..b75eebc0350e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4299,7 +4299,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) } #ifdef CONFIG_PRINTK -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) { void *base; int __maybe_unused i; -- cgit v1.2.3 From a583f2f3c8788bffd7fd7baeb76bd6d80543d7ea Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 13 Apr 2022 10:10:50 +0200 Subject: esp: limit skb_page_frag_refill use to a single page [ Upstream commit 5bd8baab087dff657e05387aee802e70304cc813 ] Commit ebe48d368e97 ("esp: Fix possible buffer overflow in ESP transformation") tried to fix skb_page_frag_refill usage in ESP by capping allocsize to 32k, but that doesn't completely solve the issue, as skb_page_frag_refill may return a single page. If that happens, we will write out of bounds, despite the check introduced in the previous patch. This patch forces COW in cases where we would end up calling skb_page_frag_refill with a size larger than a page (first in esp_output_head with tailen, then in esp_output_tail with skb->data_len). Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- include/net/esp.h | 2 -- net/ipv4/esp4.c | 5 ++--- net/ipv6/esp6.c | 5 ++--- 3 files changed, 4 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/esp.h b/include/net/esp.h index 90cd02ff77ef..9c5637d41d95 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,8 +4,6 @@ #include -#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) - struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 70e6c87fbe3d..d747166bb291 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -446,7 +446,6 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; - unsigned int allocsz; /* this is non-NULL only with TCP/UDP Encapsulation */ if (x->encap) { @@ -456,8 +455,8 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } - allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); - if (allocsz > ESP_SKB_FRAG_MAXSIZE) + if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || + ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 5023f59a5b96..6219d97cac7a 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -483,7 +483,6 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; - unsigned int allocsz; if (x->encap) { int err = esp6_output_encap(x, skb, esp); @@ -492,8 +491,8 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info return err; } - allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); - if (allocsz > ESP_SKB_FRAG_MAXSIZE) + if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || + ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { -- cgit v1.2.3 From 652a5405396dd37ea4af66a711c97e66f8192ae0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Apr 2022 11:13:33 -0700 Subject: ipv6: make ip6_rt_gc_expire an atomic_t [ Upstream commit 9cb7c013420f98fa6fd12fc6a5dc055170c108db ] Reads and Writes to ip6_rt_gc_expire always have been racy, as syzbot reported lately [1] There is a possible risk of under-flow, leading to unexpected high value passed to fib6_run_gc(), although I have not observed this in the field. Hosts hitting ip6_dst_gc() very hard are under pretty bad state anyway. [1] BUG: KCSAN: data-race in ip6_dst_gc / ip6_dst_gc read-write to 0xffff888102110744 of 4 bytes by task 13165 on cpu 1: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 read-write to 0xffff888102110744 of 4 bytes by task 11607 on cpu 0: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 value changed: 0x00000bb3 -> 0x00000ba9 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 11607 Comm: kworker/0:21 Not tainted 5.18.0-rc1-syzkaller-00037-g42e7a03d3bad-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: mld mld_ifc_work Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220413181333.649424-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- include/net/netns/ipv6.h | 4 ++-- net/ipv6/route.c | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 6bd7e5a85ce7..ff82983b7ab4 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -75,8 +75,8 @@ struct netns_ipv6 { struct list_head fib6_walkers; rwlock_t fib6_walker_lock; spinlock_t fib6_gc_lock; - unsigned int ip6_rt_gc_expire; - unsigned long ip6_rt_last_gc; + atomic_t ip6_rt_gc_expire; + unsigned long ip6_rt_last_gc; unsigned char flowlabel_has_excl; #ifdef CONFIG_IPV6_MULTIPLE_TABLES bool fib6_has_custom_rules; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6b269595efaa..0ca7c780d97a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3303,6 +3303,7 @@ static int ip6_dst_gc(struct dst_ops *ops) int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + unsigned int val; int entries; entries = dst_entries_get_fast(ops); @@ -3313,13 +3314,13 @@ static int ip6_dst_gc(struct dst_ops *ops) entries <= rt_max_size) goto out; - net->ipv6.ip6_rt_gc_expire++; - fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); + fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) - net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; + atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); out: - net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; + val = atomic_read(&net->ipv6.ip6_rt_gc_expire); + atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); return entries > rt_max_size; } @@ -6528,7 +6529,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; net->ipv6.sysctl.skip_notify_on_dev_down = 0; - net->ipv6.ip6_rt_gc_expire = 30*HZ; + atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); ret = 0; out: -- cgit v1.2.3 From 740411ee2f94632127338decbdd4e9994778573b Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 7 Apr 2022 19:13:07 -0500 Subject: scsi: iscsi: Release endpoint ID when its freed [ Upstream commit 3c6ae371b8a1ffba1fc415989fd581ebf841ed0a ] We can't release the endpoint ID until all references to the endpoint have been dropped or it could be allocated while in use. This has us use an idr instead of looping over all conns to find a free ID and then free the ID when all references have been dropped instead of when the device is only deleted. Link: https://lore.kernel.org/r/20220408001314.5014-4-michael.christie@oracle.com Tested-by: Manish Rangankar Reviewed-by: Lee Duncan Reviewed-by: Chris Leech Reviewed-by: Wu Bo Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/scsi_transport_iscsi.c | 71 ++++++++++++++++++------------------- include/scsi/scsi_transport_iscsi.h | 2 +- 2 files changed, 36 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index c7b1b2e8bb02..bcdfcb25349a 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -86,6 +86,9 @@ struct iscsi_internal { struct transport_container session_cont; }; +static DEFINE_IDR(iscsi_ep_idr); +static DEFINE_MUTEX(iscsi_ep_idr_mutex); + static atomic_t iscsi_session_nr; /* sysfs session id for next new session */ static struct workqueue_struct *iscsi_eh_timer_workq; @@ -169,6 +172,11 @@ struct device_attribute dev_attr_##_prefix##_##_name = \ static void iscsi_endpoint_release(struct device *dev) { struct iscsi_endpoint *ep = iscsi_dev_to_endpoint(dev); + + mutex_lock(&iscsi_ep_idr_mutex); + idr_remove(&iscsi_ep_idr, ep->id); + mutex_unlock(&iscsi_ep_idr_mutex); + kfree(ep); } @@ -181,7 +189,7 @@ static ssize_t show_ep_handle(struct device *dev, struct device_attribute *attr, char *buf) { struct iscsi_endpoint *ep = iscsi_dev_to_endpoint(dev); - return sysfs_emit(buf, "%llu\n", (unsigned long long) ep->id); + return sysfs_emit(buf, "%d\n", ep->id); } static ISCSI_ATTR(ep, handle, S_IRUGO, show_ep_handle, NULL); @@ -194,48 +202,32 @@ static struct attribute_group iscsi_endpoint_group = { .attrs = iscsi_endpoint_attrs, }; -#define ISCSI_MAX_EPID -1 - -static int iscsi_match_epid(struct device *dev, const void *data) -{ - struct iscsi_endpoint *ep = iscsi_dev_to_endpoint(dev); - const uint64_t *epid = data; - - return *epid == ep->id; -} - struct iscsi_endpoint * iscsi_create_endpoint(int dd_size) { - struct device *dev; struct iscsi_endpoint *ep; - uint64_t id; - int err; - - for (id = 1; id < ISCSI_MAX_EPID; id++) { - dev = class_find_device(&iscsi_endpoint_class, NULL, &id, - iscsi_match_epid); - if (!dev) - break; - else - put_device(dev); - } - if (id == ISCSI_MAX_EPID) { - printk(KERN_ERR "Too many connections. Max supported %u\n", - ISCSI_MAX_EPID - 1); - return NULL; - } + int err, id; ep = kzalloc(sizeof(*ep) + dd_size, GFP_KERNEL); if (!ep) return NULL; + mutex_lock(&iscsi_ep_idr_mutex); + id = idr_alloc(&iscsi_ep_idr, ep, 0, -1, GFP_NOIO); + if (id < 0) { + mutex_unlock(&iscsi_ep_idr_mutex); + printk(KERN_ERR "Could not allocate endpoint ID. Error %d.\n", + id); + goto free_ep; + } + mutex_unlock(&iscsi_ep_idr_mutex); + ep->id = id; ep->dev.class = &iscsi_endpoint_class; - dev_set_name(&ep->dev, "ep-%llu", (unsigned long long) id); + dev_set_name(&ep->dev, "ep-%d", id); err = device_register(&ep->dev); if (err) - goto free_ep; + goto free_id; err = sysfs_create_group(&ep->dev.kobj, &iscsi_endpoint_group); if (err) @@ -249,6 +241,10 @@ unregister_dev: device_unregister(&ep->dev); return NULL; +free_id: + mutex_lock(&iscsi_ep_idr_mutex); + idr_remove(&iscsi_ep_idr, id); + mutex_unlock(&iscsi_ep_idr_mutex); free_ep: kfree(ep); return NULL; @@ -276,14 +272,17 @@ EXPORT_SYMBOL_GPL(iscsi_put_endpoint); */ struct iscsi_endpoint *iscsi_lookup_endpoint(u64 handle) { - struct device *dev; + struct iscsi_endpoint *ep; - dev = class_find_device(&iscsi_endpoint_class, NULL, &handle, - iscsi_match_epid); - if (!dev) - return NULL; + mutex_lock(&iscsi_ep_idr_mutex); + ep = idr_find(&iscsi_ep_idr, handle); + if (!ep) + goto unlock; - return iscsi_dev_to_endpoint(dev); + get_device(&ep->dev); +unlock: + mutex_unlock(&iscsi_ep_idr_mutex); + return ep; } EXPORT_SYMBOL_GPL(iscsi_lookup_endpoint); diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index 037c77fb5dc5..3ecf9702287b 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -296,7 +296,7 @@ extern void iscsi_host_for_each_session(struct Scsi_Host *shost, struct iscsi_endpoint { void *dd_data; /* LLD private data */ struct device dev; - uint64_t id; + int id; struct iscsi_cls_conn *conn; }; -- cgit v1.2.3 From e4efe868aa14c8557841ecf30b13667f96a12111 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 7 Apr 2022 19:13:11 -0500 Subject: scsi: iscsi: Merge suspend fields [ Upstream commit 5bd856256f8c03e329f8ff36d8c8efcb111fe6df ] Move the tx and rx suspend fields into one flags field. Link: https://lore.kernel.org/r/20220408001314.5014-8-michael.christie@oracle.com Tested-by: Manish Rangankar Reviewed-by: Lee Duncan Reviewed-by: Chris Leech Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/bnx2i/bnx2i_hwi.c | 2 +- drivers/scsi/bnx2i/bnx2i_iscsi.c | 2 +- drivers/scsi/cxgbi/libcxgbi.c | 6 +++--- drivers/scsi/libiscsi.c | 20 ++++++++++---------- drivers/scsi/libiscsi_tcp.c | 2 +- include/scsi/libiscsi.h | 9 +++++---- 6 files changed, 21 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/scsi/bnx2i/bnx2i_hwi.c b/drivers/scsi/bnx2i/bnx2i_hwi.c index 5521469ce678..e16327a4b4c9 100644 --- a/drivers/scsi/bnx2i/bnx2i_hwi.c +++ b/drivers/scsi/bnx2i/bnx2i_hwi.c @@ -1977,7 +1977,7 @@ static int bnx2i_process_new_cqes(struct bnx2i_conn *bnx2i_conn) if (nopin->cq_req_sn != qp->cqe_exp_seq_sn) break; - if (unlikely(test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx))) { + if (unlikely(test_bit(ISCSI_CONN_FLAG_SUSPEND_RX, &conn->flags))) { if (nopin->op_code == ISCSI_OP_NOOP_IN && nopin->itt == (u16) RESERVED_ITT) { printk(KERN_ALERT "bnx2i: Unsolicited " diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c index 1b5f3e143f07..2e5241d12dc3 100644 --- a/drivers/scsi/bnx2i/bnx2i_iscsi.c +++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c @@ -1721,7 +1721,7 @@ static int bnx2i_tear_down_conn(struct bnx2i_hba *hba, struct iscsi_conn *conn = ep->conn->cls_conn->dd_data; /* Must suspend all rx queue activity for this ep */ - set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx); + set_bit(ISCSI_CONN_FLAG_SUSPEND_RX, &conn->flags); } /* CONN_DISCONNECT timeout may or may not be an issue depending * on what transcribed in TCP layer, different targets behave diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c index 8c7d4dda4cf2..4365d52c6430 100644 --- a/drivers/scsi/cxgbi/libcxgbi.c +++ b/drivers/scsi/cxgbi/libcxgbi.c @@ -1634,11 +1634,11 @@ void cxgbi_conn_pdu_ready(struct cxgbi_sock *csk) log_debug(1 << CXGBI_DBG_PDU_RX, "csk 0x%p, conn 0x%p.\n", csk, conn); - if (unlikely(!conn || conn->suspend_rx)) { + if (unlikely(!conn || test_bit(ISCSI_CONN_FLAG_SUSPEND_RX, &conn->flags))) { log_debug(1 << CXGBI_DBG_PDU_RX, - "csk 0x%p, conn 0x%p, id %d, suspend_rx %lu!\n", + "csk 0x%p, conn 0x%p, id %d, conn flags 0x%lx!\n", csk, conn, conn ? conn->id : 0xFF, - conn ? conn->suspend_rx : 0xFF); + conn ? conn->flags : 0xFF); return; } diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index cbc263ec9d66..a4f26431b033 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -1392,8 +1392,8 @@ static bool iscsi_set_conn_failed(struct iscsi_conn *conn) if (conn->stop_stage == 0) session->state = ISCSI_STATE_FAILED; - set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx); - set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx); + set_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags); + set_bit(ISCSI_CONN_FLAG_SUSPEND_RX, &conn->flags); return true; } @@ -1454,7 +1454,7 @@ static int iscsi_xmit_task(struct iscsi_conn *conn, struct iscsi_task *task, * Do this after dropping the extra ref because if this was a requeue * it's removed from that list and cleanup_queued_task would miss it. */ - if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) { + if (test_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags)) { /* * Save the task and ref in case we weren't cleaning up this * task and get woken up again. @@ -1532,7 +1532,7 @@ static int iscsi_data_xmit(struct iscsi_conn *conn) int rc = 0; spin_lock_bh(&conn->session->frwd_lock); - if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) { + if (test_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags)) { ISCSI_DBG_SESSION(conn->session, "Tx suspended!\n"); spin_unlock_bh(&conn->session->frwd_lock); return -ENODATA; @@ -1746,7 +1746,7 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc) goto fault; } - if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) { + if (test_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags)) { reason = FAILURE_SESSION_IN_RECOVERY; sc->result = DID_REQUEUE << 16; goto fault; @@ -1935,7 +1935,7 @@ static void fail_scsi_tasks(struct iscsi_conn *conn, u64 lun, int error) void iscsi_suspend_queue(struct iscsi_conn *conn) { spin_lock_bh(&conn->session->frwd_lock); - set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx); + set_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags); spin_unlock_bh(&conn->session->frwd_lock); } EXPORT_SYMBOL_GPL(iscsi_suspend_queue); @@ -1953,7 +1953,7 @@ void iscsi_suspend_tx(struct iscsi_conn *conn) struct Scsi_Host *shost = conn->session->host; struct iscsi_host *ihost = shost_priv(shost); - set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx); + set_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags); if (ihost->workq) flush_workqueue(ihost->workq); } @@ -1961,7 +1961,7 @@ EXPORT_SYMBOL_GPL(iscsi_suspend_tx); static void iscsi_start_tx(struct iscsi_conn *conn) { - clear_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx); + clear_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags); iscsi_conn_queue_work(conn); } @@ -3324,8 +3324,8 @@ int iscsi_conn_bind(struct iscsi_cls_session *cls_session, /* * Unblock xmitworker(), Login Phase will pass through. */ - clear_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx); - clear_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx); + clear_bit(ISCSI_CONN_FLAG_SUSPEND_RX, &conn->flags); + clear_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags); return 0; } EXPORT_SYMBOL_GPL(iscsi_conn_bind); diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c index 2e9ffe3d1a55..883005757ddb 100644 --- a/drivers/scsi/libiscsi_tcp.c +++ b/drivers/scsi/libiscsi_tcp.c @@ -927,7 +927,7 @@ int iscsi_tcp_recv_skb(struct iscsi_conn *conn, struct sk_buff *skb, */ conn->last_recv = jiffies; - if (unlikely(conn->suspend_rx)) { + if (unlikely(test_bit(ISCSI_CONN_FLAG_SUSPEND_RX, &conn->flags))) { ISCSI_DBG_TCP(conn, "Rx suspended!\n"); *status = ISCSI_TCP_SUSPENDED; return 0; diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h index 4ee233e5a6ff..bdb0ae11682d 100644 --- a/include/scsi/libiscsi.h +++ b/include/scsi/libiscsi.h @@ -52,8 +52,10 @@ enum { #define ISID_SIZE 6 -/* Connection suspend "bit" */ -#define ISCSI_SUSPEND_BIT 1 +/* Connection flags */ +#define ISCSI_CONN_FLAG_SUSPEND_TX BIT(0) +#define ISCSI_CONN_FLAG_SUSPEND_RX BIT(1) + #define ISCSI_ITT_MASK 0x1fff #define ISCSI_TOTAL_CMDS_MAX 4096 @@ -199,8 +201,7 @@ struct iscsi_conn { struct list_head cmdqueue; /* data-path cmd queue */ struct list_head requeue; /* tasks needing another run */ struct work_struct xmitwork; /* per-conn. xmit workqueue */ - unsigned long suspend_tx; /* suspend Tx */ - unsigned long suspend_rx; /* suspend Rx */ + unsigned long flags; /* ISCSI_CONN_FLAGs */ /* negotiated params */ unsigned max_recv_dlength; /* initiator_max_recv_dsl*/ -- cgit v1.2.3 From c7f4f3016fea68faba6c8846cc6b35717b253e7a Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 7 Apr 2022 19:13:12 -0500 Subject: scsi: iscsi: Fix NOP handling during conn recovery [ Upstream commit 44ac97109e42f87b1a34954704b81b6c8eca80c4 ] If a offload driver doesn't use the xmit workqueue, then when we are doing ep_disconnect libiscsi can still inject PDUs to the driver. This adds a check for if the connection is bound before trying to inject PDUs. Link: https://lore.kernel.org/r/20220408001314.5014-9-michael.christie@oracle.com Tested-by: Manish Rangankar Reviewed-by: Lee Duncan Reviewed-by: Chris Leech Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/libiscsi.c | 7 ++++++- include/scsi/libiscsi.h | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index a4f26431b033..0f2c7098f9d6 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -678,7 +678,8 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr, struct iscsi_task *task; itt_t itt; - if (session->state == ISCSI_STATE_TERMINATE) + if (session->state == ISCSI_STATE_TERMINATE || + !test_bit(ISCSI_CONN_FLAG_BOUND, &conn->flags)) return NULL; if (opcode == ISCSI_OP_LOGIN || opcode == ISCSI_OP_TEXT) { @@ -2214,6 +2215,8 @@ void iscsi_conn_unbind(struct iscsi_cls_conn *cls_conn, bool is_active) iscsi_suspend_tx(conn); spin_lock_bh(&session->frwd_lock); + clear_bit(ISCSI_CONN_FLAG_BOUND, &conn->flags); + if (!is_active) { /* * if logout timed out before userspace could even send a PDU @@ -3312,6 +3315,8 @@ int iscsi_conn_bind(struct iscsi_cls_session *cls_session, spin_lock_bh(&session->frwd_lock); if (is_leading) session->leadconn = conn; + + set_bit(ISCSI_CONN_FLAG_BOUND, &conn->flags); spin_unlock_bh(&session->frwd_lock); /* diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h index bdb0ae11682d..d1e282f0d6f1 100644 --- a/include/scsi/libiscsi.h +++ b/include/scsi/libiscsi.h @@ -55,7 +55,7 @@ enum { /* Connection flags */ #define ISCSI_CONN_FLAG_SUSPEND_TX BIT(0) #define ISCSI_CONN_FLAG_SUSPEND_RX BIT(1) - +#define ISCSI_CONN_FLAG_BOUND BIT(2) #define ISCSI_ITT_MASK 0x1fff #define ISCSI_TOTAL_CMDS_MAX 4096 -- cgit v1.2.3 From 07bdd207774c7d4ed0a5e8486a2ee6157271cad1 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 21 Apr 2022 16:35:40 -0700 Subject: memcg: sync flush only if periodic flush is delayed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 9b3016154c913b2e7ec5ae5c9a42eb9e732d86aa upstream. Daniel Dao has reported [1] a regression on workloads that may trigger a lot of refaults (anon and file). The underlying issue is that flushing rstat is expensive. Although rstat flush are batched with (nr_cpus * MEMCG_BATCH) stat updates, it seems like there are workloads which genuinely do stat updates larger than batch value within short amount of time. Since the rstat flush can happen in the performance critical codepaths like page faults, such workload can suffer greatly. This patch fixes this regression by making the rstat flushing conditional in the performance critical codepaths. More specifically, the kernel relies on the async periodic rstat flusher to flush the stats and only if the periodic flusher is delayed by more than twice the amount of its normal time window then the kernel allows rstat flushing from the performance critical codepaths. Now the question: what are the side-effects of this change? The worst that can happen is the refault codepath will see 4sec old lruvec stats and may cause false (or missed) activations of the refaulted page which may under-or-overestimate the workingset size. Though that is not very concerning as the kernel can already miss or do false activations. There are two more codepaths whose flushing behavior is not changed by this patch and we may need to come to them in future. One is the writeback stats used by dirty throttling and second is the deactivation heuristic in the reclaim. For now keeping an eye on them and if there is report of regression due to these codepaths, we will reevaluate then. Link: https://lore.kernel.org/all/CA+wXwBSyO87ZX5PVwdHm-=dBjZYECGmfnydUicUyrQqndgX2MQ@mail.gmail.com [1] Link: https://lkml.kernel.org/r/20220304184040.1304781-1-shakeelb@google.com Fixes: 1f828223b799 ("memcg: flush lruvec stats in the refault") Signed-off-by: Shakeel Butt Reported-by: Daniel Dao Tested-by: Ivan Babrou Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Michal Koutný Cc: Frank Hofmann Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/memcontrol.h | 5 +++++ mm/memcontrol.c | 12 +++++++++++- mm/workingset.c | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d9b8df5ef212..d35439db047c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1002,6 +1002,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, } void mem_cgroup_flush_stats(void); +void mem_cgroup_flush_stats_delayed(void); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); @@ -1422,6 +1423,10 @@ static inline void mem_cgroup_flush_stats(void) { } +static inline void mem_cgroup_flush_stats_delayed(void) +{ +} + static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8cdeb33d2cf9..971546bb99e0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -650,6 +650,9 @@ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); static DEFINE_SPINLOCK(stats_flush_lock); static DEFINE_PER_CPU(unsigned int, stats_updates); static atomic_t stats_flush_threshold = ATOMIC_INIT(0); +static u64 flush_next_time; + +#define FLUSH_TIME (2UL*HZ) static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { @@ -671,6 +674,7 @@ static void __mem_cgroup_flush_stats(void) if (!spin_trylock_irqsave(&stats_flush_lock, flag)) return; + flush_next_time = jiffies_64 + 2*FLUSH_TIME; cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); atomic_set(&stats_flush_threshold, 0); spin_unlock_irqrestore(&stats_flush_lock, flag); @@ -682,10 +686,16 @@ void mem_cgroup_flush_stats(void) __mem_cgroup_flush_stats(); } +void mem_cgroup_flush_stats_delayed(void) +{ + if (time_after64(jiffies_64, flush_next_time)) + mem_cgroup_flush_stats(); +} + static void flush_memcg_stats_dwork(struct work_struct *w) { __mem_cgroup_flush_stats(); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } /** diff --git a/mm/workingset.c b/mm/workingset.c index d5b81e4f4cbe..880d882f3325 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -352,7 +352,7 @@ void workingset_refault(struct page *page, void *shadow) inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats_delayed(); /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if -- cgit v1.2.3 From 9dcb65cdf3128113ce9a48b05a72b6f8ef2bc257 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Apr 2022 16:35:46 -0700 Subject: mm, hugetlb: allow for "high" userspace addresses commit 5f24d5a579d1eace79d505b148808a850b417d4c upstream. This is a fix for commit f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") for hugetlb. This patch adds support for "high" userspace addresses that are optionally supported on the system and have to be requested via a hint mechanism ("high" addr parameter to mmap). Architectures such as powerpc and x86 achieve this by making changes to their architectural versions of hugetlb_get_unmapped_area() function. However, arm64 uses the generic version of that function. So take into account arch_get_mmap_base() and arch_get_mmap_end() in hugetlb_get_unmapped_area(). To allow that, move those two macros out of mm/mmap.c into include/linux/sched/mm.h If these macros are not defined in architectural code then they default to (TASK_SIZE) and (base) so should not introduce any behavioural changes to architectures that do not define them. For the time being, only ARM64 is affected by this change. Catalin (ARM64) said "We should have fixed hugetlb_get_unmapped_area() as well when we added support for 52-bit VA. The reason for commit f6795053dac8 was to prevent normal mmap() from returning addresses above 48-bit by default as some user-space had hard assumptions about this. It's a slight ABI change if you do this for hugetlb_get_unmapped_area() but I doubt anyone would notice. It's more likely that the current behaviour would cause issues, so I'd rather have them consistent. Basically when arm64 gained support for 52-bit addresses we did not want user-space calling mmap() to suddenly get such high addresses, otherwise we could have inadvertently broken some programs (similar behaviour to x86 here). Hence we added commit f6795053dac8. But we missed hugetlbfs which could still get such high mmap() addresses. So in theory that's a potential regression that should have bee addressed at the same time as commit f6795053dac8 (and before arm64 enabled 52-bit addresses)" Link: https://lkml.kernel.org/r/ab847b6edb197bffdfe189e70fb4ac76bfe79e0d.1650033747.git.christophe.leroy@csgroup.eu Fixes: f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") Signed-off-by: Christophe Leroy Reviewed-by: Catalin Marinas Cc: Steve Capper Cc: Will Deacon Cc: [5.0.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/hugetlbfs/inode.c | 9 +++++---- include/linux/sched/mm.h | 8 ++++++++ mm/mmap.c | 8 -------- 3 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 54c4e0b0dda4..bb0651a4a128 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); @@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = current->mm->mmap_base; + info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); addr = vm_unmapped_area(&info); } @@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); + const unsigned long mmap_end = arch_get_mmap_end(addr); if (len & ~huge_page_mask(h)) return -EINVAL; @@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (mmap_end - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 5561486fddef..95fb7aaaec8d 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -106,6 +106,14 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU +#ifndef arch_get_mmap_end +#define arch_get_mmap_end(addr) (TASK_SIZE) +#endif + +#ifndef arch_get_mmap_base +#define arch_get_mmap_base(addr, base) (base) +#endif + extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); extern unsigned long diff --git a/mm/mmap.c b/mm/mmap.c index 049b8e5c18f0..6bb553ed5c55 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2113,14 +2113,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) return addr; } -#ifndef arch_get_mmap_end -#define arch_get_mmap_end(addr) (TASK_SIZE) -#endif - -#ifndef arch_get_mmap_base -#define arch_get_mmap_base(addr, base) (base) -#endif - /* Get an address range which is currently unmapped. * For shmat() with addr=0. * -- cgit v1.2.3 From 41ba681c63731872beab7bd86d8246e2ea121381 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Thu, 21 Apr 2022 16:36:01 -0700 Subject: oom_kill.c: futex: delay the OOM reaper to allow time for proper futex cleanup commit e4a38402c36e42df28eb1a5394be87e6571fb48a upstream. The pthread struct is allocated on PRIVATE|ANONYMOUS memory [1] which can be targeted by the oom reaper. This mapping is used to store the futex robust list head; the kernel does not keep a copy of the robust list and instead references a userspace address to maintain the robustness during a process death. A race can occur between exit_mm and the oom reaper that allows the oom reaper to free the memory of the futex robust list before the exit path has handled the futex death: CPU1 CPU2 -------------------------------------------------------------------- page_fault do_exit "signal" wake_oom_reaper oom_reaper oom_reap_task_mm (invalidates mm) exit_mm exit_mm_release futex_exit_release futex_cleanup exit_robust_list get_user (EFAULT- can't access memory) If the get_user EFAULT's, the kernel will be unable to recover the waiters on the robust_list, leaving userspace mutexes hung indefinitely. Delay the OOM reaper, allowing more time for the exit path to perform the futex cleanup. Reproducer: https://gitlab.com/jsavitz/oom_futex_reproducer Based on a patch by Michal Hocko. Link: https://elixir.bootlin.com/glibc/glibc-2.35/source/nptl/allocatestack.c#L370 [1] Link: https://lkml.kernel.org/r/20220414144042.677008-1-npache@redhat.com Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") Signed-off-by: Joel Savitz Signed-off-by: Nico Pache Co-developed-by: Joel Savitz Suggested-by: Thomas Gleixner Acked-by: Thomas Gleixner Acked-by: Michal Hocko Cc: Rafael Aquini Cc: Waiman Long Cc: Herton R. Krzesinski Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Daniel Bristot de Oliveira Cc: David Rientjes Cc: Andrea Arcangeli Cc: Davidlohr Bueso Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Joel Savitz Cc: Darren Hart Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/sched.h | 1 + mm/oom_kill.c | 54 ++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 41 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9016bbacedf3..ad7ff332a0ac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1436,6 +1436,7 @@ struct task_struct { int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; + struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index bfa9e348c3a3..262f752d3d51 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -635,7 +635,7 @@ done: */ set_bit(MMF_OOM_SKIP, &mm->flags); - /* Drop a reference taken by wake_oom_reaper */ + /* Drop a reference taken by queue_oom_reaper */ put_task_struct(tsk); } @@ -645,12 +645,12 @@ static int oom_reaper(void *unused) struct task_struct *tsk = NULL; wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); - spin_lock(&oom_reaper_lock); + spin_lock_irq(&oom_reaper_lock); if (oom_reaper_list != NULL) { tsk = oom_reaper_list; oom_reaper_list = tsk->oom_reaper_list; } - spin_unlock(&oom_reaper_lock); + spin_unlock_irq(&oom_reaper_lock); if (tsk) oom_reap_task(tsk); @@ -659,22 +659,48 @@ static int oom_reaper(void *unused) return 0; } -static void wake_oom_reaper(struct task_struct *tsk) +static void wake_oom_reaper(struct timer_list *timer) { - /* mm is already queued? */ - if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) - return; + struct task_struct *tsk = container_of(timer, struct task_struct, + oom_reaper_timer); + struct mm_struct *mm = tsk->signal->oom_mm; + unsigned long flags; - get_task_struct(tsk); + /* The victim managed to terminate on its own - see exit_mmap */ + if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + put_task_struct(tsk); + return; + } - spin_lock(&oom_reaper_lock); + spin_lock_irqsave(&oom_reaper_lock, flags); tsk->oom_reaper_list = oom_reaper_list; oom_reaper_list = tsk; - spin_unlock(&oom_reaper_lock); + spin_unlock_irqrestore(&oom_reaper_lock, flags); trace_wake_reaper(tsk->pid); wake_up(&oom_reaper_wait); } +/* + * Give the OOM victim time to exit naturally before invoking the oom_reaping. + * The timers timeout is arbitrary... the longer it is, the longer the worst + * case scenario for the OOM can take. If it is too small, the oom_reaper can + * get in the way and release resources needed by the process exit path. + * e.g. The futex robust list can sit in Anon|Private memory that gets reaped + * before the exit path is able to wake the futex waiters. + */ +#define OOM_REAPER_DELAY (2*HZ) +static void queue_oom_reaper(struct task_struct *tsk) +{ + /* mm is already queued? */ + if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) + return; + + get_task_struct(tsk); + timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0); + tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY; + add_timer(&tsk->oom_reaper_timer); +} + static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -682,7 +708,7 @@ static int __init oom_init(void) } subsys_initcall(oom_init) #else -static inline void wake_oom_reaper(struct task_struct *tsk) +static inline void queue_oom_reaper(struct task_struct *tsk) { } #endif /* CONFIG_MMU */ @@ -933,7 +959,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) rcu_read_unlock(); if (can_oom_reap) - wake_oom_reaper(victim); + queue_oom_reaper(victim); mmdrop(mm); put_task_struct(victim); @@ -969,7 +995,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) task_lock(victim); if (task_will_free_mem(victim)) { mark_oom_victim(victim); - wake_oom_reaper(victim); + queue_oom_reaper(victim); task_unlock(victim); put_task_struct(victim); return; @@ -1067,7 +1093,7 @@ bool out_of_memory(struct oom_control *oc) */ if (task_will_free_mem(current)) { mark_oom_victim(current); - wake_oom_reaper(current); + queue_oom_reaper(current); return true; } -- cgit v1.2.3 From bcba40bd36d705aba9c5fd4622e35118c2a46ed2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:22 +0100 Subject: netfilter: conntrack: convert to refcount_t api commit 719774377622bc4025d2a74f551b5dc2158c6c30 upstream. Convert nf_conn reference counting from atomic_t to refcount_t based api. refcount_t api provides more runtime sanity checks and will warn on certain constructs, e.g. refcount_inc() on a zero reference count, which usually indicates use-after-free. For this reason template allocation is changed to init the refcount to 1, the subsequenct add operations are removed. Likewise, init_conntrack() is changed to set the initial refcount to 1 instead refcount_inc(). This is safe because the new entry is not (yet) visible to other cpus. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- include/linux/netfilter/nf_conntrack_common.h | 8 ++++---- net/netfilter/nf_conntrack_core.c | 26 +++++++++++++------------- net/netfilter/nf_conntrack_expect.c | 4 ++-- net/netfilter/nf_conntrack_netlink.c | 6 +++--- net/netfilter/nf_conntrack_standalone.c | 4 ++-- net/netfilter/nf_flow_table_core.c | 2 +- net/netfilter/nf_synproxy_core.c | 1 - net/netfilter/nft_ct.c | 4 +--- net/netfilter/xt_CT.c | 3 +-- net/openvswitch/conntrack.c | 1 - net/sched/act_ct.c | 1 - 11 files changed, 27 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 700ea077ce2d..a03f7a80b9ab 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -2,7 +2,7 @@ #ifndef _NF_CONNTRACK_COMMON_H #define _NF_CONNTRACK_COMMON_H -#include +#include #include struct ip_conntrack_stat { @@ -25,19 +25,19 @@ struct ip_conntrack_stat { #define NFCT_PTRMASK ~(NFCT_INFOMASK) struct nf_conntrack { - atomic_t use; + refcount_t use; }; void nf_conntrack_destroy(struct nf_conntrack *nfct); static inline void nf_conntrack_put(struct nf_conntrack *nfct) { - if (nfct && atomic_dec_and_test(&nfct->use)) + if (nfct && refcount_dec_and_test(&nfct->use)) nf_conntrack_destroy(nfct); } static inline void nf_conntrack_get(struct nf_conntrack *nfct) { if (nfct) - atomic_inc(&nfct->use); + refcount_inc(&nfct->use); } #endif /* _NF_CONNTRACK_COMMON_H */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3a98a1316307..6d7840b8457b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -598,7 +598,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, tmpl->status = IPS_TEMPLATE; write_pnet(&tmpl->ct_net, net); nf_ct_zone_add(tmpl, zone); - atomic_set(&tmpl->ct_general.use, 0); + refcount_set(&tmpl->ct_general.use, 1); return tmpl; } @@ -631,7 +631,7 @@ destroy_conntrack(struct nf_conntrack *nfct) struct nf_conn *ct = (struct nf_conn *)nfct; pr_debug("destroy_conntrack(%p)\n", ct); - WARN_ON(atomic_read(&nfct->use) != 0); + WARN_ON(refcount_read(&nfct->use) != 0); if (unlikely(nf_ct_is_template(ct))) { nf_ct_tmpl_free(ct); @@ -755,7 +755,7 @@ nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) /* caller must hold rcu readlock and none of the nf_conntrack_locks */ static void nf_ct_gc_expired(struct nf_conn *ct) { - if (!atomic_inc_not_zero(&ct->ct_general.use)) + if (!refcount_inc_not_zero(&ct->ct_general.use)) return; if (nf_ct_should_gc(ct)) @@ -823,7 +823,7 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, * in, try to obtain a reference and re-check tuple */ ct = nf_ct_tuplehash_to_ctrack(h); - if (likely(atomic_inc_not_zero(&ct->ct_general.use))) { + if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { if (likely(nf_ct_key_equal(h, tuple, zone, net))) goto found; @@ -920,7 +920,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) smp_wmb(); /* The caller holds a reference to this object */ - atomic_set(&ct->ct_general.use, 2); + refcount_set(&ct->ct_general.use, 2); __nf_conntrack_hash_insert(ct, hash, reply_hash); nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert); @@ -971,7 +971,7 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct) { struct nf_conn_tstamp *tstamp; - atomic_inc(&ct->ct_general.use); + refcount_inc(&ct->ct_general.use); ct->status |= IPS_CONFIRMED; /* set conntrack timestamp, if enabled. */ @@ -1364,7 +1364,7 @@ static unsigned int early_drop_list(struct net *net, nf_ct_is_dying(tmp)) continue; - if (!atomic_inc_not_zero(&tmp->ct_general.use)) + if (!refcount_inc_not_zero(&tmp->ct_general.use)) continue; /* kill only if still in same netns -- might have moved due to @@ -1513,7 +1513,7 @@ static void gc_worker(struct work_struct *work) continue; /* need to take reference to avoid possible races */ - if (!atomic_inc_not_zero(&tmp->ct_general.use)) + if (!refcount_inc_not_zero(&tmp->ct_general.use)) continue; if (gc_worker_skip_ct(tmp)) { @@ -1622,7 +1622,7 @@ __nf_conntrack_alloc(struct net *net, /* Because we use RCU lookups, we set ct_general.use to zero before * this is inserted in any list. */ - atomic_set(&ct->ct_general.use, 0); + refcount_set(&ct->ct_general.use, 0); return ct; out: atomic_dec(&cnet->count); @@ -1647,7 +1647,7 @@ void nf_conntrack_free(struct nf_conn *ct) /* A freed object has refcnt == 0, that's * the golden rule for SLAB_TYPESAFE_BY_RCU */ - WARN_ON(atomic_read(&ct->ct_general.use) != 0); + WARN_ON(refcount_read(&ct->ct_general.use) != 0); nf_ct_ext_destroy(ct); kmem_cache_free(nf_conntrack_cachep, ct); @@ -1739,8 +1739,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, if (!exp) __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); - /* Now it is inserted into the unconfirmed list, bump refcount */ - nf_conntrack_get(&ct->ct_general); + /* Now it is inserted into the unconfirmed list, set refcount to 1. */ + refcount_set(&ct->ct_general.use, 1); nf_ct_add_to_unconfirmed_list(ct); local_bh_enable(); @@ -2352,7 +2352,7 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), return NULL; found: - atomic_inc(&ct->ct_general.use); + refcount_inc(&ct->ct_general.use); spin_unlock(lockp); local_bh_enable(); return ct; diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index f562eeef4234..6d056ebba57c 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -203,12 +203,12 @@ nf_ct_find_expectation(struct net *net, * about to invoke ->destroy(), or nf_ct_delete() via timeout * or early_drop(). * - * The atomic_inc_not_zero() check tells: If that fails, we + * The refcount_inc_not_zero() check tells: If that fails, we * know that the ct is being destroyed. If it succeeds, we * can be sure the ct cannot disappear underneath. */ if (unlikely(nf_ct_is_dying(exp->master) || - !atomic_inc_not_zero(&exp->master->ct_general.use))) + !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; if (exp->flags & NF_CT_EXPECT_PERMANENT) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 1c02be04aaf5..ef0a78aa9ba9 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -508,7 +508,7 @@ nla_put_failure: static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) { - if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use)))) + if (nla_put_be32(skb, CTA_USE, htonl(refcount_read(&ct->ct_general.use)))) goto nla_put_failure; return 0; @@ -1200,7 +1200,7 @@ restart: ct = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_is_expired(ct)) { if (i < ARRAY_SIZE(nf_ct_evict) && - atomic_inc_not_zero(&ct->ct_general.use)) + refcount_inc_not_zero(&ct->ct_general.use)) nf_ct_evict[i++] = ct; continue; } @@ -1748,7 +1748,7 @@ restart: NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, dying ? true : false, 0); if (res < 0) { - if (!atomic_inc_not_zero(&ct->ct_general.use)) + if (!refcount_inc_not_zero(&ct->ct_general.use)) continue; cb->args[0] = cpu; cb->args[1] = (unsigned long)ct; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 80f675d884b2..3e1afd10a9b6 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -303,7 +303,7 @@ static int ct_seq_show(struct seq_file *s, void *v) int ret = 0; WARN_ON(!ct); - if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + if (unlikely(!refcount_inc_not_zero(&ct->ct_general.use))) return 0; if (nf_ct_should_gc(ct)) { @@ -370,7 +370,7 @@ static int ct_seq_show(struct seq_file *s, void *v) ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR); ct_show_delta_time(s, ct); - seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)); + seq_printf(s, "use=%u\n", refcount_read(&ct->ct_general.use)); if (seq_has_overflowed(s)) goto release; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index ed37bb9b4e58..b90eca7a2f22 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -48,7 +48,7 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct) struct flow_offload *flow; if (unlikely(nf_ct_is_dying(ct) || - !atomic_inc_not_zero(&ct->ct_general.use))) + !refcount_inc_not_zero(&ct->ct_general.use))) return NULL; flow = kzalloc(sizeof(*flow), GFP_ATOMIC); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 3d6d49420db8..2dfc5dae0656 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -349,7 +349,6 @@ static int __net_init synproxy_net_init(struct net *net) goto err2; __set_bit(IPS_CONFIRMED_BIT, &ct->status); - nf_conntrack_get(&ct->ct_general); snet->tmpl = ct; snet->stats = alloc_percpu(struct synproxy_stats); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 54ecb9fbf2de..ee69c692056f 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -259,7 +259,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, ct = this_cpu_read(nft_ct_pcpu_template); - if (likely(atomic_read(&ct->ct_general.use) == 1)) { + if (likely(refcount_read(&ct->ct_general.use) == 1)) { nf_ct_zone_add(ct, &zone); } else { /* previous skb got queued to userspace */ @@ -270,7 +270,6 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, } } - atomic_inc(&ct->ct_general.use); nf_ct_set(skb, ct, IP_CT_NEW); } #endif @@ -375,7 +374,6 @@ static bool nft_ct_tmpl_alloc_pcpu(void) return false; } - atomic_set(&tmp->ct_general.use, 1); per_cpu(nft_ct_pcpu_template, cpu) = tmp; } diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 0a913ce07425..267757b0392a 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -24,7 +24,7 @@ static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct) return XT_CONTINUE; if (ct) { - atomic_inc(&ct->ct_general.use); + refcount_inc(&ct->ct_general.use); nf_ct_set(skb, ct, IP_CT_NEW); } else { nf_ct_set(skb, ct, IP_CT_UNTRACKED); @@ -201,7 +201,6 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, goto err4; } __set_bit(IPS_CONFIRMED_BIT, &ct->status); - nf_conntrack_get(&ct->ct_general); out: info->ct = ct; return 0; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index f2b64cab9af7..815916056e0d 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1722,7 +1722,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, goto err_free_ct; __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); - nf_conntrack_get(&ct_info.ct->ct_general); return 0; err_free_ct: __ovs_ct_free_action(&ct_info); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 553bf41671a6..f4fd584fba08 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1232,7 +1232,6 @@ static int tcf_ct_fill_params(struct net *net, return -ENOMEM; } __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); - nf_conntrack_get(&tmpl->ct_general); p->tmpl = tmpl; return 0; -- cgit v1.2.3 From 67e4860eeed86a1eec0a86467723f95cbd785076 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:25 +0100 Subject: netfilter: conntrack: avoid useless indirection during conntrack destruction commit 6ae7989c9af0d98ab64196f4f4c6f6499454bd23 upstream. nf_ct_put() results in a usesless indirection: nf_ct_put -> nf_conntrack_put -> nf_conntrack_destroy -> rcu readlock + indirect call of ct_hooks->destroy(). There are two _put helpers: nf_ct_put and nf_conntrack_put. The latter is what should be used in code that MUST NOT cause a linker dependency on the conntrack module (e.g. calls from core network stack). Everyone else should call nf_ct_put() instead. A followup patch will convert a few nf_conntrack_put() calls to nf_ct_put(), in particular from modules that already have a conntrack dependency such as act_ct or even nf_conntrack itself. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- include/linux/netfilter/nf_conntrack_common.h | 2 ++ include/net/netfilter/nf_conntrack.h | 8 ++++++-- net/netfilter/nf_conntrack_core.c | 12 ++++++------ 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index a03f7a80b9ab..2770db2fa080 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -29,6 +29,8 @@ struct nf_conntrack { }; void nf_conntrack_destroy(struct nf_conntrack *nfct); + +/* like nf_ct_put, but without module dependency on nf_conntrack */ static inline void nf_conntrack_put(struct nf_conntrack *nfct) { if (nfct && refcount_dec_and_test(&nfct->use)) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index d24b0a34c8f0..34c266502a50 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -76,6 +76,8 @@ struct nf_conn { * Hint, SKB address this struct and refcnt via skb->_nfct and * helpers nf_conntrack_get() and nf_conntrack_put(). * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt, + * except that the latter uses internal indirection and does not + * result in a conntrack module dependency. * beware nf_ct_get() is different and don't inc refcnt. */ struct nf_conntrack ct_general; @@ -169,11 +171,13 @@ nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) return (struct nf_conn *)(nfct & NFCT_PTRMASK); } +void nf_ct_destroy(struct nf_conntrack *nfct); + /* decrement reference count on a conntrack */ static inline void nf_ct_put(struct nf_conn *ct) { - WARN_ON(!ct); - nf_conntrack_put(&ct->ct_general); + if (ct && refcount_dec_and_test(&ct->ct_general.use)) + nf_ct_destroy(&ct->ct_general); } /* Protocol module loading */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 6d7840b8457b..31399c53dfb1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -571,7 +571,7 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) -/* Released via destroy_conntrack() */ +/* Released via nf_ct_destroy() */ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, gfp_t flags) @@ -625,12 +625,11 @@ static void destroy_gre_conntrack(struct nf_conn *ct) #endif } -static void -destroy_conntrack(struct nf_conntrack *nfct) +void nf_ct_destroy(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; - pr_debug("destroy_conntrack(%p)\n", ct); + pr_debug("%s(%p)\n", __func__, ct); WARN_ON(refcount_read(&nfct->use) != 0); if (unlikely(nf_ct_is_template(ct))) { @@ -656,9 +655,10 @@ destroy_conntrack(struct nf_conntrack *nfct) if (ct->master) nf_ct_put(ct->master); - pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); + pr_debug("%s: returning ct=%p to slab\n", __func__, ct); nf_conntrack_free(ct); } +EXPORT_SYMBOL(nf_ct_destroy); static void nf_ct_delete_from_lists(struct nf_conn *ct) { @@ -2825,7 +2825,7 @@ err_cachep: static struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, - .destroy = destroy_conntrack, + .destroy = nf_ct_destroy, .get_tuple_skb = nf_conntrack_get_tuple_skb, }; -- cgit v1.2.3