From 510c321b557121861601f9d259aadd65aa274f35 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 14 Feb 2018 19:06:02 +0800 Subject: xfrm: reuse uncached_list to track xdsts In early time, when freeing a xdst, it would be inserted into dst_garbage.list first. Then if it's refcnt was still held somewhere, later it would be put into dst_busy_list in dst_gc_task(). When one dev was being unregistered, the dev of these dsts in dst_busy_list would be set with loopback_dev and put this dev. So that this dev's removal wouldn't get blocked, and avoid the kmsg warning: kernel:unregister_netdevice: waiting for veth0 to become \ free. Usage count = 2 However after Commit 52df157f17e5 ("xfrm: take refcnt of dst when creating struct xfrm_dst bundle"), the xdst will not be freed with dst gc, and this warning happens. To fix it, we need to find these xdsts that are still held by others when removing the dev, and free xdst's dev and set it with loopback_dev. But unfortunately after flow_cache for xfrm was deleted, no list tracks them anymore. So we need to save these xdsts somewhere to release the xdst's dev later. To make this easier, this patch is to reuse uncached_list to track xdsts, so that the dev refcnt can be released in the event NETDEV_UNREGISTER process of fib_netdev_notifier. Thanks to Florian, we could move forward this fix quickly. Fixes: 52df157f17e5 ("xfrm: take refcnt of dst when creating struct xfrm_dst bundle") Reported-by: Jianlin Shi Reported-by: Hangbin Liu Tested-by: Eyal Birger Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- include/net/ip6_route.h | 3 +++ include/net/route.h | 3 +++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 27d23a65f3cd..ac0866bb9e93 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -179,6 +179,9 @@ void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); void rt6_multipath_rebalance(struct rt6_info *rt); +void rt6_uncached_list_add(struct rt6_info *rt); +void rt6_uncached_list_del(struct rt6_info *rt); + static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { const struct dst_entry *dst = skb_dst(skb); diff --git a/include/net/route.h b/include/net/route.h index 1eb9ce470e25..40b870d58f38 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -227,6 +227,9 @@ struct in_ifaddr; void fib_add_ifaddr(struct in_ifaddr *); void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *); +void rt_add_uncached_list(struct rtable *rt); +void rt_del_uncached_list(struct rtable *rt); + static inline void ip_rt_put(struct rtable *rt) { /* dst_release() accepts a NULL parameter. -- cgit v1.2.3 From 28b0f8a6962a24ed21737578f3b1b07424635c9e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Feb 2018 07:38:08 -0800 Subject: tty: make n_tty_read() always abort if hangup is in progress A tty is hung up by __tty_hangup() setting file->f_op to hung_up_tty_fops, which is skipped on ttys whose write operation isn't tty_write(). This means that, for example, /dev/console whose write op is redirected_tty_write() is never actually marked hung up. Because n_tty_read() uses the hung up status to decide whether to abort the waiting readers, the lack of hung-up marking can lead to the following scenario. 1. A session contains two processes. The leader and its child. The child ignores SIGHUP. 2. The leader exits and starts disassociating from the controlling terminal (/dev/console). 3. __tty_hangup() skips setting f_op to hung_up_tty_fops. 4. SIGHUP is delivered and ignored. 5. tty_ldisc_hangup() is invoked. It wakes up the waits which should clear the read lockers of tty->ldisc_sem. 6. The reader wakes up but because tty_hung_up_p() is false, it doesn't abort and goes back to sleep while read-holding tty->ldisc_sem. 7. The leader progresses to tty_ldisc_lock() in tty_ldisc_hangup() and is now stuck in D sleep indefinitely waiting for tty->ldisc_sem. The following is Alan's explanation on why some ttys aren't hung up. http://lkml.kernel.org/r/20171101170908.6ad08580@alans-desktop 1. It broke the serial consoles because they would hang up and close down the hardware. With tty_port that *should* be fixable properly for any cases remaining. 2. The console layer was (and still is) completely broken and doens't refcount properly. So if you turn on console hangups it breaks (as indeed does freeing consoles and half a dozen other things). As neither can be fixed quickly, this patch works around the problem by introducing a new flag, TTY_HUPPING, which is used solely to tell n_tty_read() that hang-up is in progress for the console and the readers should be aborted regardless of the hung-up status of the device. The following is a sample hung task warning caused by this issue. INFO: task agetty:2662 blocked for more than 120 seconds. Not tainted 4.11.3-dbg-tty-lockup-02478-gfd6c7ee-dirty #28 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. 0 2662 1 0x00000086 Call Trace: __schedule+0x267/0x890 schedule+0x36/0x80 schedule_timeout+0x23c/0x2e0 ldsem_down_write+0xce/0x1f6 tty_ldisc_lock+0x16/0x30 tty_ldisc_hangup+0xb3/0x1b0 __tty_hangup+0x300/0x410 disassociate_ctty+0x6c/0x290 do_exit+0x7ef/0xb00 do_group_exit+0x3f/0xa0 get_signal+0x1b3/0x5d0 do_signal+0x28/0x660 exit_to_usermode_loop+0x46/0x86 do_syscall_64+0x9c/0xb0 entry_SYSCALL64_slow_path+0x25/0x25 The following is the repro. Run "$PROG /dev/console". The parent process hangs in D state. #include #include #include #include #include #include #include #include #include #include #include #include int main(int argc, char **argv) { struct sigaction sact = { .sa_handler = SIG_IGN }; struct timespec ts1s = { .tv_sec = 1 }; pid_t pid; int fd; if (argc < 2) { fprintf(stderr, "test-hung-tty /dev/$TTY\n"); return 1; } /* fork a child to ensure that it isn't already the session leader */ pid = fork(); if (pid < 0) { perror("fork"); return 1; } if (pid > 0) { /* top parent, wait for everyone */ while (waitpid(-1, NULL, 0) >= 0) ; if (errno != ECHILD) perror("waitpid"); return 0; } /* new session, start a new session and set the controlling tty */ if (setsid() < 0) { perror("setsid"); return 1; } fd = open(argv[1], O_RDWR); if (fd < 0) { perror("open"); return 1; } if (ioctl(fd, TIOCSCTTY, 1) < 0) { perror("ioctl"); return 1; } /* fork a child, sleep a bit and exit */ pid = fork(); if (pid < 0) { perror("fork"); return 1; } if (pid > 0) { nanosleep(&ts1s, NULL); printf("Session leader exiting\n"); exit(0); } /* * The child ignores SIGHUP and keeps reading from the controlling * tty. Because SIGHUP is ignored, the child doesn't get killed on * parent exit and the bug in n_tty makes the read(2) block the * parent's control terminal hangup attempt. The parent ends up in * D sleep until the child is explicitly killed. */ sigaction(SIGHUP, &sact, NULL); printf("Child reading tty\n"); while (1) { char buf[1024]; if (read(fd, buf, sizeof(buf)) < 0) { perror("read"); return 1; } } return 0; } Signed-off-by: Tejun Heo Cc: Alan Cox Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/n_tty.c | 6 ++++++ drivers/tty/tty_io.c | 9 +++++++++ include/linux/tty.h | 1 + 3 files changed, 16 insertions(+) (limited to 'include') diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index 5c0e59e8fe46..cbe98bc2b998 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -2180,6 +2180,12 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file, } if (tty_hung_up_p(file)) break; + /* + * Abort readers for ttys which never actually + * get hung up. See __tty_hangup(). + */ + if (test_bit(TTY_HUPPING, &tty->flags)) + break; if (!timeout) break; if (file->f_flags & O_NONBLOCK) { diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index eb9133b472f4..63114ea35ec1 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -586,6 +586,14 @@ static void __tty_hangup(struct tty_struct *tty, int exit_session) return; } + /* + * Some console devices aren't actually hung up for technical and + * historical reasons, which can lead to indefinite interruptible + * sleep in n_tty_read(). The following explicitly tells + * n_tty_read() to abort readers. + */ + set_bit(TTY_HUPPING, &tty->flags); + /* inuse_filps is protected by the single tty lock, this really needs to change if we want to flush the workqueue with the lock held */ @@ -640,6 +648,7 @@ static void __tty_hangup(struct tty_struct *tty, int exit_session) * from the ldisc side, which is now guaranteed. */ set_bit(TTY_HUPPED, &tty->flags); + clear_bit(TTY_HUPPING, &tty->flags); tty_unlock(tty); if (f) diff --git a/include/linux/tty.h b/include/linux/tty.h index 0a6c71e0ad01..47f8af22f216 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -364,6 +364,7 @@ struct tty_file_private { #define TTY_PTY_LOCK 16 /* pty private */ #define TTY_NO_WRITE_SPLIT 17 /* Preserve write boundaries to driver */ #define TTY_HUPPED 18 /* Post driver->hangup() */ +#define TTY_HUPPING 19 /* Hangup in progress */ #define TTY_LDISC_HALTED 22 /* Line discipline is halted */ /* Values for tty->flow_change */ -- cgit v1.2.3 From d02f51cbcf12b09ab945873e35046045875eed9a Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Sat, 3 Mar 2018 03:03:46 +0100 Subject: bpf: fix bpf_skb_adjust_net/bpf_skb_proto_xlat to deal with gso sctp skbs SCTP GSO skbs have a gso_size of GSO_BY_FRAGS, so any sort of unconditionally mangling of that will result in nonsense value and would corrupt the skb later on. Therefore, i) add two helpers skb_increase_gso_size() and skb_decrease_gso_size() that would throw a one time warning and bail out for such skbs and ii) refuse and return early with an error in those BPF helpers that are affected. We do need to bail out as early as possible from there before any changes on the skb have been performed. Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper") Co-authored-by: Daniel Borkmann Signed-off-by: Daniel Axtens Cc: Marcelo Ricardo Leitner Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- Documentation/networking/segmentation-offloads.txt | 11 +++- include/linux/skbuff.h | 22 ++++++++ net/core/filter.c | 60 +++++++++++++++------- 3 files changed, 73 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/Documentation/networking/segmentation-offloads.txt b/Documentation/networking/segmentation-offloads.txt index d47480b61ac6..23a8dd91a9ec 100644 --- a/Documentation/networking/segmentation-offloads.txt +++ b/Documentation/networking/segmentation-offloads.txt @@ -153,8 +153,15 @@ To signal this, gso_size is set to the special value GSO_BY_FRAGS. Therefore, any code in the core networking stack must be aware of the possibility that gso_size will be GSO_BY_FRAGS and handle that case -appropriately. (For size checks, the skb_gso_validate_*_len family of -helpers do this automatically.) +appropriately. + +There are a couple of helpers to make this easier: + + - For size checks, the skb_gso_validate_*_len family of helpers correctly + considers GSO_BY_FRAGS. + + - For manipulating packets, skb_increase_gso_size and skb_decrease_gso_size + will check for GSO_BY_FRAGS and WARN if asked to manipulate these skbs. This also affects drivers with the NETIF_F_FRAGLIST & NETIF_F_GSO_SCTP bits set. Note also that NETIF_F_GSO_SCTP is included in NETIF_F_GSO_SOFTWARE. diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c1e66bdcf583..8c67c33f40c9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4038,6 +4038,12 @@ static inline bool skb_is_gso_v6(const struct sk_buff *skb) return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6; } +/* Note: Should be called only if skb_is_gso(skb) is true */ +static inline bool skb_is_gso_sctp(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP; +} + static inline void skb_gso_reset(struct sk_buff *skb) { skb_shinfo(skb)->gso_size = 0; @@ -4045,6 +4051,22 @@ static inline void skb_gso_reset(struct sk_buff *skb) skb_shinfo(skb)->gso_type = 0; } +static inline void skb_increase_gso_size(struct skb_shared_info *shinfo, + u16 increment) +{ + if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS)) + return; + shinfo->gso_size += increment; +} + +static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo, + u16 decrement) +{ + if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS)) + return; + shinfo->gso_size -= decrement; +} + void __skb_warn_lro_forwarding(const struct sk_buff *skb); static inline bool skb_warn_if_lro(const struct sk_buff *skb) diff --git a/net/core/filter.c b/net/core/filter.c index 0c121adbdbaa..48aa7c7320db 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2087,6 +2087,10 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; + /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ + if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + return -ENOTSUPP; + ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; @@ -2096,19 +2100,21 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) return ret; if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + /* SKB_GSO_TCPV4 needs to be changed into * SKB_GSO_TCPV6. */ - if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { - skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4; - skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; + if (shinfo->gso_type & SKB_GSO_TCPV4) { + shinfo->gso_type &= ~SKB_GSO_TCPV4; + shinfo->gso_type |= SKB_GSO_TCPV6; } /* Due to IPv6 header, MSS needs to be downgraded. */ - skb_shinfo(skb)->gso_size -= len_diff; + skb_decrease_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IPV6); @@ -2123,6 +2129,10 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; + /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ + if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + return -ENOTSUPP; + ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; @@ -2132,19 +2142,21 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) return ret; if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + /* SKB_GSO_TCPV6 needs to be changed into * SKB_GSO_TCPV4. */ - if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) { - skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6; - skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; + if (shinfo->gso_type & SKB_GSO_TCPV6) { + shinfo->gso_type &= ~SKB_GSO_TCPV6; + shinfo->gso_type |= SKB_GSO_TCPV4; } /* Due to IPv4 header, MSS can be upgraded. */ - skb_shinfo(skb)->gso_size += len_diff; + skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IP); @@ -2243,6 +2255,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; + /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ + if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + return -ENOTSUPP; + ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; @@ -2252,11 +2268,13 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) return ret; if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + /* Due to header grow, MSS needs to be downgraded. */ - skb_shinfo(skb)->gso_size -= len_diff; + skb_decrease_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; } return 0; @@ -2267,6 +2285,10 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; + /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ + if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + return -ENOTSUPP; + ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; @@ -2276,11 +2298,13 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) return ret; if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + /* Due to header shrink, MSS can be upgraded. */ - skb_shinfo(skb)->gso_size += len_diff; + skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; } return 0; -- cgit v1.2.3 From 2fe4c22c53fc2e3f35be2cd0033cb3d15ebd41b1 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 27 Feb 2018 08:03:56 -0500 Subject: media: rc: lirc does not use LIRC_CAN_SEND_SCANCODE feature Since commit 02d742f4b209 ("media: lirc: lirc daemon fails to detect raw IR device"), the feature LIRC_CAN_SEND_SCANCODE is no longer used as it tripped up lircd. The ability to send scancodes for IR Tx is implied by LIRC_CAN_SEND_PULSE (i.e. any device that can send can use IR Tx encoders). So, remove LIRC_CAN_SEND_SCANCODE since it never used. This fixes: Documentation/output/lirc.h.rst:6: WARNING: undefined label: lirc-can-send-scancode (if the link has no caption the label must precede a section header As this flag was added for kernel 4.16, let's remove it, while not too late. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/lirc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h index 4fe580d36e41..f5bf06ecd87d 100644 --- a/include/uapi/linux/lirc.h +++ b/include/uapi/linux/lirc.h @@ -54,7 +54,6 @@ #define LIRC_CAN_SEND_RAW LIRC_MODE2SEND(LIRC_MODE_RAW) #define LIRC_CAN_SEND_PULSE LIRC_MODE2SEND(LIRC_MODE_PULSE) #define LIRC_CAN_SEND_MODE2 LIRC_MODE2SEND(LIRC_MODE_MODE2) -#define LIRC_CAN_SEND_SCANCODE LIRC_MODE2SEND(LIRC_MODE_SCANCODE) #define LIRC_CAN_SEND_LIRCCODE LIRC_MODE2SEND(LIRC_MODE_LIRCCODE) #define LIRC_CAN_SEND_MASK 0x0000003f -- cgit v1.2.3 From cb88a0588717ba6c756cb5972d75766b273a6817 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 6 Mar 2018 09:38:49 +0100 Subject: usb: quirks: add control message delay for 1b1c:1b20 Corsair Strafe RGB keyboard does not respond to usb control messages sometimes and hence generates timeouts. Commit de3af5bf259d ("usb: quirks: add delay init quirk for Corsair Strafe RGB keyboard") tried to fix those timeouts by adding USB_QUIRK_DELAY_INIT. Unfortunately, even with this quirk timeouts of usb_control_msg() can still be seen, but with a lower frequency (approx. 1 out of 15): [ 29.103520] usb 1-8: string descriptor 0 read error: -110 [ 34.363097] usb 1-8: can't set config #1, error -110 Adding further delays to different locations where usb control messages are issued just moves the timeouts to other locations, e.g.: [ 35.400533] usbhid 1-8:1.0: can't add hid device: -110 [ 35.401014] usbhid: probe of 1-8:1.0 failed with error -110 The only way to reliably avoid those issues is having a pause after each usb control message. In approx. 200 boot cycles no more timeouts were seen. Addionaly, keep USB_QUIRK_DELAY_INIT as it turned out to be necessary to have the delay in hub_port_connect() after hub_port_init(). The overall boot time seems not to be influenced by these additional delays, even on fast machines and lightweight distributions. Fixes: de3af5bf259d ("usb: quirks: add delay init quirk for Corsair Strafe RGB keyboard") Cc: stable@vger.kernel.org Signed-off-by: Danilo Krummrich Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/message.c | 4 ++++ drivers/usb/core/quirks.c | 3 ++- include/linux/usb/quirks.h | 3 +++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index c64cf6c4a83d..0c11d40a12bc 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -151,6 +151,10 @@ int usb_control_msg(struct usb_device *dev, unsigned int pipe, __u8 request, ret = usb_internal_control_msg(dev, pipe, dr, data, size, timeout); + /* Linger a bit, prior to the next control message. */ + if (dev->quirks & USB_QUIRK_DELAY_CTRL_MSG) + msleep(200); + kfree(dr); return ret; diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index f4a548471f0f..54b019e267c5 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -230,7 +230,8 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x1b1c, 0x1b13), .driver_info = USB_QUIRK_DELAY_INIT }, /* Corsair Strafe RGB */ - { USB_DEVICE(0x1b1c, 0x1b20), .driver_info = USB_QUIRK_DELAY_INIT }, + { USB_DEVICE(0x1b1c, 0x1b20), .driver_info = USB_QUIRK_DELAY_INIT | + USB_QUIRK_DELAY_CTRL_MSG }, /* Corsair K70 LUX */ { USB_DEVICE(0x1b1c, 0x1b36), .driver_info = USB_QUIRK_DELAY_INIT }, diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index f1fcec2fd5f8..b7a99ce56bc9 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -63,4 +63,7 @@ */ #define USB_QUIRK_DISCONNECT_SUSPEND BIT(12) +/* Device needs a pause after every control message. */ +#define USB_QUIRK_DELAY_CTRL_MSG BIT(13) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v1.2.3 From d3dcf8eb615537526bd42ff27a081d46d337816e Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Sun, 4 Mar 2018 17:29:48 +0200 Subject: rhashtable: Fix rhlist duplicates insertion When inserting duplicate objects (those with the same key), current rhlist implementation messes up the chain pointers by updating the bucket pointer instead of prev next pointer to the newly inserted node. This causes missing elements on removal and travesal. Fix that by properly updating pprev pointer to point to the correct rhash_head next pointer. Issue: 1241076 Change-Id: I86b2c140bcb4aeb10b70a72a267ff590bb2b17e7 Fixes: ca26893f05e8 ('rhashtable: Add rhlist interface') Signed-off-by: Paul Blakey Acked-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 4 +++- lib/rhashtable.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index c9df2527e0cd..668a21f04b09 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -766,8 +766,10 @@ slow_path: if (!key || (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, head)) : - rhashtable_compare(&arg, rht_obj(ht, head)))) + rhashtable_compare(&arg, rht_obj(ht, head)))) { + pprev = &head->next; continue; + } data = rht_obj(ht, head); diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 3825c30aaa36..47de025b6245 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -506,8 +506,10 @@ static void *rhashtable_lookup_one(struct rhashtable *ht, if (!key || (ht->p.obj_cmpfn ? ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : - rhashtable_compare(&arg, rht_obj(ht, head)))) + rhashtable_compare(&arg, rht_obj(ht, head)))) { + pprev = &head->next; continue; + } if (!ht->rhlist) return rht_obj(ht, head); -- cgit v1.2.3 From 35d889d10b649fda66121891ec05eca88150059d Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Mon, 5 Mar 2018 20:52:54 +0300 Subject: sch_netem: fix skb leak in netem_enqueue() When we exceed current packets limit and we have more than one segment in the list returned by skb_gso_segment(), netem drops only the first one, skipping the rest, hence kmemleak reports: unreferenced object 0xffff880b5d23b600 (size 1024): comm "softirq", pid 0, jiffies 4384527763 (age 2770.629s) hex dump (first 32 bytes): 00 80 23 5d 0b 88 ff ff 00 00 00 00 00 00 00 00 ..#]............ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000d8a19b9d>] __alloc_skb+0xc9/0x520 [<000000001709b32f>] skb_segment+0x8c8/0x3710 [<00000000c7b9bb88>] tcp_gso_segment+0x331/0x1830 [<00000000c921cba1>] inet_gso_segment+0x476/0x1370 [<000000008b762dd4>] skb_mac_gso_segment+0x1f9/0x510 [<000000002182660a>] __skb_gso_segment+0x1dd/0x620 [<00000000412651b9>] netem_enqueue+0x1536/0x2590 [sch_netem] [<0000000005d3b2a9>] __dev_queue_xmit+0x1167/0x2120 [<00000000fc5f7327>] ip_finish_output2+0x998/0xf00 [<00000000d309e9d3>] ip_output+0x1aa/0x2c0 [<000000007ecbd3a4>] tcp_transmit_skb+0x18db/0x3670 [<0000000042d2a45f>] tcp_write_xmit+0x4d4/0x58c0 [<0000000056a44199>] tcp_tasklet_func+0x3d9/0x540 [<0000000013d06d02>] tasklet_action+0x1ca/0x250 [<00000000fcde0b8b>] __do_softirq+0x1b4/0x5a3 [<00000000e7ed027c>] irq_exit+0x1e2/0x210 Fix it by adding the rest of the segments, if any, to skb 'to_free' list. Add new __qdisc_drop_all() and qdisc_drop_all() functions because they can be useful in the future if we need to drop segmented GSO packets in other places. Fixes: 6071bd1aa13e ("netem: Segment GSO packets on enqueue") Signed-off-by: Alexey Kodanev Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sch_generic.h | 19 +++++++++++++++++++ net/sched/sch_netem.c | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e2ab13687fb9..2092d33194dd 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -824,6 +824,16 @@ static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free) *to_free = skb; } +static inline void __qdisc_drop_all(struct sk_buff *skb, + struct sk_buff **to_free) +{ + if (skb->prev) + skb->prev->next = *to_free; + else + skb->next = *to_free; + *to_free = skb; +} + static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch, struct qdisc_skb_head *qh, struct sk_buff **to_free) @@ -956,6 +966,15 @@ static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_DROP; } +static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + __qdisc_drop_all(skb, to_free); + qdisc_qstats_drop(sch); + + return NET_XMIT_DROP; +} + /* Length to Time (L2T) lookup in a qdisc_rate_table, to determine how long it will take to send a packet given its size. */ diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 7c179addebcd..7d6801fc5340 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -509,7 +509,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, } if (unlikely(sch->q.qlen >= sch->limit)) - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_all(skb, sch, to_free); qdisc_qstats_backlog_inc(sch, skb); -- cgit v1.2.3 From 2695578b896aea472b2c0dcbe9d92daa71738484 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Mar 2018 11:41:13 -0800 Subject: net: usbnet: fix potential deadlock on 32bit hosts Marek reported a LOCKDEP issue occurring on 32bit host, that we tracked down to the fact that usbnet could either run from soft or hard irqs. This patch adds u64_stats_update_begin_irqsave() and u64_stats_update_end_irqrestore() helpers to solve this case. [ 17.768040] ================================ [ 17.772239] WARNING: inconsistent lock state [ 17.776511] 4.16.0-rc3-next-20180227-00007-g876c53a7493c #453 Not tainted [ 17.783329] -------------------------------- [ 17.787580] inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. [ 17.793607] swapper/0/0 [HC0[0]:SC1[1]:HE1:SE0] takes: [ 17.798751] (&syncp->seq#5){?.-.}, at: [<9b22e5f0>] asix_rx_fixup_internal+0x188/0x288 [ 17.806790] {IN-HARDIRQ-W} state was registered at: [ 17.811677] tx_complete+0x100/0x208 [ 17.815319] __usb_hcd_giveback_urb+0x60/0xf0 [ 17.819770] xhci_giveback_urb_in_irq+0xa8/0x240 [ 17.824469] xhci_td_cleanup+0xf4/0x16c [ 17.828367] xhci_irq+0xe74/0x2240 [ 17.831827] usb_hcd_irq+0x24/0x38 [ 17.835343] __handle_irq_event_percpu+0x98/0x510 [ 17.840111] handle_irq_event_percpu+0x1c/0x58 [ 17.844623] handle_irq_event+0x38/0x5c [ 17.848519] handle_fasteoi_irq+0xa4/0x138 [ 17.852681] generic_handle_irq+0x18/0x28 [ 17.856760] __handle_domain_irq+0x6c/0xe4 [ 17.860941] gic_handle_irq+0x54/0xa0 [ 17.864666] __irq_svc+0x70/0xb0 [ 17.867964] arch_cpu_idle+0x20/0x3c [ 17.871578] arch_cpu_idle+0x20/0x3c [ 17.875190] do_idle+0x144/0x218 [ 17.878468] cpu_startup_entry+0x18/0x1c [ 17.882454] start_kernel+0x394/0x400 [ 17.886177] irq event stamp: 161912 [ 17.889616] hardirqs last enabled at (161912): [<7bedfacf>] __netdev_alloc_skb+0xcc/0x140 [ 17.897893] hardirqs last disabled at (161911): [] __netdev_alloc_skb+0x94/0x140 [ 17.904903] exynos5-hsi2c 12ca0000.i2c: tx timeout [ 17.906116] softirqs last enabled at (161904): [<387102ff>] irq_enter+0x78/0x80 [ 17.906123] softirqs last disabled at (161905): [] irq_exit+0x134/0x158 [ 17.925722]. [ 17.925722] other info that might help us debug this: [ 17.933435] Possible unsafe locking scenario: [ 17.933435]. [ 17.940331] CPU0 [ 17.942488] ---- [ 17.944894] lock(&syncp->seq#5); [ 17.948274] [ 17.950847] lock(&syncp->seq#5); [ 17.954386]. [ 17.954386] *** DEADLOCK *** [ 17.954386]. [ 17.962422] no locks held by swapper/0/0. Fixes: c8b5d129ee29 ("net: usbnet: support 64bit stats") Signed-off-by: Eric Dumazet Reported-by: Marek Szyprowski Cc: Greg Ungerer Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 10 ++++++---- include/linux/u64_stats_sync.h | 22 ++++++++++++++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 8a22ff67b026..d9eea8cfe6cb 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -315,6 +315,7 @@ static void __usbnet_status_stop_force(struct usbnet *dev) void usbnet_skb_return (struct usbnet *dev, struct sk_buff *skb) { struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->stats64); + unsigned long flags; int status; if (test_bit(EVENT_RX_PAUSED, &dev->flags)) { @@ -326,10 +327,10 @@ void usbnet_skb_return (struct usbnet *dev, struct sk_buff *skb) if (skb->protocol == 0) skb->protocol = eth_type_trans (skb, dev->net); - u64_stats_update_begin(&stats64->syncp); + flags = u64_stats_update_begin_irqsave(&stats64->syncp); stats64->rx_packets++; stats64->rx_bytes += skb->len; - u64_stats_update_end(&stats64->syncp); + u64_stats_update_end_irqrestore(&stats64->syncp, flags); netif_dbg(dev, rx_status, dev->net, "< rx, len %zu, type 0x%x\n", skb->len + sizeof (struct ethhdr), skb->protocol); @@ -1248,11 +1249,12 @@ static void tx_complete (struct urb *urb) if (urb->status == 0) { struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->stats64); + unsigned long flags; - u64_stats_update_begin(&stats64->syncp); + flags = u64_stats_update_begin_irqsave(&stats64->syncp); stats64->tx_packets += entry->packets; stats64->tx_bytes += entry->length; - u64_stats_update_end(&stats64->syncp); + u64_stats_update_end_irqrestore(&stats64->syncp, flags); } else { dev->net->stats.tx_errors++; diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index 5bdbd9f49395..07ee0f84a46c 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -90,6 +90,28 @@ static inline void u64_stats_update_end(struct u64_stats_sync *syncp) #endif } +static inline unsigned long +u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) +{ + unsigned long flags = 0; + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + local_irq_save(flags); + write_seqcount_begin(&syncp->seq); +#endif + return flags; +} + +static inline void +u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, + unsigned long flags) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_end(&syncp->seq); + local_irq_restore(flags); +#endif +} + static inline void u64_stats_update_begin_raw(struct u64_stats_sync *syncp) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) -- cgit v1.2.3 From 3a4030761ea88ff439030ca98e3094b9900e96b7 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 9 Mar 2018 14:50:34 +0800 Subject: vhost_net: examine pointer types during un-producing After commit fc72d1d54dd9 ("tuntap: XDP transmission"), we can actually queueing XDP pointers in the pointer ring, so we should examine the pointer type before freeing the pointer. Fixes: fc72d1d54dd9 ("tuntap: XDP transmission") Reported-by: Michael S. Tsirkin Acked-by: Michael S. Tsirkin Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 3 ++- drivers/vhost/net.c | 2 +- include/linux/if_tun.h | 4 ++++ 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 7433bb2e4451..28cfa642e39a 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -655,7 +655,7 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile) return tun; } -static void tun_ptr_free(void *ptr) +void tun_ptr_free(void *ptr) { if (!ptr) return; @@ -667,6 +667,7 @@ static void tun_ptr_free(void *ptr) __skb_array_destroy_skb(ptr); } } +EXPORT_SYMBOL_GPL(tun_ptr_free); static void tun_queue_purge(struct tun_file *tfile) { diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index efb93063fda1..8139bc70ad7d 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -170,7 +170,7 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) { ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head, vhost_net_buf_get_size(rxq), - __skb_array_destroy_skb); + tun_ptr_free); rxq->head = rxq->tail = 0; } } diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index c5b0a75a7812..fd00170b494f 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -25,6 +25,7 @@ struct ptr_ring *tun_get_tx_ring(struct file *file); bool tun_is_xdp_buff(void *ptr); void *tun_xdp_to_ptr(void *ptr); void *tun_ptr_to_xdp(void *ptr); +void tun_ptr_free(void *ptr); #else #include #include @@ -50,5 +51,8 @@ static inline void *tun_ptr_to_xdp(void *ptr) { return NULL; } +static inline void tun_ptr_free(void *ptr) +{ +} #endif /* CONFIG_TUN */ #endif /* __IF_TUN_H */ -- cgit v1.2.3 From b1d0a5d0cba4597c0394997b2d5fced3e3841b4e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 10 Mar 2018 01:15:45 +0100 Subject: netfilter: x_tables: add and use xt_check_proc_name recent and hashlimit both create /proc files, but only check that name is 0 terminated. This can trigger WARN() from procfs when name is "" or "/". Add helper for this and then use it for both. Cc: Eric Dumazet Reported-by: Eric Dumazet Reported-by: Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 ++ net/netfilter/x_tables.c | 30 ++++++++++++++++++++++++++++++ net/netfilter/xt_hashlimit.c | 16 ++++++++++------ net/netfilter/xt_recent.c | 6 +++--- 4 files changed, 45 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 1313b35c3ab7..14529511c4b8 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -285,6 +285,8 @@ unsigned int *xt_alloc_entry_offsets(unsigned int size); bool xt_find_jump_offset(const unsigned int *offsets, unsigned int target, unsigned int size); +int xt_check_proc_name(const char *name, unsigned int size); + int xt_check_match(struct xt_mtchk_param *, unsigned int size, u_int8_t proto, bool inv_proto); int xt_check_target(struct xt_tgchk_param *, unsigned int size, u_int8_t proto, diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index fa1655aff8d3..4aa01c90e9d1 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -423,6 +423,36 @@ textify_hooks(char *buf, size_t size, unsigned int mask, uint8_t nfproto) return buf; } +/** + * xt_check_proc_name - check that name is suitable for /proc file creation + * + * @name: file name candidate + * @size: length of buffer + * + * some x_tables modules wish to create a file in /proc. + * This function makes sure that the name is suitable for this + * purpose, it checks that name is NUL terminated and isn't a 'special' + * name, like "..". + * + * returns negative number on error or 0 if name is useable. + */ +int xt_check_proc_name(const char *name, unsigned int size) +{ + if (name[0] == '\0') + return -EINVAL; + + if (strnlen(name, size) == size) + return -ENAMETOOLONG; + + if (strcmp(name, ".") == 0 || + strcmp(name, "..") == 0 || + strchr(name, '/')) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(xt_check_proc_name); + int xt_check_match(struct xt_mtchk_param *par, unsigned int size, u_int8_t proto, bool inv_proto) { diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 66f5aca62a08..3360f13dc208 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -917,8 +917,9 @@ static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par) struct hashlimit_cfg3 cfg = {}; int ret; - if (info->name[sizeof(info->name) - 1] != '\0') - return -EINVAL; + ret = xt_check_proc_name(info->name, sizeof(info->name)); + if (ret) + return ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 1); @@ -935,8 +936,9 @@ static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par) struct hashlimit_cfg3 cfg = {}; int ret; - if (info->name[sizeof(info->name) - 1] != '\0') - return -EINVAL; + ret = xt_check_proc_name(info->name, sizeof(info->name)); + if (ret) + return ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 2); @@ -950,9 +952,11 @@ static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par) static int hashlimit_mt_check(const struct xt_mtchk_param *par) { struct xt_hashlimit_mtinfo3 *info = par->matchinfo; + int ret; - if (info->name[sizeof(info->name) - 1] != '\0') - return -EINVAL; + ret = xt_check_proc_name(info->name, sizeof(info->name)); + if (ret) + return ret; return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg, info->name, 3); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 6d232d18faff..81ee1d6543b2 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -361,9 +361,9 @@ static int recent_mt_check(const struct xt_mtchk_param *par, info->hit_count, XT_RECENT_MAX_NSTAMPS - 1); return -EINVAL; } - if (info->name[0] == '\0' || - strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN) - return -EINVAL; + ret = xt_check_proc_name(info->name, sizeof(info->name)); + if (ret) + return ret; if (ip_pkt_list_tot && info->hit_count < ip_pkt_list_tot) nstamp_mask = roundup_pow_of_two(ip_pkt_list_tot) - 1; -- cgit v1.2.3 From a2c054a896b8ac794ddcfc7c92e2dc7ec4ed4ed5 Mon Sep 17 00:00:00 2001 From: Brad Mouring Date: Thu, 8 Mar 2018 16:23:03 -0600 Subject: net: phy: Tell caller result of phy_change() In 664fcf123a30e (net: phy: Threaded interrupts allow some simplification) the phy_interrupt system was changed to use a traditional threaded interrupt scheme instead of a workqueue approach. With this change, the phy status check moved into phy_change, which did not report back to the caller whether or not the interrupt was handled. This means that, in the case of a shared phy interrupt, only the first phydev's interrupt registers are checked (since phy_interrupt() would always return IRQ_HANDLED). This leads to interrupt storms when it is a secondary device that's actually the interrupt source. Signed-off-by: Brad Mouring Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 145 +++++++++++++++++++++++++------------------------- include/linux/phy.h | 1 - 2 files changed, 72 insertions(+), 74 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index a6f924fee584..9aabfa1a455a 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -617,6 +617,77 @@ static void phy_error(struct phy_device *phydev) phy_trigger_machine(phydev, false); } +/** + * phy_disable_interrupts - Disable the PHY interrupts from the PHY side + * @phydev: target phy_device struct + */ +static int phy_disable_interrupts(struct phy_device *phydev) +{ + int err; + + /* Disable PHY interrupts */ + err = phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED); + if (err) + goto phy_err; + + /* Clear the interrupt */ + err = phy_clear_interrupt(phydev); + if (err) + goto phy_err; + + return 0; + +phy_err: + phy_error(phydev); + + return err; +} + +/** + * phy_change - Called by the phy_interrupt to handle PHY changes + * @phydev: phy_device struct that interrupted + */ +static irqreturn_t phy_change(struct phy_device *phydev) +{ + if (phy_interrupt_is_valid(phydev)) { + if (phydev->drv->did_interrupt && + !phydev->drv->did_interrupt(phydev)) + return IRQ_NONE; + + if (phydev->state == PHY_HALTED) + if (phy_disable_interrupts(phydev)) + goto phy_err; + } + + mutex_lock(&phydev->lock); + if ((PHY_RUNNING == phydev->state) || (PHY_NOLINK == phydev->state)) + phydev->state = PHY_CHANGELINK; + mutex_unlock(&phydev->lock); + + /* reschedule state queue work to run as soon as possible */ + phy_trigger_machine(phydev, true); + + if (phy_interrupt_is_valid(phydev) && phy_clear_interrupt(phydev)) + goto phy_err; + return IRQ_HANDLED; + +phy_err: + phy_error(phydev); + return IRQ_NONE; +} + +/** + * phy_change_work - Scheduled by the phy_mac_interrupt to handle PHY changes + * @work: work_struct that describes the work to be done + */ +void phy_change_work(struct work_struct *work) +{ + struct phy_device *phydev = + container_of(work, struct phy_device, phy_queue); + + phy_change(phydev); +} + /** * phy_interrupt - PHY interrupt handler * @irq: interrupt line @@ -632,9 +703,7 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat) if (PHY_HALTED == phydev->state) return IRQ_NONE; /* It can't be ours. */ - phy_change(phydev); - - return IRQ_HANDLED; + return phy_change(phydev); } /** @@ -651,32 +720,6 @@ static int phy_enable_interrupts(struct phy_device *phydev) return phy_config_interrupt(phydev, PHY_INTERRUPT_ENABLED); } -/** - * phy_disable_interrupts - Disable the PHY interrupts from the PHY side - * @phydev: target phy_device struct - */ -static int phy_disable_interrupts(struct phy_device *phydev) -{ - int err; - - /* Disable PHY interrupts */ - err = phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED); - if (err) - goto phy_err; - - /* Clear the interrupt */ - err = phy_clear_interrupt(phydev); - if (err) - goto phy_err; - - return 0; - -phy_err: - phy_error(phydev); - - return err; -} - /** * phy_start_interrupts - request and enable interrupts for a PHY device * @phydev: target phy_device struct @@ -719,50 +762,6 @@ int phy_stop_interrupts(struct phy_device *phydev) } EXPORT_SYMBOL(phy_stop_interrupts); -/** - * phy_change - Called by the phy_interrupt to handle PHY changes - * @phydev: phy_device struct that interrupted - */ -void phy_change(struct phy_device *phydev) -{ - if (phy_interrupt_is_valid(phydev)) { - if (phydev->drv->did_interrupt && - !phydev->drv->did_interrupt(phydev)) - return; - - if (phydev->state == PHY_HALTED) - if (phy_disable_interrupts(phydev)) - goto phy_err; - } - - mutex_lock(&phydev->lock); - if ((PHY_RUNNING == phydev->state) || (PHY_NOLINK == phydev->state)) - phydev->state = PHY_CHANGELINK; - mutex_unlock(&phydev->lock); - - /* reschedule state queue work to run as soon as possible */ - phy_trigger_machine(phydev, true); - - if (phy_interrupt_is_valid(phydev) && phy_clear_interrupt(phydev)) - goto phy_err; - return; - -phy_err: - phy_error(phydev); -} - -/** - * phy_change_work - Scheduled by the phy_mac_interrupt to handle PHY changes - * @work: work_struct that describes the work to be done - */ -void phy_change_work(struct work_struct *work) -{ - struct phy_device *phydev = - container_of(work, struct phy_device, phy_queue); - - phy_change(phydev); -} - /** * phy_stop - Bring down the PHY link, and stop checking the status * @phydev: target phy_device struct diff --git a/include/linux/phy.h b/include/linux/phy.h index d7069539f351..b260fb336b25 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1012,7 +1012,6 @@ int phy_driver_register(struct phy_driver *new_driver, struct module *owner); int phy_drivers_register(struct phy_driver *new_driver, int n, struct module *owner); void phy_state_machine(struct work_struct *work); -void phy_change(struct phy_device *phydev); void phy_change_work(struct work_struct *work); void phy_mac_interrupt(struct phy_device *phydev); void phy_start_machine(struct phy_device *phydev); -- cgit v1.2.3 From bf2ae2e4bf9360e07c0cdfa166bcdc0afd92f4ce Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 10 Mar 2018 18:57:50 +0800 Subject: sock_diag: request _diag module only when the family or proto has been registered Now when using 'ss' in iproute, kernel would try to load all _diag modules, which also causes corresponding family and proto modules to be loaded as well due to module dependencies. Like after running 'ss', sctp, dccp, af_packet (if it works as a module) would be loaded. For example: $ lsmod|grep sctp $ ss $ lsmod|grep sctp sctp_diag 16384 0 sctp 323584 5 sctp_diag inet_diag 24576 4 raw_diag,tcp_diag,sctp_diag,udp_diag libcrc32c 16384 3 nf_conntrack,nf_nat,sctp As these family and proto modules are loaded unintentionally, it could cause some problems, like: - Some debug tools use 'ss' to collect the socket info, which loads all those diag and family and protocol modules. It's noisy for identifying issues. - Users usually expect to drop sctp init packet silently when they have no sense of sctp protocol instead of sending abort back. - It wastes resources (especially with multiple netns), and SCTP module can't be unloaded once it's loaded. ... In short, it's really inappropriate to have these family and proto modules loaded unexpectedly when just doing debugging with inet_diag. This patch is to introduce sock_load_diag_module() where it loads the _diag module only when it's corresponding family or proto has been already registered. Note that we can't just load _diag module without the family or proto loaded, as some symbols used in _diag module are from the family or proto module. v1->v2: - move inet proto check to inet_diag to avoid a compiling err. v2->v3: - define sock_load_diag_module in sock.c and export one symbol only. - improve the changelog. Reported-by: Sabrina Dubroca Acked-by: Marcelo Ricardo Leitner Acked-by: Phil Sutter Acked-by: Sabrina Dubroca Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/net.h | 1 + include/net/sock.h | 1 + net/core/sock.c | 21 +++++++++++++++++++++ net/core/sock_diag.c | 12 ++++-------- net/ipv4/inet_diag.c | 3 +-- net/socket.c | 5 +++++ 6 files changed, 33 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index 91216b16feb7..2a0391eea05c 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -222,6 +222,7 @@ enum { int sock_wake_async(struct socket_wq *sk_wq, int how, int band); int sock_register(const struct net_proto_family *fam); void sock_unregister(int family); +bool sock_is_registered(int family); int __sock_create(struct net *net, int family, int type, int proto, struct socket **res, int kern); int sock_create(int family, int type, int proto, struct socket **res); diff --git a/include/net/sock.h b/include/net/sock.h index 169c92afcafa..ae23f3b389ca 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1137,6 +1137,7 @@ struct proto { int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); +int sock_load_diag_module(int family, int protocol); #ifdef SOCK_REFCNT_DEBUG static inline void sk_refcnt_debug_inc(struct sock *sk) diff --git a/net/core/sock.c b/net/core/sock.c index c501499a04fe..85b0b64e7f9d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3261,6 +3261,27 @@ void proto_unregister(struct proto *prot) } EXPORT_SYMBOL(proto_unregister); +int sock_load_diag_module(int family, int protocol) +{ + if (!protocol) { + if (!sock_is_registered(family)) + return -ENOENT; + + return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, family); + } + +#ifdef CONFIG_INET + if (family == AF_INET && + !rcu_access_pointer(inet_protos[protocol])) + return -ENOENT; +#endif + + return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, family, protocol); +} +EXPORT_SYMBOL(sock_load_diag_module); + #ifdef CONFIG_PROC_FS static void *proto_seq_start(struct seq_file *seq, loff_t *pos) __acquires(proto_list_mutex) diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 146b50e30659..c37b5be7c5e4 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -220,8 +220,7 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; if (sock_diag_handlers[req->sdiag_family] == NULL) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, req->sdiag_family); + sock_load_diag_module(req->sdiag_family, 0); mutex_lock(&sock_diag_table_mutex); hndl = sock_diag_handlers[req->sdiag_family]; @@ -247,8 +246,7 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, case TCPDIAG_GETSOCK: case DCCPDIAG_GETSOCK: if (inet_rcv_compat == NULL) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET); + sock_load_diag_module(AF_INET, 0); mutex_lock(&sock_diag_table_mutex); if (inet_rcv_compat != NULL) @@ -281,14 +279,12 @@ static int sock_diag_bind(struct net *net, int group) case SKNLGRP_INET_TCP_DESTROY: case SKNLGRP_INET_UDP_DESTROY: if (!sock_diag_handlers[AF_INET]) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET); + sock_load_diag_module(AF_INET, 0); break; case SKNLGRP_INET6_TCP_DESTROY: case SKNLGRP_INET6_UDP_DESTROY: if (!sock_diag_handlers[AF_INET6]) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET6); + sock_load_diag_module(AF_INET6, 0); break; } return 0; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index a383f299ce24..4e5bc4b2f14e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -53,8 +53,7 @@ static DEFINE_MUTEX(inet_diag_table_mutex); static const struct inet_diag_handler *inet_diag_lock_handler(int proto) { if (!inet_diag_table[proto]) - request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET, proto); + sock_load_diag_module(AF_INET, proto); mutex_lock(&inet_diag_table_mutex); if (!inet_diag_table[proto]) diff --git a/net/socket.c b/net/socket.c index a93c99b518ca..08847c3b8c39 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2587,6 +2587,11 @@ void sock_unregister(int family) } EXPORT_SYMBOL(sock_unregister); +bool sock_is_registered(int family) +{ + return family < NPROTO && rcu_access_pointer(net_families[family]); +} + static int __init sock_init(void) { int err; -- cgit v1.2.3 From c2b37f76485f073f020e60b5954b6dc4e55f693c Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Thu, 8 Mar 2018 15:51:41 +0200 Subject: IB/mlx5: Fix integer overflows in mlx5_ib_create_srq This patch validates user provided input to prevent integer overflow due to integer manipulation in the mlx5_ib_create_srq function. Cc: syzkaller Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Boris Pismenny Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/srq.c | 15 +++++++++------ include/linux/mlx5/driver.h | 4 ++-- 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 6d5fadad9090..3c7522d025f2 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -241,8 +241,8 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_srq *srq; - int desc_size; - int buf_size; + size_t desc_size; + size_t buf_size; int err; struct mlx5_srq_attr in = {0}; __u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); @@ -266,15 +266,18 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, desc_size = sizeof(struct mlx5_wqe_srq_next_seg) + srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg); + if (desc_size == 0 || srq->msrq.max_gs > desc_size) + return ERR_PTR(-EINVAL); desc_size = roundup_pow_of_two(desc_size); - desc_size = max_t(int, 32, desc_size); + desc_size = max_t(size_t, 32, desc_size); + if (desc_size < sizeof(struct mlx5_wqe_srq_next_seg)) + return ERR_PTR(-EINVAL); srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) / sizeof(struct mlx5_wqe_data_seg); srq->msrq.wqe_shift = ilog2(desc_size); buf_size = srq->msrq.max * desc_size; - mlx5_ib_dbg(dev, "desc_size 0x%x, req wr 0x%x, srq size 0x%x, max_gs 0x%x, max_avail_gather 0x%x\n", - desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs, - srq->msrq.max_avail_gather); + if (buf_size < desc_size) + return ERR_PTR(-EINVAL); in.type = init_attr->srq_type; if (pd->uobject) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6ed79a8a8318..9d3a03364e6e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -453,8 +453,8 @@ struct mlx5_core_srq { struct mlx5_core_rsc_common common; /* must be first */ u32 srqn; int max; - int max_gs; - int max_avail_gather; + size_t max_gs; + size_t max_avail_gather; int wqe_shift; void (*event) (struct mlx5_core_srq *, enum mlx5_event); -- cgit v1.2.3 From 6417250d3f894e66a68ba1cd93676143f2376a6f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 6 Mar 2018 19:34:42 -0800 Subject: workqueue: remove unused cancel_work() Found this by accident. There are no usages of bare cancel_work() in current kernel source. Signed-off-by: Stephen Hemminger Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 1 - kernel/workqueue.c | 8 -------- 2 files changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index bc0cda180c8b..0c3301421c57 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -456,7 +456,6 @@ extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); extern bool flush_work(struct work_struct *work); -extern bool cancel_work(struct work_struct *work); extern bool cancel_work_sync(struct work_struct *work); extern bool flush_delayed_work(struct delayed_work *dwork); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ccd1080dd6e7..6ec6ba65127b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3018,14 +3018,6 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork) return ret; } -/* - * See cancel_delayed_work() - */ -bool cancel_work(struct work_struct *work) -{ - return __cancel_work(work, false); -} - /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel -- cgit v1.2.3 From 4dcb31d4649df36297296b819437709f5407059c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Mar 2018 09:04:16 -0700 Subject: net: use skb_to_full_sk() in skb_update_prio() Andrei Vagin reported a KASAN: slab-out-of-bounds error in skb_update_prio() Since SYNACK might be attached to a request socket, we need to get back to the listener socket. Since this listener is manipulated without locks, add const qualifiers to sock_cgroup_prioidx() so that the const can also be used in skb_update_prio() Also add the const qualifier to sock_cgroup_classid() for consistency. Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener") Signed-off-by: Eric Dumazet Reported-by: Andrei Vagin Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 4 ++-- net/core/dev.c | 22 +++++++++++++++------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 9f242b876fde..f8e76d01a5ad 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -755,13 +755,13 @@ struct sock_cgroup_data { * updaters and return part of the previous pointer as the prioidx or * classid. Such races are short-lived and the result isn't critical. */ -static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd) +static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd) { /* fallback to 1 which is always the ID of the root cgroup */ return (skcd->is_data & 1) ? skcd->prioidx : 1; } -static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) +static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) { /* fallback to 0 which is the unconfigured default classid */ return (skcd->is_data & 1) ? skcd->classid : 0; diff --git a/net/core/dev.c b/net/core/dev.c index 2cedf520cb28..12be20535714 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3278,15 +3278,23 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) static void skb_update_prio(struct sk_buff *skb) { - struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); + const struct netprio_map *map; + const struct sock *sk; + unsigned int prioidx; - if (!skb->priority && skb->sk && map) { - unsigned int prioidx = - sock_cgroup_prioidx(&skb->sk->sk_cgrp_data); + if (skb->priority) + return; + map = rcu_dereference_bh(skb->dev->priomap); + if (!map) + return; + sk = skb_to_full_sk(skb); + if (!sk) + return; - if (prioidx < map->priomap_len) - skb->priority = map->priomap[prioidx]; - } + prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data); + + if (prioidx < map->priomap_len) + skb->priority = map->priomap[prioidx]; } #else #define skb_update_prio(skb) -- cgit v1.2.3 From d52e5a7e7ca49457dd31fc8b42fb7c0d58a31221 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 14 Mar 2018 10:21:14 +0100 Subject: ipv4: lock mtu in fnhe when received PMTU < net.ipv4.route.min_pmtu Prior to the rework of PMTU information storage in commit 2c8cec5c10bc ("ipv4: Cache learned PMTU information in inetpeer."), when a PMTU event advertising a PMTU smaller than net.ipv4.route.min_pmtu was received, we would disable setting the DF flag on packets by locking the MTU metric, and set the PMTU to net.ipv4.route.min_pmtu. Since then, we don't disable DF, and set PMTU to net.ipv4.route.min_pmtu, so the intermediate router that has this link with a small MTU will have to drop the packets. This patch reestablishes pre-2.6.39 behavior by splitting rtable->rt_pmtu into a bitfield with rt_mtu_locked and rt_pmtu. rt_mtu_locked indicates that we shouldn't set the DF bit on that path, and is checked in ip_dont_fragment(). One possible workaround is to set net.ipv4.route.min_pmtu to a value low enough to accommodate the lowest MTU encountered. Fixes: 2c8cec5c10bc ("ipv4: Cache learned PMTU information in inetpeer.") Signed-off-by: Sabrina Dubroca Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- include/net/ip.h | 11 +++++++++-- include/net/ip_fib.h | 1 + include/net/route.h | 3 ++- net/ipv4/route.c | 26 +++++++++++++++++++------- net/ipv4/xfrm4_policy.c | 1 + 5 files changed, 32 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index 746abff9ce51..f49b3a576bec 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -328,6 +328,13 @@ int ip_decrease_ttl(struct iphdr *iph) return --iph->ttl; } +static inline int ip_mtu_locked(const struct dst_entry *dst) +{ + const struct rtable *rt = (const struct rtable *)dst; + + return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU); +} + static inline int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst) { @@ -335,7 +342,7 @@ int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst) return pmtudisc == IP_PMTUDISC_DO || (pmtudisc == IP_PMTUDISC_WANT && - !(dst_metric_locked(dst, RTAX_MTU))); + !ip_mtu_locked(dst)); } static inline bool ip_sk_accept_pmtu(const struct sock *sk) @@ -361,7 +368,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, struct net *net = dev_net(dst->dev); if (net->ipv4.sysctl_ip_fwd_use_pmtu || - dst_metric_locked(dst, RTAX_MTU) || + ip_mtu_locked(dst) || !forwarding) return dst_mtu(dst); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index f80524396c06..77d0a78cf7d2 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -59,6 +59,7 @@ struct fib_nh_exception { int fnhe_genid; __be32 fnhe_daddr; u32 fnhe_pmtu; + bool fnhe_mtu_locked; __be32 fnhe_gw; unsigned long fnhe_expires; struct rtable __rcu *fnhe_rth_input; diff --git a/include/net/route.h b/include/net/route.h index 40b870d58f38..20a92ca9e115 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -63,7 +63,8 @@ struct rtable { __be32 rt_gateway; /* Miscellaneous cached information */ - u32 rt_pmtu; + u32 rt_mtu_locked:1, + rt_pmtu:31; u32 rt_table_id; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f9dbb8cb66bf..299e247b2032 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -634,6 +634,7 @@ static inline u32 fnhe_hashfun(__be32 daddr) static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) { rt->rt_pmtu = fnhe->fnhe_pmtu; + rt->rt_mtu_locked = fnhe->fnhe_mtu_locked; rt->dst.expires = fnhe->fnhe_expires; if (fnhe->fnhe_gw) { @@ -644,7 +645,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh } static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, - u32 pmtu, unsigned long expires) + u32 pmtu, bool lock, unsigned long expires) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; @@ -681,8 +682,10 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, fnhe->fnhe_genid = genid; if (gw) fnhe->fnhe_gw = gw; - if (pmtu) + if (pmtu) { fnhe->fnhe_pmtu = pmtu; + fnhe->fnhe_mtu_locked = lock; + } fnhe->fnhe_expires = max(1UL, expires); /* Update all cached dsts too */ rt = rcu_dereference(fnhe->fnhe_rth_input); @@ -706,6 +709,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; fnhe->fnhe_pmtu = pmtu; + fnhe->fnhe_mtu_locked = lock; fnhe->fnhe_expires = expires; /* Exception created; mark the cached routes for the nexthop @@ -787,7 +791,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, new_gw, - 0, jiffies + ip_rt_gc_timeout); + 0, false, + jiffies + ip_rt_gc_timeout); } if (kill_route) rt->dst.obsolete = DST_OBSOLETE_KILL; @@ -1009,15 +1014,18 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; struct fib_result res; + bool lock = false; - if (dst_metric_locked(dst, RTAX_MTU)) + if (ip_mtu_locked(dst)) return; if (ipv4_mtu(dst) < mtu) return; - if (mtu < ip_rt_min_pmtu) + if (mtu < ip_rt_min_pmtu) { + lock = true; mtu = ip_rt_min_pmtu; + } if (rt->rt_pmtu == mtu && time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) @@ -1027,7 +1035,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); - update_or_create_fnhe(nh, fl4->daddr, 0, mtu, + update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock, jiffies + ip_rt_mtu_expires); } rcu_read_unlock(); @@ -1280,7 +1288,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = READ_ONCE(dst->dev->mtu); - if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { + if (unlikely(ip_mtu_locked(dst))) { if (rt->rt_uses_gateway && mtu > 576) mtu = 576; } @@ -1521,6 +1529,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_is_input = 0; rt->rt_iif = 0; rt->rt_pmtu = 0; + rt->rt_mtu_locked = 0; rt->rt_gateway = 0; rt->rt_uses_gateway = 0; rt->rt_table_id = 0; @@ -2546,6 +2555,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_is_input = ort->rt_is_input; rt->rt_iif = ort->rt_iif; rt->rt_pmtu = ort->rt_pmtu; + rt->rt_mtu_locked = ort->rt_mtu_locked; rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; @@ -2648,6 +2658,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); if (rt->rt_pmtu && expires) metrics[RTAX_MTU - 1] = rt->rt_pmtu; + if (rt->rt_mtu_locked && expires) + metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU); if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 8d33f7b311f4..fbebda67ac1b 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -100,6 +100,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_gateway = rt->rt_gateway; xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; + xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked; xdst->u.rt.rt_table_id = rt->rt_table_id; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); rt_add_uncached_list(&xdst->u.rt); -- cgit v1.2.3 From 413aa807ae39fed7e387c175d2d0ae9fcf6c0c9d Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 5 Mar 2018 11:36:38 +0100 Subject: KVM: arm/arm64: Reset mapped IRQs on VM reset We currently don't allow resetting mapped IRQs from userspace, because their state is controlled by the hardware. But we do need to reset the state when the VM is reset, so we provide a function for the 'owner' of the mapped interrupt to reset the interrupt state. Currently only the timer uses mapped interrupts, so we call this function from the timer reset logic. Cc: stable@vger.kernel.org Fixes: 4c60e360d6df ("KVM: arm/arm64: Provide a get_input_level for the arch timer") Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/kvm/arm_vgic.h | 1 + virt/kvm/arm/arch_timer.c | 4 ++++ virt/kvm/arm/vgic/vgic.c | 26 ++++++++++++++++++++++++++ 3 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index cdbd142ca7f2..02924ae2527e 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -360,6 +360,7 @@ void kvm_vgic_put(struct kvm_vcpu *vcpu); bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu); void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); +void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid); void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 70f4c30918eb..3945021510a9 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -581,6 +581,7 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) { + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); @@ -594,6 +595,9 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) ptimer->cnt_ctl = 0; kvm_timer_update_state(vcpu); + if (timer->enabled && irqchip_in_kernel(vcpu->kvm)) + kvm_vgic_reset_mapped_irq(vcpu, vtimer->irq.irq); + return 0; } diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index c7c5ef190afa..0001858a2c23 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -495,6 +495,32 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, return ret; } +/** + * kvm_vgic_reset_mapped_irq - Reset a mapped IRQ + * @vcpu: The VCPU pointer + * @vintid: The INTID of the interrupt + * + * Reset the active and pending states of a mapped interrupt. Kernel + * subsystems injecting mapped interrupts should reset their interrupt lines + * when we are doing a reset of the VM. + */ +void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid) +{ + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + unsigned long flags; + + if (!irq->hw) + goto out; + + spin_lock_irqsave(&irq->irq_lock, flags); + irq->active = false; + irq->pending_latch = false; + irq->line_level = false; + spin_unlock_irqrestore(&irq->irq_lock, flags); +out: + vgic_put_irq(vcpu->kvm, irq); +} + int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid) { struct vgic_irq *irq; -- cgit v1.2.3 From 16ca6a607d84bef0129698d8d808f501afd08d43 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Mar 2018 21:48:01 +0000 Subject: KVM: arm/arm64: vgic: Don't populate multiple LRs with the same vintid The vgic code is trying to be clever when injecting GICv2 SGIs, and will happily populate LRs with the same interrupt number if they come from multiple vcpus (after all, they are distinct interrupt sources). Unfortunately, this is against the letter of the architecture, and the GICv2 architecture spec says "Each valid interrupt stored in the List registers must have a unique VirtualID for that virtual CPU interface.". GICv3 has similar (although slightly ambiguous) restrictions. This results in guests locking up when using GICv2-on-GICv3, for example. The obvious fix is to stop trying so hard, and inject a single vcpu per SGI per guest entry. After all, pending SGIs with multiple source vcpus are pretty rare, and are mostly seen in scenario where the physical CPUs are severely overcomitted. But as we now only inject a single instance of a multi-source SGI per vcpu entry, we may delay those interrupts for longer than strictly necessary, and run the risk of injecting lower priority interrupts in the meantime. In order to address this, we adopt a three stage strategy: - If we encounter a multi-source SGI in the AP list while computing its depth, we force the list to be sorted - When populating the LRs, we prevent the injection of any interrupt of lower priority than that of the first multi-source SGI we've injected. - Finally, the injection of a multi-source SGI triggers the request of a maintenance interrupt when there will be no pending interrupt in the LRs (HCR_NPIE). At the point where the last pending interrupt in the LRs switches from Pending to Active, the maintenance interrupt will be delivered, allowing us to add the remaining SGIs using the same process. Cc: stable@vger.kernel.org Fixes: 0919e84c0fc1 ("KVM: arm/arm64: vgic-new: Add IRQ sync/flush framework") Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v3.h | 1 + include/linux/irqchip/arm-gic.h | 1 + virt/kvm/arm/vgic/vgic-v2.c | 9 +++++- virt/kvm/arm/vgic/vgic-v3.c | 9 +++++- virt/kvm/arm/vgic/vgic.c | 61 +++++++++++++++++++++++++++++--------- virt/kvm/arm/vgic/vgic.h | 2 ++ 6 files changed, 67 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index c00c4c33e432..b26eccc78fb1 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -503,6 +503,7 @@ #define ICH_HCR_EN (1 << 0) #define ICH_HCR_UIE (1 << 1) +#define ICH_HCR_NPIE (1 << 3) #define ICH_HCR_TC (1 << 10) #define ICH_HCR_TALL0 (1 << 11) #define ICH_HCR_TALL1 (1 << 12) diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index d3453ee072fc..68d8b1f73682 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -84,6 +84,7 @@ #define GICH_HCR_EN (1 << 0) #define GICH_HCR_UIE (1 << 1) +#define GICH_HCR_NPIE (1 << 3) #define GICH_LR_VIRTUALID (0x3ff << 0) #define GICH_LR_PHYSID_CPUID_SHIFT (10) diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index e9d840a75e7b..29556f71b691 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -37,6 +37,13 @@ void vgic_v2_init_lrs(void) vgic_v2_write_lr(i, 0); } +void vgic_v2_set_npie(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; + + cpuif->vgic_hcr |= GICH_HCR_NPIE; +} + void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) { struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; @@ -64,7 +71,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) int lr; unsigned long flags; - cpuif->vgic_hcr &= ~GICH_HCR_UIE; + cpuif->vgic_hcr &= ~(GICH_HCR_UIE | GICH_HCR_NPIE); for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { u32 val = cpuif->vgic_lr[lr]; diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 6b329414e57a..0ff2006f3781 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -26,6 +26,13 @@ static bool group1_trap; static bool common_trap; static bool gicv4_enable; +void vgic_v3_set_npie(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; + + cpuif->vgic_hcr |= ICH_HCR_NPIE; +} + void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; @@ -47,7 +54,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) int lr; unsigned long flags; - cpuif->vgic_hcr &= ~ICH_HCR_UIE; + cpuif->vgic_hcr &= ~(ICH_HCR_UIE | ICH_HCR_NPIE); for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { u64 val = cpuif->vgic_lr[lr]; diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 0001858a2c23..8201899126f6 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -710,22 +710,37 @@ static inline void vgic_set_underflow(struct kvm_vcpu *vcpu) vgic_v3_set_underflow(vcpu); } +static inline void vgic_set_npie(struct kvm_vcpu *vcpu) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_set_npie(vcpu); + else + vgic_v3_set_npie(vcpu); +} + /* Requires the ap_list_lock to be held. */ -static int compute_ap_list_depth(struct kvm_vcpu *vcpu) +static int compute_ap_list_depth(struct kvm_vcpu *vcpu, + bool *multi_sgi) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq; int count = 0; + *multi_sgi = false; + DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { spin_lock(&irq->irq_lock); /* GICv2 SGIs can count for more than one... */ - if (vgic_irq_is_sgi(irq->intid) && irq->source) - count += hweight8(irq->source); - else + if (vgic_irq_is_sgi(irq->intid) && irq->source) { + int w = hweight8(irq->source); + + count += w; + *multi_sgi |= (w > 1); + } else { count++; + } spin_unlock(&irq->irq_lock); } return count; @@ -736,28 +751,43 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq; - int count = 0; + int count; + bool npie = false; + bool multi_sgi; + u8 prio = 0xff; DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); - if (compute_ap_list_depth(vcpu) > kvm_vgic_global_state.nr_lr) + count = compute_ap_list_depth(vcpu, &multi_sgi); + if (count > kvm_vgic_global_state.nr_lr || multi_sgi) vgic_sort_ap_list(vcpu); + count = 0; + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { spin_lock(&irq->irq_lock); - if (unlikely(vgic_target_oracle(irq) != vcpu)) - goto next; - /* - * If we get an SGI with multiple sources, try to get - * them in all at once. + * If we have multi-SGIs in the pipeline, we need to + * guarantee that they are all seen before any IRQ of + * lower priority. In that case, we need to filter out + * these interrupts by exiting early. This is easy as + * the AP list has been sorted already. */ - do { + if (multi_sgi && irq->priority > prio) { + spin_unlock(&irq->irq_lock); + break; + } + + if (likely(vgic_target_oracle(irq) == vcpu)) { vgic_populate_lr(vcpu, irq, count++); - } while (irq->source && count < kvm_vgic_global_state.nr_lr); -next: + if (irq->source) { + npie = true; + prio = irq->priority; + } + } + spin_unlock(&irq->irq_lock); if (count == kvm_vgic_global_state.nr_lr) { @@ -768,6 +798,9 @@ next: } } + if (npie) + vgic_set_npie(vcpu); + vcpu->arch.vgic_cpu.used_lrs = count; /* Nuke remaining LRs */ diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 5b11859a1a1e..f5b8519e5546 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -160,6 +160,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); void vgic_v2_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v2_set_npie(struct kvm_vcpu *vcpu); int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); @@ -189,6 +190,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); void vgic_v3_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v3_set_npie(struct kvm_vcpu *vcpu); void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_enable(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 2f31115e940c4afd49b99c33123534e2ac924ffb Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 13 Mar 2018 17:42:41 +0800 Subject: scsi: core: introduce force_blk_mq This patch introduces 'force_blk_mq' to the scsi_host_template so that drivers that have no desire to support the legacy I/O path can signal blk-mq only support. [mkp: commit desc] Cc: Omar Sandoval , Cc: "Martin K. Petersen" , Cc: James Bottomley , Cc: Christoph Hellwig , Cc: Don Brace Cc: Kashyap Desai Cc: Mike Snitzer Cc: Laurence Oberman Signed-off-by: Ming Lei Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/hosts.c | 1 + include/scsi/scsi_host.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index dd9464920456..ef22b275d050 100644 --- a/drivers/scsi/hosts.c +++ b/drivers/scsi/hosts.c @@ -474,6 +474,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize) shost->dma_boundary = 0xffffffff; shost->use_blk_mq = scsi_use_blk_mq; + shost->use_blk_mq = scsi_use_blk_mq || shost->hostt->force_blk_mq; device_initialize(&shost->shost_gendev); dev_set_name(&shost->shost_gendev, "host%d", shost->host_no); diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index a8b7bf879ced..9c1e4bad6581 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -452,6 +452,9 @@ struct scsi_host_template { /* True if the controller does not support WRITE SAME */ unsigned no_write_same:1; + /* True if the low-level driver supports blk-mq only */ + unsigned force_blk_mq:1; + /* * Countdown for host blocking with no commands outstanding. */ -- cgit v1.2.3 From c658dc58c7eaa8569ceb0edd1ddbdfda84fe8aa5 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 15 Mar 2018 11:22:28 +0200 Subject: mmc: core: Fix tracepoint print of blk_addr and blksz Swap the positions of blk_addr and blksz in the tracepoint print arguments so that they match the print format. Signed-off-by: Adrian Hunter Fixes: d2f82254e4e8 ("mmc: core: Add members to mmc_request and mmc_data for CQE's") Cc: # 4.14+ Signed-off-by: Ulf Hansson --- include/trace/events/mmc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/mmc.h b/include/trace/events/mmc.h index 200f731be557..7b706ff21335 100644 --- a/include/trace/events/mmc.h +++ b/include/trace/events/mmc.h @@ -86,8 +86,8 @@ TRACE_EVENT(mmc_request_start, __entry->stop_flags, __entry->stop_retries, __entry->sbc_opcode, __entry->sbc_arg, __entry->sbc_flags, __entry->sbc_retries, - __entry->blocks, __entry->blk_addr, - __entry->blksz, __entry->data_flags, __entry->tag, + __entry->blocks, __entry->blksz, + __entry->blk_addr, __entry->data_flags, __entry->tag, __entry->can_retune, __entry->doing_retune, __entry->retune_now, __entry->need_retune, __entry->hold_retune, __entry->retune_period) -- cgit v1.2.3 From 95dd77580ccd66a0da96e6d4696945b8cea39431 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 14 Mar 2018 18:20:29 -0500 Subject: fs: Teach path_connected to handle nfs filesystems with multiple roots. On nfsv2 and nfsv3 the nfs server can export subsets of the same filesystem and report the same filesystem identifier, so that the nfs client can know they are the same filesystem. The subsets can be from disjoint directory trees. The nfsv2 and nfsv3 filesystems provides no way to find the common root of all directory trees exported form the server with the same filesystem identifier. The practical result is that in struct super s_root for nfs s_root is not necessarily the root of the filesystem. The nfs mount code sets s_root to the root of the first subset of the nfs filesystem that the kernel mounts. This effects the dcache invalidation code in generic_shutdown_super currently called shrunk_dcache_for_umount and that code for years has gone through an additional list of dentries that might be dentry trees that need to be freed to accomodate nfs. When I wrote path_connected I did not realize nfs was so special, and it's hueristic for avoiding calling is_subdir can fail. The practical case where this fails is when there is a move of a directory from the subtree exposed by one nfs mount to the subtree exposed by another nfs mount. This move can happen either locally or remotely. With the remote case requiring that the move directory be cached before the move and that after the move someone walks the path to where the move directory now exists and in so doing causes the already cached directory to be moved in the dcache through the magic of d_splice_alias. If someone whose working directory is in the move directory or a subdirectory and now starts calling .. from the initial mount of nfs (where s_root == mnt_root), then path_connected as a heuristic will not bother with the is_subdir check. As s_root really is not the root of the nfs filesystem this heuristic is wrong, and the path may actually not be connected and path_connected can fail. The is_subdir function might be cheap enough that we can call it unconditionally. Verifying that will take some benchmarking and the result may not be the same on all kernels this fix needs to be backported to. So I am avoiding that for now. Filesystems with snapshots such as nilfs and btrfs do something similar. But as the directory tree of the snapshots are disjoint from one another and from the main directory tree rename won't move things between them and this problem will not occur. Cc: stable@vger.kernel.org Reported-by: Al Viro Fixes: 397d425dc26d ("vfs: Test for and handle paths that are unreachable from their mnt_root") Signed-off-by: "Eric W. Biederman" Signed-off-by: Al Viro --- fs/namei.c | 5 +++-- fs/nfs/super.c | 2 ++ include/linux/fs.h | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/namei.c b/fs/namei.c index 921ae32dbc80..cafa365eeb70 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -559,9 +559,10 @@ static int __nd_alloc_stack(struct nameidata *nd) static bool path_connected(const struct path *path) { struct vfsmount *mnt = path->mnt; + struct super_block *sb = mnt->mnt_sb; - /* Only bind mounts can have disconnected paths */ - if (mnt->mnt_root == mnt->mnt_sb->s_root) + /* Bind mounts and multi-root filesystems can have disconnected paths */ + if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root)) return true; return is_subdir(path->dentry, mnt->mnt_root); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 29bacdc56f6a..5e470e233c83 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2631,6 +2631,8 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, /* initial superblock/root creation */ mount_info->fill_super(s, mount_info); nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned); + if (!(server->flags & NFS_MOUNT_UNSHARED)) + s->s_iflags |= SB_I_MULTIROOT; } mntroot = nfs_get_root(s, mount_info->mntfh, dev_name); diff --git a/include/linux/fs.h b/include/linux/fs.h index 2a815560fda0..0430e03febaa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1317,6 +1317,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ +#define SB_I_MULTIROOT 0x00000008 /* Multiple roots to the dentry tree */ /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ -- cgit v1.2.3 From 4bbb3e0e8239f9079bf1fe20b3c0cb598714ae61 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Tue, 13 Mar 2018 14:51:27 +0900 Subject: net: Fix vlan untag for bridge and vlan_dev with reorder_hdr off When we have a bridge with vlan_filtering on and a vlan device on top of it, packets would be corrupted in skb_vlan_untag() called from br_dev_xmit(). The problem sits in skb_reorder_vlan_header() used in skb_vlan_untag(), which makes use of skb->mac_len. In this function mac_len is meant for handling rx path with vlan devices with reorder_header disabled, but in tx path mac_len is typically 0 and cannot be used, which is the problem in this case. The current code even does not properly handle rx path (skb_vlan_untag() called from __netif_receive_skb_core()) with reorder_header off actually. In rx path single tag case, it works as follows: - Before skb_reorder_vlan_header() mac_header data v v +-------------------+-------------+------+---- | ETH | VLAN | ETH | | ADDRS | TPID | TCI | TYPE | +-------------------+-------------+------+---- <-------- mac_len ---------> <-------------> to be removed - After skb_reorder_vlan_header() mac_header data v v +-------------------+------+---- | ETH | ETH | | ADDRS | TYPE | +-------------------+------+---- <-------- mac_len ---------> This is ok, but in rx double tag case, it corrupts packets: - Before skb_reorder_vlan_header() mac_header data v v +-------------------+-------------+-------------+------+---- | ETH | VLAN | VLAN | ETH | | ADDRS | TPID | TCI | TPID | TCI | TYPE | +-------------------+-------------+-------------+------+---- <--------------- mac_len ----------------> <-------------> should be removed <---------------------------> actually will be removed - After skb_reorder_vlan_header() mac_header data v v +-------------------+------+---- | ETH | ETH | | ADDRS | TYPE | +-------------------+------+---- <--------------- mac_len ----------------> So, two of vlan tags are both removed while only inner one should be removed and mac_header (and mac_len) is broken. skb_vlan_untag() is meant for removing the vlan header at (skb->data - 2), so use skb->data and skb->mac_header to calculate the right offset. Reported-by: Brandon Carpenter Fixes: a6e18ff11170 ("vlan: Fix untag operations of stacked vlans with REORDER_HEADER off") Signed-off-by: Toshiaki Makita Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 1 + net/core/skbuff.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 8bbbcb5cd94b..820de5d222d2 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -30,6 +30,7 @@ */ #define ETH_ALEN 6 /* Octets in one ethernet addr */ +#define ETH_TLEN 2 /* Octets in ethernet type field */ #define ETH_HLEN 14 /* Total octets in header. */ #define ETH_ZLEN 60 /* Min. octets in frame sans FCS */ #define ETH_DATA_LEN 1500 /* Max. octets in payload */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index baf990528943..b103f46ec512 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5020,13 +5020,16 @@ EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { + int mac_len; + if (skb_cow(skb, skb_headroom(skb)) < 0) { kfree_skb(skb); return NULL; } - memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN, - 2 * ETH_ALEN); + mac_len = skb->data - skb_mac_header(skb); + memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), + mac_len - VLAN_HLEN - ETH_TLEN); skb->mac_header += VLAN_HLEN; return skb; } -- cgit v1.2.3 From cbe7128c4b92e2004984f477fd38dfa81662f02e Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Tue, 13 Mar 2018 14:51:28 +0900 Subject: vlan: Fix out of order vlan headers with reorder header off With reorder header off, received packets are untagged in skb_vlan_untag() called from within __netif_receive_skb_core(), and later the tag will be inserted back in vlan_do_receive(). This caused out of order vlan headers when we create a vlan device on top of another vlan device, because vlan_do_receive() inserts a tag as the outermost vlan tag. E.g. the outer tag is first removed in skb_vlan_untag() and inserted back in vlan_do_receive(), then the inner tag is next removed and inserted back as the outermost tag. This patch fixes the behaviour by inserting the inner tag at the right position. Signed-off-by: Toshiaki Makita Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 66 ++++++++++++++++++++++++++++++++++++++++--------- net/8021q/vlan_core.c | 4 +-- 2 files changed, 57 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 5e6a2d4dc366..c4a1cff9c768 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -300,30 +300,34 @@ static inline bool vlan_hw_offload_capable(netdev_features_t features, } /** - * __vlan_insert_tag - regular VLAN tag inserting + * __vlan_insert_inner_tag - inner VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert + * @mac_len: MAC header length including outer vlan headers * - * Inserts the VLAN tag into @skb as part of the payload + * Inserts the VLAN tag into @skb as part of the payload at offset mac_len * Returns error if skb_cow_head failes. * * Does not change skb->protocol so this function can be used during receive. */ -static inline int __vlan_insert_tag(struct sk_buff *skb, - __be16 vlan_proto, u16 vlan_tci) +static inline int __vlan_insert_inner_tag(struct sk_buff *skb, + __be16 vlan_proto, u16 vlan_tci, + unsigned int mac_len) { struct vlan_ethhdr *veth; if (skb_cow_head(skb, VLAN_HLEN) < 0) return -ENOMEM; - veth = skb_push(skb, VLAN_HLEN); + skb_push(skb, VLAN_HLEN); - /* Move the mac addresses to the beginning of the new header. */ - memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN); + /* Move the mac header sans proto to the beginning of the new header. */ + memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN); skb->mac_header -= VLAN_HLEN; + veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN); + /* first, the ethernet type */ veth->h_vlan_proto = vlan_proto; @@ -334,12 +338,30 @@ static inline int __vlan_insert_tag(struct sk_buff *skb, } /** - * vlan_insert_tag - regular VLAN tag inserting + * __vlan_insert_tag - regular VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload + * Returns error if skb_cow_head failes. + * + * Does not change skb->protocol so this function can be used during receive. + */ +static inline int __vlan_insert_tag(struct sk_buff *skb, + __be16 vlan_proto, u16 vlan_tci) +{ + return __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN); +} + +/** + * vlan_insert_inner_tag - inner VLAN tag inserting + * @skb: skbuff to tag + * @vlan_proto: VLAN encapsulation protocol + * @vlan_tci: VLAN TCI to insert + * @mac_len: MAC header length including outer vlan headers + * + * Inserts the VLAN tag into @skb as part of the payload at offset mac_len * Returns a VLAN tagged skb. If a new skb is created, @skb is freed. * * Following the skb_unshare() example, in case of error, the calling function @@ -347,12 +369,14 @@ static inline int __vlan_insert_tag(struct sk_buff *skb, * * Does not change skb->protocol so this function can be used during receive. */ -static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, - __be16 vlan_proto, u16 vlan_tci) +static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb, + __be16 vlan_proto, + u16 vlan_tci, + unsigned int mac_len) { int err; - err = __vlan_insert_tag(skb, vlan_proto, vlan_tci); + err = __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, mac_len); if (err) { dev_kfree_skb_any(skb); return NULL; @@ -360,6 +384,26 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, return skb; } +/** + * vlan_insert_tag - regular VLAN tag inserting + * @skb: skbuff to tag + * @vlan_proto: VLAN encapsulation protocol + * @vlan_tci: VLAN TCI to insert + * + * Inserts the VLAN tag into @skb as part of the payload + * Returns a VLAN tagged skb. If a new skb is created, @skb is freed. + * + * Following the skb_unshare() example, in case of error, the calling function + * doesn't have to worry about freeing the original skb. + * + * Does not change skb->protocol so this function can be used during receive. + */ +static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, + __be16 vlan_proto, u16 vlan_tci) +{ + return vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN); +} + /** * vlan_insert_tag_set_proto - regular VLAN tag inserting * @skb: skbuff to tag diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 64aa9f755e1d..45c9bf5ff3a0 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -48,8 +48,8 @@ bool vlan_do_receive(struct sk_buff **skbp) * original position later */ skb_push(skb, offset); - skb = *skbp = vlan_insert_tag(skb, skb->vlan_proto, - skb->vlan_tci); + skb = *skbp = vlan_insert_inner_tag(skb, skb->vlan_proto, + skb->vlan_tci, skb->mac_len); if (!skb) return false; skb_pull(skb, offset + VLAN_HLEN); -- cgit v1.2.3 From a6618f4aedb2b60932d766bd82ae7ce866e842aa Mon Sep 17 00:00:00 2001 From: Kirill Marinushkin Date: Mon, 19 Mar 2018 07:11:08 +0100 Subject: ALSA: usb-audio: Fix parsing descriptor of UAC2 processing unit Currently, the offsets in the UAC2 processing unit descriptor are calculated incorrectly. It causes an issue when connecting the device which provides such a feature: ~~~~ [84126.724420] usb 1-1.3.1: invalid Processing Unit descriptor (id 18) ~~~~ After this patch is applied, the UAC2 processing unit inits w/o this error. Fixes: 23caaf19b11e ("ALSA: usb-mixer: Add support for Audio Class v2.0") Signed-off-by: Kirill Marinushkin Cc: Signed-off-by: Takashi Iwai --- include/uapi/linux/usb/audio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/usb/audio.h b/include/uapi/linux/usb/audio.h index 17a022c5b414..da3315ed1bcd 100644 --- a/include/uapi/linux/usb/audio.h +++ b/include/uapi/linux/usb/audio.h @@ -370,7 +370,7 @@ static inline __u8 uac_processing_unit_bControlSize(struct uac_processing_unit_d { return (protocol == UAC_VERSION_1) ? desc->baSourceID[desc->bNrInPins + 4] : - desc->baSourceID[desc->bNrInPins + 6]; + 2; /* in UAC2, this value is constant */ } static inline __u8 *uac_processing_unit_bmControls(struct uac_processing_unit_descriptor *desc, @@ -378,7 +378,7 @@ static inline __u8 *uac_processing_unit_bmControls(struct uac_processing_unit_de { return (protocol == UAC_VERSION_1) ? &desc->baSourceID[desc->bNrInPins + 5] : - &desc->baSourceID[desc->bNrInPins + 7]; + &desc->baSourceID[desc->bNrInPins + 6]; } static inline __u8 uac_processing_unit_iProcessing(struct uac_processing_unit_descriptor *desc, -- cgit v1.2.3 From b3a5d111994450909158929560906f2c1c6c1d85 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Mar 2018 12:45:12 -0700 Subject: percpu_ref: Update doc to dissuade users from depending on internal RCU grace periods percpu_ref internally uses sched-RCU to implement the percpu -> atomic mode switching and the documentation suggested that this could be depended upon. This doesn't seem like a good idea. * percpu_ref uses sched-RCU which has different grace periods regular RCU. Users may combine percpu_ref with regular RCU usage and incorrectly believe that regular RCU grace periods are performed by percpu_ref. This can lead to, for example, use-after-free due to premature freeing. * percpu_ref has a grace period when switching from percpu to atomic mode. It doesn't have one between the last put and release. This distinction is subtle and can lead to surprising bugs. * percpu_ref allows starting in and switching to atomic mode manually for debugging and other purposes. This means that there may not be any grace periods from kill to release. This patch makes it clear that the grace periods are percpu_ref's internal implementation detail and can't be depended upon by the users. Signed-off-by: Tejun Heo Cc: Kent Overstreet Cc: Linus Torvalds Signed-off-by: Tejun Heo --- include/linux/percpu-refcount.h | 18 ++++++++++++------ lib/percpu-refcount.c | 2 ++ 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 864d167a1073..009cdf3d65b6 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -30,10 +30,14 @@ * calls io_destroy() or the process exits. * * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it - * calls percpu_ref_kill(), then hlist_del_rcu() and synchronize_rcu() to remove - * the kioctx from the proccess's list of kioctxs - after that, there can't be - * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop - * the initial ref with percpu_ref_put(). + * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref. + * After that, there can't be any new users of the kioctx (from lookup_ioctx()) + * and it's then safe to drop the initial ref with percpu_ref_put(). + * + * Note that the free path, free_ioctx(), needs to go through explicit call_rcu() + * to synchronize with RCU protected lookup_ioctx(). percpu_ref operations don't + * imply RCU grace periods of any kind and if a user wants to combine percpu_ref + * with RCU protection, it must be done explicitly. * * Code that does a two stage shutdown like this often needs some kind of * explicit synchronization to ensure the initial refcount can only be dropped @@ -113,8 +117,10 @@ void percpu_ref_reinit(struct percpu_ref *ref); * Must be used to drop the initial ref on a percpu refcount; must be called * precisely once before shutdown. * - * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the - * percpu counters and dropping the initial ref. + * Switches @ref into atomic mode before gathering up the percpu counters + * and dropping the initial ref. + * + * There are no implied RCU grace periods between kill and release. */ static inline void percpu_ref_kill(struct percpu_ref *ref) { diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 30e7dd88148b..9f96fa7bc000 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -322,6 +322,8 @@ EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu); * This function normally doesn't block and can be called from any context * but it may block if @confirm_kill is specified and @ref is in the * process of switching to atomic mode by percpu_ref_switch_to_atomic(). + * + * There are no implied RCU grace periods between kill and release. */ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) -- cgit v1.2.3 From 80cf79ae4f688ead1dca8878990709c0446c3992 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 19 Mar 2018 12:21:43 +0200 Subject: RDMA/verbs: Remove restrack entry from XRCD structure XRCD object is not implemented in the restrack, so lets remove it. Fixes: 02d8883f520e ("RDMA/restrack: Add general infrastructure to track RDMA resources") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 73b2387e3f74..ff3ed435701f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1537,10 +1537,6 @@ struct ib_xrcd { struct mutex tgt_qp_mutex; struct list_head tgt_qp_list; - /* - * Implementation details of the RDMA core, don't use in drivers: - */ - struct rdma_restrack_entry res; }; struct ib_ah { -- cgit v1.2.3 From 578ae447e7e5d78c90ac40a06406c1741f79ba96 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 19 Mar 2018 13:18:57 -0500 Subject: jump_label: Disable jump labels in __exit code With the following commit: 333522447063 ("jump_label: Explicitly disable jump labels in __init code") ... we explicitly disabled jump labels in __init code, so they could be detected and not warned about in the following commit: dc1dd184c2f0 ("jump_label: Warn on failed jump_label patching attempt") In-kernel __exit code has the same issue. It's never used, so it's freed along with the rest of initmem. But jump label entries in __exit code aren't explicitly disabled, so we get the following warning when enabling pr_debug() in __exit code: can't patch jump_label at dmi_sysfs_exit+0x0/0x2d WARNING: CPU: 0 PID: 22572 at kernel/jump_label.c:376 __jump_label_update+0x9d/0xb0 Fix the warning by disabling all jump labels in initmem (which includes both __init and __exit code). Reported-and-tested-by: Li Wang Signed-off-by: Josh Poimboeuf Cc: Borislav Petkov Cc: Jason Baron Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: dc1dd184c2f0 ("jump_label: Warn on failed jump_label patching attempt") Link: http://lkml.kernel.org/r/7121e6e595374f06616c505b6e690e275c0054d1.1521483452.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 4 ++-- init/main.c | 2 +- kernel/jump_label.c | 7 ++++--- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 2168cc6b8b30..b46b541c67c4 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -151,7 +151,7 @@ extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; extern void jump_label_init(void); -extern void jump_label_invalidate_init(void); +extern void jump_label_invalidate_initmem(void); extern void jump_label_lock(void); extern void jump_label_unlock(void); extern void arch_jump_label_transform(struct jump_entry *entry, @@ -199,7 +199,7 @@ static __always_inline void jump_label_init(void) static_key_initialized = true; } -static inline void jump_label_invalidate_init(void) {} +static inline void jump_label_invalidate_initmem(void) {} static __always_inline bool static_key_false(struct static_key *key) { diff --git a/init/main.c b/init/main.c index 969eaf140ef0..21efbf6ace93 100644 --- a/init/main.c +++ b/init/main.c @@ -1001,7 +1001,7 @@ static int __ref kernel_init(void *unused) /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); ftrace_free_init_mem(); - jump_label_invalidate_init(); + jump_label_invalidate_initmem(); free_initmem(); mark_readonly(); system_state = SYSTEM_RUNNING; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index e7214093dcd1..01ebdf1f9f40 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef HAVE_JUMP_LABEL @@ -421,15 +422,15 @@ void __init jump_label_init(void) cpus_read_unlock(); } -/* Disable any jump label entries in __init code */ -void __init jump_label_invalidate_init(void) +/* Disable any jump label entries in __init/__exit code */ +void __init jump_label_invalidate_initmem(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; struct jump_entry *iter; for (iter = iter_start; iter < iter_stop; iter++) { - if (init_kernel_text(iter->code)) + if (init_section_contains((void *)(unsigned long)iter->code, 1)) iter->code = 0; } } -- cgit v1.2.3 From 6b00c35138b404be98b85f4a703be594cbed501c Mon Sep 17 00:00:00 2001 From: Jagdish Gediya Date: Thu, 22 Mar 2018 01:08:10 +0530 Subject: mtd: nand: fsl_ifc: Read ECCSTAT0 and ECCSTAT1 registers for IFC 2.0 Due to missing information in Hardware manual, current implementation doesn't read ECCSTAT0 and ECCSTAT1 registers for IFC 2.0. Add support to read ECCSTAT0 and ECCSTAT1 registers during ecccheck for IFC 2.0. Fixes: 656441478ed5 ("mtd: nand: ifc: Fix location of eccstat registers for IFC V1.0") Cc: stable@vger.kernel.org # v3.18+ Signed-off-by: Jagdish Gediya Reviewed-by: Prabhakar Kushwaha Signed-off-by: Boris Brezillon --- drivers/mtd/nand/fsl_ifc_nand.c | 6 +----- include/linux/fsl_ifc.h | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/mtd/nand/fsl_ifc_nand.c b/drivers/mtd/nand/fsl_ifc_nand.c index fae01b04abc7..5a9c2f0020c2 100644 --- a/drivers/mtd/nand/fsl_ifc_nand.c +++ b/drivers/mtd/nand/fsl_ifc_nand.c @@ -227,11 +227,7 @@ static void fsl_ifc_run_command(struct mtd_info *mtd) int sector_end = sector_start + chip->ecc.steps - 1; __be32 *eccstat_regs; - if (ctrl->version >= FSL_IFC_VERSION_2_0_0) - eccstat_regs = ifc->ifc_nand.v2_nand_eccstat; - else - eccstat_regs = ifc->ifc_nand.v1_nand_eccstat; - + eccstat_regs = ifc->ifc_nand.nand_eccstat; eccstat = ifc_in32(&eccstat_regs[sector_start / 4]); for (i = sector_start; i <= sector_end; i++) { diff --git a/include/linux/fsl_ifc.h b/include/linux/fsl_ifc.h index c332f0a45607..3fdfede2f0f3 100644 --- a/include/linux/fsl_ifc.h +++ b/include/linux/fsl_ifc.h @@ -734,11 +734,7 @@ struct fsl_ifc_nand { u32 res19[0x10]; __be32 nand_fsr; u32 res20; - /* The V1 nand_eccstat is actually 4 words that overlaps the - * V2 nand_eccstat. - */ - __be32 v1_nand_eccstat[2]; - __be32 v2_nand_eccstat[6]; + __be32 nand_eccstat[8]; u32 res21[0x1c]; __be32 nanndcr; u32 res22[0x2]; -- cgit v1.2.3 From 7c181f4fcdc62e5dc7a87fd33387d322262c3b52 Mon Sep 17 00:00:00 2001 From: Ben Caradoc-Davies Date: Mon, 19 Mar 2018 12:57:44 +1300 Subject: mac80211: add ieee80211_hw flag for QoS NDP support Commit 7b6ddeaf27ec ("mac80211: use QoS NDP for AP probing") added an argument qos_ok to ieee80211_nullfunc_get to support QoS NDP. Despite the claim in the commit log "Change all the drivers to *not* allow QoS NDP for now, even though it looks like most of them should be OK with that", this commit enables QoS NDP in response to beacons (see change to mlme.c:ieee80211_send_nullfunc), causing ath9k_htc to lose IP connectivity. See: https://patchwork.kernel.org/patch/10241109/ https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=891060 Introduce a hardware flag to allow such buggy drivers to override the correct default behaviour of mac80211 of sending QoS NDP packets. Signed-off-by: Ben Caradoc-Davies Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ net/mac80211/debugfs.c | 1 + net/mac80211/mlme.c | 3 ++- 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c96511fa9198..2b581bd93812 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2063,6 +2063,9 @@ struct ieee80211_txq { * @IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA: Hardware supports buffer STA on * TDLS links. * + * @IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP: The driver (or firmware) doesn't + * support QoS NDP for AP probing - that's most likely a driver bug. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2106,6 +2109,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_REPORTS_LOW_ACK, IEEE80211_HW_SUPPORTS_TX_FRAG, IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA, + IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 1f466d12a6bc..94c7ee9df33b 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -212,6 +212,7 @@ static const char *hw_flag_names[] = { FLAG(REPORTS_LOW_ACK), FLAG(SUPPORTS_TX_FRAG), FLAG(SUPPORTS_TDLS_BUFFER_STA), + FLAG(DOESNT_SUPPORT_QOS_NDP), #undef FLAG }; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 39b660b9a908..5f303abac5ad 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -896,7 +896,8 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, true); + skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, + !ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP)); if (!skb) return; -- cgit v1.2.3 From 5df7af85ecd88e8b5f1f31d6456c3cf38a8bbdda Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Tue, 20 Mar 2018 09:44:52 +0800 Subject: net: phy: Add general dummy stubs for MMD register access For some phy devices, even though they don't support the MMD extended register access, it does have some side effect if we are trying to read/write the MMD registers via indirect method. So introduce general dummy stubs for MMD register access which these devices can use to avoid such side effect. Fixes: b6b5e8a69118 ("gianfar: Disable EEE autoneg by default") Signed-off-by: Kevin Hao Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 17 +++++++++++++++++ include/linux/phy.h | 4 ++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index fe16f5894092..74664a6c0cdc 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1673,6 +1673,23 @@ int genphy_config_init(struct phy_device *phydev) } EXPORT_SYMBOL(genphy_config_init); +/* This is used for the phy device which doesn't support the MMD extended + * register access, but it does have side effect when we are trying to access + * the MMD register via indirect method. + */ +int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad, u16 regnum) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(genphy_read_mmd_unsupported); + +int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum, + u16 regnum, u16 val) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(genphy_write_mmd_unsupported); + int genphy_suspend(struct phy_device *phydev) { return phy_set_bits(phydev, MII_BMCR, BMCR_PDOWN); diff --git a/include/linux/phy.h b/include/linux/phy.h index b260fb336b25..7c4c2379e010 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -984,6 +984,10 @@ static inline int genphy_no_soft_reset(struct phy_device *phydev) { return 0; } +int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad, + u16 regnum); +int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum, + u16 regnum, u16 val); /* Clause 45 PHY */ int genphy_c45_restart_aneg(struct phy_device *phydev); -- cgit v1.2.3 From b6bdb7517c3d3f41f20e5c2948d6bc3f8897394e Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 22 Mar 2018 16:17:20 -0700 Subject: mm/vmalloc: add interfaces to free unmapped page table On architectures with CONFIG_HAVE_ARCH_HUGE_VMAP set, ioremap() may create pud/pmd mappings. A kernel panic was observed on arm64 systems with Cortex-A75 in the following steps as described by Hanjun Guo. 1. ioremap a 4K size, valid page table will build, 2. iounmap it, pte0 will set to 0; 3. ioremap the same address with 2M size, pgd/pmd is unchanged, then set the a new value for pmd; 4. pte0 is leaked; 5. CPU may meet exception because the old pmd is still in TLB, which will lead to kernel panic. This panic is not reproducible on x86. INVLPG, called from iounmap, purges all levels of entries associated with purged address on x86. x86 still has memory leak. The patch changes the ioremap path to free unmapped page table(s) since doing so in the unmap path has the following issues: - The iounmap() path is shared with vunmap(). Since vmap() only supports pte mappings, making vunmap() to free a pte page is an overhead for regular vmap users as they do not need a pte page freed up. - Checking if all entries in a pte page are cleared in the unmap path is racy, and serializing this check is expensive. - The unmap path calls free_vmap_area_noflush() to do lazy TLB purges. Clearing a pud/pmd entry before the lazy TLB purges needs extra TLB purge. Add two interfaces, pud_free_pmd_page() and pmd_free_pte_page(), which clear a given pud/pmd entry and free up a page for the lower level entries. This patch implements their stub functions on x86 and arm64, which work as workaround. [akpm@linux-foundation.org: fix typo in pmd_free_pte_page() stub] Link: http://lkml.kernel.org/r/20180314180155.19492-2-toshi.kani@hpe.com Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings") Reported-by: Lei Li Signed-off-by: Toshi Kani Cc: Catalin Marinas Cc: Wang Xuefeng Cc: Will Deacon Cc: Hanjun Guo Cc: Michal Hocko Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Borislav Petkov Cc: Matthew Wilcox Cc: Chintan Pandya Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 10 ++++++++++ arch/x86/mm/pgtable.c | 24 ++++++++++++++++++++++++ include/asm-generic/pgtable.h | 10 ++++++++++ lib/ioremap.c | 6 ++++-- 4 files changed, 48 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8c704f1e53c2..2dbb2c9f1ec1 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -972,3 +972,13 @@ int pmd_clear_huge(pmd_t *pmdp) pmd_clear(pmdp); return 1; } + +int pud_free_pmd_page(pud_t *pud) +{ + return pud_none(*pud); +} + +int pmd_free_pte_page(pmd_t *pmd) +{ + return pmd_none(*pmd); +} diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 004abf9ebf12..1eed7ed518e6 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -702,4 +702,28 @@ int pmd_clear_huge(pmd_t *pmd) return 0; } + +/** + * pud_free_pmd_page - Clear pud entry and free pmd page. + * @pud: Pointer to a PUD. + * + * Context: The pud range has been unmaped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + */ +int pud_free_pmd_page(pud_t *pud) +{ + return pud_none(*pud); +} + +/** + * pmd_free_pte_page - Clear pmd entry and free pte page. + * @pmd: Pointer to a PMD. + * + * Context: The pmd range has been unmaped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + */ +int pmd_free_pte_page(pmd_t *pmd) +{ + return pmd_none(*pmd); +} #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 2cfa3075d148..bfbb44a5ad38 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -983,6 +983,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); +int pud_free_pmd_page(pud_t *pud); +int pmd_free_pte_page(pmd_t *pmd); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { @@ -1008,6 +1010,14 @@ static inline int pmd_clear_huge(pmd_t *pmd) { return 0; } +static inline int pud_free_pmd_page(pud_t *pud) +{ + return 0; +} +static inline int pmd_free_pte_page(pmd_t *pmd) +{ + return 0; +} #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE diff --git a/lib/ioremap.c b/lib/ioremap.c index b808a390e4c3..54e5bbaa3200 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -91,7 +91,8 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, if (ioremap_pmd_enabled() && ((next - addr) == PMD_SIZE) && - IS_ALIGNED(phys_addr + addr, PMD_SIZE)) { + IS_ALIGNED(phys_addr + addr, PMD_SIZE) && + pmd_free_pte_page(pmd)) { if (pmd_set_huge(pmd, phys_addr + addr, prot)) continue; } @@ -117,7 +118,8 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, if (ioremap_pud_enabled() && ((next - addr) == PUD_SIZE) && - IS_ALIGNED(phys_addr + addr, PUD_SIZE)) { + IS_ALIGNED(phys_addr + addr, PUD_SIZE) && + pud_free_pmd_page(pud)) { if (pud_set_huge(pud, phys_addr + addr, prot)) continue; } -- cgit v1.2.3 From f59f1caf72ba00d519c793c3deb32cd3be32edc2 Mon Sep 17 00:00:00 2001 From: Daniel Vacek Date: Thu, 22 Mar 2018 16:17:38 -0700 Subject: Revert "mm: page_alloc: skip over regions of invalid pfns where possible" This reverts commit b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible"). The commit is meant to be a boot init speed up skipping the loop in memmap_init_zone() for invalid pfns. But given some specific memory mapping on x86_64 (or more generally theoretically anywhere but on arm with CONFIG_HAVE_ARCH_PFN_VALID) the implementation also skips valid pfns which is plain wrong and causes 'kernel BUG at mm/page_alloc.c:1389!' crash> log | grep -e BUG -e RIP -e Call.Trace -e move_freepages_block -e rmqueue -e freelist -A1 kernel BUG at mm/page_alloc.c:1389! invalid opcode: 0000 [#1] SMP -- RIP: 0010: move_freepages+0x15e/0x160 -- Call Trace: move_freepages_block+0x73/0x80 __rmqueue+0x263/0x460 get_page_from_freelist+0x7e1/0x9e0 __alloc_pages_nodemask+0x176/0x420 -- crash> page_init_bug -v | grep RAM 1000 - 9bfff System RAM (620.00 KiB) 100000 - 430bffff System RAM ( 1.05 GiB = 1071.75 MiB = 1097472.00 KiB) 4b0c8000 - 4bf9cfff System RAM ( 14.83 MiB = 15188.00 KiB) 4bfac000 - 646b1fff System RAM (391.02 MiB = 400408.00 KiB) 7b788000 - 7b7fffff System RAM (480.00 KiB) 100000000 - 67fffffff System RAM ( 22.00 GiB) crash> page_init_bug | head -6 7b788000 - 7b7fffff System RAM (480.00 KiB) 1fffff00000000 0 1 DMA32 4096 1048575 505736 505344 505855 0 0 0 DMA 1 4095 1fffff00000400 0 1 DMA32 4096 1048575 BUG, zones differ! crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b787000 7b788000 PAGE PHYSICAL MAPPING INDEX CNT FLAGS ffffea0001e00000 78000000 0 0 0 0 ffffea0001ed7fc0 7b5ff000 0 0 0 0 ffffea0001ed8000 7b600000 0 0 0 0 <<<< ffffea0001ede1c0 7b787000 0 0 0 0 ffffea0001ede200 7b788000 0 0 1 1fffff00000000 Link: http://lkml.kernel.org/r/20180316143855.29838-1-neelx@redhat.com Fixes: b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible") Signed-off-by: Daniel Vacek Acked-by: Ard Biesheuvel Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Vlastimil Babka Cc: Mel Gorman Cc: Pavel Tatashin Cc: Paul Burton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 1 - mm/memblock.c | 28 ---------------------------- mm/page_alloc.c | 11 +---------- 3 files changed, 1 insertion(+), 39 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 8be5077efb5f..f92ea7783652 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -187,7 +187,6 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, unsigned long *out_end_pfn, int *out_nid); -unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn); /** * for_each_mem_pfn_range - early memory pfn range iterator diff --git a/mm/memblock.c b/mm/memblock.c index b6ba6b7adadc..48376bd33274 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, *out_nid = r->nid; } -unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn, - unsigned long max_pfn) -{ - struct memblock_type *type = &memblock.memory; - unsigned int right = type->cnt; - unsigned int mid, left = 0; - phys_addr_t addr = PFN_PHYS(++pfn); - - do { - mid = (right + left) / 2; - - if (addr < type->regions[mid].base) - right = mid; - else if (addr >= (type->regions[mid].base + - type->regions[mid].size)) - left = mid + 1; - else { - /* addr is within the region, so pfn is valid */ - return pfn; - } - } while (left < right); - - if (right == type->cnt) - return -1UL; - else - return PHYS_PFN(type->regions[right].base); -} - /** * memblock_set_node - set node ID on memblock regions * @base: base of area to set node ID for diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 010dee0f089e..1741dd23e7c1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5356,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (context != MEMMAP_EARLY) goto not_early; - if (!early_pfn_valid(pfn)) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - /* - * Skip to the pfn preceding the next valid one (or - * end_pfn), such that we hit a valid pfn (or end_pfn) - * on our next iteration of the loop. - */ - pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; -#endif + if (!early_pfn_valid(pfn)) continue; - } if (!early_pfn_in_nid(pfn, nid)) continue; if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) -- cgit v1.2.3 From 84652aefb347297aa08e91e283adf7b18f77c2d5 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 28 Mar 2018 11:27:22 -0700 Subject: RDMA/ucma: Introduce safer rdma_addr_size() variants There are several places in the ucma ABI where userspace can pass in a sockaddr but set the address family to AF_IB. When that happens, rdma_addr_size() will return a size bigger than sizeof struct sockaddr_in6, and the ucma kernel code might end up copying past the end of a buffer not sized for a struct sockaddr_ib. Fix this by introducing new variants int rdma_addr_size_in6(struct sockaddr_in6 *addr); int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr); that are type-safe for the types used in the ucma ABI and return 0 if the size computed is bigger than the size of the type passed in. We can use these new variants to check what size userspace has passed in before copying any addresses. Reported-by: Signed-off-by: Roland Dreier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 16 ++++++++++++++++ drivers/infiniband/core/ucma.c | 34 +++++++++++++++++----------------- include/rdma/ib_addr.h | 2 ++ 3 files changed, 35 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index c3e6811bb1bd..cb1d2ab13c66 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -207,6 +207,22 @@ int rdma_addr_size(struct sockaddr *addr) } EXPORT_SYMBOL(rdma_addr_size); +int rdma_addr_size_in6(struct sockaddr_in6 *addr) +{ + int ret = rdma_addr_size((struct sockaddr *) addr); + + return ret <= sizeof(*addr) ? ret : 0; +} +EXPORT_SYMBOL(rdma_addr_size_in6); + +int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr) +{ + int ret = rdma_addr_size((struct sockaddr *) addr); + + return ret <= sizeof(*addr) ? ret : 0; +} +EXPORT_SYMBOL(rdma_addr_size_kss); + static struct rdma_addr_client self; void rdma_addr_register_client(struct rdma_addr_client *client) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 21585055cf32..d933336d7e01 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -632,6 +632,9 @@ static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; + if (!rdma_addr_size_in6(&cmd.addr)) + return -EINVAL; + ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); @@ -645,22 +648,21 @@ static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_bind cmd; - struct sockaddr *addr; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; - addr = (struct sockaddr *) &cmd.addr; - if (cmd.reserved || !cmd.addr_size || (cmd.addr_size != rdma_addr_size(addr))) + if (cmd.reserved || !cmd.addr_size || + cmd.addr_size != rdma_addr_size_kss(&cmd.addr)) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ret = rdma_bind_addr(ctx->cm_id, addr); + ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); ucma_put_ctx(ctx); return ret; } @@ -670,23 +672,22 @@ static ssize_t ucma_resolve_ip(struct ucma_file *file, int in_len, int out_len) { struct rdma_ucm_resolve_ip cmd; - struct sockaddr *src, *dst; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; - src = (struct sockaddr *) &cmd.src_addr; - dst = (struct sockaddr *) &cmd.dst_addr; - if (!rdma_addr_size(src) || !rdma_addr_size(dst)) + if (!rdma_addr_size_in6(&cmd.src_addr) || + !rdma_addr_size_in6(&cmd.dst_addr)) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ret = rdma_resolve_addr(ctx->cm_id, src, dst, cmd.timeout_ms); + ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, + (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); ucma_put_ctx(ctx); return ret; } @@ -696,24 +697,23 @@ static ssize_t ucma_resolve_addr(struct ucma_file *file, int in_len, int out_len) { struct rdma_ucm_resolve_addr cmd; - struct sockaddr *src, *dst; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; - src = (struct sockaddr *) &cmd.src_addr; - dst = (struct sockaddr *) &cmd.dst_addr; - if (cmd.reserved || (cmd.src_size && (cmd.src_size != rdma_addr_size(src))) || - !cmd.dst_size || (cmd.dst_size != rdma_addr_size(dst))) + if (cmd.reserved || + (cmd.src_size && (cmd.src_size != rdma_addr_size_kss(&cmd.src_addr))) || + !cmd.dst_size || (cmd.dst_size != rdma_addr_size_kss(&cmd.dst_addr))) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ret = rdma_resolve_addr(ctx->cm_id, src, dst, cmd.timeout_ms); + ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, + (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); ucma_put_ctx(ctx); return ret; } @@ -1433,7 +1433,7 @@ static ssize_t ucma_join_ip_multicast(struct ucma_file *file, join_cmd.response = cmd.response; join_cmd.uid = cmd.uid; join_cmd.id = cmd.id; - join_cmd.addr_size = rdma_addr_size((struct sockaddr *) &cmd.addr); + join_cmd.addr_size = rdma_addr_size_in6(&cmd.addr); if (!join_cmd.addr_size) return -EINVAL; @@ -1452,7 +1452,7 @@ static ssize_t ucma_join_multicast(struct ucma_file *file, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; - if (!rdma_addr_size((struct sockaddr *)&cmd.addr)) + if (!rdma_addr_size_kss(&cmd.addr)) return -EINVAL; return ucma_process_join(file, &cmd, out_len); diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index d656809f1217..415e09960017 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -130,6 +130,8 @@ void rdma_copy_addr(struct rdma_dev_addr *dev_addr, const unsigned char *dst_dev_addr); int rdma_addr_size(struct sockaddr *addr); +int rdma_addr_size_in6(struct sockaddr_in6 *addr); +int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr); int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, -- cgit v1.2.3