summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorKuniyuki Iwashima <kuniyu@amazon.com>2022-07-13 20:52:07 +0300
committerJakub Kicinski <kuba@kernel.org>2022-07-16 04:50:35 +0300
commit11052589cf5c0bab3b4884d423d5f60c38fcf25d (patch)
tree80de70b884d5defb7e23909e9ed7c5973bf5042a /net/ipv4
parentdf254d4508f96f298be8aff08da4d7e33d8584a0 (diff)
downloadlinux-11052589cf5c0bab3b4884d423d5f60c38fcf25d.tar.xz
tcp/udp: Make early_demux back namespacified.
Commit e21145a9871a ("ipv4: namespacify ip_early_demux sysctl knob") made it possible to enable/disable early_demux on a per-netns basis. Then, we introduced two knobs, tcp_early_demux and udp_early_demux, to switch it for TCP/UDP in commit dddb64bcb346 ("net: Add sysctl to toggle early demux for tcp and udp"). However, the .proc_handler() was wrong and actually disabled us from changing the behaviour in each netns. We can execute early_demux if net.ipv4.ip_early_demux is on and each proto .early_demux() handler is not NULL. When we toggle (tcp|udp)_early_demux, the change itself is saved in each netns variable, but the .early_demux() handler is a global variable, so the handler is switched based on the init_net's sysctl variable. Thus, netns (tcp|udp)_early_demux knobs have nothing to do with the logic. Whether we CAN execute proto .early_demux() is always decided by init_net's sysctl knob, and whether we DO it or not is by each netns ip_early_demux knob. This patch namespacifies (tcp|udp)_early_demux again. For now, the users of the .early_demux() handler are TCP and UDP only, and they are called directly to avoid retpoline. So, we can remove the .early_demux() handler from inet6?_protos and need not dereference them in ip6?_rcv_finish_core(). If another proto needs .early_demux(), we can restore it at that time. Fixes: dddb64bcb346 ("net: Add sysctl to toggle early demux for tcp and udp") Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Link: https://lore.kernel.org/r/20220713175207.7727-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c14
-rw-r--r--net/ipv4/ip_input.c37
-rw-r--r--net/ipv4/sysctl_net_ipv4.c59
3 files changed, 26 insertions, 84 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4bc24f9e38b3..8baef2f3fc4b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1710,24 +1710,14 @@ static const struct net_protocol igmp_protocol = {
};
#endif
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct net_protocol tcp_protocol = {
- .early_demux = tcp_v4_early_demux,
- .early_demux_handler = tcp_v4_early_demux,
+static const struct net_protocol tcp_protocol = {
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.no_policy = 1,
.icmp_strict_tag_validation = 1,
};
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct net_protocol udp_protocol = {
- .early_demux = udp_v4_early_demux,
- .early_demux_handler = udp_v4_early_demux,
+static const struct net_protocol udp_protocol = {
.handler = udp_rcv,
.err_handler = udp_err,
.no_policy = 1,
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index b1165f717cd1..1b512390b3cf 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -312,14 +312,13 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
ip_hdr(hint)->tos == iph->tos;
}
-INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *));
-INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *));
+int tcp_v4_early_demux(struct sk_buff *skb);
+int udp_v4_early_demux(struct sk_buff *skb);
static int ip_rcv_finish_core(struct net *net, struct sock *sk,
struct sk_buff *skb, struct net_device *dev,
const struct sk_buff *hint)
{
const struct iphdr *iph = ip_hdr(skb);
- int (*edemux)(struct sk_buff *skb);
int err, drop_reason;
struct rtable *rt;
@@ -332,21 +331,29 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
goto drop_error;
}
- if (net->ipv4.sysctl_ip_early_demux &&
+ if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
!skb_dst(skb) &&
!skb->sk &&
!ip_is_fragment(iph)) {
- const struct net_protocol *ipprot;
- int protocol = iph->protocol;
-
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
- err = INDIRECT_CALL_2(edemux, tcp_v4_early_demux,
- udp_v4_early_demux, skb);
- if (unlikely(err))
- goto drop_error;
- /* must reload iph, skb->head might have changed */
- iph = ip_hdr(skb);
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) {
+ tcp_v4_early_demux(skb);
+
+ /* must reload iph, skb->head might have changed */
+ iph = ip_hdr(skb);
+ }
+ break;
+ case IPPROTO_UDP:
+ if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) {
+ err = udp_v4_early_demux(skb);
+ if (unlikely(err))
+ goto drop_error;
+
+ /* must reload iph, skb->head might have changed */
+ iph = ip_hdr(skb);
+ }
+ break;
}
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 108fd86f2718..130e9c130311 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -350,61 +350,6 @@ bad_key:
return ret;
}
-static void proc_configure_early_demux(int enabled, int protocol)
-{
- struct net_protocol *ipprot;
-#if IS_ENABLED(CONFIG_IPV6)
- struct inet6_protocol *ip6prot;
-#endif
-
- rcu_read_lock();
-
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot)
- ipprot->early_demux = enabled ? ipprot->early_demux_handler :
- NULL;
-
-#if IS_ENABLED(CONFIG_IPV6)
- ip6prot = rcu_dereference(inet6_protos[protocol]);
- if (ip6prot)
- ip6prot->early_demux = enabled ? ip6prot->early_demux_handler :
- NULL;
-#endif
- rcu_read_unlock();
-}
-
-static int proc_tcp_early_demux(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- int ret = 0;
-
- ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
-
- if (write && !ret) {
- int enabled = init_net.ipv4.sysctl_tcp_early_demux;
-
- proc_configure_early_demux(enabled, IPPROTO_TCP);
- }
-
- return ret;
-}
-
-static int proc_udp_early_demux(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- int ret = 0;
-
- ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
-
- if (write && !ret) {
- int enabled = init_net.ipv4.sysctl_udp_early_demux;
-
- proc_configure_early_demux(enabled, IPPROTO_UDP);
- }
-
- return ret;
-}
-
static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
int write, void *buffer,
size_t *lenp, loff_t *ppos)
@@ -707,14 +652,14 @@ static struct ctl_table ipv4_net_table[] = {
.data = &init_net.ipv4.sysctl_udp_early_demux,
.maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_udp_early_demux
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_early_demux",
.data = &init_net.ipv4.sysctl_tcp_early_demux,
.maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_tcp_early_demux
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "nexthop_compat_mode",