summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/devinet.c10
-rw-r--r--net/ipv4/fib_semantics.c6
-rw-r--r--net/ipv4/fib_trie.c7
-rw-r--r--net/ipv4/igmp.c3
-rw-r--r--net/ipv4/inet_hashtables.c36
-rw-r--r--net/ipv4/ip_forward.c1
-rw-r--r--net/ipv4/ip_input.c3
-rw-r--r--net/ipv4/ip_output.c9
-rw-r--r--net/ipv4/ip_sockglue.c2
-rw-r--r--net/ipv4/ipmr.c1
-rw-r--r--net/ipv4/route.c7
-rw-r--r--net/ipv4/tcp.c14
-rw-r--r--net/ipv4/tcp_bpf.c4
-rw-r--r--net/ipv4/tcp_input.c13
-rw-r--r--net/ipv4/tcp_output.c9
-rw-r--r--net/ipv4/udp.c6
16 files changed, 78 insertions, 53 deletions
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 9cf64ee47dd2..ca0ff15dc8fa 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -355,14 +355,14 @@ static void __inet_del_ifa(struct in_device *in_dev,
{
struct in_ifaddr *promote = NULL;
struct in_ifaddr *ifa, *ifa1;
- struct in_ifaddr *last_prim;
+ struct in_ifaddr __rcu **last_prim;
struct in_ifaddr *prev_prom = NULL;
int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);
ASSERT_RTNL();
ifa1 = rtnl_dereference(*ifap);
- last_prim = rtnl_dereference(in_dev->ifa_list);
+ last_prim = ifap;
if (in_dev->dead)
goto no_promotions;
@@ -376,7 +376,7 @@ static void __inet_del_ifa(struct in_device *in_dev,
while ((ifa = rtnl_dereference(*ifap1)) != NULL) {
if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
ifa1->ifa_scope <= ifa->ifa_scope)
- last_prim = ifa;
+ last_prim = &ifa->ifa_next;
if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
ifa1->ifa_mask != ifa->ifa_mask ||
@@ -440,9 +440,9 @@ no_promotions:
rcu_assign_pointer(prev_prom->ifa_next, next_sec);
- last_sec = rtnl_dereference(last_prim->ifa_next);
+ last_sec = rtnl_dereference(*last_prim);
rcu_assign_pointer(promote->ifa_next, last_sec);
- rcu_assign_pointer(last_prim->ifa_next, promote);
+ rcu_assign_pointer(*last_prim, promote);
}
promote->ifa_flags &= ~IFA_F_SECONDARY;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 65ba18a91865..1ea82bc33ef1 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -278,7 +278,8 @@ void fib_release_info(struct fib_info *fi)
hlist_del(&nexthop_nh->nh_hash);
} endfor_nexthops(fi)
}
- fi->fib_dead = 1;
+ /* Paired with READ_ONCE() from fib_table_lookup() */
+ WRITE_ONCE(fi->fib_dead, 1);
fib_info_put(fi);
}
spin_unlock_bh(&fib_info_lock);
@@ -1581,6 +1582,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
link_it:
ofi = fib_find_info(fi);
if (ofi) {
+ /* fib_table_lookup() should not see @fi yet. */
fi->fib_dead = 1;
free_fib_info(fi);
refcount_inc(&ofi->fib_treeref);
@@ -1619,6 +1621,7 @@ err_inval:
failure:
if (fi) {
+ /* fib_table_lookup() should not see @fi yet. */
fi->fib_dead = 1;
free_fib_info(fi);
}
@@ -1884,6 +1887,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local)
continue;
if (fi->fib_prefsrc == local) {
fi->fib_flags |= RTNH_F_DEAD;
+ fi->pfsrc_removed = true;
ret++;
}
}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 74d403dbd2b4..9bdfdab906fe 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1582,7 +1582,8 @@ found:
if (fa->fa_dscp &&
inet_dscp_to_dsfield(fa->fa_dscp) != flp->flowi4_tos)
continue;
- if (fi->fib_dead)
+ /* Paired with WRITE_ONCE() in fib_release_info() */
+ if (READ_ONCE(fi->fib_dead))
continue;
if (fa->fa_info->fib_scope < flp->flowi4_scope)
continue;
@@ -2026,6 +2027,7 @@ void fib_table_flush_external(struct fib_table *tb)
int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
{
struct trie *t = (struct trie *)tb->tb_data;
+ struct nl_info info = { .nl_net = net };
struct key_vector *pn = t->kv;
unsigned long cindex = 1;
struct hlist_node *tmp;
@@ -2088,6 +2090,9 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
fib_notify_alias_delete(net, n->key, &n->leaf, fa,
NULL);
+ if (fi->pfsrc_removed)
+ rtmsg_fib(RTM_DELROUTE, htonl(n->key), fa,
+ KEYLENGTH - fa->fa_slen, tb->tb_id, &info, 0);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 0c9e768e5628..418e5fb58fd3 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -353,8 +353,9 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
struct flowi4 fl4;
int hlen = LL_RESERVED_SPACE(dev);
int tlen = dev->needed_tailroom;
- unsigned int size = mtu;
+ unsigned int size;
+ size = min(mtu, IP_MAX_MTU);
while (1) {
skb = alloc_skb(size + hlen + tlen,
GFP_ATOMIC | __GFP_NOWARN);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 7876b7d703cb..c32f5e28758b 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -815,41 +815,45 @@ static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb,
const struct net *net, unsigned short port,
int l3mdev, const struct sock *sk)
{
+ if (!net_eq(ib2_net(tb), net) || tb->port != port ||
+ tb->l3mdev != l3mdev)
+ return false;
+
#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family != tb->family)
+ if (sk->sk_family != tb->family) {
+ if (sk->sk_family == AF_INET)
+ return ipv6_addr_v4mapped(&tb->v6_rcv_saddr) &&
+ tb->v6_rcv_saddr.s6_addr32[3] == sk->sk_rcv_saddr;
+
return false;
+ }
if (sk->sk_family == AF_INET6)
- return net_eq(ib2_net(tb), net) && tb->port == port &&
- tb->l3mdev == l3mdev &&
- ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr);
- else
+ return ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr);
#endif
- return net_eq(ib2_net(tb), net) && tb->port == port &&
- tb->l3mdev == l3mdev && tb->rcv_saddr == sk->sk_rcv_saddr;
+ return tb->rcv_saddr == sk->sk_rcv_saddr;
}
bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net,
unsigned short port, int l3mdev, const struct sock *sk)
{
+ if (!net_eq(ib2_net(tb), net) || tb->port != port ||
+ tb->l3mdev != l3mdev)
+ return false;
+
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family != tb->family) {
if (sk->sk_family == AF_INET)
- return net_eq(ib2_net(tb), net) && tb->port == port &&
- tb->l3mdev == l3mdev &&
- ipv6_addr_any(&tb->v6_rcv_saddr);
+ return ipv6_addr_any(&tb->v6_rcv_saddr) ||
+ ipv6_addr_v4mapped_any(&tb->v6_rcv_saddr);
return false;
}
if (sk->sk_family == AF_INET6)
- return net_eq(ib2_net(tb), net) && tb->port == port &&
- tb->l3mdev == l3mdev &&
- ipv6_addr_any(&tb->v6_rcv_saddr);
- else
+ return ipv6_addr_any(&tb->v6_rcv_saddr);
#endif
- return net_eq(ib2_net(tb), net) && tb->port == port &&
- tb->l3mdev == l3mdev && tb->rcv_saddr == 0;
+ return tb->rcv_saddr == 0;
}
/* The socket's bhash2 hashbucket spinlock must be held when this is called */
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index e18931a6d153..66fac1216d46 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -67,7 +67,6 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
struct ip_options *opt = &(IPCB(skb)->opt);
__IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
- __IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
#ifdef CONFIG_NET_SWITCHDEV
if (skb->offload_l3_fwd_mark) {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index fe9ead9ee863..5e9c8156656a 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -584,7 +584,8 @@ static void ip_sublist_rcv_finish(struct list_head *head)
static struct sk_buff *ip_extract_route_hint(const struct net *net,
struct sk_buff *skb, int rt_type)
{
- if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST)
+ if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST ||
+ IPCB(skb)->flags & IPSKB_MULTIPATH)
return NULL;
return skb;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 43ba4b77b248..4ab877cf6d35 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -207,6 +207,9 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
+ /* OUTOCTETS should be counted after fragment */
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
+
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
skb = skb_expand_head(skb, hh_len);
if (!skb)
@@ -366,8 +369,6 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
/*
* If the indicated interface is up and running, send the packet.
*/
- IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
-
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
@@ -424,8 +425,6 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
- IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
-
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
@@ -982,7 +981,7 @@ static int __ip_append_data(struct sock *sk,
paged = !!cork->gso_size;
if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
- sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+ READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
tskey = atomic_inc_return(&sk->sk_tskey) - 1;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index d1c73660b844..cce9cb25f3b3 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -511,7 +511,7 @@ static bool ipv4_datagram_support_cmsg(const struct sock *sk,
* or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
*/
info = PKTINFO_SKB_CB(skb);
- if (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG) ||
+ if (!(READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_CMSG) ||
!info->ipi_ifindex)
return false;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3f0c6d602fb7..9e222a57bc2b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1804,7 +1804,6 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
struct ip_options *opt = &(IPCB(skb)->opt);
IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
- IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d8c99bdc6170..b214b5a2e045 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1213,6 +1213,7 @@ EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
static void ipv4_send_dest_unreach(struct sk_buff *skb)
{
+ struct net_device *dev;
struct ip_options opt;
int res;
@@ -1230,7 +1231,8 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb)
opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
rcu_read_lock();
- res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
+ dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
+ res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
rcu_read_unlock();
if (res)
@@ -2144,6 +2146,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
fib_select_multipath(res, h);
+ IPCB(skb)->flags |= IPSKB_MULTIPATH;
}
#endif
@@ -3414,6 +3417,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fa->fa_type == fri.type) {
fri.offload = READ_ONCE(fa->offload);
fri.trap = READ_ONCE(fa->trap);
+ fri.offload_failed =
+ READ_ONCE(fa->offload_failed);
break;
}
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b1559481898d..3f66cdeef7de 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1621,16 +1621,13 @@ EXPORT_SYMBOL(tcp_read_sock);
int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
- struct tcp_sock *tp = tcp_sk(sk);
- u32 seq = tp->copied_seq;
struct sk_buff *skb;
int copied = 0;
- u32 offset;
if (sk->sk_state == TCP_LISTEN)
return -ENOTCONN;
- while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
u8 tcp_flags;
int used;
@@ -1643,13 +1640,10 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
copied = used;
break;
}
- seq += used;
copied += used;
- if (tcp_flags & TCPHDR_FIN) {
- ++seq;
+ if (tcp_flags & TCPHDR_FIN)
break;
- }
}
return copied;
}
@@ -2256,14 +2250,14 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
}
}
- if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
+ if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_SOFTWARE)
has_timestamping = true;
else
tss->ts[0] = (struct timespec64) {0};
}
if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
- if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
+ if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_RAW_HARDWARE)
has_timestamping = true;
else
tss->ts[2] = (struct timespec64) {0};
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 81f0dff69e0b..327268203001 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -222,6 +222,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
int *addr_len)
{
struct tcp_sock *tcp = tcp_sk(sk);
+ int peek = flags & MSG_PEEK;
u32 seq = tcp->copied_seq;
struct sk_psock *psock;
int copied = 0;
@@ -311,7 +312,8 @@ msg_bytes_ready:
copied = -EAGAIN;
}
out:
- WRITE_ONCE(tcp->copied_seq, seq);
+ if (!peek)
+ WRITE_ONCE(tcp->copied_seq, seq);
tcp_rcv_space_adjust(sk);
if (copied > 0)
__tcp_cleanup_rbuf(sk, copied);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 06fe1cf645d5..8afb0950a697 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -253,6 +253,19 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
if (unlikely(len > icsk->icsk_ack.rcv_mss +
MAX_TCP_OPTION_SPACE))
tcp_gro_dev_warn(sk, skb, len);
+ /* If the skb has a len of exactly 1*MSS and has the PSH bit
+ * set then it is likely the end of an application write. So
+ * more data may not be arriving soon, and yet the data sender
+ * may be waiting for an ACK if cwnd-bound or using TX zero
+ * copy. So we set ICSK_ACK_PUSHED here so that
+ * tcp_cleanup_rbuf() will send an ACK immediately if the app
+ * reads all of the data and is not ping-pong. If len > MSS
+ * then this logic does not matter (and does not hurt) because
+ * tcp_cleanup_rbuf() will always ACK immediately if the app
+ * reads data and there is more than an MSS of unACKed data.
+ */
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH)
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e6b4fbd642f7..aa0fc8c766e5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -177,8 +177,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
}
/* Account for an ACK we sent. */
-static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
- u32 rcv_nxt)
+static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -192,7 +191,7 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
if (unlikely(rcv_nxt != tp->rcv_nxt))
return; /* Special ACK sent by DCTCP to reflect ECN */
- tcp_dec_quickack_mode(sk, pkts);
+ tcp_dec_quickack_mode(sk);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
@@ -1387,7 +1386,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
sk, skb);
if (likely(tcb->tcp_flags & TCPHDR_ACK))
- tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
+ tcp_event_ack_sent(sk, rcv_nxt);
if (skb->len != tcp_header_size) {
tcp_event_data_sent(tp, sk);
@@ -3474,7 +3473,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
if (delta <= 0)
return;
amt = sk_mem_pages(delta);
- sk->sk_forward_alloc += amt << PAGE_SHIFT;
+ sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
sk_memory_allocated_add(sk, amt);
if (mem_cgroup_sockets_enabled && sk->sk_memcg)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0794a2c46a56..f39b9c844580 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1414,9 +1414,9 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
spin_lock(&sk_queue->lock);
- sk->sk_forward_alloc += size;
+ sk_forward_alloc_add(sk, size);
amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
- sk->sk_forward_alloc -= amt;
+ sk_forward_alloc_add(sk, -amt);
if (amt)
__sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT);
@@ -1527,7 +1527,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
goto uncharge_drop;
}
- sk->sk_forward_alloc -= size;
+ sk_forward_alloc_add(sk, -size);
/* no need to setup a destructor, we will explicitly release the
* forward allocated memory on dequeue