summaryrefslogtreecommitdiff
path: root/include/net
diff options
context:
space:
mode:
Diffstat (limited to 'include/net')
-rw-r--r--include/net/act_api.h2
-rw-r--r--include/net/addrconf.h12
-rw-r--r--include/net/af_unix.h1
-rw-r--r--include/net/af_vsock.h2
-rw-r--r--include/net/cfg80211.h46
-rw-r--r--include/net/dropreason-core.h6
-rw-r--r--include/net/genetlink.h2
-rw-r--r--include/net/if_inet6.h4
-rw-r--r--include/net/inet_sock.h5
-rw-r--r--include/net/ip.h10
-rw-r--r--include/net/ip6_fib.h4
-rw-r--r--include/net/mana/gdma.h7
-rw-r--r--include/net/neighbour.h2
-rw-r--r--include/net/netdev_rx_queue.h4
-rw-r--r--include/net/netfilter/nf_flow_table.h10
-rw-r--r--include/net/netlink.h2
-rw-r--r--include/net/netns/core.h1
-rw-r--r--include/net/netns/ipv4.h50
-rw-r--r--include/net/page_pool/helpers.h77
-rw-r--r--include/net/page_pool/types.h6
-rw-r--r--include/net/sock.h1
-rw-r--r--include/net/tcp.h9
-rw-r--r--include/net/tcp_ao.h6
-rw-r--r--include/net/tcp_states.h2
-rw-r--r--include/net/xdp_sock.h111
-rw-r--r--include/net/xdp_sock_drv.h34
-rw-r--r--include/net/xsk_buff_pool.h8
27 files changed, 350 insertions, 74 deletions
diff --git a/include/net/act_api.h b/include/net/act_api.h
index 4ae0580b63ca..ea13e1e4a7c2 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -191,7 +191,7 @@ int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index,
struct nlattr *est, struct tc_action **a,
const struct tc_action_ops *ops, int bind,
u32 flags);
-void tcf_idr_insert_many(struct tc_action *actions[]);
+void tcf_idr_insert_many(struct tc_action *actions[], int init_res[]);
void tcf_idr_cleanup(struct tc_action_net *tn, u32 index);
int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
struct tc_action **a, int bind);
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 82da55101b5a..61ebe723ee4d 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -31,17 +31,22 @@ struct prefix_info {
__u8 length;
__u8 prefix_len;
+ union __packed {
+ __u8 flags;
+ struct __packed {
#if defined(__BIG_ENDIAN_BITFIELD)
- __u8 onlink : 1,
+ __u8 onlink : 1,
autoconf : 1,
reserved : 6;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- __u8 reserved : 6,
+ __u8 reserved : 6,
autoconf : 1,
onlink : 1;
#else
#error "Please fix <asm/byteorder.h>"
#endif
+ };
+ };
__be32 valid;
__be32 prefered;
__be32 reserved2;
@@ -49,6 +54,9 @@ struct prefix_info {
struct in6_addr prefix;
};
+/* rfc4861 4.6.2: IPv6 PIO is 32 bytes in size */
+static_assert(sizeof(struct prefix_info) == 32);
+
#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <net/if_inet6.h>
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 824c258143a3..49c4640027d8 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -75,6 +75,7 @@ struct unix_sock {
};
#define unix_sk(ptr) container_of_const(ptr, struct unix_sock, sk)
+#define unix_peer(sk) (unix_sk(sk)->peer)
#define peer_wait peer_wq.wait
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index e302c0e804d0..535701efc1e5 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -137,7 +137,6 @@ struct vsock_transport {
u64 (*stream_rcvhiwat)(struct vsock_sock *);
bool (*stream_is_active)(struct vsock_sock *);
bool (*stream_allow)(u32 cid, u32 port);
- int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
/* SEQ_PACKET. */
ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
@@ -168,6 +167,7 @@ struct vsock_transport {
struct vsock_transport_send_notify_data *);
/* sk_lock held by the caller */
void (*notify_buffer_size)(struct vsock_sock *, u64 *);
+ int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
/* Shutdown. */
int (*shutdown)(struct vsock_sock *, int);
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 602960dafe0f..ac1fb326dcda 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -9375,4 +9375,50 @@ bool cfg80211_valid_disable_subchannel_bitmap(u16 *bitmap,
*/
void cfg80211_links_removed(struct net_device *dev, u16 link_mask);
+#ifdef CONFIG_CFG80211_DEBUGFS
+/**
+ * wiphy_locked_debugfs_read - do a locked read in debugfs
+ * @wiphy: the wiphy to use
+ * @file: the file being read
+ * @buf: the buffer to fill and then read from
+ * @bufsize: size of the buffer
+ * @userbuf: the user buffer to copy to
+ * @count: read count
+ * @ppos: read position
+ * @handler: the read handler to call (under wiphy lock)
+ * @data: additional data to pass to the read handler
+ */
+ssize_t wiphy_locked_debugfs_read(struct wiphy *wiphy, struct file *file,
+ char *buf, size_t bufsize,
+ char __user *userbuf, size_t count,
+ loff_t *ppos,
+ ssize_t (*handler)(struct wiphy *wiphy,
+ struct file *file,
+ char *buf,
+ size_t bufsize,
+ void *data),
+ void *data);
+
+/**
+ * wiphy_locked_debugfs_write - do a locked write in debugfs
+ * @wiphy: the wiphy to use
+ * @file: the file being written to
+ * @buf: the buffer to copy the user data to
+ * @bufsize: size of the buffer
+ * @userbuf: the user buffer to copy from
+ * @count: read count
+ * @handler: the write handler to call (under wiphy lock)
+ * @data: additional data to pass to the write handler
+ */
+ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file,
+ char *buf, size_t bufsize,
+ const char __user *userbuf, size_t count,
+ ssize_t (*handler)(struct wiphy *wiphy,
+ struct file *file,
+ char *buf,
+ size_t count,
+ void *data),
+ void *data);
+#endif
+
#endif /* __NET_CFG80211_H */
diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index 3c70ad53a49c..278e4c7d465c 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -86,6 +86,7 @@
FN(IPV6_NDISC_NS_OTHERHOST) \
FN(QUEUE_PURGE) \
FN(TC_ERROR) \
+ FN(PACKET_SOCK_ERROR) \
FNe(MAX)
/**
@@ -379,6 +380,11 @@ enum skb_drop_reason {
/** @SKB_DROP_REASON_TC_ERROR: generic internal tc error. */
SKB_DROP_REASON_TC_ERROR,
/**
+ * @SKB_DROP_REASON_PACKET_SOCK_ERROR: generic packet socket errors
+ * after its filter matches an incoming packet.
+ */
+ SKB_DROP_REASON_PACKET_SOCK_ERROR,
+ /**
* @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
* shouldn't be used as a real 'reason' - only for tracing code gen
*/
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index e18a4c0d69ee..c53244f20437 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -12,10 +12,12 @@
* struct genl_multicast_group - generic netlink multicast group
* @name: name of the multicast group, names are per-family
* @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM)
+ * @cap_sys_admin: whether %CAP_SYS_ADMIN is required for binding
*/
struct genl_multicast_group {
char name[GENL_NAMSIZ];
u8 flags;
+ u8 cap_sys_admin:1;
};
struct genl_split_ops;
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 3e454c4d7ba6..f07642264c1e 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -22,10 +22,6 @@
#define IF_RS_SENT 0x10
#define IF_READY 0x80000000
-/* prefix flags */
-#define IF_PREFIX_ONLINK 0x01
-#define IF_PREFIX_AUTOCONF 0x02
-
enum {
INET6_IFADDR_STATE_PREDAD,
INET6_IFADDR_STATE_DAD,
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 74db6d97cae1..aa86453f6b9b 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -234,10 +234,7 @@ struct inet_sock {
int uc_index;
int mc_index;
__be32 mc_addr;
- struct {
- __u16 lo;
- __u16 hi;
- } local_port_range;
+ u32 local_port_range; /* high << 16 | low */
struct ip_mc_socklist __rcu *mc_list;
struct inet_cork_full cork;
diff --git a/include/net/ip.h b/include/net/ip.h
index 1fc4c8d69e33..de0c69c57e3c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -349,8 +349,14 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o
} \
}
-void inet_get_local_port_range(const struct net *net, int *low, int *high);
-void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
+static inline void inet_get_local_port_range(const struct net *net, int *low, int *high)
+{
+ u32 range = READ_ONCE(net->ipv4.ip_local_ports.range);
+
+ *low = range & 0xffff;
+ *high = range >> 16;
+}
+bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
#ifdef CONFIG_SYSCTL
static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1ba9f4ddf2f6..95ed495c3a40 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -328,8 +328,10 @@ static inline bool fib6_info_hold_safe(struct fib6_info *f6i)
static inline void fib6_info_release(struct fib6_info *f6i)
{
- if (f6i && refcount_dec_and_test(&f6i->fib6_ref))
+ if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) {
+ DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link));
call_rcu(&f6i->rcu, fib6_info_destroy_rcu);
+ }
}
enum fib6_walk_state {
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 88b6ef7ce1a6..76f2fd2645b7 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -293,6 +293,7 @@ struct gdma_queue {
u32 head;
u32 tail;
+ struct list_head entry;
/* Extra fields specific to EQ/CQ. */
union {
@@ -328,6 +329,7 @@ struct gdma_queue_spec {
void *context;
unsigned long log2_throttle_limit;
+ unsigned int msix_index;
} eq;
struct {
@@ -344,7 +346,9 @@ struct gdma_queue_spec {
struct gdma_irq_context {
void (*handler)(void *arg);
- void *arg;
+ /* Protect the eq_list */
+ spinlock_t lock;
+ struct list_head eq_list;
char name[MANA_IRQ_NAME_SZ];
};
@@ -355,7 +359,6 @@ struct gdma_context {
unsigned int max_num_queues;
unsigned int max_num_msix;
unsigned int num_msix_usable;
- struct gdma_resource msix_resource;
struct gdma_irq_context *irq_contexts;
/* L2 MTU */
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 07022bb0d44d..0d28172193fa 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -162,7 +162,7 @@ struct neighbour {
struct rcu_head rcu;
struct net_device *dev;
netdevice_tracker dev_tracker;
- u8 primary_key[0];
+ u8 primary_key[];
} __randomize_layout;
struct neigh_ops {
diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h
index cdcafb30d437..aa1716fb0e53 100644
--- a/include/net/netdev_rx_queue.h
+++ b/include/net/netdev_rx_queue.h
@@ -21,6 +21,10 @@ struct netdev_rx_queue {
#ifdef CONFIG_XDP_SOCKETS
struct xsk_buff_pool *pool;
#endif
+ /* NAPI instance for the queue
+ * Readers and writers must hold RTNL
+ */
+ struct napi_struct *napi;
} ____cacheline_aligned_in_smp;
/*
diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index fe1507c1db82..692d5955911c 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -62,6 +62,8 @@ struct nf_flowtable_type {
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule);
void (*free)(struct nf_flowtable *ft);
+ void (*get)(struct nf_flowtable *ft);
+ void (*put)(struct nf_flowtable *ft);
nf_hookfn *hook;
struct module *owner;
};
@@ -240,6 +242,11 @@ nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table,
}
list_add_tail(&block_cb->list, &block->cb_list);
+ up_write(&flow_table->flow_block_lock);
+
+ if (flow_table->type->get)
+ flow_table->type->get(flow_table);
+ return 0;
unlock:
up_write(&flow_table->flow_block_lock);
@@ -262,6 +269,9 @@ nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table,
WARN_ON(true);
}
up_write(&flow_table->flow_block_lock);
+
+ if (flow_table->type->put)
+ flow_table->type->put(flow_table);
}
void flow_offload_route_init(struct flow_offload *flow,
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 167b91348e57..28039e57070a 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -1214,7 +1214,7 @@ static inline void *nla_data(const struct nlattr *nla)
* nla_len - length of payload
* @nla: netlink attribute
*/
-static inline int nla_len(const struct nlattr *nla)
+static inline u16 nla_len(const struct nlattr *nla)
{
return nla->nla_len - NLA_HDRLEN;
}
diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index a91ef9f8de60..78214f1b43a2 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -13,6 +13,7 @@ struct netns_core {
struct ctl_table_header *sysctl_hdr;
int sysctl_somaxconn;
+ int sysctl_optmem_max;
u8 sysctl_txrehash;
#ifdef CONFIG_PROC_FS
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 73f43f699199..c356c458b340 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -19,8 +19,7 @@ struct hlist_head;
struct fib_table;
struct sock;
struct local_ports {
- seqlock_t lock;
- int range[2];
+ u32 range; /* high << 16 | low */
bool warned;
};
@@ -42,6 +41,38 @@ struct inet_timewait_death_row {
struct tcp_fastopen_context;
struct netns_ipv4 {
+ /* Cacheline organization can be found documented in
+ * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst.
+ * Please update the document when adding new fields.
+ */
+
+ /* TX readonly hotpath cache lines */
+ __cacheline_group_begin(netns_ipv4_read_tx);
+ u8 sysctl_tcp_early_retrans;
+ u8 sysctl_tcp_tso_win_divisor;
+ u8 sysctl_tcp_tso_rtt_log;
+ u8 sysctl_tcp_autocorking;
+ int sysctl_tcp_min_snd_mss;
+ unsigned int sysctl_tcp_notsent_lowat;
+ int sysctl_tcp_limit_output_bytes;
+ int sysctl_tcp_min_rtt_wlen;
+ int sysctl_tcp_wmem[3];
+ u8 sysctl_ip_fwd_use_pmtu;
+ __cacheline_group_end(netns_ipv4_read_tx);
+
+ /* TXRX readonly hotpath cache lines */
+ __cacheline_group_begin(netns_ipv4_read_txrx);
+ u8 sysctl_tcp_moderate_rcvbuf;
+ __cacheline_group_end(netns_ipv4_read_txrx);
+
+ /* RX readonly hotpath cache line */
+ __cacheline_group_begin(netns_ipv4_read_rx);
+ u8 sysctl_ip_early_demux;
+ u8 sysctl_tcp_early_demux;
+ int sysctl_tcp_reordering;
+ int sysctl_tcp_rmem[3];
+ __cacheline_group_end(netns_ipv4_read_rx);
+
struct inet_timewait_death_row tcp_death_row;
struct udp_table *udp_table;
@@ -96,17 +127,14 @@ struct netns_ipv4 {
u8 sysctl_ip_default_ttl;
u8 sysctl_ip_no_pmtu_disc;
- u8 sysctl_ip_fwd_use_pmtu;
u8 sysctl_ip_fwd_update_priority;
u8 sysctl_ip_nonlocal_bind;
u8 sysctl_ip_autobind_reuse;
/* Shall we try to damage output packets if routing dev changes? */
u8 sysctl_ip_dynaddr;
- u8 sysctl_ip_early_demux;
#ifdef CONFIG_NET_L3_MASTER_DEV
u8 sysctl_raw_l3mdev_accept;
#endif
- u8 sysctl_tcp_early_demux;
u8 sysctl_udp_early_demux;
u8 sysctl_nexthop_compat_mode;
@@ -119,7 +147,6 @@ struct netns_ipv4 {
u8 sysctl_tcp_mtu_probing;
int sysctl_tcp_mtu_probe_floor;
int sysctl_tcp_base_mss;
- int sysctl_tcp_min_snd_mss;
int sysctl_tcp_probe_threshold;
u32 sysctl_tcp_probe_interval;
@@ -135,17 +162,14 @@ struct netns_ipv4 {
u8 sysctl_tcp_backlog_ack_defer;
u8 sysctl_tcp_pingpong_thresh;
- int sysctl_tcp_reordering;
u8 sysctl_tcp_retries1;
u8 sysctl_tcp_retries2;
u8 sysctl_tcp_orphan_retries;
u8 sysctl_tcp_tw_reuse;
int sysctl_tcp_fin_timeout;
- unsigned int sysctl_tcp_notsent_lowat;
u8 sysctl_tcp_sack;
u8 sysctl_tcp_window_scaling;
u8 sysctl_tcp_timestamps;
- u8 sysctl_tcp_early_retrans;
u8 sysctl_tcp_recovery;
u8 sysctl_tcp_thin_linear_timeouts;
u8 sysctl_tcp_slow_start_after_idle;
@@ -161,21 +185,13 @@ struct netns_ipv4 {
u8 sysctl_tcp_frto;
u8 sysctl_tcp_nometrics_save;
u8 sysctl_tcp_no_ssthresh_metrics_save;
- u8 sysctl_tcp_moderate_rcvbuf;
- u8 sysctl_tcp_tso_win_divisor;
u8 sysctl_tcp_workaround_signed_windows;
- int sysctl_tcp_limit_output_bytes;
int sysctl_tcp_challenge_ack_limit;
- int sysctl_tcp_min_rtt_wlen;
u8 sysctl_tcp_min_tso_segs;
- u8 sysctl_tcp_tso_rtt_log;
- u8 sysctl_tcp_autocorking;
u8 sysctl_tcp_reflect_tos;
int sysctl_tcp_invalid_ratelimit;
int sysctl_tcp_pacing_ss_ratio;
int sysctl_tcp_pacing_ca_ratio;
- int sysctl_tcp_wmem[3];
- int sysctl_tcp_rmem[3];
unsigned int sysctl_tcp_child_ehash_entries;
unsigned long sysctl_tcp_comp_sack_delay_ns;
unsigned long sysctl_tcp_comp_sack_slack_ns;
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 7dc65774cde5..1d397c1a0043 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -11,7 +11,7 @@
* The page_pool allocator is optimized for recycling page or page fragment used
* by skb packet and xdp frame.
*
- * Basic use involves replacing and alloc_pages() calls with page_pool_alloc(),
+ * Basic use involves replacing any alloc_pages() calls with page_pool_alloc(),
* which allocate memory with or without page splitting depending on the
* requested memory size.
*
@@ -29,7 +29,7 @@
* page allocated from page pool. Page splitting enables memory saving and thus
* avoids TLB/cache miss for data access, but there also is some cost to
* implement page splitting, mainly some cache line dirtying/bouncing for
- * 'struct page' and atomic operation for page->pp_frag_count.
+ * 'struct page' and atomic operation for page->pp_ref_count.
*
* The API keeps track of in-flight pages, in order to let API users know when
* it is safe to free a page_pool object, the API users must call
@@ -37,15 +37,15 @@
* attach the page_pool object to a page_pool-aware object like skbs marked with
* skb_mark_for_recycle().
*
- * page_pool_put_page() may be called multi times on the same page if a page is
- * split into multi fragments. For the last fragment, it will either recycle the
- * page, or in case of page->_refcount > 1, it will release the DMA mapping and
- * in-flight state accounting.
+ * page_pool_put_page() may be called multiple times on the same page if a page
+ * is split into multiple fragments. For the last fragment, it will either
+ * recycle the page, or in case of page->_refcount > 1, it will release the DMA
+ * mapping and in-flight state accounting.
*
* dma_sync_single_range_for_device() is only called for the last fragment when
* page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the
* last freed fragment to do the sync_for_device operation for all fragments in
- * the same page when a page is split, the API user must setup pool->p.max_len
+ * the same page when a page is split. The API user must setup pool->p.max_len
* and pool->p.offset correctly and ensure that page_pool_put_page() is called
* with dma_sync_size being -1 for fragment API.
*/
@@ -210,69 +210,82 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
return pool->p.dma_dir;
}
-/* pp_frag_count represents the number of writers who can update the page
- * either by updating skb->data or via DMA mappings for the device.
- * We can't rely on the page refcnt for that as we don't know who might be
- * holding page references and we can't reliably destroy or sync DMA mappings
- * of the fragments.
+/**
+ * page_pool_fragment_page() - split a fresh page into fragments
+ * @page: page to split
+ * @nr: references to set
+ *
+ * pp_ref_count represents the number of outstanding references to the page,
+ * which will be freed using page_pool APIs (rather than page allocator APIs
+ * like put_page()). Such references are usually held by page_pool-aware
+ * objects like skbs marked for page pool recycling.
*
- * When pp_frag_count reaches 0 we can either recycle the page if the page
- * refcnt is 1 or return it back to the memory allocator and destroy any
- * mappings we have.
+ * This helper allows the caller to take (set) multiple references to a
+ * freshly allocated page. The page must be freshly allocated (have a
+ * pp_ref_count of 1). This is commonly done by drivers and
+ * "fragment allocators" to save atomic operations - either when they know
+ * upfront how many references they will need; or to take MAX references and
+ * return the unused ones with a single atomic dec(), instead of performing
+ * multiple atomic inc() operations.
*/
static inline void page_pool_fragment_page(struct page *page, long nr)
{
- atomic_long_set(&page->pp_frag_count, nr);
+ atomic_long_set(&page->pp_ref_count, nr);
}
-static inline long page_pool_defrag_page(struct page *page, long nr)
+static inline long page_pool_unref_page(struct page *page, long nr)
{
long ret;
- /* If nr == pp_frag_count then we have cleared all remaining
+ /* If nr == pp_ref_count then we have cleared all remaining
* references to the page:
* 1. 'n == 1': no need to actually overwrite it.
* 2. 'n != 1': overwrite it with one, which is the rare case
- * for pp_frag_count draining.
+ * for pp_ref_count draining.
*
* The main advantage to doing this is that not only we avoid a atomic
* update, as an atomic_read is generally a much cheaper operation than
* an atomic update, especially when dealing with a page that may be
- * partitioned into only 2 or 3 pieces; but also unify the pp_frag_count
+ * referenced by only 2 or 3 users; but also unify the pp_ref_count
* handling by ensuring all pages have partitioned into only 1 piece
* initially, and only overwrite it when the page is partitioned into
* more than one piece.
*/
- if (atomic_long_read(&page->pp_frag_count) == nr) {
+ if (atomic_long_read(&page->pp_ref_count) == nr) {
/* As we have ensured nr is always one for constant case using
* the BUILD_BUG_ON(), only need to handle the non-constant case
- * here for pp_frag_count draining, which is a rare case.
+ * here for pp_ref_count draining, which is a rare case.
*/
BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1);
if (!__builtin_constant_p(nr))
- atomic_long_set(&page->pp_frag_count, 1);
+ atomic_long_set(&page->pp_ref_count, 1);
return 0;
}
- ret = atomic_long_sub_return(nr, &page->pp_frag_count);
+ ret = atomic_long_sub_return(nr, &page->pp_ref_count);
WARN_ON(ret < 0);
- /* We are the last user here too, reset pp_frag_count back to 1 to
+ /* We are the last user here too, reset pp_ref_count back to 1 to
* ensure all pages have been partitioned into 1 piece initially,
* this should be the rare case when the last two fragment users call
- * page_pool_defrag_page() currently.
+ * page_pool_unref_page() currently.
*/
if (unlikely(!ret))
- atomic_long_set(&page->pp_frag_count, 1);
+ atomic_long_set(&page->pp_ref_count, 1);
return ret;
}
-static inline bool page_pool_is_last_frag(struct page *page)
+static inline void page_pool_ref_page(struct page *page)
+{
+ atomic_long_inc(&page->pp_ref_count);
+}
+
+static inline bool page_pool_is_last_ref(struct page *page)
{
- /* If page_pool_defrag_page() returns 0, we were the last user */
- return page_pool_defrag_page(page, 1) == 0;
+ /* If page_pool_unref_page() returns 0, we were the last user */
+ return page_pool_unref_page(page, 1) == 0;
}
/**
@@ -297,10 +310,10 @@ static inline void page_pool_put_page(struct page_pool *pool,
* allow registering MEM_TYPE_PAGE_POOL, but shield linker.
*/
#ifdef CONFIG_PAGE_POOL
- if (!page_pool_is_last_frag(page))
+ if (!page_pool_is_last_ref(page))
return;
- page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct);
+ page_pool_put_unrefed_page(pool, page, dma_sync_size, allow_direct);
#endif
}
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index ac286ea8ce2d..76481c465375 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -234,9 +234,9 @@ static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data,
}
#endif
-void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
- unsigned int dma_sync_size,
- bool allow_direct);
+void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size,
+ bool allow_direct);
static inline bool is_page_pool_compiled_in(void)
{
diff --git a/include/net/sock.h b/include/net/sock.h
index 1d6931caf0c3..8b6fe164b218 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2920,7 +2920,6 @@ extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;
extern int sysctl_tstamp_allow_data;
-extern int sysctl_optmem_max;
extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 973555cb1d3f..f5ca4abaee8b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1518,17 +1518,22 @@ static inline int tcp_full_space(const struct sock *sk)
return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
}
-static inline void tcp_adjust_rcv_ssthresh(struct sock *sk)
+static inline void __tcp_adjust_rcv_ssthresh(struct sock *sk, u32 new_ssthresh)
{
int unused_mem = sk_unused_reserved_mem(sk);
struct tcp_sock *tp = tcp_sk(sk);
- tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh, new_ssthresh);
if (unused_mem)
tp->rcv_ssthresh = max_t(u32, tp->rcv_ssthresh,
tcp_win_from_space(sk, unused_mem));
}
+static inline void tcp_adjust_rcv_ssthresh(struct sock *sk)
+{
+ __tcp_adjust_rcv_ssthresh(sk, 4U * tcp_sk(sk)->advmss);
+}
+
void tcp_cleanup_rbuf(struct sock *sk, int copied);
void __tcp_cleanup_rbuf(struct sock *sk, int copied);
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index 4d900ef8dfc3..0f2dcc9e8d46 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -62,11 +62,17 @@ static inline int tcp_ao_maclen(const struct tcp_ao_key *key)
return key->maclen;
}
+/* Use tcp_ao_len_aligned() for TCP header calculations */
static inline int tcp_ao_len(const struct tcp_ao_key *key)
{
return tcp_ao_maclen(key) + sizeof(struct tcp_ao_hdr);
}
+static inline int tcp_ao_len_aligned(const struct tcp_ao_key *key)
+{
+ return round_up(tcp_ao_len(key), 4);
+}
+
static inline unsigned int tcp_ao_digest_size(struct tcp_ao_key *key)
{
return key->digest_size;
diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h
index cc00118acca1..d60e8148ff4c 100644
--- a/include/net/tcp_states.h
+++ b/include/net/tcp_states.h
@@ -22,6 +22,7 @@ enum {
TCP_LISTEN,
TCP_CLOSING, /* Now a valid state */
TCP_NEW_SYN_RECV,
+ TCP_BOUND_INACTIVE, /* Pseudo-state for inet_diag */
TCP_MAX_STATES /* Leave at the end! */
};
@@ -43,6 +44,7 @@ enum {
TCPF_LISTEN = (1 << TCP_LISTEN),
TCPF_CLOSING = (1 << TCP_CLOSING),
TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV),
+ TCPF_BOUND_INACTIVE = (1 << TCP_BOUND_INACTIVE),
};
#endif /* _LINUX_TCP_STATES_H */
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index f83128007fb0..3cb4dc9bd70e 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -30,6 +30,7 @@ struct xdp_umem {
struct user_struct *user;
refcount_t users;
u8 flags;
+ u8 tx_metadata_len;
bool zc;
struct page **pgs;
int id;
@@ -92,12 +93,105 @@ struct xdp_sock {
struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */
};
+/*
+ * AF_XDP TX metadata hooks for network devices.
+ * The following hooks can be defined; unless noted otherwise, they are
+ * optional and can be filled with a null pointer.
+ *
+ * void (*tmo_request_timestamp)(void *priv)
+ * Called when AF_XDP frame requested egress timestamp.
+ *
+ * u64 (*tmo_fill_timestamp)(void *priv)
+ * Called when AF_XDP frame, that had requested egress timestamp,
+ * received a completion. The hook needs to return the actual HW timestamp.
+ *
+ * void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv)
+ * Called when AF_XDP frame requested HW checksum offload. csum_start
+ * indicates position where checksumming should start.
+ * csum_offset indicates position where checksum should be stored.
+ *
+ */
+struct xsk_tx_metadata_ops {
+ void (*tmo_request_timestamp)(void *priv);
+ u64 (*tmo_fill_timestamp)(void *priv);
+ void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv);
+};
+
#ifdef CONFIG_XDP_SOCKETS
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
void __xsk_map_flush(void);
+/**
+ * xsk_tx_metadata_to_compl - Save enough relevant metadata information
+ * to perform tx completion in the future.
+ * @meta: pointer to AF_XDP metadata area
+ * @compl: pointer to output struct xsk_tx_metadata_to_compl
+ *
+ * This function should be called by the networking device when
+ * it prepares AF_XDP egress packet. The value of @compl should be stored
+ * and passed to xsk_tx_metadata_complete upon TX completion.
+ */
+static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta,
+ struct xsk_tx_metadata_compl *compl)
+{
+ if (!meta)
+ return;
+
+ if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP)
+ compl->tx_timestamp = &meta->completion.tx_timestamp;
+ else
+ compl->tx_timestamp = NULL;
+}
+
+/**
+ * xsk_tx_metadata_request - Evaluate AF_XDP TX metadata at submission
+ * and call appropriate xsk_tx_metadata_ops operation.
+ * @meta: pointer to AF_XDP metadata area
+ * @ops: pointer to struct xsk_tx_metadata_ops
+ * @priv: pointer to driver-private aread
+ *
+ * This function should be called by the networking device when
+ * it prepares AF_XDP egress packet.
+ */
+static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta,
+ const struct xsk_tx_metadata_ops *ops,
+ void *priv)
+{
+ if (!meta)
+ return;
+
+ if (ops->tmo_request_timestamp)
+ if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP)
+ ops->tmo_request_timestamp(priv);
+
+ if (ops->tmo_request_checksum)
+ if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM)
+ ops->tmo_request_checksum(meta->request.csum_start,
+ meta->request.csum_offset, priv);
+}
+
+/**
+ * xsk_tx_metadata_complete - Evaluate AF_XDP TX metadata at completion
+ * and call appropriate xsk_tx_metadata_ops operation.
+ * @compl: pointer to completion metadata produced from xsk_tx_metadata_to_compl
+ * @ops: pointer to struct xsk_tx_metadata_ops
+ * @priv: pointer to driver-private aread
+ *
+ * This function should be called by the networking device upon
+ * AF_XDP egress completion.
+ */
+static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl,
+ const struct xsk_tx_metadata_ops *ops,
+ void *priv)
+{
+ if (!compl)
+ return;
+
+ *compl->tx_timestamp = ops->tmo_fill_timestamp(priv);
+}
+
#else
static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
@@ -114,6 +208,23 @@ static inline void __xsk_map_flush(void)
{
}
+static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta,
+ struct xsk_tx_metadata_compl *compl)
+{
+}
+
+static inline void xsk_tx_metadata_request(struct xsk_tx_metadata *meta,
+ const struct xsk_tx_metadata_ops *ops,
+ void *priv)
+{
+}
+
+static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl,
+ const struct xsk_tx_metadata_ops *ops,
+ void *priv)
+{
+}
+
#endif /* CONFIG_XDP_SOCKETS */
#if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET)
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 1f6fc8c7a84c..81e02de3f453 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -165,6 +165,30 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
return xp_raw_get_data(pool, addr);
}
+#define XDP_TXMD_FLAGS_VALID ( \
+ XDP_TXMD_FLAGS_TIMESTAMP | \
+ XDP_TXMD_FLAGS_CHECKSUM | \
+ 0)
+
+static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta)
+{
+ return !(meta->flags & ~XDP_TXMD_FLAGS_VALID);
+}
+
+static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
+{
+ struct xsk_tx_metadata *meta;
+
+ if (!pool->tx_metadata_len)
+ return NULL;
+
+ meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len;
+ if (unlikely(!xsk_buff_valid_tx_metadata(meta)))
+ return NULL; /* no way to signal the error to the user */
+
+ return meta;
+}
+
static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
@@ -324,6 +348,16 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
return NULL;
}
+static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta)
+{
+ return false;
+}
+
+static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
+{
+ return NULL;
+}
+
static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
{
}
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index b0bdff26fc88..8d48d37ab7c0 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -33,6 +33,7 @@ struct xdp_buff_xsk {
};
#define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb))
+#define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t))
struct xsk_dma_map {
dma_addr_t *dma_pages;
@@ -77,10 +78,12 @@ struct xsk_buff_pool {
u32 chunk_size;
u32 chunk_shift;
u32 frame_len;
+ u8 tx_metadata_len; /* inherited from umem */
u8 cached_need_wakeup;
bool uses_need_wakeup;
bool dma_need_sync;
bool unaligned;
+ bool tx_sw_csum;
void *addrs;
/* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect:
* NAPI TX thread and sendmsg error paths in the SKB destructor callback and when
@@ -233,4 +236,9 @@ static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb)
return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
}
+static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool)
+{
+ return pool->tx_metadata_len > 0;
+}
+
#endif /* XSK_BUFF_POOL_H_ */