summaryrefslogtreecommitdiff
path: root/include/net/sock.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/net/sock.h')
-rw-r--r--include/net/sock.h159
1 files changed, 113 insertions, 46 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index 8741988e6880..c4f5e6fca17c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -252,6 +252,7 @@ struct sock_common {
* @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
* @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
* @sk_sndbuf: size of send buffer in bytes
+ * @sk_padding: unused element for alignment
* @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
* @sk_no_check_rx: allow zero checksum in RX packets
* @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
@@ -302,7 +303,8 @@ struct sock_common {
* @sk_backlog_rcv: callback to process the backlog
* @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
* @sk_reuseport_cb: reuseport group container
- */
+ * @sk_rcu: used during RCU grace period
+ */
struct sock {
/*
* Now struct inet_timewait_sock also uses sock_common, so please just
@@ -341,6 +343,9 @@ struct sock {
#define sk_rxhash __sk_common.skc_rxhash
socket_lock_t sk_lock;
+ atomic_t sk_drops;
+ int sk_rcvlowat;
+ struct sk_buff_head sk_error_queue;
struct sk_buff_head sk_receive_queue;
/*
* The backlog queue is special, it is always used with
@@ -357,14 +362,13 @@ struct sock {
struct sk_buff *tail;
} sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc
- int sk_forward_alloc;
- __u32 sk_txhash;
+ int sk_forward_alloc;
#ifdef CONFIG_NET_RX_BUSY_POLL
- unsigned int sk_napi_id;
unsigned int sk_ll_usec;
+ /* ===== mostly read cache line ===== */
+ unsigned int sk_napi_id;
#endif
- atomic_t sk_drops;
int sk_rcvbuf;
struct sk_filter __rcu *sk_filter;
@@ -377,16 +381,50 @@ struct sock {
#endif
struct dst_entry *sk_rx_dst;
struct dst_entry __rcu *sk_dst_cache;
- /* Note: 32bit hole on 64bit arches */
- atomic_t sk_wmem_alloc;
atomic_t sk_omem_alloc;
int sk_sndbuf;
+
+ /* ===== cache line for TX ===== */
+ int sk_wmem_queued;
+ atomic_t sk_wmem_alloc;
+ unsigned long sk_tsq_flags;
+ struct sk_buff *sk_send_head;
struct sk_buff_head sk_write_queue;
+ __s32 sk_peek_off;
+ int sk_write_pending;
+ long sk_sndtimeo;
+ struct timer_list sk_timer;
+ __u32 sk_priority;
+ __u32 sk_mark;
+ u32 sk_pacing_rate; /* bytes per second */
+ u32 sk_max_pacing_rate;
+ struct page_frag sk_frag;
+ netdev_features_t sk_route_caps;
+ netdev_features_t sk_route_nocaps;
+ int sk_gso_type;
+ unsigned int sk_gso_max_size;
+ gfp_t sk_allocation;
+ __u32 sk_txhash;
/*
* Because of non atomicity rules, all
* changes are protected by socket lock.
*/
+ unsigned int __sk_flags_offset[0];
+#ifdef __BIG_ENDIAN_BITFIELD
+#define SK_FL_PROTO_SHIFT 16
+#define SK_FL_PROTO_MASK 0x00ff0000
+
+#define SK_FL_TYPE_SHIFT 0
+#define SK_FL_TYPE_MASK 0x0000ffff
+#else
+#define SK_FL_PROTO_SHIFT 8
+#define SK_FL_PROTO_MASK 0x0000ff00
+
+#define SK_FL_TYPE_SHIFT 16
+#define SK_FL_TYPE_MASK 0xffff0000
+#endif
+
kmemcheck_bitfield_begin(flags);
unsigned int sk_padding : 2,
sk_no_check_tx : 1,
@@ -397,41 +435,24 @@ struct sock {
#define SK_PROTOCOL_MAX U8_MAX
kmemcheck_bitfield_end(flags);
- int sk_wmem_queued;
- gfp_t sk_allocation;
- u32 sk_pacing_rate; /* bytes per second */
- u32 sk_max_pacing_rate;
- netdev_features_t sk_route_caps;
- netdev_features_t sk_route_nocaps;
- int sk_gso_type;
- unsigned int sk_gso_max_size;
u16 sk_gso_max_segs;
- int sk_rcvlowat;
unsigned long sk_lingertime;
- struct sk_buff_head sk_error_queue;
struct proto *sk_prot_creator;
rwlock_t sk_callback_lock;
int sk_err,
sk_err_soft;
u32 sk_ack_backlog;
u32 sk_max_ack_backlog;
- __u32 sk_priority;
- __u32 sk_mark;
+ kuid_t sk_uid;
struct pid *sk_peer_pid;
const struct cred *sk_peer_cred;
long sk_rcvtimeo;
- long sk_sndtimeo;
- struct timer_list sk_timer;
ktime_t sk_stamp;
u16 sk_tsflags;
u8 sk_shutdown;
u32 sk_tskey;
struct socket *sk_socket;
void *sk_user_data;
- struct page_frag sk_frag;
- struct sk_buff *sk_send_head;
- __s32 sk_peek_off;
- int sk_write_pending;
#ifdef CONFIG_SECURITY
void *sk_security;
#endif
@@ -892,7 +913,20 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
static inline void sock_rps_record_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
- sock_rps_record_flow_hash(sk->sk_rxhash);
+ if (static_key_false(&rfs_needed)) {
+ /* Reading sk->sk_rxhash might incur an expensive cache line
+ * miss.
+ *
+ * TCP_ESTABLISHED does cover almost all states where RFS
+ * might be useful, and is cheaper [1] than testing :
+ * IPv4: inet_sk(sk)->inet_daddr
+ * IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
+ * OR an additional socket flag
+ * [1] : sk_state and sk_prot are in the same cache line.
+ */
+ if (sk->sk_state == TCP_ESTABLISHED)
+ sock_rps_record_flow_hash(sk->sk_rxhash);
+ }
#endif
}
@@ -912,14 +946,16 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
#endif
}
-#define sk_wait_event(__sk, __timeo, __condition) \
+#define sk_wait_event(__sk, __timeo, __condition, __wait) \
({ int __rc; \
release_sock(__sk); \
__rc = __condition; \
if (!__rc) { \
- *(__timeo) = schedule_timeout(*(__timeo)); \
+ *(__timeo) = wait_woken(__wait, \
+ TASK_INTERRUPTIBLE, \
+ *(__timeo)); \
} \
- sched_annotate_sleep(); \
+ sched_annotate_sleep(); \
lock_sock(__sk); \
__rc = __condition; \
__rc; \
@@ -1020,7 +1056,6 @@ struct proto {
void (*unhash)(struct sock *sk);
void (*rehash)(struct sock *sk);
int (*get_port)(struct sock *sk, unsigned short snum);
- void (*clear_sk)(struct sock *sk, int size);
/* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
@@ -1114,6 +1149,16 @@ static inline bool sk_stream_is_writeable(const struct sock *sk)
sk_stream_memory_free(sk);
}
+static inline int sk_under_cgroup_hierarchy(struct sock *sk,
+ struct cgroup *ancestor)
+{
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data),
+ ancestor);
+#else
+ return -ENOTSUPP;
+#endif
+}
static inline bool sk_has_memory_pressure(const struct sock *sk)
{
@@ -1151,11 +1196,6 @@ static inline void sk_enter_memory_pressure(struct sock *sk)
sk->sk_prot->enter_memory_pressure(sk);
}
-static inline long sk_prot_mem_limits(const struct sock *sk, int index)
-{
- return sk->sk_prot->sysctl_mem[index];
-}
-
static inline long
sk_memory_allocated(const struct sock *sk)
{
@@ -1232,8 +1272,6 @@ static inline int __sk_prot_rehash(struct sock *sk)
return sk->sk_prot->hash(sk);
}
-void sk_prot_clear_portaddr_nulls(struct sock *sk, int size);
-
/* About 10 seconds */
#define SOCK_DESTROY_TIME (10*HZ)
@@ -1267,14 +1305,32 @@ static inline struct inode *SOCK_INODE(struct socket *socket)
/*
* Functions for memory accounting
*/
+int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind);
int __sk_mem_schedule(struct sock *sk, int size, int kind);
+void __sk_mem_reduce_allocated(struct sock *sk, int amount);
void __sk_mem_reclaim(struct sock *sk, int amount);
-#define SK_MEM_QUANTUM ((int)PAGE_SIZE)
+/* We used to have PAGE_SIZE here, but systems with 64KB pages
+ * do not necessarily have 16x time more memory than 4KB ones.
+ */
+#define SK_MEM_QUANTUM 4096
#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
#define SK_MEM_SEND 0
#define SK_MEM_RECV 1
+/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
+static inline long sk_prot_mem_limits(const struct sock *sk, int index)
+{
+ long val = sk->sk_prot->sysctl_mem[index];
+
+#if PAGE_SIZE > SK_MEM_QUANTUM
+ val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
+#elif PAGE_SIZE < SK_MEM_QUANTUM
+ val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT;
+#endif
+ return val;
+}
+
static inline int sk_mem_pages(int amt)
{
return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
@@ -1587,11 +1643,11 @@ static inline void sock_put(struct sock *sk)
void sock_gen_put(struct sock *sk);
int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested,
- unsigned int trim_cap);
+ unsigned int trim_cap, bool refcounted);
static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb,
const int nested)
{
- return __sk_receive_skb(sk, skb, nested, 1);
+ return __sk_receive_skb(sk, skb, nested, 1, true);
}
static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
@@ -1642,6 +1698,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
sk->sk_wq = parent->wq;
parent->sk = sk;
sk_set_socket(sk, parent);
+ sk->sk_uid = SOCK_INODE(parent)->i_uid;
security_sock_graft(sk, parent);
write_unlock_bh(&sk->sk_callback_lock);
}
@@ -1649,6 +1706,11 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
kuid_t sock_i_uid(struct sock *sk);
unsigned long sock_i_ino(struct sock *sk);
+static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
+{
+ return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
+}
+
static inline u32 net_tx_rndhash(void)
{
u32 v = prandom_u32();
@@ -1774,13 +1836,13 @@ static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
{
if (skb->ip_summed == CHECKSUM_NONE) {
__wsum csum = 0;
- if (csum_and_copy_from_iter(to, copy, &csum, from) != copy)
+ if (!csum_and_copy_from_iter_full(to, copy, &csum, from))
return -EFAULT;
skb->csum = csum_block_add(skb->csum, csum, offset);
} else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
- if (copy_from_iter_nocache(to, copy, from) != copy)
+ if (!copy_from_iter_full_nocache(to, copy, from))
return -EFAULT;
- } else if (copy_from_iter(to, copy, from) != copy)
+ } else if (!copy_from_iter_full(to, copy, from))
return -EFAULT;
return 0;
@@ -1943,6 +2005,10 @@ void sk_reset_timer(struct sock *sk, struct timer_list *timer,
void sk_stop_timer(struct sock *sk, struct timer_list *timer);
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
+ unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb));
int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
@@ -2099,7 +2165,8 @@ struct sock_skb_cb {
static inline void
sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
{
- SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops);
+ SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
+ atomic_read(&sk->sk_drops) : 0;
}
static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
@@ -2128,8 +2195,8 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
*/
if (sock_flag(sk, SOCK_RCVTSTAMP) ||
(sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
- (kt.tv64 && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
- (hwtstamps->hwtstamp.tv64 &&
+ (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
+ (hwtstamps->hwtstamp &&
(sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
__sock_recv_timestamp(msg, sk, skb);
else