diff options
author | Stefan Schmidt <stefan@datenfreihafen.org> | 2021-02-24 15:36:58 +0300 |
---|---|---|
committer | Stefan Schmidt <stefan@datenfreihafen.org> | 2021-02-24 15:36:58 +0300 |
commit | cdd38c5f1ce4398ec58fec95904b75824daab7b5 (patch) | |
tree | 639cf51fe8ee120a13e61b13d448aeaf4d044c74 /net | |
parent | 04052a318fb93491f1f3b4d282cb806f588e9326 (diff) | |
parent | fcb3007371e1a4afb03280af1b336a83287fe115 (diff) | |
download | linux-cdd38c5f1ce4398ec58fec95904b75824daab7b5.tar.xz |
Merge remote-tracking branch 'net/master'
Diffstat (limited to 'net')
724 files changed, 35741 insertions, 20421 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index f292e0267bb9..8b644113715e 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -284,8 +284,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) return 0; out_free_newdev: - if (new_dev->reg_state == NETREG_UNINITIALIZED) - free_netdev(new_dev); + free_netdev(new_dev); return err; } diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ec8408d1638f..dc1a197792e6 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -510,9 +510,17 @@ static void vlan_dev_set_lockdep_class(struct net_device *dev) netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL); } +static __be16 vlan_parse_protocol(const struct sk_buff *skb) +{ + struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); + + return __vlan_get_protocol(skb, veth->h_vlan_proto, NULL); +} + static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .parse = eth_header_parse, + .parse_protocol = vlan_parse_protocol, }; static int vlan_passthru_hard_header(struct sk_buff *skb, struct net_device *dev, @@ -532,6 +540,7 @@ static int vlan_passthru_hard_header(struct sk_buff *skb, struct net_device *dev static const struct header_ops vlan_passthru_header_ops = { .create = vlan_passthru_hard_header, .parse = eth_header_parse, + .parse_protocol = vlan_parse_protocol, }; static struct device_type vlan_type = { diff --git a/net/9p/Kconfig b/net/9p/Kconfig index 3d11fec3a8dc..64468c49791f 100644 --- a/net/9p/Kconfig +++ b/net/9p/Kconfig @@ -4,7 +4,6 @@ # menuconfig NET_9P - depends on NET tristate "Plan 9 Resource Sharing Support (9P2000)" help If you say Y here, you will get experimental support for diff --git a/net/9p/client.c b/net/9p/client.c index 09f1ec589b80..4f62f299da0c 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -412,8 +412,9 @@ static void p9_tag_cleanup(struct p9_client *c) /** * p9_client_cb - call back from transport to client - * c: client state - * req: request received + * @c: client state + * @req: request received + * @status: request status, one of REQ_STATUS_* * */ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) @@ -555,6 +556,7 @@ out_err: * p9_check_zc_errors - check 9p packet for error return and process it * @c: current client instance * @req: request to parse and check for error conditions + * @uidata: external buffer containing error * @in_hdrlen: Size of response protocol buffer. * * returns error code if one is discovered, otherwise returns 0 @@ -901,6 +903,7 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) fid->clnt = clnt; fid->rdir = NULL; fid->fid = 0; + refcount_set(&fid->count, 1); idr_preload(GFP_KERNEL); spin_lock_irq(&clnt->lock); @@ -908,7 +911,6 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) GFP_NOWAIT); spin_unlock_irq(&clnt->lock); idr_preload_end(); - if (!ret) return fid; @@ -1187,7 +1189,6 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname, p9_debug(P9_DEBUG_9P, ">>> TWALK fids %d,%d nwname %ud wname[0] %s\n", oldfid->fid, fid->fid, nwname, wnames ? wnames[0] : NULL); - req = p9_client_rpc(clnt, P9_TWALK, "ddT", oldfid->fid, fid->fid, nwname, wnames); if (IS_ERR(req)) { @@ -1219,7 +1220,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname, if (nwname) memmove(&fid->qid, &wqids[nwqids - 1], sizeof(struct p9_qid)); else - fid->qid = oldfid->qid; + memmove(&fid->qid, &oldfid->qid, sizeof(struct p9_qid)); kfree(wqids); return fid; @@ -1272,6 +1273,7 @@ int p9_client_open(struct p9_fid *fid, int mode) p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN", qid.type, (unsigned long long)qid.path, qid.version, iounit); + memmove(&fid->qid, &qid, sizeof(struct p9_qid)); fid->mode = mode; fid->iounit = iounit; @@ -1317,6 +1319,7 @@ int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32 (unsigned long long)qid->path, qid->version, iounit); + memmove(&ofid->qid, qid, sizeof(struct p9_qid)); ofid->mode = mode; ofid->iounit = iounit; @@ -1362,6 +1365,7 @@ int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode, (unsigned long long)qid.path, qid.version, iounit); + memmove(&fid->qid, &qid, sizeof(struct p9_qid)); fid->mode = mode; fid->iounit = iounit; @@ -1458,12 +1462,14 @@ int p9_client_clunk(struct p9_fid *fid) struct p9_req_t *req; int retries = 0; - if (!fid) { - pr_warn("%s (%d): Trying to clunk with NULL fid\n", + if (!fid || IS_ERR(fid)) { + pr_warn("%s (%d): Trying to clunk with invalid fid\n", __func__, task_pid_nr(current)); dump_stack(); return 0; } + if (!refcount_dec_and_test(&fid->count)) + return 0; again: p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n", fid->fid, diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c index 3dff68f05fb9..6ea5ea548cd4 100644 --- a/net/9p/trans_common.c +++ b/net/9p/trans_common.c @@ -17,7 +17,9 @@ #include "trans_common.h" /** - * p9_release_pages - Release pages after the transaction. + * p9_release_pages - Release pages after the transaction. + * @pages: array of pages to be put + * @nr_pages: size of array */ void p9_release_pages(struct page **pages, int nr_pages) { diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 8f528e783a6c..fa158397bb63 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -45,7 +45,7 @@ static struct p9_trans_module p9_fd_trans; * @rfd: file descriptor for reading (trans=fd) * @wfd: file descriptor for writing (trans=fd) * @port: port to connect to (trans=tcp) - * + * @privport: port is privileged */ struct p9_fd_opts { @@ -95,6 +95,8 @@ struct p9_poll_wait { * @err: error state * @req_list: accounting for requests which have been sent * @unsent_req_list: accounting for requests that haven't been sent + * @rreq: read request + * @wreq: write request * @req: current request being processed (if any) * @tmp_buf: temporary buffer to read in header * @rc: temporary fcall for reading current frame diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 2885ff9c76f0..af0a8a6cd3fd 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -99,6 +99,7 @@ struct p9_rdma_req; /** * struct p9_rdma_context - Keeps track of in-process WR * + * @cqe: completion queue entry * @busa: Bus address to unmap when the WR completes * @req: Keeps track of requests (send) * @rc: Keepts track of replies (receive) @@ -115,6 +116,7 @@ struct p9_rdma_context { /** * struct p9_rdma_opts - Collection of mount options * @port: port of connection + * @privport: Whether a privileged port may be used * @sq_depth: The requested depth of the SQ. This really doesn't need * to be any deeper than the number of threads used in the client * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index a3cd90a74012..93f2f8654882 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -50,7 +50,11 @@ static atomic_t vp_pinned = ATOMIC_INIT(0); * @client: client instance * @vdev: virtio dev associated with this channel * @vq: virtio queue associated with this channel + * @ring_bufs_avail: flag to indicate there is some available in the ring buf + * @vc_wq: wait queue for waiting for thing to be added to ring buf + * @p9_max_pages: maximum number of pinned pages * @sg: scatter gather list which is used to pack a request (protected?) + * @chan_list: linked list of channels * * We keep all per-channel information in a structure. * This structure is allocated within the devices dev->mem space. @@ -74,8 +78,8 @@ struct virtio_chan { unsigned long p9_max_pages; /* Scatterlist: can be too big for stack. */ struct scatterlist sg[VIRTQUEUE_NUM]; - /* - * tag name to identify a mount null terminated + /** + * @tag: name to identify a mount null terminated */ char *tag; @@ -204,6 +208,7 @@ static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req) * this takes a list of pages. * @sg: scatter/gather list to pack into * @start: which segment of the sg_list to start at + * @limit: maximum number of pages in sg list. * @pdata: a list of pages to add into sg. * @nr_pages: number of pages to pack into the scatter/gather list * @offs: amount of data in the beginning of first page _not_ to pack diff --git a/net/Kconfig b/net/Kconfig index d6567162c1cf..8cea808ad9e8 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -256,9 +256,13 @@ config RFS_ACCEL select CPU_RMAP default y +config SOCK_RX_QUEUE_MAPPING + bool + config XPS bool depends on SMP + select SOCK_RX_QUEUE_MAPPING default y config HWBM @@ -386,8 +390,6 @@ source "net/mac80211/Kconfig" endif # WIRELESS -source "net/wimax/Kconfig" - source "net/rfkill/Kconfig" source "net/9p/Kconfig" source "net/caif/Kconfig" diff --git a/net/Makefile b/net/Makefile index 5744bf1997fd..9ca9572188fe 100644 --- a/net/Makefile +++ b/net/Makefile @@ -6,20 +6,19 @@ # Rewritten to use lists instead of if-statements. # -obj-$(CONFIG_NET) := devres.o socket.o core/ +obj-y := devres.o socket.o core/ -tmp-$(CONFIG_COMPAT) := compat.o -obj-$(CONFIG_NET) += $(tmp-y) +obj-$(CONFIG_COMPAT) += compat.o # LLC has to be linked before the files in net/802/ obj-$(CONFIG_LLC) += llc/ -obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ bpf/ ethtool/ +obj-y += ethernet/ 802/ sched/ netlink/ bpf/ ethtool/ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX_SCM) += unix/ -obj-$(CONFIG_NET) += ipv6/ +obj-y += ipv6/ obj-$(CONFIG_BPFILTER) += bpfilter/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ @@ -56,17 +55,12 @@ obj-$(CONFIG_SMC) += smc/ obj-$(CONFIG_RFKILL) += rfkill/ obj-$(CONFIG_NET_9P) += 9p/ obj-$(CONFIG_CAIF) += caif/ -ifneq ($(CONFIG_DCB),) -obj-y += dcb/ -endif +obj-$(CONFIG_DCB) += dcb/ obj-$(CONFIG_6LOWPAN) += 6lowpan/ obj-$(CONFIG_IEEE802154) += ieee802154/ obj-$(CONFIG_MAC802154) += mac802154/ -ifeq ($(CONFIG_NET),y) obj-$(CONFIG_SYSCTL) += sysctl_net.o -endif -obj-$(CONFIG_WIMAX) += wimax/ obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ obj-$(CONFIG_CEPH_LIB) += ceph/ obj-$(CONFIG_BATMAN_ADV) += batman-adv/ @@ -78,12 +72,8 @@ obj-$(CONFIG_VSOCKETS) += vmw_vsock/ obj-$(CONFIG_MPLS) += mpls/ obj-$(CONFIG_NET_NSH) += nsh/ obj-$(CONFIG_HSR) += hsr/ -ifneq ($(CONFIG_NET_SWITCHDEV),) -obj-y += switchdev/ -endif -ifneq ($(CONFIG_NET_L3_MASTER_DEV),) -obj-y += l3mdev/ -endif +obj-$(CONFIG_NET_SWITCHDEV) += switchdev/ +obj-$(CONFIG_NET_L3_MASTER_DEV) += l3mdev/ obj-$(CONFIG_QRTR) += qrtr/ obj-$(CONFIG_NET_NCSI) += ncsi/ obj-$(CONFIG_XDP_SOCKETS) += xdp/ diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 45f584171de7..be18af481d7d 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -44,15 +44,15 @@ int sysctl_aarp_resolve_time = AARP_RESOLVE_TIME; /* Lists of aarp entries */ /** * struct aarp_entry - AARP entry - * @last_sent - Last time we xmitted the aarp request - * @packet_queue - Queue of frames wait for resolution - * @status - Used for proxy AARP - * expires_at - Entry expiry time - * target_addr - DDP Address - * dev - Device to use - * hwaddr - Physical i/f address of target/router - * xmit_count - When this hits 10 we give up - * next - Next entry in chain + * @last_sent: Last time we xmitted the aarp request + * @packet_queue: Queue of frames wait for resolution + * @status: Used for proxy AARP + * @expires_at: Entry expiry time + * @target_addr: DDP Address + * @dev: Device to use + * @hwaddr: Physical i/f address of target/router + * @xmit_count: When this hits 10 we give up + * @next: Next entry in chain */ struct aarp_entry { /* These first two are only used for unresolved entries */ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 1d48708c5a2e..ebda397fa95a 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1407,9 +1407,10 @@ drop: /** * atalk_rcv - Receive a packet (in skb) from device dev - * @skb - packet received - * @dev - network device where the packet comes from - * @pt - packet type + * @skb: packet received + * @dev: network device where the packet comes from + * @pt: packet type + * @orig_dev: the original receive net device * * Receive a packet (in skb) from device dev. This has come from the SNAP * decoder, and on entry skb->transport_header is the DDP header, skb->len @@ -1576,8 +1577,8 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) struct sk_buff *skb; struct net_device *dev; struct ddpehdr *ddp; - int size; - struct atalk_route *rt; + int size, hard_header_len; + struct atalk_route *rt, *rt_lo = NULL; int err; if (flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) @@ -1640,7 +1641,22 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) SOCK_DEBUG(sk, "SK %p: Size needed %d, device %s\n", sk, size, dev->name); - size += dev->hard_header_len; + hard_header_len = dev->hard_header_len; + /* Leave room for loopback hardware header if necessary */ + if (usat->sat_addr.s_node == ATADDR_BCAST && + (dev->flags & IFF_LOOPBACK || !(rt->flags & RTF_GATEWAY))) { + struct atalk_addr at_lo; + + at_lo.s_node = 0; + at_lo.s_net = 0; + + rt_lo = atrtr_find(&at_lo); + + if (rt_lo && rt_lo->dev->hard_header_len > hard_header_len) + hard_header_len = rt_lo->dev->hard_header_len; + } + + size += hard_header_len; release_sock(sk); skb = sock_alloc_send_skb(sk, size, (flags & MSG_DONTWAIT), &err); lock_sock(sk); @@ -1648,7 +1664,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) goto out; skb_reserve(skb, ddp_dl->header_length); - skb_reserve(skb, dev->hard_header_len); + skb_reserve(skb, hard_header_len); skb->dev = dev; SOCK_DEBUG(sk, "SK %p: Begin build.\n", sk); @@ -1699,18 +1715,12 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) /* loop back */ skb_orphan(skb); if (ddp->deh_dnode == ATADDR_BCAST) { - struct atalk_addr at_lo; - - at_lo.s_node = 0; - at_lo.s_net = 0; - - rt = atrtr_find(&at_lo); - if (!rt) { + if (!rt_lo) { kfree_skb(skb); err = -ENETUNREACH; goto out; } - dev = rt->dev; + dev = rt_lo->dev; skb->dev = dev; } ddp_dl->request(ddp_dl, skb, dev->dev_addr); diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c index 579b66da1d95..3e4f17d335fe 100644 --- a/net/atm/pppoatm.c +++ b/net/atm/pppoatm.c @@ -101,9 +101,11 @@ static inline struct pppoatm_vcc *chan_to_pvcc(const struct ppp_channel *chan) * doesn't want to be called in interrupt context, so we do it from * a tasklet */ -static void pppoatm_wakeup_sender(unsigned long arg) +static void pppoatm_wakeup_sender(struct tasklet_struct *t) { - ppp_output_wakeup((struct ppp_channel *) arg); + struct pppoatm_vcc *pvcc = from_tasklet(pvcc, t, wakeup_tasklet); + + ppp_output_wakeup(&pvcc->chan); } static void pppoatm_release_cb(struct atm_vcc *atmvcc) @@ -389,11 +391,7 @@ static int pppoatm_assign_vcc(struct atm_vcc *atmvcc, void __user *arg) struct atm_backend_ppp be; struct pppoatm_vcc *pvcc; int err; - /* - * Each PPPoATM instance has its own tasklet - this is just a - * prototypical one used to initialize them - */ - static const DECLARE_TASKLET_OLD(tasklet_proto, pppoatm_wakeup_sender); + if (copy_from_user(&be, arg, sizeof be)) return -EFAULT; if (be.encaps != PPPOATM_ENCAPS_AUTODETECT && @@ -415,8 +413,7 @@ static int pppoatm_assign_vcc(struct atm_vcc *atmvcc, void __user *arg) pvcc->chan.ops = &pppoatm_ops; pvcc->chan.mtu = atmvcc->qos.txtp.max_sdu - PPP_HDRLEN - (be.encaps == e_vc ? 0 : LLC_LEN); - pvcc->wakeup_tasklet = tasklet_proto; - pvcc->wakeup_tasklet.data = (unsigned long) &pvcc->chan; + tasklet_setup(&pvcc->wakeup_tasklet, pppoatm_wakeup_sender); err = ppp_register_channel(&pvcc->chan); if (err != 0) { kfree(pvcc); diff --git a/net/atm/raw.c b/net/atm/raw.c index b3ba44aab0ee..2b5f78a7ec3e 100644 --- a/net/atm/raw.c +++ b/net/atm/raw.c @@ -54,6 +54,8 @@ static int atm_send_aal0(struct atm_vcc *vcc, struct sk_buff *skb) kfree_skb(skb); return -EADDRNOTAVAIL; } + if (vcc->dev->ops->send_bh) + return vcc->dev->ops->send_bh(vcc, skb); return vcc->dev->ops->send(vcc, skb); } @@ -71,7 +73,10 @@ int atm_init_aal34(struct atm_vcc *vcc) vcc->push = atm_push_raw; vcc->pop = atm_pop_raw; vcc->push_oam = NULL; - vcc->send = vcc->dev->ops->send; + if (vcc->dev->ops->send_bh) + vcc->send = vcc->dev->ops->send_bh; + else + vcc->send = vcc->dev->ops->send; return 0; } @@ -80,7 +85,10 @@ int atm_init_aal5(struct atm_vcc *vcc) vcc->push = atm_push_raw; vcc->pop = atm_pop_raw; vcc->push_oam = NULL; - vcc->send = vcc->dev->ops->send; + if (vcc->dev->ops->send_bh) + vcc->send = vcc->dev->ops->send_bh; + else + vcc->send = vcc->dev->ops->send; return 0; } EXPORT_SYMBOL(atm_init_aal5); diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index c762758a4649..860a0786bc1e 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +# Copyright (C) B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich @@ -9,7 +9,6 @@ config BATMAN_ADV tristate "B.A.T.M.A.N. Advanced Meshing Protocol" - depends on NET select LIBCRC32C help B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is @@ -76,37 +75,14 @@ config BATMAN_ADV_MCAST reduce the air overhead while improving the reliability of multicast messages. -config BATMAN_ADV_DEBUGFS - bool "batman-adv debugfs entries" - depends on BATMAN_ADV - depends on DEBUG_FS - help - Enable this to export routing related debug tables via debugfs. - The information for each soft-interface and used hard-interface can be - found under batman_adv/ - - If unsure, say N. - config BATMAN_ADV_DEBUG bool "B.A.T.M.A.N. debugging" depends on BATMAN_ADV help This is an option for use by developers; most people should say N here. This enables compilation of support for - outputting debugging information to the debugfs log or tracing - buffer. The output is controlled via the batadv netdev specific - log_level setting. - -config BATMAN_ADV_SYSFS - bool "batman-adv sysfs entries" - depends on BATMAN_ADV - help - Say Y here if you want to enable batman-adv device configuration and - status interface through sysfs attributes. It is replaced by the - batadv generic netlink family but still used by various userspace - tools and scripts. - - If unsure, say Y. + outputting debugging information to the tracing buffer. The output is + controlled via the batadv netdev specific log_level setting. config BATMAN_ADV_TRACING bool "B.A.T.M.A.N. tracing support" diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index daa49af7ff40..3bd0760c76a2 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +# Copyright (C) B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich @@ -11,14 +11,12 @@ batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_ogm.o batman-adv-y += bitarray.o batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o -batman-adv-$(CONFIG_BATMAN_ADV_DEBUGFS) += debugfs.o batman-adv-$(CONFIG_BATMAN_ADV_DAT) += distributed-arp-table.o batman-adv-y += fragmentation.o batman-adv-y += gateway_client.o batman-adv-y += gateway_common.o batman-adv-y += hard-interface.o batman-adv-y += hash.o -batman-adv-$(CONFIG_BATMAN_ADV_DEBUGFS) += icmp_socket.o batman-adv-$(CONFIG_BATMAN_ADV_DEBUG) += log.o batman-adv-y += main.o batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o @@ -28,7 +26,6 @@ batman-adv-y += originator.o batman-adv-y += routing.o batman-adv-y += send.o batman-adv-y += soft-interface.o -batman-adv-$(CONFIG_BATMAN_ADV_SYSFS) += sysfs.o batman-adv-$(CONFIG_BATMAN_ADV_TRACING) += trace.o batman-adv-y += tp_meter.o batman-adv-y += translation-table.o diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index 382fbe51fd34..4eee53d19eb0 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -11,7 +11,6 @@ #include <linux/moduleparam.h> #include <linux/netlink.h> #include <linux/printk.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/string.h> @@ -34,7 +33,13 @@ void batadv_algo_init(void) INIT_HLIST_HEAD(&batadv_algo_list); } -static struct batadv_algo_ops *batadv_algo_get(char *name) +/** + * batadv_algo_get() - Search for algorithm with specific name + * @name: algorithm name to find + * + * Return: Pointer to batadv_algo_ops on success, NULL otherwise + */ +struct batadv_algo_ops *batadv_algo_get(const char *name) { struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp; @@ -97,7 +102,7 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) * * Return: 0 on success or negative error number in case of failure */ -int batadv_algo_select(struct batadv_priv *bat_priv, char *name) +int batadv_algo_select(struct batadv_priv *bat_priv, const char *name) { struct batadv_algo_ops *bat_algo_ops; @@ -110,29 +115,6 @@ int batadv_algo_select(struct batadv_priv *bat_priv, char *name) return 0; } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - -/** - * batadv_algo_seq_print_text() - Print the supported algorithms in a seq file - * @seq: seq file to print on - * @offset: not used - * - * Return: always 0 - */ -int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) -{ - struct batadv_algo_ops *bat_algo_ops; - - seq_puts(seq, "Available routing algorithms:\n"); - - hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) { - seq_printf(seq, " * %s\n", bat_algo_ops->name); - } - - return 0; -} -#endif - static int batadv_param_set_ra(const char *val, const struct kernel_param *kp) { struct batadv_algo_ops *bat_algo_ops; diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 686a60bc9492..2c486374af58 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing */ @@ -10,7 +10,6 @@ #include "main.h" #include <linux/netlink.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/types.h> @@ -18,9 +17,9 @@ extern char batadv_routing_algo[]; extern struct list_head batadv_hardif_list; void batadv_algo_init(void); +struct batadv_algo_ops *batadv_algo_get(const char *name); int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops); -int batadv_algo_select(struct batadv_priv *bat_priv, char *name); -int batadv_algo_seq_print_text(struct seq_file *seq, void *offset); +int batadv_algo_select(struct batadv_priv *bat_priv, const char *name); int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb); #endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 206d0b424712..a5e313cd6f44 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -32,7 +32,6 @@ #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -1780,106 +1779,6 @@ free_skb: return ret; } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS -/** - * batadv_iv_ogm_orig_print_neigh() - print neighbors for the originator table - * @orig_node: the orig_node for which the neighbors are printed - * @if_outgoing: outgoing interface for these entries - * @seq: debugfs table seq_file struct - * - * Must be called while holding an rcu lock. - */ -static void -batadv_iv_ogm_orig_print_neigh(struct batadv_orig_node *orig_node, - struct batadv_hard_iface *if_outgoing, - struct seq_file *seq) -{ - struct batadv_neigh_node *neigh_node; - struct batadv_neigh_ifinfo *n_ifinfo; - - hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { - n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); - if (!n_ifinfo) - continue; - - seq_printf(seq, " %pM (%3i)", - neigh_node->addr, - n_ifinfo->bat_iv.tq_avg); - - batadv_neigh_ifinfo_put(n_ifinfo); - } -} - -/** - * batadv_iv_ogm_orig_print() - print the originator table - * @bat_priv: the bat priv with all the soft interface information - * @seq: debugfs table seq_file struct - * @if_outgoing: the outgoing interface for which this should be printed - */ -static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv, - struct seq_file *seq, - struct batadv_hard_iface *if_outgoing) -{ - struct batadv_neigh_node *neigh_node; - struct batadv_hashtable *hash = bat_priv->orig_hash; - int last_seen_msecs, last_seen_secs; - struct batadv_orig_node *orig_node; - struct batadv_neigh_ifinfo *n_ifinfo; - unsigned long last_seen_jiffies; - struct hlist_head *head; - int batman_count = 0; - u32 i; - - seq_puts(seq, - " Originator last-seen (#/255) Nexthop [outgoingIF]: Potential nexthops ...\n"); - - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) { - neigh_node = batadv_orig_router_get(orig_node, - if_outgoing); - if (!neigh_node) - continue; - - n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, - if_outgoing); - if (!n_ifinfo) - goto next; - - if (n_ifinfo->bat_iv.tq_avg == 0) - goto next; - - last_seen_jiffies = jiffies - orig_node->last_seen; - last_seen_msecs = jiffies_to_msecs(last_seen_jiffies); - last_seen_secs = last_seen_msecs / 1000; - last_seen_msecs = last_seen_msecs % 1000; - - seq_printf(seq, "%pM %4i.%03is (%3i) %pM [%10s]:", - orig_node->orig, last_seen_secs, - last_seen_msecs, n_ifinfo->bat_iv.tq_avg, - neigh_node->addr, - neigh_node->if_incoming->net_dev->name); - - batadv_iv_ogm_orig_print_neigh(orig_node, if_outgoing, - seq); - seq_putc(seq, '\n'); - batman_count++; - -next: - batadv_neigh_node_put(neigh_node); - if (n_ifinfo) - batadv_neigh_ifinfo_put(n_ifinfo); - } - rcu_read_unlock(); - } - - if (batman_count == 0) - seq_puts(seq, "No batman nodes in range ...\n"); -} -#endif - /** * batadv_iv_ogm_neigh_get_tq_avg() - Get the TQ average for a neighbour on a * given outgoing interface. @@ -2109,59 +2008,6 @@ batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, cb->args[2] = sub; } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS -/** - * batadv_iv_hardif_neigh_print() - print a single hop neighbour node - * @seq: neighbour table seq_file struct - * @hardif_neigh: hardif neighbour information - */ -static void -batadv_iv_hardif_neigh_print(struct seq_file *seq, - struct batadv_hardif_neigh_node *hardif_neigh) -{ - int last_secs, last_msecs; - - last_secs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) / 1000; - last_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) % 1000; - - seq_printf(seq, " %10s %pM %4i.%03is\n", - hardif_neigh->if_incoming->net_dev->name, - hardif_neigh->addr, last_secs, last_msecs); -} - -/** - * batadv_iv_ogm_neigh_print() - print the single hop neighbour list - * @bat_priv: the bat priv with all the soft interface information - * @seq: neighbour table seq_file struct - */ -static void batadv_iv_neigh_print(struct batadv_priv *bat_priv, - struct seq_file *seq) -{ - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_hardif_neigh_node *hardif_neigh; - struct batadv_hard_iface *hard_iface; - int batman_count = 0; - - seq_puts(seq, " IF Neighbor last-seen\n"); - - rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->soft_iface != net_dev) - continue; - - hlist_for_each_entry_rcu(hardif_neigh, - &hard_iface->neigh_list, list) { - batadv_iv_hardif_neigh_print(seq, hardif_neigh); - batman_count++; - } - } - rcu_read_unlock(); - - if (batman_count == 0) - seq_puts(seq, "No batman nodes in range ...\n"); -} -#endif - /** * batadv_iv_ogm_neigh_diff() - calculate tq difference of two neighbors * @neigh1: the first neighbor object of the comparison @@ -2557,72 +2403,6 @@ out: return ret; } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS -/* fails if orig_node has no router */ -static int batadv_iv_gw_write_buffer_text(struct batadv_priv *bat_priv, - struct seq_file *seq, - const struct batadv_gw_node *gw_node) -{ - struct batadv_gw_node *curr_gw; - struct batadv_neigh_node *router; - struct batadv_neigh_ifinfo *router_ifinfo = NULL; - int ret = -1; - - router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); - if (!router) - goto out; - - router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); - if (!router_ifinfo) - goto out; - - curr_gw = batadv_gw_get_selected_gw_node(bat_priv); - - seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %u.%u/%u.%u MBit\n", - (curr_gw == gw_node ? "=>" : " "), - gw_node->orig_node->orig, - router_ifinfo->bat_iv.tq_avg, router->addr, - router->if_incoming->net_dev->name, - gw_node->bandwidth_down / 10, - gw_node->bandwidth_down % 10, - gw_node->bandwidth_up / 10, - gw_node->bandwidth_up % 10); - ret = seq_has_overflowed(seq) ? -1 : 0; - - if (curr_gw) - batadv_gw_node_put(curr_gw); -out: - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - if (router) - batadv_neigh_node_put(router); - return ret; -} - -static void batadv_iv_gw_print(struct batadv_priv *bat_priv, - struct seq_file *seq) -{ - struct batadv_gw_node *gw_node; - int gw_count = 0; - - seq_puts(seq, - " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth\n"); - - rcu_read_lock(); - hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { - /* fails if orig_node has no router */ - if (batadv_iv_gw_write_buffer_text(bat_priv, seq, gw_node) < 0) - continue; - - gw_count++; - } - rcu_read_unlock(); - - if (gw_count == 0) - seq_puts(seq, "No gateways in range ...\n"); -} -#endif - /** * batadv_iv_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into @@ -2747,24 +2527,15 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .neigh = { .cmp = batadv_iv_ogm_neigh_cmp, .is_similar_or_better = batadv_iv_ogm_neigh_is_sob, -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - .print = batadv_iv_neigh_print, -#endif .dump = batadv_iv_ogm_neigh_dump, }, .orig = { -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - .print = batadv_iv_ogm_orig_print, -#endif .dump = batadv_iv_ogm_orig_dump, }, .gw = { .init_sel_class = batadv_iv_init_sel_class, .get_best_gw_node = batadv_iv_gw_get_best_gw_node, .is_eligible = batadv_iv_gw_is_eligible, -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - .print = batadv_iv_gw_print, -#endif .dump = batadv_iv_gw_dump, }, }; diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index 0c57c1000c64..04b01bd684e8 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 0ecaf1bb0068..e1ca2b8c3152 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner */ @@ -13,14 +13,13 @@ #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> +#include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/stddef.h> @@ -119,92 +118,6 @@ batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) batadv_v_elp_throughput_metric_update); } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS -/** - * batadv_v_orig_print_neigh() - print neighbors for the originator table - * @orig_node: the orig_node for which the neighbors are printed - * @if_outgoing: outgoing interface for these entries - * @seq: debugfs table seq_file struct - * - * Must be called while holding an rcu lock. - */ -static void -batadv_v_orig_print_neigh(struct batadv_orig_node *orig_node, - struct batadv_hard_iface *if_outgoing, - struct seq_file *seq) -{ - struct batadv_neigh_node *neigh_node; - struct batadv_neigh_ifinfo *n_ifinfo; - - hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { - n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); - if (!n_ifinfo) - continue; - - seq_printf(seq, " %pM (%9u.%1u)", - neigh_node->addr, - n_ifinfo->bat_v.throughput / 10, - n_ifinfo->bat_v.throughput % 10); - - batadv_neigh_ifinfo_put(n_ifinfo); - } -} - -/** - * batadv_v_hardif_neigh_print() - print a single ELP neighbour node - * @seq: neighbour table seq_file struct - * @hardif_neigh: hardif neighbour information - */ -static void -batadv_v_hardif_neigh_print(struct seq_file *seq, - struct batadv_hardif_neigh_node *hardif_neigh) -{ - int last_secs, last_msecs; - u32 throughput; - - last_secs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) / 1000; - last_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) % 1000; - throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); - - seq_printf(seq, "%pM %4i.%03is (%9u.%1u) [%10s]\n", - hardif_neigh->addr, last_secs, last_msecs, throughput / 10, - throughput % 10, hardif_neigh->if_incoming->net_dev->name); -} - -/** - * batadv_v_neigh_print() - print the single hop neighbour list - * @bat_priv: the bat priv with all the soft interface information - * @seq: neighbour table seq_file struct - */ -static void batadv_v_neigh_print(struct batadv_priv *bat_priv, - struct seq_file *seq) -{ - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_hardif_neigh_node *hardif_neigh; - struct batadv_hard_iface *hard_iface; - int batman_count = 0; - - seq_puts(seq, - " Neighbor last-seen ( throughput) [ IF]\n"); - - rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->soft_iface != net_dev) - continue; - - hlist_for_each_entry_rcu(hardif_neigh, - &hard_iface->neigh_list, list) { - batadv_v_hardif_neigh_print(seq, hardif_neigh); - batman_count++; - } - } - rcu_read_unlock(); - - if (batman_count == 0) - seq_puts(seq, "No batman nodes in range ...\n"); -} -#endif - /** * batadv_v_neigh_dump_neigh() - Dump a neighbour into a message * @msg: Netlink message to dump into @@ -337,75 +250,6 @@ batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, cb->args[1] = idx; } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS -/** - * batadv_v_orig_print() - print the originator table - * @bat_priv: the bat priv with all the soft interface information - * @seq: debugfs table seq_file struct - * @if_outgoing: the outgoing interface for which this should be printed - */ -static void batadv_v_orig_print(struct batadv_priv *bat_priv, - struct seq_file *seq, - struct batadv_hard_iface *if_outgoing) -{ - struct batadv_neigh_node *neigh_node; - struct batadv_hashtable *hash = bat_priv->orig_hash; - int last_seen_msecs, last_seen_secs; - struct batadv_orig_node *orig_node; - struct batadv_neigh_ifinfo *n_ifinfo; - unsigned long last_seen_jiffies; - struct hlist_head *head; - int batman_count = 0; - u32 i; - - seq_puts(seq, - " Originator last-seen ( throughput) Nexthop [outgoingIF]: Potential nexthops ...\n"); - - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) { - neigh_node = batadv_orig_router_get(orig_node, - if_outgoing); - if (!neigh_node) - continue; - - n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, - if_outgoing); - if (!n_ifinfo) - goto next; - - last_seen_jiffies = jiffies - orig_node->last_seen; - last_seen_msecs = jiffies_to_msecs(last_seen_jiffies); - last_seen_secs = last_seen_msecs / 1000; - last_seen_msecs = last_seen_msecs % 1000; - - seq_printf(seq, "%pM %4i.%03is (%9u.%1u) %pM [%10s]:", - orig_node->orig, last_seen_secs, - last_seen_msecs, - n_ifinfo->bat_v.throughput / 10, - n_ifinfo->bat_v.throughput % 10, - neigh_node->addr, - neigh_node->if_incoming->net_dev->name); - - batadv_v_orig_print_neigh(orig_node, if_outgoing, seq); - seq_putc(seq, '\n'); - batman_count++; - -next: - batadv_neigh_node_put(neigh_node); - if (n_ifinfo) - batadv_neigh_ifinfo_put(n_ifinfo); - } - rcu_read_unlock(); - } - - if (batman_count == 0) - seq_puts(seq, "No batman nodes in range ...\n"); -} -#endif - /** * batadv_v_orig_dump_subentry() - Dump an originator subentry into a message * @msg: Netlink message to dump into @@ -685,13 +529,6 @@ static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv, return count; } -static ssize_t batadv_v_show_sel_class(struct batadv_priv *bat_priv, char *buff) -{ - u32 class = atomic_read(&bat_priv->gw.sel_class); - - return sprintf(buff, "%u.%u MBit\n", class / 10, class % 10); -} - /** * batadv_v_gw_throughput_get() - retrieve the GW-bandwidth for a given GW * @gw_node: the GW to retrieve the metric for @@ -829,78 +666,6 @@ out: return ret; } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS -/* fails if orig_node has no router */ -static int batadv_v_gw_write_buffer_text(struct batadv_priv *bat_priv, - struct seq_file *seq, - const struct batadv_gw_node *gw_node) -{ - struct batadv_gw_node *curr_gw; - struct batadv_neigh_node *router; - struct batadv_neigh_ifinfo *router_ifinfo = NULL; - int ret = -1; - - router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); - if (!router) - goto out; - - router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); - if (!router_ifinfo) - goto out; - - curr_gw = batadv_gw_get_selected_gw_node(bat_priv); - - seq_printf(seq, "%s %pM (%9u.%1u) %pM [%10s]: %u.%u/%u.%u MBit\n", - (curr_gw == gw_node ? "=>" : " "), - gw_node->orig_node->orig, - router_ifinfo->bat_v.throughput / 10, - router_ifinfo->bat_v.throughput % 10, router->addr, - router->if_incoming->net_dev->name, - gw_node->bandwidth_down / 10, - gw_node->bandwidth_down % 10, - gw_node->bandwidth_up / 10, - gw_node->bandwidth_up % 10); - ret = seq_has_overflowed(seq) ? -1 : 0; - - if (curr_gw) - batadv_gw_node_put(curr_gw); -out: - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - if (router) - batadv_neigh_node_put(router); - return ret; -} - -/** - * batadv_v_gw_print() - print the gateway list - * @bat_priv: the bat priv with all the soft interface information - * @seq: gateway table seq_file struct - */ -static void batadv_v_gw_print(struct batadv_priv *bat_priv, - struct seq_file *seq) -{ - struct batadv_gw_node *gw_node; - int gw_count = 0; - - seq_puts(seq, - " Gateway ( throughput) Nexthop [outgoingIF]: advertised uplink bandwidth\n"); - - rcu_read_lock(); - hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { - /* fails if orig_node has no router */ - if (batadv_v_gw_write_buffer_text(bat_priv, seq, gw_node) < 0) - continue; - - gw_count++; - } - rcu_read_unlock(); - - if (gw_count == 0) - seq_puts(seq, "No gateways in range ...\n"); -} -#endif - /** * batadv_v_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into @@ -1046,26 +811,16 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = { .hardif_init = batadv_v_hardif_neigh_init, .cmp = batadv_v_neigh_cmp, .is_similar_or_better = batadv_v_neigh_is_sob, -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - .print = batadv_v_neigh_print, -#endif .dump = batadv_v_neigh_dump, }, .orig = { -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - .print = batadv_v_orig_print, -#endif .dump = batadv_v_orig_dump, }, .gw = { .init_sel_class = batadv_v_init_sel_class, .store_sel_class = batadv_v_store_sel_class, - .show_sel_class = batadv_v_show_sel_class, .get_best_gw_node = batadv_v_gw_get_best_gw_node, .is_eligible = batadv_v_gw_is_eligible, -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - .print = batadv_v_gw_print, -#endif .dump = batadv_v_gw_dump, }, }; diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h index 5e0be10bc84e..964431f4dc8d 100644 --- a/net/batman-adv/bat_v.h +++ b/net/batman-adv/bat_v.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing */ diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 79a7dfc32e76..423c2d171703 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner */ @@ -18,6 +18,7 @@ #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/kref.h> +#include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/nl80211.h> #include <linux/prandom.h> diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 4358d436be2a..9e2740195fa2 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner */ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 8c1148fc73d7..a0a9636d1740 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Antonio Quartulli */ @@ -18,6 +18,7 @@ #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> +#include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/prandom.h> diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index 0ae2575f70bb..edeffedecade 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Antonio Quartulli */ diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 4bc695cda397..649c41f393e1 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index 533c6d44cb58..37f7ae413bc6 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index ba0027d1f2df..360bdbf44748 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich */ @@ -28,7 +28,6 @@ #include <linux/preempt.h> #include <linux/rculist.h> #include <linux/rcupdate.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -2115,69 +2114,6 @@ out: return ret; } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS -/** - * batadv_bla_claim_table_seq_print_text() - print the claim table in a seq file - * @seq: seq file to print on - * @offset: not used - * - * Return: always 0 - */ -int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) -{ - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_priv *bat_priv = netdev_priv(net_dev); - struct batadv_hashtable *hash = bat_priv->bla.claim_hash; - struct batadv_bla_backbone_gw *backbone_gw; - struct batadv_bla_claim *claim; - struct batadv_hard_iface *primary_if; - struct hlist_head *head; - u16 backbone_crc; - u32 i; - bool is_own; - u8 *primary_addr; - - primary_if = batadv_seq_print_text_primary_if_get(seq); - if (!primary_if) - goto out; - - primary_addr = primary_if->net_dev->dev_addr; - seq_printf(seq, - "Claims announced for the mesh %s (orig %pM, group id %#.4x)\n", - net_dev->name, primary_addr, - ntohs(bat_priv->bla.claim_dest.group)); - seq_puts(seq, - " Client VID Originator [o] (CRC )\n"); - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(claim, head, hash_entry) { - backbone_gw = batadv_bla_claim_get_backbone_gw(claim); - - is_own = batadv_compare_eth(backbone_gw->orig, - primary_addr); - - spin_lock_bh(&backbone_gw->crc_lock); - backbone_crc = backbone_gw->crc; - spin_unlock_bh(&backbone_gw->crc_lock); - seq_printf(seq, " * %pM on %5d by %pM [%c] (%#.4x)\n", - claim->addr, batadv_print_vid(claim->vid), - backbone_gw->orig, - (is_own ? 'x' : ' '), - backbone_crc); - - batadv_backbone_gw_put(backbone_gw); - } - rcu_read_unlock(); - } -out: - if (primary_if) - batadv_hardif_put(primary_if); - return 0; -} -#endif - /** * batadv_bla_claim_dump_entry() - dump one entry of the claim table * to a netlink socket @@ -2348,72 +2284,6 @@ out: return ret; } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS -/** - * batadv_bla_backbone_table_seq_print_text() - print the backbone table in a - * seq file - * @seq: seq file to print on - * @offset: not used - * - * Return: always 0 - */ -int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) -{ - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_priv *bat_priv = netdev_priv(net_dev); - struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; - struct batadv_bla_backbone_gw *backbone_gw; - struct batadv_hard_iface *primary_if; - struct hlist_head *head; - int secs, msecs; - u16 backbone_crc; - u32 i; - bool is_own; - u8 *primary_addr; - - primary_if = batadv_seq_print_text_primary_if_get(seq); - if (!primary_if) - goto out; - - primary_addr = primary_if->net_dev->dev_addr; - seq_printf(seq, - "Backbones announced for the mesh %s (orig %pM, group id %#.4x)\n", - net_dev->name, primary_addr, - ntohs(bat_priv->bla.claim_dest.group)); - seq_puts(seq, " Originator VID last seen (CRC )\n"); - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { - msecs = jiffies_to_msecs(jiffies - - backbone_gw->lasttime); - secs = msecs / 1000; - msecs = msecs % 1000; - - is_own = batadv_compare_eth(backbone_gw->orig, - primary_addr); - if (is_own) - continue; - - spin_lock_bh(&backbone_gw->crc_lock); - backbone_crc = backbone_gw->crc; - spin_unlock_bh(&backbone_gw->crc_lock); - - seq_printf(seq, " * %pM on %5d %4i.%03is (%#.4x)\n", - backbone_gw->orig, - batadv_print_vid(backbone_gw->vid), secs, - msecs, backbone_crc); - } - rcu_read_unlock(); - } -out: - if (primary_if) - batadv_hardif_put(primary_if); - return 0; -} -#endif - /** * batadv_bla_backbone_dump_entry() - dump one entry of the backbone table to a * netlink socket diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index a81c41b636f9..5c22955bb9d5 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich */ @@ -12,7 +12,6 @@ #include <linux/compiler.h> #include <linux/netdevice.h> #include <linux/netlink.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/types.h> @@ -41,10 +40,7 @@ bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, bool batadv_bla_is_backbone_gw(struct sk_buff *skb, struct batadv_orig_node *orig_node, int hdr_size); -int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset); int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb); -int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, - void *offset); int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb); bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid); @@ -84,18 +80,6 @@ static inline bool batadv_bla_is_backbone_gw(struct sk_buff *skb, return false; } -static inline int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, - void *offset) -{ - return 0; -} - -static inline int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, - void *offset) -{ - return 0; -} - static inline bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid) { diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c deleted file mode 100644 index 452856c27d20..000000000000 --- a/net/batman-adv/debugfs.c +++ /dev/null @@ -1,442 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: - * - * Marek Lindner - */ - -#include "debugfs.h" -#include "main.h" - -#include <asm/current.h> -#include <linux/dcache.h> -#include <linux/debugfs.h> -#include <linux/errno.h> -#include <linux/export.h> -#include <linux/fs.h> -#include <linux/netdevice.h> -#include <linux/printk.h> -#include <linux/sched.h> -#include <linux/seq_file.h> -#include <linux/stat.h> -#include <linux/stddef.h> -#include <linux/stringify.h> -#include <linux/sysfs.h> -#include <net/net_namespace.h> - -#include "bat_algo.h" -#include "bridge_loop_avoidance.h" -#include "distributed-arp-table.h" -#include "gateway_client.h" -#include "icmp_socket.h" -#include "log.h" -#include "multicast.h" -#include "network-coding.h" -#include "originator.h" -#include "translation-table.h" - -static struct dentry *batadv_debugfs; - -/** - * batadv_debugfs_deprecated() - Log use of deprecated batadv debugfs access - * @file: file which was accessed - * @alt: explanation what can be used as alternative - */ -void batadv_debugfs_deprecated(struct file *file, const char *alt) -{ - struct dentry *dentry = file_dentry(file); - const char *name = dentry->d_name.name; - - pr_warn_ratelimited(DEPRECATED "%s (pid %d) Use of debugfs file \"%s\".\n%s", - current->comm, task_pid_nr(current), name, alt); -} - -static int batadv_algorithms_open(struct inode *inode, struct file *file) -{ - batadv_debugfs_deprecated(file, - "Use genl command BATADV_CMD_GET_ROUTING_ALGOS instead\n"); - return single_open(file, batadv_algo_seq_print_text, NULL); -} - -static int neighbors_open(struct inode *inode, struct file *file) -{ - struct net_device *net_dev = (struct net_device *)inode->i_private; - - batadv_debugfs_deprecated(file, - "Use genl command BATADV_CMD_GET_NEIGHBORS instead\n"); - return single_open(file, batadv_hardif_neigh_seq_print_text, net_dev); -} - -static int batadv_originators_open(struct inode *inode, struct file *file) -{ - struct net_device *net_dev = (struct net_device *)inode->i_private; - - batadv_debugfs_deprecated(file, - "Use genl command BATADV_CMD_GET_ORIGINATORS instead\n"); - return single_open(file, batadv_orig_seq_print_text, net_dev); -} - -/** - * batadv_originators_hardif_open() - handles debugfs output for the originator - * table of an hard interface - * @inode: inode pointer to debugfs file - * @file: pointer to the seq_file - * - * Return: 0 on success or negative error number in case of failure - */ -static int batadv_originators_hardif_open(struct inode *inode, - struct file *file) -{ - struct net_device *net_dev = (struct net_device *)inode->i_private; - - batadv_debugfs_deprecated(file, - "Use genl command BATADV_CMD_GET_HARDIFS instead\n"); - return single_open(file, batadv_orig_hardif_seq_print_text, net_dev); -} - -static int batadv_gateways_open(struct inode *inode, struct file *file) -{ - struct net_device *net_dev = (struct net_device *)inode->i_private; - - batadv_debugfs_deprecated(file, - "Use genl command BATADV_CMD_GET_GATEWAYS instead\n"); - return single_open(file, batadv_gw_client_seq_print_text, net_dev); -} - -static int batadv_transtable_global_open(struct inode *inode, struct file *file) -{ - struct net_device *net_dev = (struct net_device *)inode->i_private; - - batadv_debugfs_deprecated(file, - "Use genl command BATADV_CMD_GET_TRANSTABLE_GLOBAL instead\n"); - return single_open(file, batadv_tt_global_seq_print_text, net_dev); -} - -#ifdef CONFIG_BATMAN_ADV_BLA -static int batadv_bla_claim_table_open(struct inode *inode, struct file *file) -{ - struct net_device *net_dev = (struct net_device *)inode->i_private; - - batadv_debugfs_deprecated(file, - "Use genl command BATADV_CMD_GET_BLA_CLAIM instead\n"); - return single_open(file, batadv_bla_claim_table_seq_print_text, - net_dev); -} - -static int batadv_bla_backbone_table_open(struct inode *inode, - struct file *file) -{ - struct net_device *net_dev = (struct net_device *)inode->i_private; - - batadv_debugfs_deprecated(file, - "Use genl command BATADV_CMD_GET_BLA_BACKBONE instead\n"); - return single_open(file, batadv_bla_backbone_table_seq_print_text, - net_dev); -} - -#endif - -#ifdef CONFIG_BATMAN_ADV_DAT -/** - * batadv_dat_cache_open() - Prepare file handler for reads from dat_cache - * @inode: inode which was opened - * @file: file handle to be initialized - * - * Return: 0 on success or negative error number in case of failure - */ -static int batadv_dat_cache_open(struct inode *inode, struct file *file) -{ - struct net_device *net_dev = (struct net_device *)inode->i_private; - - batadv_debugfs_deprecated(file, - "Use genl command BATADV_CMD_GET_DAT_CACHE instead\n"); - return single_open(file, batadv_dat_cache_seq_print_text, net_dev); -} -#endif - -static int batadv_transtable_local_open(struct inode *inode, struct file *file) -{ - struct net_device *net_dev = (struct net_device *)inode->i_private; - - batadv_debugfs_deprecated(file, - "Use genl command BATADV_CMD_GET_TRANSTABLE_LOCAL instead\n"); - return single_open(file, batadv_tt_local_seq_print_text, net_dev); -} - -struct batadv_debuginfo { - struct attribute attr; - const struct file_operations fops; -}; - -#ifdef CONFIG_BATMAN_ADV_NC -static int batadv_nc_nodes_open(struct inode *inode, struct file *file) -{ - struct net_device *net_dev = (struct net_device *)inode->i_private; - - batadv_debugfs_deprecated(file, ""); - return single_open(file, batadv_nc_nodes_seq_print_text, net_dev); -} -#endif - -#ifdef CONFIG_BATMAN_ADV_MCAST -/** - * batadv_mcast_flags_open() - prepare file handler for reads from mcast_flags - * @inode: inode which was opened - * @file: file handle to be initialized - * - * Return: 0 on success or negative error number in case of failure - */ -static int batadv_mcast_flags_open(struct inode *inode, struct file *file) -{ - struct net_device *net_dev = (struct net_device *)inode->i_private; - - batadv_debugfs_deprecated(file, - "Use genl command BATADV_CMD_GET_MCAST_FLAGS instead\n"); - return single_open(file, batadv_mcast_flags_seq_print_text, net_dev); -} -#endif - -#define BATADV_DEBUGINFO(_name, _mode, _open) \ -struct batadv_debuginfo batadv_debuginfo_##_name = { \ - .attr = { \ - .name = __stringify(_name), \ - .mode = _mode, \ - }, \ - .fops = { \ - .owner = THIS_MODULE, \ - .open = _open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ - }, \ -} - -/* the following attributes are general and therefore they will be directly - * placed in the BATADV_DEBUGFS_SUBDIR subdirectory of debugfs - */ -static BATADV_DEBUGINFO(routing_algos, 0444, batadv_algorithms_open); - -static struct batadv_debuginfo *batadv_general_debuginfos[] = { - &batadv_debuginfo_routing_algos, - NULL, -}; - -/* The following attributes are per soft interface */ -static BATADV_DEBUGINFO(neighbors, 0444, neighbors_open); -static BATADV_DEBUGINFO(originators, 0444, batadv_originators_open); -static BATADV_DEBUGINFO(gateways, 0444, batadv_gateways_open); -static BATADV_DEBUGINFO(transtable_global, 0444, batadv_transtable_global_open); -#ifdef CONFIG_BATMAN_ADV_BLA -static BATADV_DEBUGINFO(bla_claim_table, 0444, batadv_bla_claim_table_open); -static BATADV_DEBUGINFO(bla_backbone_table, 0444, - batadv_bla_backbone_table_open); -#endif -#ifdef CONFIG_BATMAN_ADV_DAT -static BATADV_DEBUGINFO(dat_cache, 0444, batadv_dat_cache_open); -#endif -static BATADV_DEBUGINFO(transtable_local, 0444, batadv_transtable_local_open); -#ifdef CONFIG_BATMAN_ADV_NC -static BATADV_DEBUGINFO(nc_nodes, 0444, batadv_nc_nodes_open); -#endif -#ifdef CONFIG_BATMAN_ADV_MCAST -static BATADV_DEBUGINFO(mcast_flags, 0444, batadv_mcast_flags_open); -#endif - -static struct batadv_debuginfo *batadv_mesh_debuginfos[] = { - &batadv_debuginfo_neighbors, - &batadv_debuginfo_originators, - &batadv_debuginfo_gateways, - &batadv_debuginfo_transtable_global, -#ifdef CONFIG_BATMAN_ADV_BLA - &batadv_debuginfo_bla_claim_table, - &batadv_debuginfo_bla_backbone_table, -#endif -#ifdef CONFIG_BATMAN_ADV_DAT - &batadv_debuginfo_dat_cache, -#endif - &batadv_debuginfo_transtable_local, -#ifdef CONFIG_BATMAN_ADV_NC - &batadv_debuginfo_nc_nodes, -#endif -#ifdef CONFIG_BATMAN_ADV_MCAST - &batadv_debuginfo_mcast_flags, -#endif - NULL, -}; - -#define BATADV_HARDIF_DEBUGINFO(_name, _mode, _open) \ -struct batadv_debuginfo batadv_hardif_debuginfo_##_name = { \ - .attr = { \ - .name = __stringify(_name), \ - .mode = _mode, \ - }, \ - .fops = { \ - .owner = THIS_MODULE, \ - .open = _open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ - }, \ -} - -static BATADV_HARDIF_DEBUGINFO(originators, 0444, - batadv_originators_hardif_open); - -static struct batadv_debuginfo *batadv_hardif_debuginfos[] = { - &batadv_hardif_debuginfo_originators, - NULL, -}; - -/** - * batadv_debugfs_init() - Initialize soft interface independent debugfs entries - */ -void batadv_debugfs_init(void) -{ - struct batadv_debuginfo **bat_debug; - - batadv_debugfs = debugfs_create_dir(BATADV_DEBUGFS_SUBDIR, NULL); - - for (bat_debug = batadv_general_debuginfos; *bat_debug; ++bat_debug) - debugfs_create_file(((*bat_debug)->attr).name, - S_IFREG | ((*bat_debug)->attr).mode, - batadv_debugfs, NULL, &(*bat_debug)->fops); -} - -/** - * batadv_debugfs_destroy() - Remove all debugfs entries - */ -void batadv_debugfs_destroy(void) -{ - debugfs_remove_recursive(batadv_debugfs); - batadv_debugfs = NULL; -} - -/** - * batadv_debugfs_add_hardif() - creates the base directory for a hard interface - * in debugfs. - * @hard_iface: hard interface which should be added. - */ -void batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) -{ - struct net *net = dev_net(hard_iface->net_dev); - struct batadv_debuginfo **bat_debug; - - if (net != &init_net) - return; - - hard_iface->debug_dir = debugfs_create_dir(hard_iface->net_dev->name, - batadv_debugfs); - - for (bat_debug = batadv_hardif_debuginfos; *bat_debug; ++bat_debug) - debugfs_create_file(((*bat_debug)->attr).name, - S_IFREG | ((*bat_debug)->attr).mode, - hard_iface->debug_dir, hard_iface->net_dev, - &(*bat_debug)->fops); -} - -/** - * batadv_debugfs_rename_hardif() - Fix debugfs path for renamed hardif - * @hard_iface: hard interface which was renamed - */ -void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface) -{ - const char *name = hard_iface->net_dev->name; - struct dentry *dir; - - dir = hard_iface->debug_dir; - if (!dir) - return; - - debugfs_rename(dir->d_parent, dir, dir->d_parent, name); -} - -/** - * batadv_debugfs_del_hardif() - delete the base directory for a hard interface - * in debugfs. - * @hard_iface: hard interface which is deleted. - */ -void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface) -{ - struct net *net = dev_net(hard_iface->net_dev); - - if (net != &init_net) - return; - - if (batadv_debugfs) { - debugfs_remove_recursive(hard_iface->debug_dir); - hard_iface->debug_dir = NULL; - } -} - -/** - * batadv_debugfs_add_meshif() - Initialize interface dependent debugfs entries - * @dev: netdev struct of the soft interface - * - * Return: 0 on success or negative error number in case of failure - */ -int batadv_debugfs_add_meshif(struct net_device *dev) -{ - struct batadv_priv *bat_priv = netdev_priv(dev); - struct batadv_debuginfo **bat_debug; - struct net *net = dev_net(dev); - - if (net != &init_net) - return 0; - - bat_priv->debug_dir = debugfs_create_dir(dev->name, batadv_debugfs); - - batadv_socket_setup(bat_priv); - - if (batadv_debug_log_setup(bat_priv) < 0) - goto rem_attr; - - for (bat_debug = batadv_mesh_debuginfos; *bat_debug; ++bat_debug) - debugfs_create_file(((*bat_debug)->attr).name, - S_IFREG | ((*bat_debug)->attr).mode, - bat_priv->debug_dir, dev, - &(*bat_debug)->fops); - - batadv_nc_init_debugfs(bat_priv); - - return 0; -rem_attr: - debugfs_remove_recursive(bat_priv->debug_dir); - bat_priv->debug_dir = NULL; - return -ENOMEM; -} - -/** - * batadv_debugfs_rename_meshif() - Fix debugfs path for renamed softif - * @dev: net_device which was renamed - */ -void batadv_debugfs_rename_meshif(struct net_device *dev) -{ - struct batadv_priv *bat_priv = netdev_priv(dev); - const char *name = dev->name; - struct dentry *dir; - - dir = bat_priv->debug_dir; - if (!dir) - return; - - debugfs_rename(dir->d_parent, dir, dir->d_parent, name); -} - -/** - * batadv_debugfs_del_meshif() - Remove interface dependent debugfs entries - * @dev: netdev struct of the soft interface - */ -void batadv_debugfs_del_meshif(struct net_device *dev) -{ - struct batadv_priv *bat_priv = netdev_priv(dev); - struct net *net = dev_net(dev); - - if (net != &init_net) - return; - - batadv_debug_log_cleanup(bat_priv); - - if (batadv_debugfs) { - debugfs_remove_recursive(bat_priv->debug_dir); - bat_priv->debug_dir = NULL; - } -} diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h deleted file mode 100644 index 7e2e8f586f42..000000000000 --- a/net/batman-adv/debugfs.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: - * - * Marek Lindner - */ - -#ifndef _NET_BATMAN_ADV_DEBUGFS_H_ -#define _NET_BATMAN_ADV_DEBUGFS_H_ - -#include "main.h" - -#include <linux/fs.h> -#include <linux/netdevice.h> - -#define BATADV_DEBUGFS_SUBDIR "batman_adv" - -#if IS_ENABLED(CONFIG_BATMAN_ADV_DEBUGFS) - -void batadv_debugfs_deprecated(struct file *file, const char *alt); -void batadv_debugfs_init(void); -void batadv_debugfs_destroy(void); -int batadv_debugfs_add_meshif(struct net_device *dev); -void batadv_debugfs_rename_meshif(struct net_device *dev); -void batadv_debugfs_del_meshif(struct net_device *dev); -void batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface); -void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface); -void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface); - -#else - -static inline void batadv_debugfs_deprecated(struct file *file, const char *alt) -{ -} - -static inline void batadv_debugfs_init(void) -{ -} - -static inline void batadv_debugfs_destroy(void) -{ -} - -static inline int batadv_debugfs_add_meshif(struct net_device *dev) -{ - return 0; -} - -static inline void batadv_debugfs_rename_meshif(struct net_device *dev) -{ -} - -static inline void batadv_debugfs_del_meshif(struct net_device *dev) -{ -} - -static inline -void batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) -{ -} - -static inline -void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface) -{ -} - -static inline -void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface) -{ -} - -#endif - -#endif /* _NET_BATMAN_ADV_DEBUGFS_H_ */ diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 0e6e53e9b5f3..8c95a11a830a 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Antonio Quartulli */ @@ -26,7 +26,6 @@ #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -88,7 +87,7 @@ struct batadv_dhcp_packet { __u8 sname[64]; __u8 file[128]; __be32 magic; - __u8 options[]; + /* __u8 options[]; */ }; #define BATADV_DHCP_YIADDR_LEN sizeof(((struct batadv_dhcp_packet *)0)->yiaddr) @@ -842,60 +841,6 @@ void batadv_dat_free(struct batadv_priv *bat_priv) batadv_dat_hash_free(bat_priv); } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS -/** - * batadv_dat_cache_seq_print_text() - print the local DAT hash table - * @seq: seq file to print on - * @offset: not used - * - * Return: always 0 - */ -int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset) -{ - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_priv *bat_priv = netdev_priv(net_dev); - struct batadv_hashtable *hash = bat_priv->dat.hash; - struct batadv_dat_entry *dat_entry; - struct batadv_hard_iface *primary_if; - struct hlist_head *head; - unsigned long last_seen_jiffies; - int last_seen_msecs, last_seen_secs, last_seen_mins; - u32 i; - - primary_if = batadv_seq_print_text_primary_if_get(seq); - if (!primary_if) - goto out; - - seq_printf(seq, "Distributed ARP Table (%s):\n", net_dev->name); - seq_puts(seq, - " IPv4 MAC VID last-seen\n"); - - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(dat_entry, head, hash_entry) { - last_seen_jiffies = jiffies - dat_entry->last_update; - last_seen_msecs = jiffies_to_msecs(last_seen_jiffies); - last_seen_mins = last_seen_msecs / 60000; - last_seen_msecs = last_seen_msecs % 60000; - last_seen_secs = last_seen_msecs / 1000; - - seq_printf(seq, " * %15pI4 %pM %4i %6i:%02i\n", - &dat_entry->ip, dat_entry->mac_addr, - batadv_print_vid(dat_entry->vid), - last_seen_mins, last_seen_secs); - } - rcu_read_unlock(); - } - -out: - if (primary_if) - batadv_hardif_put(primary_if); - return 0; -} -#endif - /** * batadv_dat_cache_dump_entry() - dump one entry of the DAT cache table to a * netlink socket @@ -1619,7 +1564,7 @@ static int batadv_dat_get_dhcp_message_type(struct sk_buff *skb) } /** - * batadv_dat_get_dhcp_yiaddr() - get yiaddr from a DHCP packet + * batadv_dat_dhcp_get_yiaddr() - get yiaddr from a DHCP packet * @skb: the DHCP packet to parse * @buf: a buffer to store the yiaddr in * diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 4e031661682a..bed7f3d20844 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Antonio Quartulli */ @@ -12,7 +12,6 @@ #include <linux/compiler.h> #include <linux/netdevice.h> #include <linux/netlink.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> @@ -74,7 +73,6 @@ batadv_dat_init_own_addr(struct batadv_priv *bat_priv, int batadv_dat_init(struct batadv_priv *bat_priv); void batadv_dat_free(struct batadv_priv *bat_priv); -int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset); int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb); /** diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 9a47ef8b95c4..a5d9d800082b 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> */ @@ -14,8 +14,8 @@ #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/lockdep.h> +#include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> @@ -27,7 +27,6 @@ #include "originator.h" #include "routing.h" #include "send.h" -#include "soft-interface.h" /** * batadv_frag_clear_chain() - delete entries in the fragment buffer chain @@ -391,6 +390,7 @@ out: /** * batadv_frag_create() - create a fragment from skb + * @net_dev: outgoing device for fragment * @skb: skb to create fragment from * @frag_head: header to use in new fragment * @fragment_size: size of new fragment @@ -401,22 +401,25 @@ out: * * Return: the new fragment, NULL on error. */ -static struct sk_buff *batadv_frag_create(struct sk_buff *skb, +static struct sk_buff *batadv_frag_create(struct net_device *net_dev, + struct sk_buff *skb, struct batadv_frag_packet *frag_head, unsigned int fragment_size) { + unsigned int ll_reserved = LL_RESERVED_SPACE(net_dev); + unsigned int tailroom = net_dev->needed_tailroom; struct sk_buff *skb_fragment; unsigned int header_size = sizeof(*frag_head); unsigned int mtu = fragment_size + header_size; - skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN); + skb_fragment = dev_alloc_skb(ll_reserved + mtu + tailroom); if (!skb_fragment) goto err; skb_fragment->priority = skb->priority; /* Eat the last mtu-bytes of the skb */ - skb_reserve(skb_fragment, header_size + ETH_HLEN); + skb_reserve(skb_fragment, ll_reserved + header_size); skb_split(skb, skb_fragment, skb->len - fragment_size); /* Add the header */ @@ -439,11 +442,12 @@ int batadv_frag_send_packet(struct sk_buff *skb, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node) { + struct net_device *net_dev = neigh_node->if_incoming->net_dev; struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_frag_packet frag_header; struct sk_buff *skb_fragment; - unsigned int mtu = neigh_node->if_incoming->net_dev->mtu; + unsigned int mtu = net_dev->mtu; unsigned int header_size = sizeof(frag_header); unsigned int max_fragment_size, num_fragments; int ret; @@ -503,7 +507,7 @@ int batadv_frag_send_packet(struct sk_buff *skb, goto put_primary_if; } - skb_fragment = batadv_frag_create(skb, &frag_header, + skb_fragment = batadv_frag_create(net_dev, skb, &frag_header, max_fragment_size); if (!skb_fragment) { ret = -ENOMEM; @@ -522,13 +526,14 @@ int batadv_frag_send_packet(struct sk_buff *skb, frag_header.no++; } - /* Make room for the fragment header. */ - if (batadv_skb_head_push(skb, header_size) < 0 || - pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0) { - ret = -ENOMEM; + /* make sure that there is at least enough head for the fragmentation + * and ethernet headers + */ + ret = skb_cow_head(skb, ETH_HLEN + header_size); + if (ret < 0) goto put_primary_if; - } + skb_push(skb, header_size); memcpy(skb->data, &frag_header, header_size); /* Send the last fragment */ diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index 881ef328b6cd..dbf0871f8703 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> */ diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index ef3f85b576c4..007f2827935d 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ @@ -25,7 +25,6 @@ #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -511,44 +510,6 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv) spin_unlock_bh(&bat_priv->gw.list_lock); } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - -/** - * batadv_gw_client_seq_print_text() - Print the gateway table in a seq file - * @seq: seq file to print on - * @offset: not used - * - * Return: always 0 - */ -int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) -{ - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_priv *bat_priv = netdev_priv(net_dev); - struct batadv_hard_iface *primary_if; - - primary_if = batadv_seq_print_text_primary_if_get(seq); - if (!primary_if) - return 0; - - seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n", - BATADV_SOURCE_VERSION, primary_if->net_dev->name, - primary_if->net_dev->dev_addr, net_dev->name, - bat_priv->algo_ops->name); - - batadv_hardif_put(primary_if); - - if (!bat_priv->algo_ops->gw.print) { - seq_puts(seq, - "No printing function for this routing protocol\n"); - return 0; - } - - bat_priv->algo_ops->gw.print(bat_priv, seq); - - return 0; -} -#endif - /** * batadv_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 88b5dba84354..2ae5846ef958 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ @@ -10,7 +10,6 @@ #include "main.h" #include <linux/netlink.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> @@ -31,7 +30,6 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv); void batadv_gw_node_put(struct batadv_gw_node *gw_node); struct batadv_gw_node * batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv); -int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset); int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb); bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb); enum batadv_dhcp_recipient diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 16cd9450ceb1..fdde305a198e 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index c3a0c5a7f7e9..87c37f907261 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index dad99641df2a..4a6a25d551a8 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -18,6 +18,7 @@ #include <linux/kref.h> #include <linux/limits.h> #include <linux/list.h> +#include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/printk.h> @@ -31,14 +32,12 @@ #include "bat_v.h" #include "bridge_loop_avoidance.h" -#include "debugfs.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "log.h" #include "originator.h" #include "send.h" #include "soft-interface.h" -#include "sysfs.h" #include "translation-table.h" /** @@ -554,6 +553,9 @@ static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface) needed_headroom = lower_headroom + (lower_header_len - ETH_HLEN); needed_headroom += batadv_max_header_len(); + /* fragmentation headers don't strip the unicast/... header */ + needed_headroom += sizeof(struct batadv_frag_packet); + soft_iface->needed_headroom = needed_headroom; soft_iface->needed_tailroom = lower_tailroom; } @@ -843,11 +845,8 @@ static size_t batadv_hardif_cnt(const struct net_device *soft_iface) /** * batadv_hardif_disable_interface() - Remove hard interface from soft interface * @hard_iface: hard interface to be removed - * @autodel: whether to delete soft interface when it doesn't contain any other - * slave interfaces */ -void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, - enum batadv_hard_if_cleanup autodel) +void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if = NULL; @@ -885,13 +884,9 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface); /* nobody uses this interface anymore */ - if (batadv_hardif_cnt(hard_iface->soft_iface) <= 1) { + if (batadv_hardif_cnt(hard_iface->soft_iface) <= 1) batadv_gw_check_client_stop(bat_priv); - if (autodel == BATADV_IF_CLEANUP_AUTO) - batadv_softif_destroy_sysfs(hard_iface->soft_iface); - } - hard_iface->soft_iface = NULL; batadv_hardif_put(hard_iface); @@ -904,7 +899,6 @@ static struct batadv_hard_iface * batadv_hardif_add_interface(struct net_device *net_dev) { struct batadv_hard_iface *hard_iface; - int ret; ASSERT_RTNL(); @@ -917,16 +911,10 @@ batadv_hardif_add_interface(struct net_device *net_dev) if (!hard_iface) goto release_dev; - ret = batadv_sysfs_add_hardif(&hard_iface->hardif_obj, net_dev); - if (ret) - goto free_if; - hard_iface->net_dev = net_dev; hard_iface->soft_iface = NULL; hard_iface->if_status = BATADV_IF_NOT_IN_USE; - batadv_debugfs_add_hardif(hard_iface); - INIT_LIST_HEAD(&hard_iface->list); INIT_HLIST_HEAD(&hard_iface->neigh_list); @@ -950,8 +938,6 @@ batadv_hardif_add_interface(struct net_device *net_dev) return hard_iface; -free_if: - kfree(hard_iface); release_dev: dev_put(net_dev); out: @@ -964,15 +950,12 @@ static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface) /* first deactivate interface */ if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) - batadv_hardif_disable_interface(hard_iface, - BATADV_IF_CLEANUP_KEEP); + batadv_hardif_disable_interface(hard_iface); if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) return; hard_iface->if_status = BATADV_IF_TO_BE_REMOVED; - batadv_debugfs_del_hardif(hard_iface); - batadv_sysfs_del_hardif(&hard_iface->hardif_obj); batadv_hardif_put(hard_iface); } @@ -990,13 +973,9 @@ static int batadv_hard_if_event_softif(unsigned long event, switch (event) { case NETDEV_REGISTER: - batadv_sysfs_add_meshif(net_dev); bat_priv = netdev_priv(net_dev); batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS); break; - case NETDEV_CHANGENAME: - batadv_debugfs_rename_meshif(net_dev); - break; } return NOTIFY_DONE; @@ -1061,9 +1040,6 @@ static int batadv_hard_if_event(struct notifier_block *this, if (batadv_is_wifi_hardif(hard_iface)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; break; - case NETDEV_CHANGENAME: - batadv_debugfs_rename_hardif(hard_iface); - break; default: break; } diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index b1855d9d0b06..83d11b46a9d8 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -42,12 +42,6 @@ enum batadv_hard_if_state { /** @BATADV_IF_TO_BE_ACTIVATED: interface is getting activated */ BATADV_IF_TO_BE_ACTIVATED, - - /** - * @BATADV_IF_I_WANT_YOU: interface is queued up (using sysfs) for being - * added as slave interface of a batman-adv soft interface - */ - BATADV_IF_I_WANT_YOU, }; /** @@ -73,22 +67,6 @@ enum batadv_hard_if_bcast { BATADV_HARDIF_BCAST_DUPORIG, }; -/** - * enum batadv_hard_if_cleanup - Cleanup modi for soft_iface after slave removal - */ -enum batadv_hard_if_cleanup { - /** - * @BATADV_IF_CLEANUP_KEEP: Don't automatically delete soft-interface - */ - BATADV_IF_CLEANUP_KEEP, - - /** - * @BATADV_IF_CLEANUP_AUTO: Delete soft-interface after last slave was - * removed - */ - BATADV_IF_CLEANUP_AUTO, -}; - extern struct notifier_block batadv_hard_if_notifier; struct net_device *batadv_get_real_netdev(struct net_device *net_device); @@ -98,8 +76,7 @@ struct batadv_hard_iface* batadv_hardif_get_by_netdev(const struct net_device *net_dev); int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, struct net *net, const char *iface_name); -void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, - enum batadv_hard_if_cleanup autodel); +void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface); int batadv_hardif_min_mtu(struct net_device *soft_iface); void batadv_update_min_mtu(struct net_device *soft_iface); void batadv_hardif_release(struct kref *ref); diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 68638e0450a6..8016e619787f 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 91ae9f32b580..46696759f194 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c deleted file mode 100644 index 8bdabc03b0b2..000000000000 --- a/net/batman-adv/icmp_socket.c +++ /dev/null @@ -1,392 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: - * - * Marek Lindner - */ - -#include "icmp_socket.h" -#include "main.h" - -#include <linux/atomic.h> -#include <linux/compiler.h> -#include <linux/debugfs.h> -#include <linux/errno.h> -#include <linux/etherdevice.h> -#include <linux/eventpoll.h> -#include <linux/export.h> -#include <linux/fcntl.h> -#include <linux/fs.h> -#include <linux/gfp.h> -#include <linux/if_ether.h> -#include <linux/kernel.h> -#include <linux/list.h> -#include <linux/module.h> -#include <linux/netdevice.h> -#include <linux/pkt_sched.h> -#include <linux/poll.h> -#include <linux/printk.h> -#include <linux/sched.h> /* for linux/wait.h */ -#include <linux/skbuff.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/stddef.h> -#include <linux/string.h> -#include <linux/uaccess.h> -#include <linux/wait.h> -#include <uapi/linux/batadv_packet.h> - -#include "debugfs.h" -#include "hard-interface.h" -#include "log.h" -#include "originator.h" -#include "send.h" - -static struct batadv_socket_client *batadv_socket_client_hash[256]; - -static void batadv_socket_add_packet(struct batadv_socket_client *socket_client, - struct batadv_icmp_header *icmph, - size_t icmp_len); - -/** - * batadv_socket_init() - Initialize soft interface independent socket data - */ -void batadv_socket_init(void) -{ - memset(batadv_socket_client_hash, 0, sizeof(batadv_socket_client_hash)); -} - -static int batadv_socket_open(struct inode *inode, struct file *file) -{ - unsigned int i; - struct batadv_socket_client *socket_client; - - if (!try_module_get(THIS_MODULE)) - return -EBUSY; - - batadv_debugfs_deprecated(file, ""); - - stream_open(inode, file); - - socket_client = kmalloc(sizeof(*socket_client), GFP_KERNEL); - if (!socket_client) { - module_put(THIS_MODULE); - return -ENOMEM; - } - - for (i = 0; i < ARRAY_SIZE(batadv_socket_client_hash); i++) { - if (!batadv_socket_client_hash[i]) { - batadv_socket_client_hash[i] = socket_client; - break; - } - } - - if (i == ARRAY_SIZE(batadv_socket_client_hash)) { - pr_err("Error - can't add another packet client: maximum number of clients reached\n"); - kfree(socket_client); - module_put(THIS_MODULE); - return -EXFULL; - } - - INIT_LIST_HEAD(&socket_client->queue_list); - socket_client->queue_len = 0; - socket_client->index = i; - socket_client->bat_priv = inode->i_private; - spin_lock_init(&socket_client->lock); - init_waitqueue_head(&socket_client->queue_wait); - - file->private_data = socket_client; - - return 0; -} - -static int batadv_socket_release(struct inode *inode, struct file *file) -{ - struct batadv_socket_client *client = file->private_data; - struct batadv_socket_packet *packet, *tmp; - - spin_lock_bh(&client->lock); - - /* for all packets in the queue ... */ - list_for_each_entry_safe(packet, tmp, &client->queue_list, list) { - list_del(&packet->list); - kfree(packet); - } - - batadv_socket_client_hash[client->index] = NULL; - spin_unlock_bh(&client->lock); - - kfree(client); - module_put(THIS_MODULE); - - return 0; -} - -static ssize_t batadv_socket_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct batadv_socket_client *socket_client = file->private_data; - struct batadv_socket_packet *socket_packet; - size_t packet_len; - int error; - - if ((file->f_flags & O_NONBLOCK) && socket_client->queue_len == 0) - return -EAGAIN; - - if (!buf || count < sizeof(struct batadv_icmp_packet)) - return -EINVAL; - - error = wait_event_interruptible(socket_client->queue_wait, - socket_client->queue_len); - - if (error) - return error; - - spin_lock_bh(&socket_client->lock); - - socket_packet = list_first_entry(&socket_client->queue_list, - struct batadv_socket_packet, list); - list_del(&socket_packet->list); - socket_client->queue_len--; - - spin_unlock_bh(&socket_client->lock); - - packet_len = min(count, socket_packet->icmp_len); - error = copy_to_user(buf, &socket_packet->icmp_packet, packet_len); - - kfree(socket_packet); - - if (error) - return -EFAULT; - - return packet_len; -} - -static ssize_t batadv_socket_write(struct file *file, const char __user *buff, - size_t len, loff_t *off) -{ - struct batadv_socket_client *socket_client = file->private_data; - struct batadv_priv *bat_priv = socket_client->bat_priv; - struct batadv_hard_iface *primary_if = NULL; - struct sk_buff *skb; - struct batadv_icmp_packet_rr *icmp_packet_rr; - struct batadv_icmp_header *icmp_header; - struct batadv_orig_node *orig_node = NULL; - struct batadv_neigh_node *neigh_node = NULL; - size_t packet_len = sizeof(struct batadv_icmp_packet); - u8 *addr; - - if (len < sizeof(struct batadv_icmp_header)) { - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Error - can't send packet from char device: invalid packet size\n"); - return -EINVAL; - } - - primary_if = batadv_primary_if_get_selected(bat_priv); - - if (!primary_if) { - len = -EFAULT; - goto out; - } - - if (len >= BATADV_ICMP_MAX_PACKET_SIZE) - packet_len = BATADV_ICMP_MAX_PACKET_SIZE; - else - packet_len = len; - - skb = netdev_alloc_skb_ip_align(NULL, packet_len + ETH_HLEN); - if (!skb) { - len = -ENOMEM; - goto out; - } - - skb->priority = TC_PRIO_CONTROL; - skb_reserve(skb, ETH_HLEN); - icmp_header = skb_put(skb, packet_len); - - if (copy_from_user(icmp_header, buff, packet_len)) { - len = -EFAULT; - goto free_skb; - } - - if (icmp_header->packet_type != BATADV_ICMP) { - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Error - can't send packet from char device: got bogus packet type (expected: BAT_ICMP)\n"); - len = -EINVAL; - goto free_skb; - } - - switch (icmp_header->msg_type) { - case BATADV_ECHO_REQUEST: - if (len < sizeof(struct batadv_icmp_packet)) { - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Error - can't send packet from char device: invalid packet size\n"); - len = -EINVAL; - goto free_skb; - } - - if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) - goto dst_unreach; - - orig_node = batadv_orig_hash_find(bat_priv, icmp_header->dst); - if (!orig_node) - goto dst_unreach; - - neigh_node = batadv_orig_router_get(orig_node, - BATADV_IF_DEFAULT); - if (!neigh_node) - goto dst_unreach; - - if (!neigh_node->if_incoming) - goto dst_unreach; - - if (neigh_node->if_incoming->if_status != BATADV_IF_ACTIVE) - goto dst_unreach; - - icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmp_header; - if (packet_len == sizeof(*icmp_packet_rr)) { - addr = neigh_node->if_incoming->net_dev->dev_addr; - ether_addr_copy(icmp_packet_rr->rr[0], addr); - } - - break; - default: - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Error - can't send packet from char device: got unknown message type\n"); - len = -EINVAL; - goto free_skb; - } - - icmp_header->uid = socket_client->index; - - if (icmp_header->version != BATADV_COMPAT_VERSION) { - icmp_header->msg_type = BATADV_PARAMETER_PROBLEM; - icmp_header->version = BATADV_COMPAT_VERSION; - batadv_socket_add_packet(socket_client, icmp_header, - packet_len); - goto free_skb; - } - - ether_addr_copy(icmp_header->orig, primary_if->net_dev->dev_addr); - - batadv_send_unicast_skb(skb, neigh_node); - goto out; - -dst_unreach: - icmp_header->msg_type = BATADV_DESTINATION_UNREACHABLE; - batadv_socket_add_packet(socket_client, icmp_header, packet_len); -free_skb: - kfree_skb(skb); -out: - if (primary_if) - batadv_hardif_put(primary_if); - if (neigh_node) - batadv_neigh_node_put(neigh_node); - if (orig_node) - batadv_orig_node_put(orig_node); - return len; -} - -static __poll_t batadv_socket_poll(struct file *file, poll_table *wait) -{ - struct batadv_socket_client *socket_client = file->private_data; - - poll_wait(file, &socket_client->queue_wait, wait); - - if (socket_client->queue_len > 0) - return EPOLLIN | EPOLLRDNORM; - - return 0; -} - -static const struct file_operations batadv_fops = { - .owner = THIS_MODULE, - .open = batadv_socket_open, - .release = batadv_socket_release, - .read = batadv_socket_read, - .write = batadv_socket_write, - .poll = batadv_socket_poll, - .llseek = no_llseek, -}; - -/** - * batadv_socket_setup() - Create debugfs "socket" file - * @bat_priv: the bat priv with all the soft interface information - */ -void batadv_socket_setup(struct batadv_priv *bat_priv) -{ - debugfs_create_file(BATADV_ICMP_SOCKET, 0600, bat_priv->debug_dir, - bat_priv, &batadv_fops); -} - -/** - * batadv_socket_add_packet() - schedule an icmp packet to be sent to - * userspace on an icmp socket. - * @socket_client: the socket this packet belongs to - * @icmph: pointer to the header of the icmp packet - * @icmp_len: total length of the icmp packet - */ -static void batadv_socket_add_packet(struct batadv_socket_client *socket_client, - struct batadv_icmp_header *icmph, - size_t icmp_len) -{ - struct batadv_socket_packet *socket_packet; - size_t len; - - socket_packet = kmalloc(sizeof(*socket_packet), GFP_ATOMIC); - - if (!socket_packet) - return; - - len = icmp_len; - /* check the maximum length before filling the buffer */ - if (len > sizeof(socket_packet->icmp_packet)) - len = sizeof(socket_packet->icmp_packet); - - INIT_LIST_HEAD(&socket_packet->list); - memcpy(&socket_packet->icmp_packet, icmph, len); - socket_packet->icmp_len = len; - - spin_lock_bh(&socket_client->lock); - - /* while waiting for the lock the socket_client could have been - * deleted - */ - if (!batadv_socket_client_hash[icmph->uid]) { - spin_unlock_bh(&socket_client->lock); - kfree(socket_packet); - return; - } - - list_add_tail(&socket_packet->list, &socket_client->queue_list); - socket_client->queue_len++; - - if (socket_client->queue_len > 100) { - socket_packet = list_first_entry(&socket_client->queue_list, - struct batadv_socket_packet, - list); - - list_del(&socket_packet->list); - kfree(socket_packet); - socket_client->queue_len--; - } - - spin_unlock_bh(&socket_client->lock); - - wake_up(&socket_client->queue_wait); -} - -/** - * batadv_socket_receive_packet() - schedule an icmp packet to be received - * locally and sent to userspace. - * @icmph: pointer to the header of the icmp packet - * @icmp_len: total length of the icmp packet - */ -void batadv_socket_receive_packet(struct batadv_icmp_header *icmph, - size_t icmp_len) -{ - struct batadv_socket_client *hash; - - hash = batadv_socket_client_hash[icmph->uid]; - if (hash) - batadv_socket_add_packet(hash, icmph, icmp_len); -} diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h deleted file mode 100644 index 6abd0f4742ef..000000000000 --- a/net/batman-adv/icmp_socket.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: - * - * Marek Lindner - */ - -#ifndef _NET_BATMAN_ADV_ICMP_SOCKET_H_ -#define _NET_BATMAN_ADV_ICMP_SOCKET_H_ - -#include "main.h" - -#include <linux/types.h> -#include <uapi/linux/batadv_packet.h> - -#define BATADV_ICMP_SOCKET "socket" - -void batadv_socket_setup(struct batadv_priv *bat_priv); - -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - -void batadv_socket_init(void); -void batadv_socket_receive_packet(struct batadv_icmp_header *icmph, - size_t icmp_len); - -#else - -static inline void batadv_socket_init(void) -{ -} - -static inline void -batadv_socket_receive_packet(struct batadv_icmp_header *icmph, size_t icmp_len) -{ -} - -#endif - -#endif /* _NET_BATMAN_ADV_ICMP_SOCKET_H_ */ diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index a67b2b091447..f0e5d1429662 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ @@ -7,213 +7,10 @@ #include "log.h" #include "main.h" -#include <linux/compiler.h> -#include <linux/debugfs.h> -#include <linux/errno.h> -#include <linux/eventpoll.h> -#include <linux/export.h> -#include <linux/fcntl.h> -#include <linux/fs.h> -#include <linux/gfp.h> -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/poll.h> -#include <linux/sched.h> /* for linux/wait.h */ -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/stddef.h> -#include <linux/types.h> -#include <linux/uaccess.h> -#include <linux/wait.h> #include <stdarg.h> -#include "debugfs.h" #include "trace.h" -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - -#define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1) - -static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN; - -static char *batadv_log_char_addr(struct batadv_priv_debug_log *debug_log, - size_t idx) -{ - return &debug_log->log_buff[idx & BATADV_LOG_BUFF_MASK]; -} - -static void batadv_emit_log_char(struct batadv_priv_debug_log *debug_log, - char c) -{ - char *char_addr; - - char_addr = batadv_log_char_addr(debug_log, debug_log->log_end); - *char_addr = c; - debug_log->log_end++; - - if (debug_log->log_end - debug_log->log_start > batadv_log_buff_len) - debug_log->log_start = debug_log->log_end - batadv_log_buff_len; -} - -__printf(2, 3) -static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log, - const char *fmt, ...) -{ - va_list args; - static char debug_log_buf[256]; - char *p; - - if (!debug_log) - return 0; - - spin_lock_bh(&debug_log->lock); - va_start(args, fmt); - vscnprintf(debug_log_buf, sizeof(debug_log_buf), fmt, args); - va_end(args); - - for (p = debug_log_buf; *p != 0; p++) - batadv_emit_log_char(debug_log, *p); - - spin_unlock_bh(&debug_log->lock); - - wake_up(&debug_log->queue_wait); - - return 0; -} - -static int batadv_log_open(struct inode *inode, struct file *file) -{ - if (!try_module_get(THIS_MODULE)) - return -EBUSY; - - batadv_debugfs_deprecated(file, - "Use tracepoint batadv:batadv_dbg instead\n"); - - stream_open(inode, file); - file->private_data = inode->i_private; - return 0; -} - -static int batadv_log_release(struct inode *inode, struct file *file) -{ - module_put(THIS_MODULE); - return 0; -} - -static bool batadv_log_empty(struct batadv_priv_debug_log *debug_log) -{ - return !(debug_log->log_start - debug_log->log_end); -} - -static ssize_t batadv_log_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct batadv_priv *bat_priv = file->private_data; - struct batadv_priv_debug_log *debug_log = bat_priv->debug_log; - int error, i = 0; - char *char_addr; - char c; - - if ((file->f_flags & O_NONBLOCK) && batadv_log_empty(debug_log)) - return -EAGAIN; - - if (!buf) - return -EINVAL; - - if (count == 0) - return 0; - - if (!access_ok(buf, count)) - return -EFAULT; - - error = wait_event_interruptible(debug_log->queue_wait, - (!batadv_log_empty(debug_log))); - - if (error) - return error; - - spin_lock_bh(&debug_log->lock); - - while ((!error) && (i < count) && - (debug_log->log_start != debug_log->log_end)) { - char_addr = batadv_log_char_addr(debug_log, - debug_log->log_start); - c = *char_addr; - - debug_log->log_start++; - - spin_unlock_bh(&debug_log->lock); - - error = __put_user(c, buf); - - spin_lock_bh(&debug_log->lock); - - buf++; - i++; - } - - spin_unlock_bh(&debug_log->lock); - - if (!error) - return i; - - return error; -} - -static __poll_t batadv_log_poll(struct file *file, poll_table *wait) -{ - struct batadv_priv *bat_priv = file->private_data; - struct batadv_priv_debug_log *debug_log = bat_priv->debug_log; - - poll_wait(file, &debug_log->queue_wait, wait); - - if (!batadv_log_empty(debug_log)) - return EPOLLIN | EPOLLRDNORM; - - return 0; -} - -static const struct file_operations batadv_log_fops = { - .open = batadv_log_open, - .release = batadv_log_release, - .read = batadv_log_read, - .poll = batadv_log_poll, - .llseek = no_llseek, -}; - -/** - * batadv_debug_log_setup() - Initialize debug log - * @bat_priv: the bat priv with all the soft interface information - * - * Return: 0 on success or negative error number in case of failure - */ -int batadv_debug_log_setup(struct batadv_priv *bat_priv) -{ - bat_priv->debug_log = kzalloc(sizeof(*bat_priv->debug_log), GFP_ATOMIC); - if (!bat_priv->debug_log) - return -ENOMEM; - - spin_lock_init(&bat_priv->debug_log->lock); - init_waitqueue_head(&bat_priv->debug_log->queue_wait); - - debugfs_create_file("log", 0400, bat_priv->debug_dir, bat_priv, - &batadv_log_fops); - return 0; -} - -/** - * batadv_debug_log_cleanup() - Destroy debug log - * @bat_priv: the bat priv with all the soft interface information - */ -void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) -{ - kfree(bat_priv->debug_log); - bat_priv->debug_log = NULL; -} - -#endif /* CONFIG_BATMAN_ADV_DEBUGFS */ - /** * batadv_debug_log() - Add debug log entry * @bat_priv: the bat priv with all the soft interface information @@ -231,11 +28,6 @@ int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - batadv_fdebug_log(bat_priv->debug_log, "[%10u] %pV", - jiffies_to_msecs(jiffies), &vaf); -#endif - trace_batadv_dbg(bat_priv, &vaf); va_end(args); diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 979864c0fa6b..6717c965f0fa 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 70fee9b42e25..e48f7ac8a854 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -23,12 +23,12 @@ #include <linux/kobject.h> #include <linux/kref.h> #include <linux/list.h> +#include <linux/minmax.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rcupdate.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -44,12 +44,10 @@ #include "bat_iv_ogm.h" #include "bat_v.h" #include "bridge_loop_avoidance.h" -#include "debugfs.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "gateway_common.h" #include "hard-interface.h" -#include "icmp_socket.h" #include "log.h" #include "multicast.h" #include "netlink.h" @@ -113,9 +111,6 @@ static int __init batadv_init(void) if (!batadv_event_workqueue) goto err_create_wq; - batadv_socket_init(); - batadv_debugfs_init(); - register_netdevice_notifier(&batadv_hard_if_notifier); rtnl_link_register(&batadv_link_ops); batadv_netlink_register(); @@ -133,7 +128,6 @@ err_create_wq: static void __exit batadv_exit(void) { - batadv_debugfs_destroy(); batadv_netlink_unregister(); rtnl_link_unregister(&batadv_link_ops); unregister_netdevice_notifier(&batadv_hard_if_notifier); @@ -305,44 +299,6 @@ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) return is_my_mac; } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS -/** - * batadv_seq_print_text_primary_if_get() - called from debugfs table printing - * function that requires the primary interface - * @seq: debugfs table seq_file struct - * - * Return: primary interface if found or NULL otherwise. - */ -struct batadv_hard_iface * -batadv_seq_print_text_primary_if_get(struct seq_file *seq) -{ - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_priv *bat_priv = netdev_priv(net_dev); - struct batadv_hard_iface *primary_if; - - primary_if = batadv_primary_if_get_selected(bat_priv); - - if (!primary_if) { - seq_printf(seq, - "BATMAN mesh %s disabled - please specify interfaces to enable it\n", - net_dev->name); - goto out; - } - - if (primary_if->if_status == BATADV_IF_ACTIVE) - goto out; - - seq_printf(seq, - "BATMAN mesh %s disabled - primary interface not active\n", - net_dev->name); - batadv_hardif_put(primary_if); - primary_if = NULL; - -out: - return primary_if; -} -#endif - /** * batadv_max_header_len() - calculate maximum encapsulation overhead for a * payload packet diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index a47dc332d796..8f0102b71656 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2020.4" +#define BATADV_SOURCE_VERSION "2021.1" #endif /* B.A.T.M.A.N. parameters */ @@ -212,7 +212,6 @@ enum batadv_uev_type { #include <linux/jiffies.h> #include <linux/netdevice.h> #include <linux/percpu.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> @@ -243,8 +242,6 @@ extern struct workqueue_struct *batadv_event_workqueue; int batadv_mesh_init(struct net_device *soft_iface); void batadv_mesh_free(struct net_device *soft_iface); bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr); -struct batadv_hard_iface * -batadv_seq_print_text_primary_if_get(struct seq_file *seq); int batadv_max_header_len(void); void batadv_skb_set_priority(struct sk_buff *skb, int offset); int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 9af99c39b9fd..28166402d30c 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2014-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Linus Lüssing */ @@ -33,7 +33,6 @@ #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rcupdate.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -829,7 +828,7 @@ batadv_mcast_bridge_log(struct batadv_priv *bat_priv, } /** - * batadv_mcast_flags_logs() - output debug information about mcast flag changes + * batadv_mcast_flags_log() - output debug information about mcast flag changes * @bat_priv: the bat priv with all the soft interface information * @flags: TVLV flags indicating the new multicast state * @@ -2074,116 +2073,6 @@ void batadv_mcast_init(struct batadv_priv *bat_priv) batadv_mcast_start_timer(bat_priv); } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS -/** - * batadv_mcast_flags_print_header() - print own mcast flags to debugfs table - * @bat_priv: the bat priv with all the soft interface information - * @seq: debugfs table seq_file struct - * - * Prints our own multicast flags including a more specific reason why - * they are set, that is prints the bridge and querier state too, to - * the debugfs table specified via @seq. - */ -static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv, - struct seq_file *seq) -{ - struct batadv_mcast_mla_flags *mla_flags = &bat_priv->mcast.mla_flags; - char querier4, querier6, shadowing4, shadowing6; - bool bridged = mla_flags->bridged; - u8 flags = mla_flags->tvlv_flags; - - if (bridged) { - querier4 = mla_flags->querier_ipv4.exists ? '.' : '4'; - querier6 = mla_flags->querier_ipv6.exists ? '.' : '6'; - shadowing4 = mla_flags->querier_ipv4.shadowing ? '4' : '.'; - shadowing6 = mla_flags->querier_ipv6.shadowing ? '6' : '.'; - } else { - querier4 = '?'; - querier6 = '?'; - shadowing4 = '?'; - shadowing6 = '?'; - } - - seq_printf(seq, "Multicast flags (own flags: [%c%c%c%s%s])\n", - (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', - (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', - (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', - !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", - !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". "); - seq_printf(seq, "* Bridged [U]\t\t\t\t%c\n", bridged ? 'U' : '.'); - seq_printf(seq, "* No IGMP/MLD Querier [4/6]:\t\t%c/%c\n", - querier4, querier6); - seq_printf(seq, "* Shadowing IGMP/MLD Querier [4/6]:\t%c/%c\n", - shadowing4, shadowing6); - seq_puts(seq, "-------------------------------------------\n"); - seq_printf(seq, " %-10s %s\n", "Originator", "Flags"); -} - -/** - * batadv_mcast_flags_seq_print_text() - print the mcast flags of other nodes - * @seq: seq file to print on - * @offset: not used - * - * This prints a table of (primary) originators and their according - * multicast flags, including (in the header) our own. - * - * Return: always 0 - */ -int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset) -{ - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_priv *bat_priv = netdev_priv(net_dev); - struct batadv_hard_iface *primary_if; - struct batadv_hashtable *hash = bat_priv->orig_hash; - struct batadv_orig_node *orig_node; - struct hlist_head *head; - u8 flags; - u32 i; - - primary_if = batadv_seq_print_text_primary_if_get(seq); - if (!primary_if) - return 0; - - batadv_mcast_flags_print_header(bat_priv, seq); - - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) { - if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, - &orig_node->capa_initialized)) - continue; - - if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, - &orig_node->capabilities)) { - seq_printf(seq, "%pM -\n", orig_node->orig); - continue; - } - - flags = orig_node->mcast_flags; - - seq_printf(seq, "%pM [%c%c%c%s%s]\n", orig_node->orig, - (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) - ? 'U' : '.', - (flags & BATADV_MCAST_WANT_ALL_IPV4) - ? '4' : '.', - (flags & BATADV_MCAST_WANT_ALL_IPV6) - ? '6' : '.', - !(flags & BATADV_MCAST_WANT_NO_RTR4) - ? "R4" : ". ", - !(flags & BATADV_MCAST_WANT_NO_RTR6) - ? "R6" : ". "); - } - rcu_read_unlock(); - } - - batadv_hardif_put(primary_if); - - return 0; -} -#endif - /** * batadv_mcast_mesh_info_put() - put multicast info into a netlink message * @msg: buffer for the message diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 3e114bc5ca3b..9fee5da08311 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2014-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Linus Lüssing */ @@ -10,7 +10,6 @@ #include "main.h" #include <linux/netlink.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> /** @@ -56,8 +55,6 @@ int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, void batadv_mcast_init(struct batadv_priv *bat_priv); -int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset); - int batadv_mcast_mesh_info_put(struct sk_buff *msg, struct batadv_priv *bat_priv); diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index c7a55647b520..f317d206b411 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Matthias Schiffer */ @@ -23,6 +23,7 @@ #include <linux/kernel.h> #include <linux/limits.h> #include <linux/list.h> +#include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/printk.h> @@ -192,7 +193,7 @@ static int batadv_netlink_mesh_fill_ap_isolation(struct sk_buff *msg, } /** - * batadv_option_set_ap_isolation() - Set ap_isolation from genl msg + * batadv_netlink_set_mesh_ap_isolation() - Set ap_isolation from genl msg * @attr: parsed BATADV_ATTR_AP_ISOLATION_ENABLED attribute * @bat_priv: the bat priv with all the soft interface information * @@ -756,7 +757,7 @@ batadv_netlink_tp_meter_start(struct sk_buff *skb, struct genl_info *info) } /** - * batadv_netlink_tp_meter_start() - Cancel a running tp_meter session + * batadv_netlink_tp_meter_cancel() - Cancel a running tp_meter session * @skb: received netlink message * @info: receiver information * diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index 7ee48f916997..48102cc7490c 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Matthias Schiffer */ diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 61ddd6d709a0..4bb76b434d07 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen */ @@ -11,7 +11,6 @@ #include <linux/bitops.h> #include <linux/byteorder/generic.h> #include <linux/compiler.h> -#include <linux/debugfs.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> @@ -30,7 +29,6 @@ #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rcupdate.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -39,7 +37,6 @@ #include <linux/workqueue.h> #include <uapi/linux/batadv_packet.h> -#include "hard-interface.h" #include "hash.h" #include "log.h" #include "originator.h" @@ -1876,87 +1873,3 @@ void batadv_nc_mesh_free(struct batadv_priv *bat_priv) batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, NULL); batadv_hash_destroy(bat_priv->nc.decoding_hash); } - -#ifdef CONFIG_BATMAN_ADV_DEBUGFS -/** - * batadv_nc_nodes_seq_print_text() - print the nc node information - * @seq: seq file to print on - * @offset: not used - * - * Return: always 0 - */ -int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset) -{ - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_priv *bat_priv = netdev_priv(net_dev); - struct batadv_hashtable *hash = bat_priv->orig_hash; - struct batadv_hard_iface *primary_if; - struct hlist_head *head; - struct batadv_orig_node *orig_node; - struct batadv_nc_node *nc_node; - int i; - - primary_if = batadv_seq_print_text_primary_if_get(seq); - if (!primary_if) - goto out; - - /* Traverse list of originators */ - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - /* For each orig_node in this bin */ - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) { - /* no need to print the orig node if it does not have - * network coding neighbors - */ - if (list_empty(&orig_node->in_coding_list) && - list_empty(&orig_node->out_coding_list)) - continue; - - seq_printf(seq, "Node: %pM\n", orig_node->orig); - - seq_puts(seq, " Ingoing: "); - /* For each in_nc_node to this orig_node */ - list_for_each_entry_rcu(nc_node, - &orig_node->in_coding_list, - list) - seq_printf(seq, "%pM ", - nc_node->addr); - seq_puts(seq, "\n Outgoing: "); - /* For out_nc_node to this orig_node */ - list_for_each_entry_rcu(nc_node, - &orig_node->out_coding_list, - list) - seq_printf(seq, "%pM ", - nc_node->addr); - seq_puts(seq, "\n\n"); - } - rcu_read_unlock(); - } - -out: - if (primary_if) - batadv_hardif_put(primary_if); - return 0; -} - -/** - * batadv_nc_init_debugfs() - create nc folder and related files in debugfs - * @bat_priv: the bat priv with all the soft interface information - */ -void batadv_nc_init_debugfs(struct batadv_priv *bat_priv) -{ - struct dentry *nc_dir; - - nc_dir = debugfs_create_dir("nc", bat_priv->debug_dir); - - debugfs_create_u8("min_tq", 0644, nc_dir, &bat_priv->nc.min_tq); - - debugfs_create_u32("max_fwd_delay", 0644, nc_dir, - &bat_priv->nc.max_fwd_delay); - - debugfs_create_u32("max_buffer_time", 0644, nc_dir, - &bat_priv->nc.max_buffer_time); -} -#endif diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 334289084127..368cc3130e4c 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen */ @@ -10,7 +10,6 @@ #include "main.h" #include <linux/netdevice.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> @@ -38,8 +37,6 @@ void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, struct sk_buff *skb); void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, struct sk_buff *skb); -int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset); -void batadv_nc_init_debugfs(struct batadv_priv *bat_priv); #else /* ifdef CONFIG_BATMAN_ADV_NC */ @@ -104,16 +101,6 @@ batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, { } -static inline int batadv_nc_nodes_seq_print_text(struct seq_file *seq, - void *offset) -{ - return 0; -} - -static inline void batadv_nc_init_debugfs(struct batadv_priv *bat_priv) -{ -} - #endif /* ifdef CONFIG_BATMAN_ADV_NC */ #endif /* _NET_BATMAN_ADV_NETWORK_CODING_H_ */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 805d8969bdfb..da7249448474 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -20,7 +20,6 @@ #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -733,42 +732,6 @@ batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, return batadv_neigh_node_create(orig_node, hard_iface, neigh_addr); } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS -/** - * batadv_hardif_neigh_seq_print_text() - print the single hop neighbour list - * @seq: neighbour table seq_file struct - * @offset: not used - * - * Return: always 0 - */ -int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) -{ - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_priv *bat_priv = netdev_priv(net_dev); - struct batadv_hard_iface *primary_if; - - primary_if = batadv_seq_print_text_primary_if_get(seq); - if (!primary_if) - return 0; - - seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n", - BATADV_SOURCE_VERSION, primary_if->net_dev->name, - primary_if->net_dev->dev_addr, net_dev->name, - bat_priv->algo_ops->name); - - batadv_hardif_put(primary_if); - - if (!bat_priv->algo_ops->neigh.print) { - seq_puts(seq, - "No printing function for this routing protocol\n"); - return 0; - } - - bat_priv->algo_ops->neigh.print(bat_priv, seq); - return 0; -} -#endif - /** * batadv_hardif_neigh_dump() - Dump to netlink the neighbor infos for a * specific outgoing interface @@ -1382,90 +1345,6 @@ static void batadv_purge_orig(struct work_struct *work) msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD)); } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - -/** - * batadv_orig_seq_print_text() - Print the originator table in a seq file - * @seq: seq file to print on - * @offset: not used - * - * Return: always 0 - */ -int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) -{ - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_priv *bat_priv = netdev_priv(net_dev); - struct batadv_hard_iface *primary_if; - - primary_if = batadv_seq_print_text_primary_if_get(seq); - if (!primary_if) - return 0; - - seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n", - BATADV_SOURCE_VERSION, primary_if->net_dev->name, - primary_if->net_dev->dev_addr, net_dev->name, - bat_priv->algo_ops->name); - - batadv_hardif_put(primary_if); - - if (!bat_priv->algo_ops->orig.print) { - seq_puts(seq, - "No printing function for this routing protocol\n"); - return 0; - } - - bat_priv->algo_ops->orig.print(bat_priv, seq, BATADV_IF_DEFAULT); - - return 0; -} - -/** - * batadv_orig_hardif_seq_print_text() - writes originator infos for a specific - * outgoing interface - * @seq: debugfs table seq_file struct - * @offset: not used - * - * Return: 0 - */ -int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset) -{ - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_hard_iface *hard_iface; - struct batadv_priv *bat_priv; - - hard_iface = batadv_hardif_get_by_netdev(net_dev); - - if (!hard_iface || !hard_iface->soft_iface) { - seq_puts(seq, "Interface not known to B.A.T.M.A.N.\n"); - goto out; - } - - bat_priv = netdev_priv(hard_iface->soft_iface); - if (!bat_priv->algo_ops->orig.print) { - seq_puts(seq, - "No printing function for this routing protocol\n"); - goto out; - } - - if (hard_iface->if_status != BATADV_IF_ACTIVE) { - seq_puts(seq, "Interface not active\n"); - goto out; - } - - seq_printf(seq, "[B.A.T.M.A.N. adv %s, IF/MAC: %s/%pM (%s %s)]\n", - BATADV_SOURCE_VERSION, hard_iface->net_dev->name, - hard_iface->net_dev->dev_addr, - hard_iface->soft_iface->name, bat_priv->algo_ops->name); - - bat_priv->algo_ops->orig.print(bat_priv, seq, hard_iface); - -out: - if (hard_iface) - batadv_hardif_put(hard_iface); - return 0; -} -#endif - /** * batadv_orig_dump() - Dump to netlink the originator infos for a specific * outgoing interface diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 7bc01c138b3a..805be87d55b8 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -13,7 +13,6 @@ #include <linux/if_ether.h> #include <linux/jhash.h> #include <linux/netlink.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/types.h> @@ -46,7 +45,6 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo); int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb); -int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset); struct batadv_orig_ifinfo * batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, @@ -56,9 +54,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing); void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo); -int batadv_orig_seq_print_text(struct seq_file *seq, void *offset); int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb); -int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset); struct batadv_orig_node_vlan * batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, unsigned short vid); diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 9e5c71e406ff..40f5cffde6a3 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -29,7 +29,6 @@ #include "distributed-arp-table.h" #include "fragmentation.h" #include "hard-interface.h" -#include "icmp_socket.h" #include "log.h" #include "network-coding.h" #include "originator.h" @@ -227,15 +226,6 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv, icmph = (struct batadv_icmp_header *)skb->data; switch (icmph->msg_type) { - case BATADV_ECHO_REPLY: - case BATADV_DESTINATION_UNREACHABLE: - case BATADV_TTL_EXCEEDED: - /* receive the packet */ - if (skb_linearize(skb) < 0) - break; - - batadv_socket_receive_packet(icmph, skb->len); - break; case BATADV_ECHO_REQUEST: /* answer echo request (ping) */ primary_if = batadv_primary_if_get_selected(bat_priv); diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 2ed49db6eff5..5f387786e9a7 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 87017332b567..157abe92d827 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 0d36e15589f6..2b0daf8b2bc4 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 82e7ca886605..6b8181bc3122 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -30,7 +30,6 @@ #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> -#include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/socket.h> @@ -38,12 +37,12 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> +#include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" -#include "debugfs.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "hard-interface.h" @@ -51,7 +50,6 @@ #include "network-coding.h" #include "originator.h" #include "send.h" -#include "sysfs.h" #include "translation-table.h" /** @@ -574,7 +572,6 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_softif_vlan *vlan; - int err; spin_lock_bh(&bat_priv->softif_vlan_list_lock); @@ -601,19 +598,6 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); - /* batadv_sysfs_add_vlan cannot be in the spinlock section due to the - * sleeping behavior of the sysfs functions and the fs_reclaim lock - */ - err = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan); - if (err) { - /* ref for the function */ - batadv_softif_vlan_put(vlan); - - /* ref for the list */ - batadv_softif_vlan_put(vlan); - return err; - } - /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ @@ -641,7 +625,6 @@ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, batadv_tt_local_remove(bat_priv, bat_priv->soft_iface->dev_addr, vlan->vid, "vlan interface destroyed", false); - batadv_sysfs_del_vlan(bat_priv, vlan); batadv_softif_vlan_put(vlan); } @@ -661,7 +644,6 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; - int ret; /* only 802.1Q vlans are supported. * batman-adv does not know how to handle other types @@ -681,17 +663,6 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, if (!vlan) return batadv_softif_create_vlan(bat_priv, vid); - /* recreate the sysfs object if it was already destroyed (and it should - * be since we received a kill_vid() for this vlan - */ - if (!vlan->kobj) { - ret = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan); - if (ret) { - batadv_softif_vlan_put(vlan); - return ret; - } - } - /* add a new TT local entry. This one will be marked with the NOPURGE * flag. This must be added again, even if the vlan object already * exists, because the entry was deleted by kill_vid() @@ -845,22 +816,18 @@ static int batadv_softif_init_late(struct net_device *dev) batadv_nc_init_bat_priv(bat_priv); - ret = batadv_algo_select(bat_priv, batadv_routing_algo); - if (ret < 0) - goto free_bat_counters; - - ret = batadv_debugfs_add_meshif(dev); - if (ret < 0) - goto free_bat_counters; + if (!bat_priv->algo_ops) { + ret = batadv_algo_select(bat_priv, batadv_routing_algo); + if (ret < 0) + goto free_bat_counters; + } ret = batadv_mesh_init(dev); if (ret < 0) - goto unreg_debugfs; + goto free_bat_counters; return 0; -unreg_debugfs: - batadv_debugfs_del_meshif(dev); free_bat_counters: free_percpu(bat_priv->bat_counters); bat_priv->bat_counters = NULL; @@ -914,7 +881,7 @@ static int batadv_softif_slave_del(struct net_device *dev, if (!hard_iface || hard_iface->soft_iface != dev) goto out; - batadv_hardif_disable_interface(hard_iface, BATADV_IF_CLEANUP_KEEP); + batadv_hardif_disable_interface(hard_iface); ret = 0; out: @@ -1037,7 +1004,6 @@ static const struct ethtool_ops batadv_ethtool_ops = { */ static void batadv_softif_free(struct net_device *dev) { - batadv_debugfs_del_meshif(dev); batadv_mesh_free(dev); /* some scheduled RCU callbacks need the bat_priv struct to accomplish @@ -1074,6 +1040,59 @@ static void batadv_softif_init_early(struct net_device *dev) } /** + * batadv_softif_validate() - validate configuration of new batadv link + * @tb: IFLA_INFO_DATA netlink attributes + * @data: enum batadv_ifla_attrs attributes + * @extack: extended ACK report struct + * + * Return: 0 if successful or error otherwise. + */ +static int batadv_softif_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct batadv_algo_ops *algo_ops; + + if (!data) + return 0; + + if (data[IFLA_BATADV_ALGO_NAME]) { + algo_ops = batadv_algo_get(nla_data(data[IFLA_BATADV_ALGO_NAME])); + if (!algo_ops) + return -EINVAL; + } + + return 0; +} + +/** + * batadv_softif_newlink() - pre-initialize and register new batadv link + * @src_net: the applicable net namespace + * @dev: network device to register + * @tb: IFLA_INFO_DATA netlink attributes + * @data: enum batadv_ifla_attrs attributes + * @extack: extended ACK report struct + * + * Return: 0 if successful or error otherwise. + */ +static int batadv_softif_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct batadv_priv *bat_priv = netdev_priv(dev); + const char *algo_name; + int err; + + if (data && data[IFLA_BATADV_ALGO_NAME]) { + algo_name = nla_data(data[IFLA_BATADV_ALGO_NAME]); + err = batadv_algo_select(bat_priv, algo_name); + if (err) + return -EINVAL; + } + + return register_netdevice(dev); +} + +/** * batadv_softif_create() - Create and register soft interface * @net: the applicable net namespace * @name: name of the new soft interface @@ -1106,28 +1125,6 @@ struct net_device *batadv_softif_create(struct net *net, const char *name) } /** - * batadv_softif_destroy_sysfs() - deletion of batadv_soft_interface via sysfs - * @soft_iface: the to-be-removed batman-adv interface - */ -void batadv_softif_destroy_sysfs(struct net_device *soft_iface) -{ - struct batadv_priv *bat_priv = netdev_priv(soft_iface); - struct batadv_softif_vlan *vlan; - - ASSERT_RTNL(); - - /* destroy the "untagged" VLAN */ - vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); - if (vlan) { - batadv_softif_destroy_vlan(bat_priv, vlan); - batadv_softif_vlan_put(vlan); - } - - batadv_sysfs_del_meshif(soft_iface); - unregister_netdevice(soft_iface); -} - -/** * batadv_softif_destroy_netlink() - deletion of batadv_soft_interface via * netlink * @soft_iface: the to-be-removed batman-adv interface @@ -1142,8 +1139,7 @@ static void batadv_softif_destroy_netlink(struct net_device *soft_iface, list_for_each_entry(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface == soft_iface) - batadv_hardif_disable_interface(hard_iface, - BATADV_IF_CLEANUP_KEEP); + batadv_hardif_disable_interface(hard_iface); } /* destroy the "untagged" VLAN */ @@ -1153,7 +1149,6 @@ static void batadv_softif_destroy_netlink(struct net_device *soft_iface, batadv_softif_vlan_put(vlan); } - batadv_sysfs_del_meshif(soft_iface); unregister_netdevice_queue(soft_iface, head); } @@ -1171,9 +1166,17 @@ bool batadv_softif_is_valid(const struct net_device *net_dev) return false; } +static const struct nla_policy batadv_ifla_policy[IFLA_BATADV_MAX + 1] = { + [IFLA_BATADV_ALGO_NAME] = { .type = NLA_NUL_STRING }, +}; + struct rtnl_link_ops batadv_link_ops __read_mostly = { .kind = "batadv", .priv_size = sizeof(struct batadv_priv), .setup = batadv_softif_init_early, + .maxtype = IFLA_BATADV_MAX, + .policy = batadv_ifla_policy, + .validate = batadv_softif_validate, + .newlink = batadv_softif_newlink, .dellink = batadv_softif_destroy_netlink, }; diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 534e08d6ad91..38b0ad182584 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ @@ -20,7 +20,6 @@ void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node); struct net_device *batadv_softif_create(struct net *net, const char *name); -void batadv_softif_destroy_sysfs(struct net_device *soft_iface); bool batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c deleted file mode 100644 index 0f962dcd239e..000000000000 --- a/net/batman-adv/sysfs.c +++ /dev/null @@ -1,1272 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: - * - * Marek Lindner - */ - -#include "sysfs.h" -#include "main.h" - -#include <asm/current.h> -#include <linux/atomic.h> -#include <linux/compiler.h> -#include <linux/device.h> -#include <linux/errno.h> -#include <linux/gfp.h> -#include <linux/if.h> -#include <linux/if_vlan.h> -#include <linux/kernel.h> -#include <linux/kobject.h> -#include <linux/kref.h> -#include <linux/limits.h> -#include <linux/netdevice.h> -#include <linux/printk.h> -#include <linux/rculist.h> -#include <linux/rcupdate.h> -#include <linux/rtnetlink.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/stddef.h> -#include <linux/string.h> -#include <linux/stringify.h> -#include <linux/workqueue.h> -#include <uapi/linux/batadv_packet.h> -#include <uapi/linux/batman_adv.h> - -#include "bridge_loop_avoidance.h" -#include "distributed-arp-table.h" -#include "gateway_client.h" -#include "gateway_common.h" -#include "hard-interface.h" -#include "log.h" -#include "netlink.h" -#include "network-coding.h" -#include "soft-interface.h" - -/** - * batadv_sysfs_deprecated() - Log use of deprecated batadv sysfs access - * @attr: attribute which was accessed - */ -static void batadv_sysfs_deprecated(struct attribute *attr) -{ - pr_warn_ratelimited(DEPRECATED "%s (pid %d) Use of sysfs file \"%s\".\nUse batadv genl family instead", - current->comm, task_pid_nr(current), attr->name); -} - -static struct net_device *batadv_kobj_to_netdev(struct kobject *obj) -{ - struct device *dev = container_of(obj->parent, struct device, kobj); - - return to_net_dev(dev); -} - -static struct batadv_priv *batadv_kobj_to_batpriv(struct kobject *obj) -{ - struct net_device *net_dev = batadv_kobj_to_netdev(obj); - - return netdev_priv(net_dev); -} - -/** - * batadv_vlan_kobj_to_batpriv() - convert a vlan kobj in the associated batpriv - * @obj: kobject to covert - * - * Return: the associated batadv_priv struct. - */ -static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj) -{ - /* VLAN specific attributes are located in the root sysfs folder if they - * refer to the untagged VLAN.. - */ - if (!strcmp(BATADV_SYSFS_IF_MESH_SUBDIR, obj->name)) - return batadv_kobj_to_batpriv(obj); - - /* ..while the attributes for the tagged vlans are located in - * the in the corresponding "vlan%VID" subfolder - */ - return batadv_kobj_to_batpriv(obj->parent); -} - -/** - * batadv_kobj_to_vlan() - convert a kobj in the associated softif_vlan struct - * @bat_priv: the bat priv with all the soft interface information - * @obj: kobject to covert - * - * Return: the associated softif_vlan struct if found, NULL otherwise. - */ -static struct batadv_softif_vlan * -batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj) -{ - struct batadv_softif_vlan *vlan_tmp, *vlan = NULL; - - rcu_read_lock(); - hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->softif_vlan_list, list) { - if (vlan_tmp->kobj != obj) - continue; - - if (!kref_get_unless_zero(&vlan_tmp->refcount)) - continue; - - vlan = vlan_tmp; - break; - } - rcu_read_unlock(); - - return vlan; -} - -/* Use this, if you have customized show and store functions for vlan attrs */ -#define BATADV_ATTR_VLAN(_name, _mode, _show, _store) \ -struct batadv_attribute batadv_attr_vlan_##_name = { \ - .attr = {.name = __stringify(_name), \ - .mode = _mode }, \ - .show = _show, \ - .store = _store, \ -} - -/* Use this, if you have customized show and store functions */ -#define BATADV_ATTR(_name, _mode, _show, _store) \ -struct batadv_attribute batadv_attr_##_name = { \ - .attr = {.name = __stringify(_name), \ - .mode = _mode }, \ - .show = _show, \ - .store = _store, \ -} - -#define BATADV_ATTR_SIF_STORE_BOOL(_name, _post_func) \ -ssize_t batadv_store_##_name(struct kobject *kobj, \ - struct attribute *attr, char *buff, \ - size_t count) \ -{ \ - struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \ - struct batadv_priv *bat_priv = netdev_priv(net_dev); \ - ssize_t length; \ - \ - batadv_sysfs_deprecated(attr); \ - length = __batadv_store_bool_attr(buff, count, _post_func, attr,\ - &bat_priv->_name, net_dev); \ - \ - batadv_netlink_notify_mesh(bat_priv); \ - \ - return length; \ -} - -#define BATADV_ATTR_SIF_SHOW_BOOL(_name) \ -ssize_t batadv_show_##_name(struct kobject *kobj, \ - struct attribute *attr, char *buff) \ -{ \ - struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \ - \ - batadv_sysfs_deprecated(attr); \ - return sprintf(buff, "%s\n", \ - atomic_read(&bat_priv->_name) == 0 ? \ - "disabled" : "enabled"); \ -} \ - -/* Use this, if you are going to turn a [name] in the soft-interface - * (bat_priv) on or off - */ -#define BATADV_ATTR_SIF_BOOL(_name, _mode, _post_func) \ - static BATADV_ATTR_SIF_STORE_BOOL(_name, _post_func) \ - static BATADV_ATTR_SIF_SHOW_BOOL(_name) \ - static BATADV_ATTR(_name, _mode, batadv_show_##_name, \ - batadv_store_##_name) - -#define BATADV_ATTR_SIF_STORE_UINT(_name, _var, _min, _max, _post_func) \ -ssize_t batadv_store_##_name(struct kobject *kobj, \ - struct attribute *attr, char *buff, \ - size_t count) \ -{ \ - struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \ - struct batadv_priv *bat_priv = netdev_priv(net_dev); \ - ssize_t length; \ - \ - batadv_sysfs_deprecated(attr); \ - length = __batadv_store_uint_attr(buff, count, _min, _max, \ - _post_func, attr, \ - &bat_priv->_var, net_dev, \ - NULL); \ - \ - batadv_netlink_notify_mesh(bat_priv); \ - \ - return length; \ -} - -#define BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \ -ssize_t batadv_show_##_name(struct kobject *kobj, \ - struct attribute *attr, char *buff) \ -{ \ - struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \ - \ - batadv_sysfs_deprecated(attr); \ - return sprintf(buff, "%i\n", atomic_read(&bat_priv->_var)); \ -} \ - -/* Use this, if you are going to set [name] in the soft-interface - * (bat_priv) to an unsigned integer value - */ -#define BATADV_ATTR_SIF_UINT(_name, _var, _mode, _min, _max, _post_func)\ - static BATADV_ATTR_SIF_STORE_UINT(_name, _var, _min, _max, _post_func)\ - static BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \ - static BATADV_ATTR(_name, _mode, batadv_show_##_name, \ - batadv_store_##_name) - -#define BATADV_ATTR_VLAN_STORE_BOOL(_name, _post_func) \ -ssize_t batadv_store_vlan_##_name(struct kobject *kobj, \ - struct attribute *attr, char *buff, \ - size_t count) \ -{ \ - struct batadv_priv *bat_priv = batadv_vlan_kobj_to_batpriv(kobj);\ - struct batadv_softif_vlan *vlan = batadv_kobj_to_vlan(bat_priv, \ - kobj); \ - size_t res = __batadv_store_bool_attr(buff, count, _post_func, \ - attr, &vlan->_name, \ - bat_priv->soft_iface); \ - \ - batadv_sysfs_deprecated(attr); \ - if (vlan->vid) \ - batadv_netlink_notify_vlan(bat_priv, vlan); \ - else \ - batadv_netlink_notify_mesh(bat_priv); \ - \ - batadv_softif_vlan_put(vlan); \ - return res; \ -} - -#define BATADV_ATTR_VLAN_SHOW_BOOL(_name) \ -ssize_t batadv_show_vlan_##_name(struct kobject *kobj, \ - struct attribute *attr, char *buff) \ -{ \ - struct batadv_priv *bat_priv = batadv_vlan_kobj_to_batpriv(kobj);\ - struct batadv_softif_vlan *vlan = batadv_kobj_to_vlan(bat_priv, \ - kobj); \ - size_t res = sprintf(buff, "%s\n", \ - atomic_read(&vlan->_name) == 0 ? \ - "disabled" : "enabled"); \ - \ - batadv_sysfs_deprecated(attr); \ - batadv_softif_vlan_put(vlan); \ - return res; \ -} - -/* Use this, if you are going to turn a [name] in the vlan struct on or off */ -#define BATADV_ATTR_VLAN_BOOL(_name, _mode, _post_func) \ - static BATADV_ATTR_VLAN_STORE_BOOL(_name, _post_func) \ - static BATADV_ATTR_VLAN_SHOW_BOOL(_name) \ - static BATADV_ATTR_VLAN(_name, _mode, batadv_show_vlan_##_name, \ - batadv_store_vlan_##_name) - -#define BATADV_ATTR_HIF_STORE_UINT(_name, _var, _min, _max, _post_func) \ -ssize_t batadv_store_##_name(struct kobject *kobj, \ - struct attribute *attr, char *buff, \ - size_t count) \ -{ \ - struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \ - struct batadv_hard_iface *hard_iface; \ - struct batadv_priv *bat_priv; \ - ssize_t length; \ - \ - batadv_sysfs_deprecated(attr); \ - hard_iface = batadv_hardif_get_by_netdev(net_dev); \ - if (!hard_iface) \ - return 0; \ - \ - length = __batadv_store_uint_attr(buff, count, _min, _max, \ - _post_func, attr, \ - &hard_iface->_var, \ - hard_iface->soft_iface, \ - net_dev); \ - \ - if (hard_iface->soft_iface) { \ - bat_priv = netdev_priv(hard_iface->soft_iface); \ - batadv_netlink_notify_hardif(bat_priv, hard_iface); \ - } \ - \ - batadv_hardif_put(hard_iface); \ - return length; \ -} - -#define BATADV_ATTR_HIF_SHOW_UINT(_name, _var) \ -ssize_t batadv_show_##_name(struct kobject *kobj, \ - struct attribute *attr, char *buff) \ -{ \ - struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \ - struct batadv_hard_iface *hard_iface; \ - ssize_t length; \ - \ - batadv_sysfs_deprecated(attr); \ - hard_iface = batadv_hardif_get_by_netdev(net_dev); \ - if (!hard_iface) \ - return 0; \ - \ - length = sprintf(buff, "%i\n", atomic_read(&hard_iface->_var)); \ - \ - batadv_hardif_put(hard_iface); \ - return length; \ -} - -/* Use this, if you are going to set [name] in hard_iface to an - * unsigned integer value - */ -#define BATADV_ATTR_HIF_UINT(_name, _var, _mode, _min, _max, _post_func)\ - static BATADV_ATTR_HIF_STORE_UINT(_name, _var, _min, \ - _max, _post_func) \ - static BATADV_ATTR_HIF_SHOW_UINT(_name, _var) \ - static BATADV_ATTR(_name, _mode, batadv_show_##_name, \ - batadv_store_##_name) - -static int batadv_store_bool_attr(char *buff, size_t count, - struct net_device *net_dev, - const char *attr_name, atomic_t *attr, - bool *changed) -{ - int enabled = -1; - - *changed = false; - - if (buff[count - 1] == '\n') - buff[count - 1] = '\0'; - - if ((strncmp(buff, "1", 2) == 0) || - (strncmp(buff, "enable", 7) == 0) || - (strncmp(buff, "enabled", 8) == 0)) - enabled = 1; - - if ((strncmp(buff, "0", 2) == 0) || - (strncmp(buff, "disable", 8) == 0) || - (strncmp(buff, "disabled", 9) == 0)) - enabled = 0; - - if (enabled < 0) { - batadv_info(net_dev, "%s: Invalid parameter received: %s\n", - attr_name, buff); - return -EINVAL; - } - - if (atomic_read(attr) == enabled) - return count; - - batadv_info(net_dev, "%s: Changing from: %s to: %s\n", attr_name, - atomic_read(attr) == 1 ? "enabled" : "disabled", - enabled == 1 ? "enabled" : "disabled"); - - *changed = true; - - atomic_set(attr, (unsigned int)enabled); - return count; -} - -static inline ssize_t -__batadv_store_bool_attr(char *buff, size_t count, - void (*post_func)(struct net_device *), - struct attribute *attr, - atomic_t *attr_store, struct net_device *net_dev) -{ - bool changed; - int ret; - - ret = batadv_store_bool_attr(buff, count, net_dev, attr->name, - attr_store, &changed); - if (post_func && changed) - post_func(net_dev); - - return ret; -} - -static int batadv_store_uint_attr(const char *buff, size_t count, - struct net_device *net_dev, - struct net_device *slave_dev, - const char *attr_name, - unsigned int min, unsigned int max, - atomic_t *attr) -{ - char ifname[IFNAMSIZ + 3] = ""; - unsigned long uint_val; - int ret; - - ret = kstrtoul(buff, 10, &uint_val); - if (ret) { - batadv_info(net_dev, "%s: Invalid parameter received: %s\n", - attr_name, buff); - return -EINVAL; - } - - if (uint_val < min) { - batadv_info(net_dev, "%s: Value is too small: %lu min: %u\n", - attr_name, uint_val, min); - return -EINVAL; - } - - if (uint_val > max) { - batadv_info(net_dev, "%s: Value is too big: %lu max: %u\n", - attr_name, uint_val, max); - return -EINVAL; - } - - if (atomic_read(attr) == uint_val) - return count; - - if (slave_dev) - snprintf(ifname, sizeof(ifname), "%s: ", slave_dev->name); - - batadv_info(net_dev, "%s: %sChanging from: %i to: %lu\n", - attr_name, ifname, atomic_read(attr), uint_val); - - atomic_set(attr, uint_val); - return count; -} - -static ssize_t __batadv_store_uint_attr(const char *buff, size_t count, - int min, int max, - void (*post_func)(struct net_device *), - const struct attribute *attr, - atomic_t *attr_store, - struct net_device *net_dev, - struct net_device *slave_dev) -{ - int ret; - - ret = batadv_store_uint_attr(buff, count, net_dev, slave_dev, - attr->name, min, max, attr_store); - if (post_func && ret) - post_func(net_dev); - - return ret; -} - -static ssize_t batadv_show_bat_algo(struct kobject *kobj, - struct attribute *attr, char *buff) -{ - struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); - - batadv_sysfs_deprecated(attr); - return sprintf(buff, "%s\n", bat_priv->algo_ops->name); -} - -static void batadv_post_gw_reselect(struct net_device *net_dev) -{ - struct batadv_priv *bat_priv = netdev_priv(net_dev); - - batadv_gw_reselect(bat_priv); -} - -static ssize_t batadv_show_gw_mode(struct kobject *kobj, struct attribute *attr, - char *buff) -{ - struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); - int bytes_written; - - batadv_sysfs_deprecated(attr); - - /* GW mode is not available if the routing algorithm in use does not - * implement the GW API - */ - if (!bat_priv->algo_ops->gw.get_best_gw_node || - !bat_priv->algo_ops->gw.is_eligible) - return -ENOENT; - - switch (atomic_read(&bat_priv->gw.mode)) { - case BATADV_GW_MODE_CLIENT: - bytes_written = sprintf(buff, "%s\n", - BATADV_GW_MODE_CLIENT_NAME); - break; - case BATADV_GW_MODE_SERVER: - bytes_written = sprintf(buff, "%s\n", - BATADV_GW_MODE_SERVER_NAME); - break; - default: - bytes_written = sprintf(buff, "%s\n", - BATADV_GW_MODE_OFF_NAME); - break; - } - - return bytes_written; -} - -static ssize_t batadv_store_gw_mode(struct kobject *kobj, - struct attribute *attr, char *buff, - size_t count) -{ - struct net_device *net_dev = batadv_kobj_to_netdev(kobj); - struct batadv_priv *bat_priv = netdev_priv(net_dev); - char *curr_gw_mode_str; - int gw_mode_tmp = -1; - - batadv_sysfs_deprecated(attr); - - /* toggling GW mode is allowed only if the routing algorithm in use - * provides the GW API - */ - if (!bat_priv->algo_ops->gw.get_best_gw_node || - !bat_priv->algo_ops->gw.is_eligible) - return -EINVAL; - - if (buff[count - 1] == '\n') - buff[count - 1] = '\0'; - - if (strncmp(buff, BATADV_GW_MODE_OFF_NAME, - strlen(BATADV_GW_MODE_OFF_NAME)) == 0) - gw_mode_tmp = BATADV_GW_MODE_OFF; - - if (strncmp(buff, BATADV_GW_MODE_CLIENT_NAME, - strlen(BATADV_GW_MODE_CLIENT_NAME)) == 0) - gw_mode_tmp = BATADV_GW_MODE_CLIENT; - - if (strncmp(buff, BATADV_GW_MODE_SERVER_NAME, - strlen(BATADV_GW_MODE_SERVER_NAME)) == 0) - gw_mode_tmp = BATADV_GW_MODE_SERVER; - - if (gw_mode_tmp < 0) { - batadv_info(net_dev, - "Invalid parameter for 'gw mode' setting received: %s\n", - buff); - return -EINVAL; - } - - if (atomic_read(&bat_priv->gw.mode) == gw_mode_tmp) - return count; - - switch (atomic_read(&bat_priv->gw.mode)) { - case BATADV_GW_MODE_CLIENT: - curr_gw_mode_str = BATADV_GW_MODE_CLIENT_NAME; - break; - case BATADV_GW_MODE_SERVER: - curr_gw_mode_str = BATADV_GW_MODE_SERVER_NAME; - break; - default: - curr_gw_mode_str = BATADV_GW_MODE_OFF_NAME; - break; - } - - batadv_info(net_dev, "Changing gw mode from: %s to: %s\n", - curr_gw_mode_str, buff); - - /* Invoking batadv_gw_reselect() is not enough to really de-select the - * current GW. It will only instruct the gateway client code to perform - * a re-election the next time that this is needed. - * - * When gw client mode is being switched off the current GW must be - * de-selected explicitly otherwise no GW_ADD uevent is thrown on - * client mode re-activation. This is operation is performed in - * batadv_gw_check_client_stop(). - */ - batadv_gw_reselect(bat_priv); - /* always call batadv_gw_check_client_stop() before changing the gateway - * state - */ - batadv_gw_check_client_stop(bat_priv); - atomic_set(&bat_priv->gw.mode, (unsigned int)gw_mode_tmp); - batadv_gw_tvlv_container_update(bat_priv); - - batadv_netlink_notify_mesh(bat_priv); - - return count; -} - -static ssize_t batadv_show_gw_sel_class(struct kobject *kobj, - struct attribute *attr, char *buff) -{ - struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); - - batadv_sysfs_deprecated(attr); - - /* GW selection class is not available if the routing algorithm in use - * does not implement the GW API - */ - if (!bat_priv->algo_ops->gw.get_best_gw_node || - !bat_priv->algo_ops->gw.is_eligible) - return -ENOENT; - - if (bat_priv->algo_ops->gw.show_sel_class) - return bat_priv->algo_ops->gw.show_sel_class(bat_priv, buff); - - return sprintf(buff, "%i\n", atomic_read(&bat_priv->gw.sel_class)); -} - -static ssize_t batadv_store_gw_sel_class(struct kobject *kobj, - struct attribute *attr, char *buff, - size_t count) -{ - struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); - ssize_t length; - - batadv_sysfs_deprecated(attr); - - /* setting the GW selection class is allowed only if the routing - * algorithm in use implements the GW API - */ - if (!bat_priv->algo_ops->gw.get_best_gw_node || - !bat_priv->algo_ops->gw.is_eligible) - return -EINVAL; - - if (buff[count - 1] == '\n') - buff[count - 1] = '\0'; - - if (bat_priv->algo_ops->gw.store_sel_class) - return bat_priv->algo_ops->gw.store_sel_class(bat_priv, buff, - count); - - length = __batadv_store_uint_attr(buff, count, 1, BATADV_TQ_MAX_VALUE, - batadv_post_gw_reselect, attr, - &bat_priv->gw.sel_class, - bat_priv->soft_iface, NULL); - - batadv_netlink_notify_mesh(bat_priv); - - return length; -} - -static ssize_t batadv_show_gw_bwidth(struct kobject *kobj, - struct attribute *attr, char *buff) -{ - struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); - u32 down, up; - - batadv_sysfs_deprecated(attr); - - down = atomic_read(&bat_priv->gw.bandwidth_down); - up = atomic_read(&bat_priv->gw.bandwidth_up); - - return sprintf(buff, "%u.%u/%u.%u MBit\n", down / 10, - down % 10, up / 10, up % 10); -} - -static ssize_t batadv_store_gw_bwidth(struct kobject *kobj, - struct attribute *attr, char *buff, - size_t count) -{ - struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); - struct net_device *net_dev = batadv_kobj_to_netdev(kobj); - ssize_t length; - - batadv_sysfs_deprecated(attr); - - if (buff[count - 1] == '\n') - buff[count - 1] = '\0'; - - length = batadv_gw_bandwidth_set(net_dev, buff, count); - - batadv_netlink_notify_mesh(bat_priv); - - return length; -} - -/** - * batadv_show_isolation_mark() - print the current isolation mark/mask - * @kobj: kobject representing the private mesh sysfs directory - * @attr: the batman-adv attribute the user is interacting with - * @buff: the buffer that will contain the data to send back to the user - * - * Return: the number of bytes written into 'buff' on success or a negative - * error code in case of failure - */ -static ssize_t batadv_show_isolation_mark(struct kobject *kobj, - struct attribute *attr, char *buff) -{ - struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); - - batadv_sysfs_deprecated(attr); - return sprintf(buff, "%#.8x/%#.8x\n", bat_priv->isolation_mark, - bat_priv->isolation_mark_mask); -} - -/** - * batadv_store_isolation_mark() - parse and store the isolation mark/mask - * entered by the user - * @kobj: kobject representing the private mesh sysfs directory - * @attr: the batman-adv attribute the user is interacting with - * @buff: the buffer containing the user data - * @count: number of bytes in the buffer - * - * Return: 'count' on success or a negative error code in case of failure - */ -static ssize_t batadv_store_isolation_mark(struct kobject *kobj, - struct attribute *attr, char *buff, - size_t count) -{ - struct net_device *net_dev = batadv_kobj_to_netdev(kobj); - struct batadv_priv *bat_priv = netdev_priv(net_dev); - u32 mark, mask; - char *mask_ptr; - - batadv_sysfs_deprecated(attr); - - /* parse the mask if it has been specified, otherwise assume the mask is - * the biggest possible - */ - mask = 0xFFFFFFFF; - mask_ptr = strchr(buff, '/'); - if (mask_ptr) { - *mask_ptr = '\0'; - mask_ptr++; - - /* the mask must be entered in hex base as it is going to be a - * bitmask and not a prefix length - */ - if (kstrtou32(mask_ptr, 16, &mask) < 0) - return -EINVAL; - } - - /* the mark can be entered in any base */ - if (kstrtou32(buff, 0, &mark) < 0) - return -EINVAL; - - bat_priv->isolation_mark_mask = mask; - /* erase bits not covered by the mask */ - bat_priv->isolation_mark = mark & bat_priv->isolation_mark_mask; - - batadv_info(net_dev, - "New skb mark for extended isolation: %#.8x/%#.8x\n", - bat_priv->isolation_mark, bat_priv->isolation_mark_mask); - - batadv_netlink_notify_mesh(bat_priv); - - return count; -} - -BATADV_ATTR_SIF_BOOL(aggregated_ogms, 0644, NULL); -BATADV_ATTR_SIF_BOOL(bonding, 0644, NULL); -#ifdef CONFIG_BATMAN_ADV_BLA -BATADV_ATTR_SIF_BOOL(bridge_loop_avoidance, 0644, batadv_bla_status_update); -#endif -#ifdef CONFIG_BATMAN_ADV_DAT -BATADV_ATTR_SIF_BOOL(distributed_arp_table, 0644, batadv_dat_status_update); -#endif -BATADV_ATTR_SIF_BOOL(fragmentation, 0644, batadv_update_min_mtu); -static BATADV_ATTR(routing_algo, 0444, batadv_show_bat_algo, NULL); -static BATADV_ATTR(gw_mode, 0644, batadv_show_gw_mode, batadv_store_gw_mode); -BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, 0644, 2 * BATADV_JITTER, - INT_MAX, NULL); -BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, 0644, 0, BATADV_TQ_MAX_VALUE, - NULL); -static BATADV_ATTR(gw_sel_class, 0644, batadv_show_gw_sel_class, - batadv_store_gw_sel_class); -static BATADV_ATTR(gw_bandwidth, 0644, batadv_show_gw_bwidth, - batadv_store_gw_bwidth); -#ifdef CONFIG_BATMAN_ADV_MCAST -BATADV_ATTR_SIF_BOOL(multicast_mode, 0644, NULL); -#endif -#ifdef CONFIG_BATMAN_ADV_DEBUG -BATADV_ATTR_SIF_UINT(log_level, log_level, 0644, 0, BATADV_DBG_ALL, NULL); -#endif -#ifdef CONFIG_BATMAN_ADV_NC -BATADV_ATTR_SIF_BOOL(network_coding, 0644, batadv_nc_status_update); -#endif -static BATADV_ATTR(isolation_mark, 0644, batadv_show_isolation_mark, - batadv_store_isolation_mark); - -static struct batadv_attribute *batadv_mesh_attrs[] = { - &batadv_attr_aggregated_ogms, - &batadv_attr_bonding, -#ifdef CONFIG_BATMAN_ADV_BLA - &batadv_attr_bridge_loop_avoidance, -#endif -#ifdef CONFIG_BATMAN_ADV_DAT - &batadv_attr_distributed_arp_table, -#endif -#ifdef CONFIG_BATMAN_ADV_MCAST - &batadv_attr_multicast_mode, -#endif - &batadv_attr_fragmentation, - &batadv_attr_routing_algo, - &batadv_attr_gw_mode, - &batadv_attr_orig_interval, - &batadv_attr_hop_penalty, - &batadv_attr_gw_sel_class, - &batadv_attr_gw_bandwidth, -#ifdef CONFIG_BATMAN_ADV_DEBUG - &batadv_attr_log_level, -#endif -#ifdef CONFIG_BATMAN_ADV_NC - &batadv_attr_network_coding, -#endif - &batadv_attr_isolation_mark, - NULL, -}; - -BATADV_ATTR_VLAN_BOOL(ap_isolation, 0644, NULL); - -/* array of vlan specific sysfs attributes */ -static struct batadv_attribute *batadv_vlan_attrs[] = { - &batadv_attr_vlan_ap_isolation, - NULL, -}; - -/** - * batadv_sysfs_add_meshif() - Add soft interface specific sysfs entries - * @dev: netdev struct of the soft interface - * - * Return: 0 on success or negative error number in case of failure - */ -int batadv_sysfs_add_meshif(struct net_device *dev) -{ - struct kobject *batif_kobject = &dev->dev.kobj; - struct batadv_priv *bat_priv = netdev_priv(dev); - struct batadv_attribute **bat_attr; - int err; - - bat_priv->mesh_obj = kobject_create_and_add(BATADV_SYSFS_IF_MESH_SUBDIR, - batif_kobject); - if (!bat_priv->mesh_obj) { - batadv_err(dev, "Can't add sysfs directory: %s/%s\n", dev->name, - BATADV_SYSFS_IF_MESH_SUBDIR); - goto out; - } - - for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr) { - err = sysfs_create_file(bat_priv->mesh_obj, - &((*bat_attr)->attr)); - if (err) { - batadv_err(dev, "Can't add sysfs file: %s/%s/%s\n", - dev->name, BATADV_SYSFS_IF_MESH_SUBDIR, - ((*bat_attr)->attr).name); - goto rem_attr; - } - } - - return 0; - -rem_attr: - for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr) - sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr)); - - kobject_uevent(bat_priv->mesh_obj, KOBJ_REMOVE); - kobject_del(bat_priv->mesh_obj); - kobject_put(bat_priv->mesh_obj); - bat_priv->mesh_obj = NULL; -out: - return -ENOMEM; -} - -/** - * batadv_sysfs_del_meshif() - Remove soft interface specific sysfs entries - * @dev: netdev struct of the soft interface - */ -void batadv_sysfs_del_meshif(struct net_device *dev) -{ - struct batadv_priv *bat_priv = netdev_priv(dev); - struct batadv_attribute **bat_attr; - - for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr) - sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr)); - - kobject_uevent(bat_priv->mesh_obj, KOBJ_REMOVE); - kobject_del(bat_priv->mesh_obj); - kobject_put(bat_priv->mesh_obj); - bat_priv->mesh_obj = NULL; -} - -/** - * batadv_sysfs_add_vlan() - add all the needed sysfs objects for the new vlan - * @dev: netdev of the mesh interface - * @vlan: private data of the newly added VLAN interface - * - * Return: 0 on success and -ENOMEM if any of the structure allocations fails. - */ -int batadv_sysfs_add_vlan(struct net_device *dev, - struct batadv_softif_vlan *vlan) -{ - char vlan_subdir[sizeof(BATADV_SYSFS_VLAN_SUBDIR_PREFIX) + 5]; - struct batadv_priv *bat_priv = netdev_priv(dev); - struct batadv_attribute **bat_attr; - int err; - - if (vlan->vid & BATADV_VLAN_HAS_TAG) { - sprintf(vlan_subdir, BATADV_SYSFS_VLAN_SUBDIR_PREFIX "%hu", - vlan->vid & VLAN_VID_MASK); - - vlan->kobj = kobject_create_and_add(vlan_subdir, - bat_priv->mesh_obj); - if (!vlan->kobj) { - batadv_err(dev, "Can't add sysfs directory: %s/%s\n", - dev->name, vlan_subdir); - goto out; - } - } else { - /* the untagged LAN uses the root folder to store its "VLAN - * specific attributes" - */ - vlan->kobj = bat_priv->mesh_obj; - kobject_get(bat_priv->mesh_obj); - } - - for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr) { - err = sysfs_create_file(vlan->kobj, - &((*bat_attr)->attr)); - if (err) { - batadv_err(dev, "Can't add sysfs file: %s/%s/%s\n", - dev->name, vlan_subdir, - ((*bat_attr)->attr).name); - goto rem_attr; - } - } - - return 0; - -rem_attr: - for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr) - sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr)); - - if (vlan->kobj != bat_priv->mesh_obj) { - kobject_uevent(vlan->kobj, KOBJ_REMOVE); - kobject_del(vlan->kobj); - } - kobject_put(vlan->kobj); - vlan->kobj = NULL; -out: - return -ENOMEM; -} - -/** - * batadv_sysfs_del_vlan() - remove all the sysfs objects for a given VLAN - * @bat_priv: the bat priv with all the soft interface information - * @vlan: the private data of the VLAN to destroy - */ -void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv, - struct batadv_softif_vlan *vlan) -{ - struct batadv_attribute **bat_attr; - - for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr) - sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr)); - - if (vlan->kobj != bat_priv->mesh_obj) { - kobject_uevent(vlan->kobj, KOBJ_REMOVE); - kobject_del(vlan->kobj); - } - kobject_put(vlan->kobj); - vlan->kobj = NULL; -} - -static ssize_t batadv_show_mesh_iface(struct kobject *kobj, - struct attribute *attr, char *buff) -{ - struct net_device *net_dev = batadv_kobj_to_netdev(kobj); - struct batadv_hard_iface *hard_iface; - ssize_t length; - const char *ifname; - - batadv_sysfs_deprecated(attr); - - hard_iface = batadv_hardif_get_by_netdev(net_dev); - if (!hard_iface) - return 0; - - if (hard_iface->if_status == BATADV_IF_NOT_IN_USE) - ifname = "none"; - else - ifname = hard_iface->soft_iface->name; - - length = sprintf(buff, "%s\n", ifname); - - batadv_hardif_put(hard_iface); - - return length; -} - -/** - * batadv_store_mesh_iface_finish() - store new hardif mesh_iface state - * @net_dev: netdevice to add/remove to/from batman-adv soft-interface - * @ifname: name of soft-interface to modify - * - * Changes the parts of the hard+soft interface which can not be modified under - * sysfs lock (to prevent deadlock situations). - * - * Return: 0 on success, 0 < on failure - */ -static int batadv_store_mesh_iface_finish(struct net_device *net_dev, - char ifname[IFNAMSIZ]) -{ - struct net *net = dev_net(net_dev); - struct batadv_hard_iface *hard_iface; - int status_tmp; - int ret = 0; - - ASSERT_RTNL(); - - hard_iface = batadv_hardif_get_by_netdev(net_dev); - if (!hard_iface) - return 0; - - if (strncmp(ifname, "none", 4) == 0) - status_tmp = BATADV_IF_NOT_IN_USE; - else - status_tmp = BATADV_IF_I_WANT_YOU; - - if (hard_iface->if_status == status_tmp) - goto out; - - if (hard_iface->soft_iface && - strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0) - goto out; - - if (status_tmp == BATADV_IF_NOT_IN_USE) { - batadv_hardif_disable_interface(hard_iface, - BATADV_IF_CLEANUP_AUTO); - goto out; - } - - /* if the interface already is in use */ - if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) - batadv_hardif_disable_interface(hard_iface, - BATADV_IF_CLEANUP_AUTO); - - ret = batadv_hardif_enable_interface(hard_iface, net, ifname); -out: - batadv_hardif_put(hard_iface); - return ret; -} - -/** - * batadv_store_mesh_iface_work() - store new hardif mesh_iface state - * @work: work queue item - * - * Changes the parts of the hard+soft interface which can not be modified under - * sysfs lock (to prevent deadlock situations). - */ -static void batadv_store_mesh_iface_work(struct work_struct *work) -{ - struct batadv_store_mesh_work *store_work; - int ret; - - store_work = container_of(work, struct batadv_store_mesh_work, work); - - rtnl_lock(); - ret = batadv_store_mesh_iface_finish(store_work->net_dev, - store_work->soft_iface_name); - rtnl_unlock(); - - if (ret < 0) - pr_err("Failed to store new mesh_iface state %s for %s: %d\n", - store_work->soft_iface_name, store_work->net_dev->name, - ret); - - dev_put(store_work->net_dev); - kfree(store_work); -} - -static ssize_t batadv_store_mesh_iface(struct kobject *kobj, - struct attribute *attr, char *buff, - size_t count) -{ - struct net_device *net_dev = batadv_kobj_to_netdev(kobj); - struct batadv_store_mesh_work *store_work; - - batadv_sysfs_deprecated(attr); - - if (buff[count - 1] == '\n') - buff[count - 1] = '\0'; - - if (strlen(buff) >= IFNAMSIZ) { - pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n", - buff); - return -EINVAL; - } - - store_work = kmalloc(sizeof(*store_work), GFP_KERNEL); - if (!store_work) - return -ENOMEM; - - dev_hold(net_dev); - INIT_WORK(&store_work->work, batadv_store_mesh_iface_work); - store_work->net_dev = net_dev; - strscpy(store_work->soft_iface_name, buff, - sizeof(store_work->soft_iface_name)); - - queue_work(batadv_event_workqueue, &store_work->work); - - return count; -} - -static ssize_t batadv_show_iface_status(struct kobject *kobj, - struct attribute *attr, char *buff) -{ - struct net_device *net_dev = batadv_kobj_to_netdev(kobj); - struct batadv_hard_iface *hard_iface; - ssize_t length; - - batadv_sysfs_deprecated(attr); - - hard_iface = batadv_hardif_get_by_netdev(net_dev); - if (!hard_iface) - return 0; - - switch (hard_iface->if_status) { - case BATADV_IF_TO_BE_REMOVED: - length = sprintf(buff, "disabling\n"); - break; - case BATADV_IF_INACTIVE: - length = sprintf(buff, "inactive\n"); - break; - case BATADV_IF_ACTIVE: - length = sprintf(buff, "active\n"); - break; - case BATADV_IF_TO_BE_ACTIVATED: - length = sprintf(buff, "enabling\n"); - break; - case BATADV_IF_NOT_IN_USE: - default: - length = sprintf(buff, "not in use\n"); - break; - } - - batadv_hardif_put(hard_iface); - - return length; -} - -#ifdef CONFIG_BATMAN_ADV_BATMAN_V - -/** - * batadv_store_throughput_override() - parse and store throughput override - * entered by the user - * @kobj: kobject representing the private mesh sysfs directory - * @attr: the batman-adv attribute the user is interacting with - * @buff: the buffer containing the user data - * @count: number of bytes in the buffer - * - * Return: 'count' on success or a negative error code in case of failure - */ -static ssize_t batadv_store_throughput_override(struct kobject *kobj, - struct attribute *attr, - char *buff, size_t count) -{ - struct net_device *net_dev = batadv_kobj_to_netdev(kobj); - struct batadv_hard_iface *hard_iface; - struct batadv_priv *bat_priv; - u32 tp_override; - u32 old_tp_override; - bool ret; - - batadv_sysfs_deprecated(attr); - - hard_iface = batadv_hardif_get_by_netdev(net_dev); - if (!hard_iface) - return -EINVAL; - - if (buff[count - 1] == '\n') - buff[count - 1] = '\0'; - - ret = batadv_parse_throughput(net_dev, buff, "throughput_override", - &tp_override); - if (!ret) - goto out; - - old_tp_override = atomic_read(&hard_iface->bat_v.throughput_override); - if (old_tp_override == tp_override) - goto out; - - batadv_info(hard_iface->soft_iface, - "%s: %s: Changing from: %u.%u MBit to: %u.%u MBit\n", - "throughput_override", net_dev->name, - old_tp_override / 10, old_tp_override % 10, - tp_override / 10, tp_override % 10); - - atomic_set(&hard_iface->bat_v.throughput_override, tp_override); - - if (hard_iface->soft_iface) { - bat_priv = netdev_priv(hard_iface->soft_iface); - batadv_netlink_notify_hardif(bat_priv, hard_iface); - } - -out: - batadv_hardif_put(hard_iface); - return count; -} - -static ssize_t batadv_show_throughput_override(struct kobject *kobj, - struct attribute *attr, - char *buff) -{ - struct net_device *net_dev = batadv_kobj_to_netdev(kobj); - struct batadv_hard_iface *hard_iface; - u32 tp_override; - - batadv_sysfs_deprecated(attr); - - hard_iface = batadv_hardif_get_by_netdev(net_dev); - if (!hard_iface) - return -EINVAL; - - tp_override = atomic_read(&hard_iface->bat_v.throughput_override); - - batadv_hardif_put(hard_iface); - return sprintf(buff, "%u.%u MBit\n", tp_override / 10, - tp_override % 10); -} - -#endif - -static BATADV_ATTR(mesh_iface, 0644, batadv_show_mesh_iface, - batadv_store_mesh_iface); -static BATADV_ATTR(iface_status, 0444, batadv_show_iface_status, NULL); -#ifdef CONFIG_BATMAN_ADV_BATMAN_V -BATADV_ATTR_HIF_UINT(elp_interval, bat_v.elp_interval, 0644, - 2 * BATADV_JITTER, INT_MAX, NULL); -static BATADV_ATTR(throughput_override, 0644, batadv_show_throughput_override, - batadv_store_throughput_override); -#endif - -static struct batadv_attribute *batadv_batman_attrs[] = { - &batadv_attr_mesh_iface, - &batadv_attr_iface_status, -#ifdef CONFIG_BATMAN_ADV_BATMAN_V - &batadv_attr_elp_interval, - &batadv_attr_throughput_override, -#endif - NULL, -}; - -/** - * batadv_sysfs_add_hardif() - Add hard interface specific sysfs entries - * @hardif_obj: address where to store the pointer to new sysfs folder - * @dev: netdev struct of the hard interface - * - * Return: 0 on success or negative error number in case of failure - */ -int batadv_sysfs_add_hardif(struct kobject **hardif_obj, struct net_device *dev) -{ - struct kobject *hardif_kobject = &dev->dev.kobj; - struct batadv_attribute **bat_attr; - int err; - - *hardif_obj = kobject_create_and_add(BATADV_SYSFS_IF_BAT_SUBDIR, - hardif_kobject); - - if (!*hardif_obj) { - batadv_err(dev, "Can't add sysfs directory: %s/%s\n", dev->name, - BATADV_SYSFS_IF_BAT_SUBDIR); - goto out; - } - - for (bat_attr = batadv_batman_attrs; *bat_attr; ++bat_attr) { - err = sysfs_create_file(*hardif_obj, &((*bat_attr)->attr)); - if (err) { - batadv_err(dev, "Can't add sysfs file: %s/%s/%s\n", - dev->name, BATADV_SYSFS_IF_BAT_SUBDIR, - ((*bat_attr)->attr).name); - goto rem_attr; - } - } - - return 0; - -rem_attr: - for (bat_attr = batadv_batman_attrs; *bat_attr; ++bat_attr) - sysfs_remove_file(*hardif_obj, &((*bat_attr)->attr)); -out: - return -ENOMEM; -} - -/** - * batadv_sysfs_del_hardif() - Remove hard interface specific sysfs entries - * @hardif_obj: address to the pointer to which stores batman-adv sysfs folder - * of the hard interface - */ -void batadv_sysfs_del_hardif(struct kobject **hardif_obj) -{ - kobject_uevent(*hardif_obj, KOBJ_REMOVE); - kobject_del(*hardif_obj); - kobject_put(*hardif_obj); - *hardif_obj = NULL; -} diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h deleted file mode 100644 index d987f8b30a98..000000000000 --- a/net/batman-adv/sysfs.h +++ /dev/null @@ -1,93 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: - * - * Marek Lindner - */ - -#ifndef _NET_BATMAN_ADV_SYSFS_H_ -#define _NET_BATMAN_ADV_SYSFS_H_ - -#include "main.h" - -#include <linux/kobject.h> -#include <linux/netdevice.h> -#include <linux/sysfs.h> -#include <linux/types.h> - -#define BATADV_SYSFS_IF_MESH_SUBDIR "mesh" -#define BATADV_SYSFS_IF_BAT_SUBDIR "batman_adv" -/** - * BATADV_SYSFS_VLAN_SUBDIR_PREFIX - prefix of the subfolder that will be - * created in the sysfs hierarchy for each VLAN interface. The subfolder will - * be named "BATADV_SYSFS_VLAN_SUBDIR_PREFIX%vid". - */ -#define BATADV_SYSFS_VLAN_SUBDIR_PREFIX "vlan" - -/** - * struct batadv_attribute - sysfs export helper for batman-adv attributes - */ -struct batadv_attribute { - /** @attr: sysfs attribute file */ - struct attribute attr; - - /** - * @show: function to export the current attribute's content to sysfs - */ - ssize_t (*show)(struct kobject *kobj, struct attribute *attr, - char *buf); - - /** - * @store: function to load new value from character buffer and save it - * in batman-adv attribute - */ - ssize_t (*store)(struct kobject *kobj, struct attribute *attr, - char *buf, size_t count); -}; - -#ifdef CONFIG_BATMAN_ADV_SYSFS - -int batadv_sysfs_add_meshif(struct net_device *dev); -void batadv_sysfs_del_meshif(struct net_device *dev); -int batadv_sysfs_add_hardif(struct kobject **hardif_obj, - struct net_device *dev); -void batadv_sysfs_del_hardif(struct kobject **hardif_obj); -int batadv_sysfs_add_vlan(struct net_device *dev, - struct batadv_softif_vlan *vlan); -void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv, - struct batadv_softif_vlan *vlan); - -#else - -static inline int batadv_sysfs_add_meshif(struct net_device *dev) -{ - return 0; -} - -static inline void batadv_sysfs_del_meshif(struct net_device *dev) -{ -} - -static inline int batadv_sysfs_add_hardif(struct kobject **hardif_obj, - struct net_device *dev) -{ - return 0; -} - -static inline void batadv_sysfs_del_hardif(struct kobject **hardif_obj) -{ -} - -static inline int batadv_sysfs_add_vlan(struct net_device *dev, - struct batadv_softif_vlan *vlan) -{ - return 0; -} - -static inline void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv, - struct batadv_softif_vlan *vlan) -{ -} - -#endif - -#endif /* _NET_BATMAN_ADV_SYSFS_H_ */ diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index db7e3774825b..789c851732b7 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli */ @@ -23,6 +23,7 @@ #include <linux/kthread.h> #include <linux/limits.h> #include <linux/list.h> +#include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/param.h> #include <linux/printk.h> @@ -130,7 +131,7 @@ static u32 batadv_tp_cwnd(u32 base, u32 increment, u32 min) } /** - * batadv_tp_updated_cwnd() - update the Congestion Windows + * batadv_tp_update_cwnd() - update the Congestion Windows * @tp_vars: the private data of the current TP meter session * @mss: maximum segment size of transmission * diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index 140105215aa2..f0046d366eac 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli */ diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c index 3444d9e4e90d..ec8b9519076b 100644 --- a/net/batman-adv/trace.c +++ b/net/batman-adv/trace.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Sven Eckelmann */ diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index a87547570b4e..d673ebdd0426 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Sven Eckelmann */ diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 98a0aaaf0d50..f8761281aab0 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ @@ -30,7 +30,6 @@ #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -1062,84 +1061,6 @@ container_register: kfree(tt_data); } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - -/** - * batadv_tt_local_seq_print_text() - Print the local tt table in a seq file - * @seq: seq file to print on - * @offset: not used - * - * Return: always 0 - */ -int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) -{ - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_priv *bat_priv = netdev_priv(net_dev); - struct batadv_hashtable *hash = bat_priv->tt.local_hash; - struct batadv_tt_common_entry *tt_common_entry; - struct batadv_tt_local_entry *tt_local; - struct batadv_hard_iface *primary_if; - struct hlist_head *head; - u32 i; - int last_seen_secs; - int last_seen_msecs; - unsigned long last_seen_jiffies; - bool no_purge; - u16 np_flag = BATADV_TT_CLIENT_NOPURGE; - - primary_if = batadv_seq_print_text_primary_if_get(seq); - if (!primary_if) - goto out; - - seq_printf(seq, - "Locally retrieved addresses (from %s) announced via TT (TTVN: %u):\n", - net_dev->name, (u8)atomic_read(&bat_priv->tt.vn)); - seq_puts(seq, - " Client VID Flags Last seen (CRC )\n"); - - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tt_common_entry, - head, hash_entry) { - tt_local = container_of(tt_common_entry, - struct batadv_tt_local_entry, - common); - last_seen_jiffies = jiffies - tt_local->last_seen; - last_seen_msecs = jiffies_to_msecs(last_seen_jiffies); - last_seen_secs = last_seen_msecs / 1000; - last_seen_msecs = last_seen_msecs % 1000; - - no_purge = tt_common_entry->flags & np_flag; - seq_printf(seq, - " * %pM %4i [%c%c%c%c%c%c] %3u.%03u (%#.8x)\n", - tt_common_entry->addr, - batadv_print_vid(tt_common_entry->vid), - ((tt_common_entry->flags & - BATADV_TT_CLIENT_ROAM) ? 'R' : '.'), - no_purge ? 'P' : '.', - ((tt_common_entry->flags & - BATADV_TT_CLIENT_NEW) ? 'N' : '.'), - ((tt_common_entry->flags & - BATADV_TT_CLIENT_PENDING) ? 'X' : '.'), - ((tt_common_entry->flags & - BATADV_TT_CLIENT_WIFI) ? 'W' : '.'), - ((tt_common_entry->flags & - BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), - no_purge ? 0 : last_seen_secs, - no_purge ? 0 : last_seen_msecs, - tt_local->vlan->tt.crc); - } - rcu_read_unlock(); - } -out: - if (primary_if) - batadv_hardif_put(primary_if); - return 0; -} -#endif - /** * batadv_tt_local_dump_entry() - Dump one TT local entry into a message * @msg :Netlink message to dump into @@ -1879,139 +1800,6 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, return best_entry; } -#ifdef CONFIG_BATMAN_ADV_DEBUGFS -/** - * batadv_tt_global_print_entry() - print all orig nodes who announce the - * address for this global entry - * @bat_priv: the bat priv with all the soft interface information - * @tt_global_entry: global translation table entry to be printed - * @seq: debugfs table seq_file struct - * - * This function assumes the caller holds rcu_read_lock(). - */ -static void -batadv_tt_global_print_entry(struct batadv_priv *bat_priv, - struct batadv_tt_global_entry *tt_global_entry, - struct seq_file *seq) -{ - struct batadv_tt_orig_list_entry *orig_entry, *best_entry; - struct batadv_tt_common_entry *tt_common_entry; - struct batadv_orig_node_vlan *vlan; - struct hlist_head *head; - u8 last_ttvn; - u16 flags; - - tt_common_entry = &tt_global_entry->common; - flags = tt_common_entry->flags; - - best_entry = batadv_transtable_best_orig(bat_priv, tt_global_entry); - if (best_entry) { - vlan = batadv_orig_node_vlan_get(best_entry->orig_node, - tt_common_entry->vid); - if (!vlan) { - seq_printf(seq, - " * Cannot retrieve VLAN %d for originator %pM\n", - batadv_print_vid(tt_common_entry->vid), - best_entry->orig_node->orig); - goto print_list; - } - - last_ttvn = atomic_read(&best_entry->orig_node->last_ttvn); - seq_printf(seq, - " %c %pM %4i (%3u) via %pM (%3u) (%#.8x) [%c%c%c%c]\n", - '*', tt_global_entry->common.addr, - batadv_print_vid(tt_global_entry->common.vid), - best_entry->ttvn, best_entry->orig_node->orig, - last_ttvn, vlan->tt.crc, - ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'), - ((flags & BATADV_TT_CLIENT_WIFI) ? 'W' : '.'), - ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), - ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.')); - - batadv_orig_node_vlan_put(vlan); - } - -print_list: - head = &tt_global_entry->orig_list; - - hlist_for_each_entry_rcu(orig_entry, head, list) { - if (best_entry == orig_entry) - continue; - - vlan = batadv_orig_node_vlan_get(orig_entry->orig_node, - tt_common_entry->vid); - if (!vlan) { - seq_printf(seq, - " + Cannot retrieve VLAN %d for originator %pM\n", - batadv_print_vid(tt_common_entry->vid), - orig_entry->orig_node->orig); - continue; - } - - last_ttvn = atomic_read(&orig_entry->orig_node->last_ttvn); - seq_printf(seq, - " %c %pM %4d (%3u) via %pM (%3u) (%#.8x) [%c%c%c%c]\n", - '+', tt_global_entry->common.addr, - batadv_print_vid(tt_global_entry->common.vid), - orig_entry->ttvn, orig_entry->orig_node->orig, - last_ttvn, vlan->tt.crc, - ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'), - ((flags & BATADV_TT_CLIENT_WIFI) ? 'W' : '.'), - ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), - ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.')); - - batadv_orig_node_vlan_put(vlan); - } -} - -/** - * batadv_tt_global_seq_print_text() - Print the global tt table in a seq file - * @seq: seq file to print on - * @offset: not used - * - * Return: always 0 - */ -int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset) -{ - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_priv *bat_priv = netdev_priv(net_dev); - struct batadv_hashtable *hash = bat_priv->tt.global_hash; - struct batadv_tt_common_entry *tt_common_entry; - struct batadv_tt_global_entry *tt_global; - struct batadv_hard_iface *primary_if; - struct hlist_head *head; - u32 i; - - primary_if = batadv_seq_print_text_primary_if_get(seq); - if (!primary_if) - goto out; - - seq_printf(seq, - "Globally announced TT entries received via the mesh %s\n", - net_dev->name); - seq_puts(seq, - " Client VID (TTVN) Originator (Curr TTVN) (CRC ) Flags\n"); - - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tt_common_entry, - head, hash_entry) { - tt_global = container_of(tt_common_entry, - struct batadv_tt_global_entry, - common); - batadv_tt_global_print_entry(bat_priv, tt_global, seq); - } - rcu_read_unlock(); - } -out: - if (primary_if) - batadv_hardif_put(primary_if); - return 0; -} -#endif - /** * batadv_tt_global_dump_subentry() - Dump all TT local entries into a message * @msg: Netlink message to dump into diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index b24d35b9226a..e1285904f885 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ @@ -11,7 +11,6 @@ #include <linux/netdevice.h> #include <linux/netlink.h> -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/types.h> @@ -21,8 +20,6 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid, const char *message, bool roaming); -int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset); -int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset); int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb); void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 6a23a566cde1..253f5a33a914 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index d509d00c7a23..54f2a35653d0 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 965336a3b89d..7c0b475cc22a 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -21,7 +21,6 @@ #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/sched.h> /* for linux/wait.h */ -#include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/timer.h> @@ -187,9 +186,6 @@ struct batadv_hard_iface { /** @net_dev: pointer to the net_device */ struct net_device *net_dev; - /** @hardif_obj: kobject of the per interface sysfs "mesh" directory */ - struct kobject *hardif_obj; - /** @refcount: number of contexts the object is used */ struct kref refcount; @@ -222,13 +218,6 @@ struct batadv_hard_iface { struct batadv_hard_iface_bat_v bat_v; #endif -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - /** - * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs - */ - struct dentry *debug_dir; -#endif - /** * @neigh_list: list of unique single hop neighbors via this interface */ @@ -239,7 +228,8 @@ struct batadv_hard_iface { }; /** - * struct batadv_orig_ifinfo - B.A.T.M.A.N. IV private orig_ifinfo members + * struct batadv_orig_ifinfo_bat_iv - B.A.T.M.A.N. IV private orig_ifinfo + * members */ struct batadv_orig_ifinfo_bat_iv { /** @@ -1306,13 +1296,6 @@ struct batadv_priv_nc { /** @work: work queue callback item for cleanup */ struct delayed_work work; -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - /** - * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs - */ - struct dentry *debug_dir; -#endif - /** * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq */ @@ -1512,9 +1495,6 @@ struct batadv_softif_vlan { /** @vid: VLAN identifier */ unsigned short vid; - /** @kobj: kobject for sysfs vlan subdirectory */ - struct kobject *kobj; - /** @ap_isolation: AP isolation state */ atomic_t ap_isolation; /* boolean */ @@ -1667,14 +1647,6 @@ struct batadv_priv { /** @batman_queue_left: number of remaining OGM packet slots */ atomic_t batman_queue_left; - /** @mesh_obj: kobject for sysfs mesh subdirectory */ - struct kobject *mesh_obj; - -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - /** @debug_dir: dentry for debugfs batman-adv subdirectory */ - struct dentry *debug_dir; -#endif - /** @forw_bat_list: list of aggregated OGMs that will be forwarded */ struct hlist_head forw_bat_list; @@ -2234,11 +2206,6 @@ struct batadv_algo_neigh_ops { struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2); -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - /** @print: print the single hop neighbor list (optional) */ - void (*print)(struct batadv_priv *priv, struct seq_file *seq); -#endif - /** @dump: dump neighbors to a netlink socket (optional) */ void (*dump)(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *priv, @@ -2249,12 +2216,6 @@ struct batadv_algo_neigh_ops { * struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific) */ struct batadv_algo_orig_ops { -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - /** @print: print the originator table (optional) */ - void (*print)(struct batadv_priv *priv, struct seq_file *seq, - struct batadv_hard_iface *hard_iface); -#endif - /** @dump: dump originators to a netlink socket (optional) */ void (*dump)(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *priv, @@ -2274,10 +2235,6 @@ struct batadv_algo_gw_ops { */ ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff, size_t count); - - /** @show_sel_class: prints the current GW selection class (optional) */ - ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff); - /** * @get_best_gw_node: select the best GW from the list of available * nodes (optional) @@ -2293,11 +2250,6 @@ struct batadv_algo_gw_ops { struct batadv_orig_node *curr_gw_orig, struct batadv_orig_node *orig_node); -#ifdef CONFIG_BATMAN_ADV_DEBUGFS - /** @print: print the gateway table (optional) */ - void (*print)(struct batadv_priv *bat_priv, struct seq_file *seq); -#endif - /** @dump: dump gateways to a netlink socket (optional) */ void (*dump)(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *priv); @@ -2456,21 +2408,4 @@ enum batadv_tvlv_handler_flags { BATADV_TVLV_HANDLER_OGM_CALLED = BIT(2), }; -/** - * struct batadv_store_mesh_work - Work queue item to detach add/del interface - * from sysfs locks - */ -struct batadv_store_mesh_work { - /** - * @net_dev: netdevice to add/remove to/from batman-adv soft-interface - */ - struct net_device *net_dev; - - /** @soft_iface_name: name of soft-interface to modify */ - char soft_iface_name[IFNAMSIZ]; - - /** @work: work queue item */ - struct work_struct work; -}; - #endif /* _NET_BATMAN_ADV_TYPES_H_ */ diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 64e669acd42f..400c5130dc0a 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -5,7 +5,7 @@ menuconfig BT tristate "Bluetooth subsystem support" - depends on NET && !S390 + depends on !S390 depends on RFKILL || !RFKILL select CRC16 select CRYPTO diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index da7fd7c8c2dc..463bad58478b 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -381,9 +381,9 @@ static int a2mp_getampassoc_req(struct amp_mgr *mgr, struct sk_buff *skb, hdev = hci_dev_get(req->id); if (!hdev || hdev->amp_type == AMP_TYPE_BREDR || tmp) { struct a2mp_amp_assoc_rsp rsp; - rsp.id = req->id; memset(&rsp, 0, sizeof(rsp)); + rsp.id = req->id; if (tmp) { rsp.status = A2MP_STATUS_COLLISION_OCCURED; @@ -512,6 +512,7 @@ static int a2mp_createphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb, assoc = kmemdup(req->amp_assoc, assoc_len, GFP_KERNEL); if (!assoc) { amp_ctrl_put(ctrl); + hci_dev_put(hdev); return -ENOMEM; } diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 4ef6a54403aa..1661979b6a6e 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -72,8 +72,8 @@ void bt_sock_reclassify_lock(struct sock *sk, int proto) BUG_ON(!sock_allow_reclassification(sk)); sock_lock_init_class_and_name(sk, - bt_slock_key_strings[proto], &bt_slock_key[proto], - bt_key_strings[proto], &bt_lock_key[proto]); + bt_slock_key_strings[proto], &bt_slock_key[proto], + bt_key_strings[proto], &bt_lock_key[proto]); } EXPORT_SYMBOL(bt_sock_reclassify_lock); @@ -451,7 +451,7 @@ static inline __poll_t bt_accept_poll(struct sock *parent) } __poll_t bt_sock_poll(struct file *file, struct socket *sock, - poll_table *wait) + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; @@ -478,8 +478,8 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock, mask |= EPOLLHUP; if (sk->sk_state == BT_CONNECT || - sk->sk_state == BT_CONNECT2 || - sk->sk_state == BT_CONFIG) + sk->sk_state == BT_CONNECT2 || + sk->sk_state == BT_CONFIG) return mask; if (!test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags) && sock_writeable(sk)) @@ -508,7 +508,7 @@ int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; - err = put_user(amount, (int __user *) arg); + err = put_user(amount, (int __user *)arg); break; case TIOCINQ: @@ -519,7 +519,7 @@ int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) skb = skb_peek(&sk->sk_receive_queue); amount = skb ? skb->len : 0; release_sock(sk); - err = put_user(amount, (int __user *) arg); + err = put_user(amount, (int __user *)arg); break; default: @@ -637,7 +637,7 @@ static int bt_seq_show(struct seq_file *seq, void *v) struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); if (v == SEQ_START_TOKEN) { - seq_puts(seq ,"sk RefCnt Rmem Wmem User Inode Parent"); + seq_puts(seq, "sk RefCnt Rmem Wmem User Inode Parent"); if (l->custom_seq_show) { seq_putc(seq, ' '); @@ -657,7 +657,7 @@ static int bt_seq_show(struct seq_file *seq, void *v) sk_wmem_alloc_get(sk), from_kuid(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk), - bt->parent? sock_i_ino(bt->parent): 0LU); + bt->parent ? sock_i_ino(bt->parent) : 0LU); if (l->custom_seq_show) { seq_putc(seq, ' '); @@ -678,7 +678,7 @@ static const struct seq_operations bt_seq_ops = { int bt_procfs_init(struct net *net, const char *name, struct bt_sock_list *sk_list, - int (* seq_show)(struct seq_file *, void *)) + int (*seq_show)(struct seq_file *, void *)) { sk_list->custom_seq_show = seq_show; @@ -694,7 +694,7 @@ void bt_procfs_cleanup(struct net *net, const char *name) #else int bt_procfs_init(struct net *net, const char *name, struct bt_sock_list *sk_list, - int (* seq_show)(struct seq_file *, void *)) + int (*seq_show)(struct seq_file *, void *)) { return 0; } diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c index 9c711f0dfae3..be2d469d6369 100644 --- a/net/bluetooth/amp.c +++ b/net/bluetooth/amp.c @@ -297,6 +297,9 @@ void amp_read_loc_assoc_final_data(struct hci_dev *hdev, struct hci_request req; int err; + if (!mgr) + return; + cp.phy_handle = hcon->handle; cp.len_so_far = cpu_to_le16(0); cp.max_len = cpu_to_le16(hdev->amp_assoc_size); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index d0c1024bf600..6ffa89e3ba0a 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -203,6 +203,23 @@ static void hci_acl_create_connection(struct hci_conn *conn) BT_DBG("hcon %p", conn); + /* Many controllers disallow HCI Create Connection while it is doing + * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create + * Connection. This may cause the MGMT discovering state to become false + * without user space's request but it is okay since the MGMT Discovery + * APIs do not promise that discovery should be done forever. Instead, + * the user space monitors the status of MGMT discovering and it may + * request for discovery again when this flag becomes false. + */ + if (test_bit(HCI_INQUIRY, &hdev->flags)) { + /* Put this connection to "pending" state so that it will be + * executed after the inquiry cancel command complete event. + */ + conn->state = BT_CONNECT2; + hci_send_cmd(hdev, HCI_OP_INQUIRY_CANCEL, 0, NULL); + return; + } + conn->state = BT_CONNECT; conn->out = true; conn->role = HCI_ROLE_MASTER; @@ -276,6 +293,20 @@ static void hci_add_sco(struct hci_conn *conn, __u16 handle) hci_send_cmd(hdev, HCI_OP_ADD_SCO, sizeof(cp), &cp); } +static bool find_next_esco_param(struct hci_conn *conn, + const struct sco_param *esco_param, int size) +{ + for (; conn->attempt <= size; conn->attempt++) { + if (lmp_esco_2m_capable(conn->link) || + (esco_param[conn->attempt - 1].pkt_type & ESCO_2EV3)) + break; + BT_DBG("hcon %p skipped attempt %d, eSCO 2M not supported", + conn, conn->attempt); + } + + return conn->attempt <= size; +} + bool hci_setup_sync(struct hci_conn *conn, __u16 handle) { struct hci_dev *hdev = conn->hdev; @@ -297,13 +328,15 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle) switch (conn->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: - if (conn->attempt > ARRAY_SIZE(esco_param_msbc)) + if (!find_next_esco_param(conn, esco_param_msbc, + ARRAY_SIZE(esco_param_msbc))) return false; param = &esco_param_msbc[conn->attempt - 1]; break; case SCO_AIRMODE_CVSD: if (lmp_esco_capable(conn->link)) { - if (conn->attempt > ARRAY_SIZE(esco_param_cvsd)) + if (!find_next_esco_param(conn, esco_param_cvsd, + ARRAY_SIZE(esco_param_cvsd))) return false; param = &esco_param_cvsd[conn->attempt - 1]; } else { @@ -758,6 +791,9 @@ static void create_le_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode) conn = hci_lookup_le_connect(hdev); + if (hdev->adv_instance_cnt) + hci_req_resume_adv_instances(hdev); + if (!status) { hci_connect_le_scan_cleanup(conn); goto done; @@ -1067,10 +1103,11 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, * connections most controllers will refuse to connect if * advertising is enabled, and for slave role connections we * anyway have to disable it in order to start directed - * advertising. + * advertising. Any registered advertisements will be + * re-enabled after the connection attempt is finished. */ if (hci_dev_test_flag(hdev, HCI_LE_ADV)) - __hci_req_disable_advertising(&req); + __hci_req_pause_adv_instances(&req); /* If requested to connect as slave use directed advertising */ if (conn->role == HCI_ROLE_SLAVE) { @@ -1118,6 +1155,10 @@ create_conn: err = hci_req_run(&req, create_le_conn_complete); if (err) { hci_conn_del(conn); + + if (hdev->adv_instance_cnt) + hci_req_resume_adv_instances(hdev); + return ERR_PTR(err); } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 502552d6e9af..b0d9c36acc03 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -741,6 +741,12 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL); } + if (hdev->commands[38] & 0x80) { + /* Read LE Min/Max Tx Power*/ + hci_req_add(req, HCI_OP_LE_READ_TRANSMIT_POWER, + 0, NULL); + } + if (hdev->commands[26] & 0x40) { /* Read LE White List Size */ hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE, @@ -763,7 +769,7 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_LE_CLEAR_RESOLV_LIST, 0, NULL); } - if (hdev->commands[35] & 0x40) { + if (hdev->commands[35] & 0x04) { __le16 rpa_timeout = cpu_to_le16(hdev->rpa_timeout); /* Set RPA timeout */ @@ -1356,8 +1362,10 @@ int hci_inquiry(void __user *arg) * cleared). If it is interrupted by a signal, return -EINTR. */ if (wait_on_bit(&hdev->flags, HCI_INQUIRY, - TASK_INTERRUPTIBLE)) - return -EINTR; + TASK_INTERRUPTIBLE)) { + err = -EINTR; + goto done; + } } /* for unlimited number of responses we will use buffer with @@ -2951,7 +2959,8 @@ static void adv_instance_rpa_expired(struct work_struct *work) int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data, - u16 timeout, u16 duration) + u16 timeout, u16 duration, s8 tx_power, + u32 min_interval, u32 max_interval) { struct adv_info *adv_instance; @@ -2979,6 +2988,9 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, adv_instance->flags = flags; adv_instance->adv_data_len = adv_data_len; adv_instance->scan_rsp_len = scan_rsp_len; + adv_instance->min_interval = min_interval; + adv_instance->max_interval = max_interval; + adv_instance->tx_power = tx_power; if (adv_data_len) memcpy(adv_instance->adv_data, adv_data, adv_data_len); @@ -2995,8 +3007,6 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, else adv_instance->duration = duration; - adv_instance->tx_power = HCI_TX_POWER_INVALID; - INIT_DELAYED_WORK(&adv_instance->rpa_expired_cb, adv_instance_rpa_expired); @@ -3006,18 +3016,52 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, } /* This function requires the caller holds hdev->lock */ +int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance, + u16 adv_data_len, u8 *adv_data, + u16 scan_rsp_len, u8 *scan_rsp_data) +{ + struct adv_info *adv_instance; + + adv_instance = hci_find_adv_instance(hdev, instance); + + /* If advertisement doesn't exist, we can't modify its data */ + if (!adv_instance) + return -ENOENT; + + if (adv_data_len) { + memset(adv_instance->adv_data, 0, + sizeof(adv_instance->adv_data)); + memcpy(adv_instance->adv_data, adv_data, adv_data_len); + adv_instance->adv_data_len = adv_data_len; + } + + if (scan_rsp_len) { + memset(adv_instance->scan_rsp_data, 0, + sizeof(adv_instance->scan_rsp_data)); + memcpy(adv_instance->scan_rsp_data, + scan_rsp_data, scan_rsp_len); + adv_instance->scan_rsp_len = scan_rsp_len; + } + + return 0; +} + +/* This function requires the caller holds hdev->lock */ void hci_adv_monitors_clear(struct hci_dev *hdev) { struct adv_monitor *monitor; int handle; idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) - hci_free_adv_monitor(monitor); + hci_free_adv_monitor(hdev, monitor); idr_destroy(&hdev->adv_monitors_idr); } -void hci_free_adv_monitor(struct adv_monitor *monitor) +/* Frees the monitor structure and do some bookkeepings. + * This function requires the caller holds hdev->lock. + */ +void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { struct adv_pattern *pattern; struct adv_pattern *tmp; @@ -3025,68 +3069,167 @@ void hci_free_adv_monitor(struct adv_monitor *monitor) if (!monitor) return; - list_for_each_entry_safe(pattern, tmp, &monitor->patterns, list) + list_for_each_entry_safe(pattern, tmp, &monitor->patterns, list) { + list_del(&pattern->list); kfree(pattern); + } + + if (monitor->handle) + idr_remove(&hdev->adv_monitors_idr, monitor->handle); + + if (monitor->state != ADV_MONITOR_STATE_NOT_REGISTERED) { + hdev->adv_monitors_cnt--; + mgmt_adv_monitor_removed(hdev, monitor->handle); + } kfree(monitor); } -/* This function requires the caller holds hdev->lock */ -int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) +int hci_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status) +{ + return mgmt_add_adv_patterns_monitor_complete(hdev, status); +} + +int hci_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status) +{ + return mgmt_remove_adv_monitor_complete(hdev, status); +} + +/* Assigns handle to a monitor, and if offloading is supported and power is on, + * also attempts to forward the request to the controller. + * Returns true if request is forwarded (result is pending), false otherwise. + * This function requires the caller holds hdev->lock. + */ +bool hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, + int *err) { int min, max, handle; - if (!monitor) - return -EINVAL; + *err = 0; + + if (!monitor) { + *err = -EINVAL; + return false; + } min = HCI_MIN_ADV_MONITOR_HANDLE; max = HCI_MIN_ADV_MONITOR_HANDLE + HCI_MAX_ADV_MONITOR_NUM_HANDLES; handle = idr_alloc(&hdev->adv_monitors_idr, monitor, min, max, GFP_KERNEL); - if (handle < 0) - return handle; + if (handle < 0) { + *err = handle; + return false; + } - hdev->adv_monitors_cnt++; monitor->handle = handle; - hci_update_background_scan(hdev); + if (!hdev_is_powered(hdev)) + return false; - return 0; + switch (hci_get_adv_monitor_offload_ext(hdev)) { + case HCI_ADV_MONITOR_EXT_NONE: + hci_update_background_scan(hdev); + bt_dev_dbg(hdev, "%s add monitor status %d", hdev->name, *err); + /* Message was not forwarded to controller - not an error */ + return false; + case HCI_ADV_MONITOR_EXT_MSFT: + *err = msft_add_monitor_pattern(hdev, monitor); + bt_dev_dbg(hdev, "%s add monitor msft status %d", hdev->name, + *err); + break; + } + + return (*err == 0); } -static int free_adv_monitor(int id, void *ptr, void *data) +/* Attempts to tell the controller and free the monitor. If somehow the + * controller doesn't have a corresponding handle, remove anyway. + * Returns true if request is forwarded (result is pending), false otherwise. + * This function requires the caller holds hdev->lock. + */ +static bool hci_remove_adv_monitor(struct hci_dev *hdev, + struct adv_monitor *monitor, + u16 handle, int *err) { - struct hci_dev *hdev = data; - struct adv_monitor *monitor = ptr; + *err = 0; - idr_remove(&hdev->adv_monitors_idr, monitor->handle); - hci_free_adv_monitor(monitor); - hdev->adv_monitors_cnt--; + switch (hci_get_adv_monitor_offload_ext(hdev)) { + case HCI_ADV_MONITOR_EXT_NONE: /* also goes here when powered off */ + goto free_monitor; + case HCI_ADV_MONITOR_EXT_MSFT: + *err = msft_remove_monitor(hdev, monitor, handle); + break; + } - return 0; + /* In case no matching handle registered, just free the monitor */ + if (*err == -ENOENT) + goto free_monitor; + + return (*err == 0); + +free_monitor: + if (*err == -ENOENT) + bt_dev_warn(hdev, "Removing monitor with no matching handle %d", + monitor->handle); + hci_free_adv_monitor(hdev, monitor); + + *err = 0; + return false; } -/* This function requires the caller holds hdev->lock */ -int hci_remove_adv_monitor(struct hci_dev *hdev, u16 handle) +/* Returns true if request is forwarded (result is pending), false otherwise. + * This function requires the caller holds hdev->lock. + */ +bool hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle, int *err) +{ + struct adv_monitor *monitor = idr_find(&hdev->adv_monitors_idr, handle); + bool pending; + + if (!monitor) { + *err = -EINVAL; + return false; + } + + pending = hci_remove_adv_monitor(hdev, monitor, handle, err); + if (!*err && !pending) + hci_update_background_scan(hdev); + + bt_dev_dbg(hdev, "%s remove monitor handle %d, status %d, %spending", + hdev->name, handle, *err, pending ? "" : "not "); + + return pending; +} + +/* Returns true if request is forwarded (result is pending), false otherwise. + * This function requires the caller holds hdev->lock. + */ +bool hci_remove_all_adv_monitor(struct hci_dev *hdev, int *err) { struct adv_monitor *monitor; + int idr_next_id = 0; + bool pending = false; + bool update = false; + + *err = 0; - if (handle) { - monitor = idr_find(&hdev->adv_monitors_idr, handle); + while (!*err && !pending) { + monitor = idr_get_next(&hdev->adv_monitors_idr, &idr_next_id); if (!monitor) - return -ENOENT; + break; - idr_remove(&hdev->adv_monitors_idr, monitor->handle); - hci_free_adv_monitor(monitor); - hdev->adv_monitors_cnt--; - } else { - /* Remove all monitors if handle is 0. */ - idr_for_each(&hdev->adv_monitors_idr, &free_adv_monitor, hdev); + pending = hci_remove_adv_monitor(hdev, monitor, 0, err); + + if (!*err && !pending) + update = true; } - hci_update_background_scan(hdev); + if (update) + hci_update_background_scan(hdev); - return 0; + bt_dev_dbg(hdev, "%s remove all monitors status %d, %spending", + hdev->name, *err, pending ? "" : "not "); + + return pending; } /* This function requires the caller holds hdev->lock */ @@ -3095,6 +3238,14 @@ bool hci_is_adv_monitoring(struct hci_dev *hdev) return !idr_is_empty(&hdev->adv_monitors_idr); } +int hci_get_adv_monitor_offload_ext(struct hci_dev *hdev) +{ + if (msft_monitor_supported(hdev)) + return HCI_ADV_MONITOR_EXT_MSFT; + + return HCI_ADV_MONITOR_EXT_NONE; +} + struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { @@ -3527,7 +3678,8 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, } /* Suspend notifier should only act on events when powered. */ - if (!hdev_is_powered(hdev)) + if (!hdev_is_powered(hdev) || + hci_dev_test_flag(hdev, HCI_UNREGISTER)) goto done; if (action == PM_SUSPEND_PREPARE) { @@ -3592,6 +3744,10 @@ struct hci_dev *hci_alloc_dev(void) hdev->cur_adv_instance = 0x00; hdev->adv_instance_timeout = 0; + hdev->advmon_allowlist_duration = 300; + hdev->advmon_no_filter_duration = 500; + hdev->enable_advmon_interleave_scan = 0x00; /* Default to disable */ + hdev->sniff_max_interval = 800; hdev->sniff_min_interval = 80; @@ -3623,6 +3779,8 @@ struct hci_dev *hci_alloc_dev(void) hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES; hdev->def_multi_adv_rotation_duration = HCI_DEFAULT_ADV_DURATION; hdev->def_le_autoconnect_timeout = HCI_LE_AUTOCONN_TIMEOUT; + hdev->min_le_tx_power = HCI_TX_POWER_INVALID; + hdev->max_le_tx_power = HCI_TX_POWER_INVALID; hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT; hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT; @@ -3782,10 +3940,12 @@ int hci_register_dev(struct hci_dev *hdev) hci_sock_dev_event(hdev, HCI_DEV_REG); hci_dev_hold(hdev); - hdev->suspend_notifier.notifier_call = hci_suspend_notifier; - error = register_pm_notifier(&hdev->suspend_notifier); - if (error) - goto err_wqueue; + if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) { + hdev->suspend_notifier.notifier_call = hci_suspend_notifier; + error = register_pm_notifier(&hdev->suspend_notifier); + if (error) + goto err_wqueue; + } queue_work(hdev->req_workqueue, &hdev->power_on); @@ -3820,9 +3980,11 @@ void hci_unregister_dev(struct hci_dev *hdev) cancel_work_sync(&hdev->power_on); - hci_suspend_clear_tasks(hdev); - unregister_pm_notifier(&hdev->suspend_notifier); - cancel_work_sync(&hdev->suspend_prepare); + if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) { + hci_suspend_clear_tasks(hdev); + unregister_pm_notifier(&hdev->suspend_notifier); + cancel_work_sync(&hdev->suspend_prepare); + } hci_dev_do_close(hdev); diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 5e8af2658e44..1a0ab58bfad0 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -237,8 +237,8 @@ static int conn_info_min_age_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(conn_info_min_age_fops, conn_info_min_age_get, - conn_info_min_age_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(conn_info_min_age_fops, conn_info_min_age_get, + conn_info_min_age_set, "%llu\n"); static int conn_info_max_age_set(void *data, u64 val) { @@ -265,8 +265,8 @@ static int conn_info_max_age_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(conn_info_max_age_fops, conn_info_max_age_get, - conn_info_max_age_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(conn_info_max_age_fops, conn_info_max_age_get, + conn_info_max_age_set, "%llu\n"); static ssize_t use_debug_keys_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) @@ -419,8 +419,8 @@ static int voice_setting_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(voice_setting_fops, voice_setting_get, - NULL, "0x%4.4llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(voice_setting_fops, voice_setting_get, + NULL, "0x%4.4llx\n"); static ssize_t ssp_debug_mode_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) @@ -476,9 +476,9 @@ static int min_encrypt_key_size_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(min_encrypt_key_size_fops, - min_encrypt_key_size_get, - min_encrypt_key_size_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(min_encrypt_key_size_fops, + min_encrypt_key_size_get, + min_encrypt_key_size_set, "%llu\n"); static int auto_accept_delay_get(void *data, u64 *val) { @@ -491,8 +491,47 @@ static int auto_accept_delay_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(auto_accept_delay_fops, auto_accept_delay_get, - auto_accept_delay_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(auto_accept_delay_fops, auto_accept_delay_get, + auto_accept_delay_set, "%llu\n"); + +static ssize_t force_bredr_smp_read(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[3]; + + buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP) ? 'Y' : 'N'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t force_bredr_smp_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + bool enable; + int err; + + err = kstrtobool_from_user(user_buf, count, &enable); + if (err) + return err; + + err = smp_force_bredr(hdev, enable); + if (err) + return err; + + return count; +} + +static const struct file_operations force_bredr_smp_fops = { + .open = simple_open, + .read = force_bredr_smp_read, + .write = force_bredr_smp_write, + .llseek = default_llseek, +}; static int idle_timeout_set(void *data, u64 val) { @@ -519,8 +558,8 @@ static int idle_timeout_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(idle_timeout_fops, idle_timeout_get, - idle_timeout_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(idle_timeout_fops, idle_timeout_get, + idle_timeout_set, "%llu\n"); static int sniff_min_interval_set(void *data, u64 val) { @@ -547,8 +586,8 @@ static int sniff_min_interval_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(sniff_min_interval_fops, sniff_min_interval_get, - sniff_min_interval_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(sniff_min_interval_fops, sniff_min_interval_get, + sniff_min_interval_set, "%llu\n"); static int sniff_max_interval_set(void *data, u64 val) { @@ -575,8 +614,8 @@ static int sniff_max_interval_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(sniff_max_interval_fops, sniff_max_interval_get, - sniff_max_interval_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(sniff_max_interval_fops, sniff_max_interval_get, + sniff_max_interval_set, "%llu\n"); void hci_debugfs_create_bredr(struct hci_dev *hdev) { @@ -589,6 +628,17 @@ void hci_debugfs_create_bredr(struct hci_dev *hdev) debugfs_create_file("voice_setting", 0444, hdev->debugfs, hdev, &voice_setting_fops); + /* If the controller does not support BR/EDR Secure Connections + * feature, then the BR/EDR SMP channel shall not be present. + * + * To test this with Bluetooth 4.0 controllers, create a debugfs + * switch that allows forcing BR/EDR SMP support and accepting + * cross-transport pairing on non-AES encrypted connections. + */ + if (!lmp_sc_capable(hdev)) + debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs, + hdev, &force_bredr_smp_fops); + if (lmp_ssp_capable(hdev)) { debugfs_create_file("ssp_debug_mode", 0444, hdev->debugfs, hdev, &ssp_debug_mode_fops); @@ -656,8 +706,8 @@ static int rpa_timeout_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(rpa_timeout_fops, rpa_timeout_get, - rpa_timeout_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(rpa_timeout_fops, rpa_timeout_get, + rpa_timeout_set, "%llu\n"); static int random_address_show(struct seq_file *f, void *p) { @@ -819,8 +869,8 @@ static int conn_min_interval_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(conn_min_interval_fops, conn_min_interval_get, - conn_min_interval_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(conn_min_interval_fops, conn_min_interval_get, + conn_min_interval_set, "%llu\n"); static int conn_max_interval_set(void *data, u64 val) { @@ -847,8 +897,8 @@ static int conn_max_interval_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(conn_max_interval_fops, conn_max_interval_get, - conn_max_interval_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(conn_max_interval_fops, conn_max_interval_get, + conn_max_interval_set, "%llu\n"); static int conn_latency_set(void *data, u64 val) { @@ -875,8 +925,8 @@ static int conn_latency_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(conn_latency_fops, conn_latency_get, - conn_latency_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(conn_latency_fops, conn_latency_get, + conn_latency_set, "%llu\n"); static int supervision_timeout_set(void *data, u64 val) { @@ -903,8 +953,8 @@ static int supervision_timeout_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(supervision_timeout_fops, supervision_timeout_get, - supervision_timeout_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(supervision_timeout_fops, supervision_timeout_get, + supervision_timeout_set, "%llu\n"); static int adv_channel_map_set(void *data, u64 val) { @@ -931,8 +981,8 @@ static int adv_channel_map_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(adv_channel_map_fops, adv_channel_map_get, - adv_channel_map_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(adv_channel_map_fops, adv_channel_map_get, + adv_channel_map_set, "%llu\n"); static int adv_min_interval_set(void *data, u64 val) { @@ -959,8 +1009,8 @@ static int adv_min_interval_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(adv_min_interval_fops, adv_min_interval_get, - adv_min_interval_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(adv_min_interval_fops, adv_min_interval_get, + adv_min_interval_set, "%llu\n"); static int adv_max_interval_set(void *data, u64 val) { @@ -987,8 +1037,8 @@ static int adv_max_interval_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(adv_max_interval_fops, adv_max_interval_get, - adv_max_interval_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(adv_max_interval_fops, adv_max_interval_get, + adv_max_interval_set, "%llu\n"); static int min_key_size_set(void *data, u64 val) { @@ -1015,8 +1065,8 @@ static int min_key_size_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(min_key_size_fops, min_key_size_get, - min_key_size_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(min_key_size_fops, min_key_size_get, + min_key_size_set, "%llu\n"); static int max_key_size_set(void *data, u64 val) { @@ -1043,8 +1093,8 @@ static int max_key_size_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(max_key_size_fops, max_key_size_get, - max_key_size_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(max_key_size_fops, max_key_size_get, + max_key_size_set, "%llu\n"); static int auth_payload_timeout_set(void *data, u64 val) { @@ -1071,9 +1121,9 @@ static int auth_payload_timeout_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(auth_payload_timeout_fops, - auth_payload_timeout_get, - auth_payload_timeout_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(auth_payload_timeout_fops, + auth_payload_timeout_get, + auth_payload_timeout_set, "%llu\n"); static ssize_t force_no_mitm_read(struct file *file, char __user *user_buf, diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index f04963914366..67668be3461e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1202,6 +1202,20 @@ static void hci_cc_le_set_adv_set_random_addr(struct hci_dev *hdev, hci_dev_unlock(hdev); } +static void hci_cc_le_read_transmit_power(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_le_read_transmit_power *rp = (void *)skb->data; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (rp->status) + return; + + hdev->min_le_tx_power = rp->min_le_tx_power; + hdev->max_le_tx_power = rp->max_le_tx_power; +} + static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) { __u8 *sent, status = *((__u8 *) skb->data); @@ -1752,6 +1766,7 @@ static void hci_cc_set_ext_adv_param(struct hci_dev *hdev, struct sk_buff *skb) } /* Update adv data as tx power is known now */ hci_req_update_adv_data(hdev, hdev->cur_adv_instance); + hci_dev_unlock(hdev); } @@ -3581,6 +3596,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_set_adv_set_random_addr(hdev, skb); break; + case HCI_OP_LE_READ_TRANSMIT_POWER: + hci_cc_le_read_transmit_power(hdev, skb); + break; + default: BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); break; @@ -4936,15 +4955,15 @@ static void hci_phy_link_complete_evt(struct hci_dev *hdev, hci_dev_lock(hdev); hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle); - if (!hcon) { - hci_dev_unlock(hdev); - return; - } + if (!hcon) + goto unlock; + + if (!hcon->amp_mgr) + goto unlock; if (ev->status) { hci_conn_del(hcon); - hci_dev_unlock(hdev); - return; + goto unlock; } bredr_hcon = hcon->amp_mgr->l2cap_conn->hcon; @@ -4961,6 +4980,7 @@ static void hci_phy_link_complete_evt(struct hci_dev *hdev, amp_physical_cfm(bredr_hcon, hcon); +unlock: hci_dev_unlock(hdev); } @@ -5868,21 +5888,19 @@ static void hci_le_direct_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) { u8 num_reports = skb->data[0]; - void *ptr = &skb->data[1]; + struct hci_ev_le_direct_adv_info *ev = (void *)&skb->data[1]; - hci_dev_lock(hdev); + if (!num_reports || skb->len < num_reports * sizeof(*ev) + 1) + return; - while (num_reports--) { - struct hci_ev_le_direct_adv_info *ev = ptr; + hci_dev_lock(hdev); + for (; num_reports; num_reports--, ev++) process_adv_report(hdev, ev->evt_type, &ev->bdaddr, ev->bdaddr_type, &ev->direct_addr, ev->direct_addr_type, ev->rssi, NULL, 0, false); - ptr += sizeof(*ev); - } - hci_dev_unlock(hdev); } diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 6f12bab4d2fa..e55976db4403 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -29,6 +29,7 @@ #include "smp.h" #include "hci_request.h" +#include "msft.h" #define HCI_REQ_DONE 0 #define HCI_REQ_PEND 1 @@ -58,7 +59,7 @@ static int req_run(struct hci_request *req, hci_req_complete_t complete, struct sk_buff *skb; unsigned long flags; - BT_DBG("length %u", skb_queue_len(&req->cmd_q)); + bt_dev_dbg(hdev, "length %u", skb_queue_len(&req->cmd_q)); /* If an error occurred during request building, remove all HCI * commands queued on the HCI request queue. @@ -102,7 +103,7 @@ int hci_req_run_skb(struct hci_request *req, hci_req_complete_skb_t complete) static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, struct sk_buff *skb) { - BT_DBG("%s result 0x%2.2x", hdev->name, result); + bt_dev_dbg(hdev, "result 0x%2.2x", result); if (hdev->req_status == HCI_REQ_PEND) { hdev->req_result = result; @@ -115,7 +116,7 @@ static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, void hci_req_sync_cancel(struct hci_dev *hdev, int err) { - BT_DBG("%s err 0x%2.2x", hdev->name, err); + bt_dev_dbg(hdev, "err 0x%2.2x", err); if (hdev->req_status == HCI_REQ_PEND) { hdev->req_result = err; @@ -131,7 +132,7 @@ struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, struct sk_buff *skb; int err = 0; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, ""); hci_req_init(&req, hdev); @@ -167,7 +168,7 @@ struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, skb = hdev->req_skb; hdev->req_skb = NULL; - BT_DBG("%s end: err %d", hdev->name, err); + bt_dev_dbg(hdev, "end: err %d", err); if (err < 0) { kfree_skb(skb); @@ -196,7 +197,7 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req, struct hci_request req; int err = 0; - BT_DBG("%s start", hdev->name); + bt_dev_dbg(hdev, "start"); hci_req_init(&req, hdev); @@ -260,7 +261,7 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req, hdev->req_skb = NULL; hdev->req_status = hdev->req_result = 0; - BT_DBG("%s end: err %d", hdev->name, err); + bt_dev_dbg(hdev, "end: err %d", err); return err; } @@ -300,7 +301,7 @@ struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen, if (plen) skb_put_data(skb, param, plen); - BT_DBG("skb len %d", skb->len); + bt_dev_dbg(hdev, "skb len %d", skb->len); hci_skb_pkt_type(skb) = HCI_COMMAND_PKT; hci_skb_opcode(skb) = opcode; @@ -315,7 +316,7 @@ void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen, struct hci_dev *hdev = req->hdev; struct sk_buff *skb; - BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen); + bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen); /* If an error occurred during request building, there is no point in * queueing the HCI command. We can simply return. @@ -378,6 +379,58 @@ void __hci_req_write_fast_connectable(struct hci_request *req, bool enable) hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_TYPE, 1, &type); } +static void start_interleave_scan(struct hci_dev *hdev) +{ + hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER; + queue_delayed_work(hdev->req_workqueue, + &hdev->interleave_scan, 0); +} + +static bool is_interleave_scanning(struct hci_dev *hdev) +{ + return hdev->interleave_scan_state != INTERLEAVE_SCAN_NONE; +} + +static void cancel_interleave_scan(struct hci_dev *hdev) +{ + bt_dev_dbg(hdev, "cancelling interleave scan"); + + cancel_delayed_work_sync(&hdev->interleave_scan); + + hdev->interleave_scan_state = INTERLEAVE_SCAN_NONE; +} + +/* Return true if interleave_scan wasn't started until exiting this function, + * otherwise, return false + */ +static bool __hci_update_interleaved_scan(struct hci_dev *hdev) +{ + /* Do interleaved scan only if all of the following are true: + * - There is at least one ADV monitor + * - At least one pending LE connection or one device to be scanned for + * - Monitor offloading is not supported + * If so, we should alternate between allowlist scan and one without + * any filters to save power. + */ + bool use_interleaving = hci_is_adv_monitoring(hdev) && + !(list_empty(&hdev->pend_le_conns) && + list_empty(&hdev->pend_le_reports)) && + hci_get_adv_monitor_offload_ext(hdev) == + HCI_ADV_MONITOR_EXT_NONE; + bool is_interleaving = is_interleave_scanning(hdev); + + if (use_interleaving && !is_interleaving) { + start_interleave_scan(hdev); + bt_dev_dbg(hdev, "starting interleave scan"); + return true; + } + + if (!use_interleaving && is_interleaving) + cancel_interleave_scan(hdev); + + return false; +} + /* This function controls the background scanning based on hdev->pend_le_conns * list. If there are pending LE connection we start the background scanning, * otherwise we stop it. @@ -413,8 +466,8 @@ static void __hci_update_background_scan(struct hci_request *req) */ hci_discovery_filter_clear(hdev); - BT_DBG("%s ADV monitoring is %s", hdev->name, - hci_is_adv_monitoring(hdev) ? "on" : "off"); + bt_dev_dbg(hdev, "ADV monitoring is %s", + hci_is_adv_monitoring(hdev) ? "on" : "off"); if (list_empty(&hdev->pend_le_conns) && list_empty(&hdev->pend_le_reports) && @@ -430,7 +483,7 @@ static void __hci_update_background_scan(struct hci_request *req) hci_req_add_le_scan_disable(req, false); - BT_DBG("%s stopping background scanning", hdev->name); + bt_dev_dbg(hdev, "stopping background scanning"); } else { /* If there is at least one pending LE connection, we should * keep the background scan running. @@ -450,8 +503,7 @@ static void __hci_update_background_scan(struct hci_request *req) hci_req_add_le_scan_disable(req, false); hci_req_add_le_passive_scan(req); - - BT_DBG("%s starting background scanning", hdev->name); + bt_dev_dbg(hdev, "starting background scanning"); } } @@ -661,6 +713,9 @@ void hci_req_add_le_scan_disable(struct hci_request *req, bool rpa_le_conn) return; } + if (hdev->suspended) + set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks); + if (use_ext_scan(hdev)) { struct hci_cp_le_set_ext_scan_enable cp; @@ -698,7 +753,8 @@ static void del_from_white_list(struct hci_request *req, bdaddr_t *bdaddr, cp.bdaddr_type); hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, sizeof(cp), &cp); - if (use_ll_privacy(req->hdev)) { + if (use_ll_privacy(req->hdev) && + hci_dev_test_flag(req->hdev, HCI_ENABLE_LL_PRIVACY)) { struct smp_irk *irk; irk = hci_find_irk_by_addr(req->hdev, bdaddr, bdaddr_type); @@ -732,7 +788,8 @@ static int add_to_white_list(struct hci_request *req, return -1; /* White list can not be used with RPAs */ - if (!allow_rpa && !use_ll_privacy(hdev) && + if (!allow_rpa && + !hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && hci_find_irk_by_addr(hdev, ¶ms->addr, params->addr_type)) { return -1; } @@ -750,7 +807,8 @@ static int add_to_white_list(struct hci_request *req, cp.bdaddr_type); hci_req_add(req, HCI_OP_LE_ADD_TO_WHITE_LIST, sizeof(cp), &cp); - if (use_ll_privacy(hdev)) { + if (use_ll_privacy(hdev) && + hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) { struct smp_irk *irk; irk = hci_find_irk_by_addr(hdev, ¶ms->addr, @@ -812,7 +870,8 @@ static u8 update_white_list(struct hci_request *req) } /* White list can not be used with RPAs */ - if (!allow_rpa && !use_ll_privacy(hdev) && + if (!allow_rpa && + !hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) { return 0x00; } @@ -844,12 +903,14 @@ static u8 update_white_list(struct hci_request *req) return 0x00; } - /* Once the controller offloading of advertisement monitor is in place, - * the if condition should include the support of MSFT extension - * support. If suspend is ongoing, whitelist should be the default to - * prevent waking by random advertisements. + /* Use the allowlist unless the following conditions are all true: + * - We are not currently suspending + * - There are 1 or more ADV monitors registered and it's not offloaded + * - Interleaved scanning is not currently using the allowlist */ - if (!idr_is_empty(&hdev->adv_monitors_idr) && !hdev->suspended) + if (!idr_is_empty(&hdev->adv_monitors_idr) && !hdev->suspended && + hci_get_adv_monitor_offload_ext(hdev) == HCI_ADV_MONITOR_EXT_NONE && + hdev->interleave_scan_state != INTERLEAVE_SCAN_ALLOWLIST) return 0x00; /* Select filter policy to use white list */ @@ -1002,6 +1063,11 @@ void hci_req_add_le_passive_scan(struct hci_request *req) &own_addr_type)) return; + if (hdev->enable_advmon_interleave_scan && + __hci_update_interleaved_scan(hdev)) + return; + + bt_dev_dbg(hdev, "interleave state %d", hdev->interleave_scan_state); /* Adding or removing entries from the white list must * happen before enabling scanning. The controller does * not allow white list modification while scanning. @@ -1024,6 +1090,8 @@ void hci_req_add_le_passive_scan(struct hci_request *req) if (hdev->suspended) { window = hdev->le_scan_window_suspend; interval = hdev->le_scan_int_suspend; + + set_bit(SUSPEND_SCAN_ENABLE, hdev->suspend_tasks); } else if (hci_is_le_conn_scanning(hdev)) { window = hdev->le_scan_window_connect; interval = hdev->le_scan_int_connect; @@ -1040,22 +1108,23 @@ void hci_req_add_le_passive_scan(struct hci_request *req) own_addr_type, filter_policy, addr_resolv); } -static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance) +static bool adv_instance_is_scannable(struct hci_dev *hdev, u8 instance) { struct adv_info *adv_instance; /* Instance 0x00 always set local name */ if (instance == 0x00) - return 1; + return true; adv_instance = hci_find_adv_instance(hdev, instance); if (!adv_instance) - return 0; + return false; - /* TODO: Take into account the "appearance" and "local-name" flags here. - * These are currently being ignored as they are not supported. - */ - return adv_instance->scan_rsp_len; + if (adv_instance->flags & MGMT_ADV_FLAG_APPEARANCE || + adv_instance->flags & MGMT_ADV_FLAG_LOCAL_NAME) + return true; + + return adv_instance->scan_rsp_len ? true : false; } static void hci_req_clear_event_filter(struct hci_request *req) @@ -1098,20 +1167,12 @@ static void hci_req_set_event_filter(struct hci_request *req) scan = SCAN_PAGE; } - hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); -} - -static void hci_req_config_le_suspend_scan(struct hci_request *req) -{ - /* Before changing params disable scan if enabled */ - if (hci_dev_test_flag(req->hdev, HCI_LE_SCAN)) - hci_req_add_le_scan_disable(req, false); - - /* Configure params and enable scanning */ - hci_req_add_le_passive_scan(req); + if (scan) + set_bit(SUSPEND_SCAN_ENABLE, hdev->suspend_tasks); + else + set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks); - /* Block suspend notifier on response */ - set_bit(SUSPEND_SCAN_ENABLE, req->hdev->suspend_tasks); + hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); } static void cancel_adv_timeout(struct hci_dev *hdev) @@ -1123,9 +1184,9 @@ static void cancel_adv_timeout(struct hci_dev *hdev) } /* This function requires the caller holds hdev->lock */ -static void hci_suspend_adv_instances(struct hci_request *req) +void __hci_req_pause_adv_instances(struct hci_request *req) { - bt_dev_dbg(req->hdev, "Suspending advertising instances"); + bt_dev_dbg(req->hdev, "Pausing advertising instances"); /* Call to disable any advertisements active on the controller. * This will succeed even if no advertisements are configured. @@ -1138,7 +1199,7 @@ static void hci_suspend_adv_instances(struct hci_request *req) } /* This function requires the caller holds hdev->lock */ -static void hci_resume_adv_instances(struct hci_request *req) +static void __hci_req_resume_adv_instances(struct hci_request *req) { struct adv_info *adv; @@ -1161,16 +1222,52 @@ static void hci_resume_adv_instances(struct hci_request *req) } } +/* This function requires the caller holds hdev->lock */ +int hci_req_resume_adv_instances(struct hci_dev *hdev) +{ + struct hci_request req; + + hci_req_init(&req, hdev); + __hci_req_resume_adv_instances(&req); + + return hci_req_run(&req, NULL); +} + static void suspend_req_complete(struct hci_dev *hdev, u8 status, u16 opcode) { bt_dev_dbg(hdev, "Request complete opcode=0x%x, status=0x%x", opcode, status); - if (test_and_clear_bit(SUSPEND_SCAN_ENABLE, hdev->suspend_tasks) || - test_and_clear_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks)) { + if (test_bit(SUSPEND_SCAN_ENABLE, hdev->suspend_tasks) || + test_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks)) { + clear_bit(SUSPEND_SCAN_ENABLE, hdev->suspend_tasks); + clear_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks); + wake_up(&hdev->suspend_wait_q); + } + + if (test_bit(SUSPEND_SET_ADV_FILTER, hdev->suspend_tasks)) { + clear_bit(SUSPEND_SET_ADV_FILTER, hdev->suspend_tasks); wake_up(&hdev->suspend_wait_q); } } +static void hci_req_add_set_adv_filter_enable(struct hci_request *req, + bool enable) +{ + struct hci_dev *hdev = req->hdev; + + switch (hci_get_adv_monitor_offload_ext(hdev)) { + case HCI_ADV_MONITOR_EXT_MSFT: + msft_req_add_set_filter_enable(req, enable); + break; + default: + return; + } + + /* No need to block when enabling since it's on resume path */ + if (hdev->suspended && !enable) + set_bit(SUSPEND_SET_ADV_FILTER, hdev->suspend_tasks); +} + /* Call with hci_dev_lock */ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) { @@ -1214,7 +1311,7 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) /* Pause other advertisements */ if (hdev->adv_instance_cnt) - hci_suspend_adv_instances(&req); + __hci_req_pause_adv_instances(&req); hdev->advertising_paused = true; hdev->advertising_old_state = old_state; @@ -1223,8 +1320,13 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &page_scan); /* Disable LE passive scan if enabled */ - if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) + if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { + cancel_interleave_scan(hdev); hci_req_add_le_scan_disable(&req, false); + } + + /* Disable advertisement filters */ + hci_req_add_set_adv_filter_enable(&req, false); /* Mark task needing completion */ set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks); @@ -1254,7 +1356,7 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) /* Enable event filter for paired devices */ hci_req_set_event_filter(&req); /* Enable passive scan at lower duty cycle */ - hci_req_config_le_suspend_scan(&req); + __hci_update_background_scan(&req); /* Pause scan changes again. */ hdev->scanning_paused = true; hci_req_run(&req, suspend_req_complete); @@ -1264,7 +1366,9 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) hci_req_clear_event_filter(&req); /* Reset passive/background scanning to normal */ - hci_req_config_le_suspend_scan(&req); + __hci_update_background_scan(&req); + /* Enable all of the advertisement filters */ + hci_req_add_set_adv_filter_enable(&req, true); /* Unpause directed advertising */ hdev->advertising_paused = false; @@ -1279,7 +1383,7 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) /* Resume other advertisements */ if (hdev->adv_instance_cnt) - hci_resume_adv_instances(&req); + __hci_req_resume_adv_instances(&req); /* Unpause discovery */ hdev->discovery_paused = false; @@ -1300,23 +1404,9 @@ done: wake_up(&hdev->suspend_wait_q); } -static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) +static bool adv_cur_instance_is_scannable(struct hci_dev *hdev) { - u8 instance = hdev->cur_adv_instance; - struct adv_info *adv_instance; - - /* Instance 0x00 always set local name */ - if (instance == 0x00) - return 1; - - adv_instance = hci_find_adv_instance(hdev, instance); - if (!adv_instance) - return 0; - - /* TODO: Take into account the "appearance" and "local-name" flags here. - * These are currently being ignored as they are not supported. - */ - return adv_instance->scan_rsp_len; + return adv_instance_is_scannable(hdev, hdev->cur_adv_instance); } void __hci_req_disable_advertising(struct hci_request *req) @@ -1428,6 +1518,7 @@ static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) void __hci_req_enable_advertising(struct hci_request *req) { struct hci_dev *hdev = req->hdev; + struct adv_info *adv_instance; struct hci_cp_le_set_adv_param cp; u8 own_addr_type, enable = 0x01; bool connectable; @@ -1435,6 +1526,7 @@ void __hci_req_enable_advertising(struct hci_request *req) u32 flags; flags = get_adv_instance_flags(hdev, hdev->cur_adv_instance); + adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance); /* If the "connectable" instance flag was not set, then choose between * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. @@ -1466,13 +1558,18 @@ void __hci_req_enable_advertising(struct hci_request *req) memset(&cp, 0, sizeof(cp)); - if (connectable) { - cp.type = LE_ADV_IND; - + if (adv_instance) { + adv_min_interval = adv_instance->min_interval; + adv_max_interval = adv_instance->max_interval; + } else { adv_min_interval = hdev->le_adv_min_interval; adv_max_interval = hdev->le_adv_max_interval; + } + + if (connectable) { + cp.type = LE_ADV_IND; } else { - if (get_cur_adv_instance_scan_rsp_len(hdev)) + if (adv_cur_instance_is_scannable(hdev)) cp.type = LE_ADV_SCAN_IND; else cp.type = LE_ADV_NONCONN_IND; @@ -1481,9 +1578,6 @@ void __hci_req_enable_advertising(struct hci_request *req) hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) { adv_min_interval = DISCOV_LE_FAST_ADV_INT_MIN; adv_max_interval = DISCOV_LE_FAST_ADV_INT_MAX; - } else { - adv_min_interval = hdev->le_adv_min_interval; - adv_max_interval = hdev->le_adv_max_interval; } } @@ -1591,14 +1685,11 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) memset(&cp, 0, sizeof(cp)); - /* Extended scan response data doesn't allow a response to be - * set if the instance isn't scannable. - */ - if (get_adv_instance_scan_rsp_len(hdev, instance)) + if (instance) len = create_instance_scan_rsp_data(hdev, instance, cp.data); else - len = 0; + len = create_default_scan_rsp_data(hdev, cp.data); if (hdev->scan_rsp_data_len == len && !memcmp(cp.data, hdev->scan_rsp_data, len)) @@ -1811,7 +1902,7 @@ void hci_req_disable_address_resolution(struct hci_dev *hdev) static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - BT_DBG("%s status %u", hdev->name, status); + bt_dev_dbg(hdev, "status %u", status); } void hci_req_reenable_advertising(struct hci_dev *hdev) @@ -1848,7 +1939,7 @@ static void adv_timeout_expire(struct work_struct *work) struct hci_request req; u8 instance; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, ""); hci_dev_lock(hdev); @@ -1871,6 +1962,62 @@ unlock: hci_dev_unlock(hdev); } +static int hci_req_add_le_interleaved_scan(struct hci_request *req, + unsigned long opt) +{ + struct hci_dev *hdev = req->hdev; + int ret = 0; + + hci_dev_lock(hdev); + + if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) + hci_req_add_le_scan_disable(req, false); + hci_req_add_le_passive_scan(req); + + switch (hdev->interleave_scan_state) { + case INTERLEAVE_SCAN_ALLOWLIST: + bt_dev_dbg(hdev, "next state: allowlist"); + hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER; + break; + case INTERLEAVE_SCAN_NO_FILTER: + bt_dev_dbg(hdev, "next state: no filter"); + hdev->interleave_scan_state = INTERLEAVE_SCAN_ALLOWLIST; + break; + case INTERLEAVE_SCAN_NONE: + BT_ERR("unexpected error"); + ret = -1; + } + + hci_dev_unlock(hdev); + + return ret; +} + +static void interleave_scan_work(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + interleave_scan.work); + u8 status; + unsigned long timeout; + + if (hdev->interleave_scan_state == INTERLEAVE_SCAN_ALLOWLIST) { + timeout = msecs_to_jiffies(hdev->advmon_allowlist_duration); + } else if (hdev->interleave_scan_state == INTERLEAVE_SCAN_NO_FILTER) { + timeout = msecs_to_jiffies(hdev->advmon_no_filter_duration); + } else { + bt_dev_err(hdev, "unexpected error"); + return; + } + + hci_req_sync(hdev, hci_req_add_le_interleaved_scan, 0, + HCI_CMD_TIMEOUT, &status); + + /* Don't continue interleaving if it was canceled */ + if (is_interleave_scanning(hdev)) + queue_delayed_work(hdev->req_workqueue, + &hdev->interleave_scan, timeout); +} + int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, bool use_rpa, struct adv_info *adv_instance, u8 *own_addr_type, bdaddr_t *rand_addr) @@ -2006,9 +2153,15 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) memset(&cp, 0, sizeof(cp)); - /* In ext adv set param interval is 3 octets */ - hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval); - hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval); + if (adv_instance) { + hci_cpu_to_le24(adv_instance->min_interval, cp.min_interval); + hci_cpu_to_le24(adv_instance->max_interval, cp.max_interval); + cp.tx_power = adv_instance->tx_power; + } else { + hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval); + hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval); + cp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE; + } secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK); @@ -2017,7 +2170,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND); else cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND); - } else if (get_adv_instance_scan_rsp_len(hdev, instance)) { + } else if (adv_instance_is_scannable(hdev, instance)) { if (secondary_adv) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND); else @@ -2031,7 +2184,6 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) cp.own_addr_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; - cp.tx_power = 127; cp.handle = instance; if (flags & MGMT_ADV_FLAG_SEC_2M) { @@ -2332,7 +2484,7 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa) */ if (hci_dev_test_flag(hdev, HCI_LE_ADV) || hci_lookup_le_connect(hdev)) { - BT_DBG("Deferring random address update"); + bt_dev_dbg(hdev, "Deferring random address update"); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); return; } @@ -2557,7 +2709,7 @@ void __hci_req_update_class(struct hci_request *req) struct hci_dev *hdev = req->hdev; u8 cod[3]; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, ""); if (!hdev_is_powered(hdev)) return; @@ -2726,7 +2878,7 @@ void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn, static void abort_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode) { if (status) - BT_DBG("Failed to abort connection: status 0x%2.2x", status); + bt_dev_dbg(hdev, "Failed to abort connection: status 0x%2.2x", status); } int hci_abort_conn(struct hci_conn *conn, u8 reason) @@ -2789,7 +2941,7 @@ static int bredr_inquiry(struct hci_request *req, unsigned long opt) const u8 liac[3] = { 0x00, 0x8b, 0x9e }; struct hci_cp_inquiry cp; - BT_DBG("%s", req->hdev->name); + bt_dev_dbg(req->hdev, ""); hci_dev_lock(req->hdev); hci_inquiry_cache_flush(req->hdev); @@ -2815,7 +2967,7 @@ static void le_scan_disable_work(struct work_struct *work) le_scan_disable.work); u8 status; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, ""); if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) return; @@ -2911,7 +3063,7 @@ static void le_scan_restart_work(struct work_struct *work) unsigned long timeout, duration, scan_start, now; u8 status; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, ""); hci_req_sync(hdev, le_scan_restart, 0, HCI_CMD_TIMEOUT, &status); if (status) { @@ -2965,14 +3117,16 @@ static int active_scan(struct hci_request *req, unsigned long opt) bool addr_resolv = false; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, ""); /* If controller is scanning, it means the background scanning is * running. Thus, we should temporarily stop it in order to set the * discovery scanning parameters. */ - if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) + if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { hci_req_add_le_scan_disable(req, false); + cancel_interleave_scan(hdev); + } /* All active scans will be done with either a resolvable private * address (when privacy feature has been enabled) or non-resolvable @@ -2993,7 +3147,7 @@ static int interleaved_discov(struct hci_request *req, unsigned long opt) { int err; - BT_DBG("%s", req->hdev->name); + bt_dev_dbg(req->hdev, ""); err = active_scan(req, opt); if (err) @@ -3006,7 +3160,7 @@ static void start_discovery(struct hci_dev *hdev, u8 *status) { unsigned long timeout; - BT_DBG("%s type %u", hdev->name, hdev->discovery.type); + bt_dev_dbg(hdev, "type %u", hdev->discovery.type); switch (hdev->discovery.type) { case DISCOV_TYPE_BREDR: @@ -3054,7 +3208,7 @@ static void start_discovery(struct hci_dev *hdev, u8 *status) if (*status) return; - BT_DBG("%s timeout %u ms", hdev->name, jiffies_to_msecs(timeout)); + bt_dev_dbg(hdev, "timeout %u ms", jiffies_to_msecs(timeout)); /* When service discovery is used and the controller has a * strict duplicate filter, it is important to remember the @@ -3079,7 +3233,7 @@ bool hci_req_stop_discovery(struct hci_request *req) struct inquiry_entry *e; bool ret = false; - BT_DBG("%s state %u", hdev->name, hdev->discovery.state); + bt_dev_dbg(hdev, "state %u", hdev->discovery.state); if (d->state == DISCOVERY_FINDING || d->state == DISCOVERY_STOPPING) { if (test_bit(HCI_INQUIRY, &hdev->flags)) @@ -3159,7 +3313,7 @@ static void discov_off(struct work_struct *work) struct hci_dev *hdev = container_of(work, struct hci_dev, discov_off.work); - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, ""); hci_dev_lock(hdev); @@ -3298,6 +3452,7 @@ void hci_request_setup(struct hci_dev *hdev) INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work); INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work); INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire); + INIT_DELAYED_WORK(&hdev->interleave_scan, interleave_scan_work); } void hci_request_cancel_all(struct hci_dev *hdev) @@ -3317,4 +3472,6 @@ void hci_request_cancel_all(struct hci_dev *hdev) cancel_delayed_work_sync(&hdev->adv_instance_expire); hdev->adv_instance_timeout = 0; } + + cancel_interleave_scan(hdev); } diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 6a12e84c66c4..39ee8a18087a 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -71,6 +71,8 @@ void hci_req_add_le_passive_scan(struct hci_request *req); void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next); void hci_req_disable_address_resolution(struct hci_dev *hdev); +void __hci_req_pause_adv_instances(struct hci_request *req); +int hci_req_resume_adv_instances(struct hci_dev *hdev); void hci_req_reenable_advertising(struct hci_dev *hdev); void __hci_req_enable_advertising(struct hci_request *req); void __hci_req_disable_advertising(struct hci_request *req); diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 3b4fa27a44e6..0db48c812662 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -1290,7 +1290,7 @@ static int hidp_session_thread(void *arg) /* cleanup runtime environment */ remove_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait); - remove_wait_queue(sk_sleep(session->intr_sock->sk), &ctrl_wait); + remove_wait_queue(sk_sleep(session->ctrl_sock->sk), &ctrl_wait); wake_up_interruptible(&session->report_queue); hidp_del_timer(session); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 1ab27b90ddcb..72c2f5226d67 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1515,8 +1515,14 @@ static bool l2cap_check_enc_key_size(struct hci_conn *hcon) * that have no key size requirements. Ensure that the link is * actually encrypted before enforcing a key size. */ + int min_key_size = hcon->hdev->min_enc_key_size; + + /* On FIPS security level, key size must be 16 bytes */ + if (hcon->sec_level == BT_SECURITY_FIPS) + min_key_size = 16; + return (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags) || - hcon->enc_key_size >= hcon->hdev->min_enc_key_size); + hcon->enc_key_size >= min_key_size); } static void l2cap_do_start(struct l2cap_chan *chan) @@ -3627,7 +3633,7 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data if (hint) break; result = L2CAP_CONF_UNKNOWN; - *((u8 *) ptr++) = type; + l2cap_add_conf_opt(&ptr, (u8)type, sizeof(u8), type, endptr - ptr); break; } } @@ -4513,6 +4519,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, } goto done; + case L2CAP_CONF_UNKNOWN: case L2CAP_CONF_UNACCEPT: if (chan->num_conf_rsp <= L2CAP_CONF_MAX_CONF_RSP) { char req[64]; @@ -8270,10 +8277,73 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) mutex_unlock(&conn->chan_lock); } +/* Append fragment into frame respecting the maximum len of rx_skb */ +static int l2cap_recv_frag(struct l2cap_conn *conn, struct sk_buff *skb, + u16 len) +{ + if (!conn->rx_skb) { + /* Allocate skb for the complete frame (with header) */ + conn->rx_skb = bt_skb_alloc(len, GFP_KERNEL); + if (!conn->rx_skb) + return -ENOMEM; + /* Init rx_len */ + conn->rx_len = len; + } + + /* Copy as much as the rx_skb can hold */ + len = min_t(u16, len, skb->len); + skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, len), len); + skb_pull(skb, len); + conn->rx_len -= len; + + return len; +} + +static int l2cap_recv_len(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct sk_buff *rx_skb; + int len; + + /* Append just enough to complete the header */ + len = l2cap_recv_frag(conn, skb, L2CAP_LEN_SIZE - conn->rx_skb->len); + + /* If header could not be read just continue */ + if (len < 0 || conn->rx_skb->len < L2CAP_LEN_SIZE) + return len; + + rx_skb = conn->rx_skb; + len = get_unaligned_le16(rx_skb->data); + + /* Check if rx_skb has enough space to received all fragments */ + if (len + (L2CAP_HDR_SIZE - L2CAP_LEN_SIZE) <= skb_tailroom(rx_skb)) { + /* Update expected len */ + conn->rx_len = len + (L2CAP_HDR_SIZE - L2CAP_LEN_SIZE); + return L2CAP_LEN_SIZE; + } + + /* Reset conn->rx_skb since it will need to be reallocated in order to + * fit all fragments. + */ + conn->rx_skb = NULL; + + /* Reallocates rx_skb using the exact expected length */ + len = l2cap_recv_frag(conn, rx_skb, + len + (L2CAP_HDR_SIZE - L2CAP_LEN_SIZE)); + kfree_skb(rx_skb); + + return len; +} + +static void l2cap_recv_reset(struct l2cap_conn *conn) +{ + kfree_skb(conn->rx_skb); + conn->rx_skb = NULL; + conn->rx_len = 0; +} + void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { struct l2cap_conn *conn = hcon->l2cap_data; - struct l2cap_hdr *hdr; int len; /* For AMP controller do not create l2cap conn */ @@ -8292,23 +8362,23 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) case ACL_START: case ACL_START_NO_FLUSH: case ACL_COMPLETE: - if (conn->rx_len) { + if (conn->rx_skb) { BT_ERR("Unexpected start frame (len %d)", skb->len); - kfree_skb(conn->rx_skb); - conn->rx_skb = NULL; - conn->rx_len = 0; + l2cap_recv_reset(conn); l2cap_conn_unreliable(conn, ECOMM); } - /* Start fragment always begin with Basic L2CAP header */ - if (skb->len < L2CAP_HDR_SIZE) { - BT_ERR("Frame is too short (len %d)", skb->len); - l2cap_conn_unreliable(conn, ECOMM); - goto drop; + /* Start fragment may not contain the L2CAP length so just + * copy the initial byte when that happens and use conn->mtu as + * expected length. + */ + if (skb->len < L2CAP_LEN_SIZE) { + if (l2cap_recv_frag(conn, skb, conn->mtu) < 0) + goto drop; + return; } - hdr = (struct l2cap_hdr *) skb->data; - len = __le16_to_cpu(hdr->len) + L2CAP_HDR_SIZE; + len = get_unaligned_le16(skb->data) + L2CAP_HDR_SIZE; if (len == skb->len) { /* Complete frame received */ @@ -8325,38 +8395,43 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) goto drop; } - /* Allocate skb for the complete frame (with header) */ - conn->rx_skb = bt_skb_alloc(len, GFP_KERNEL); - if (!conn->rx_skb) + /* Append fragment into frame (with header) */ + if (l2cap_recv_frag(conn, skb, len) < 0) goto drop; - skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), - skb->len); - conn->rx_len = len - skb->len; break; case ACL_CONT: BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len); - if (!conn->rx_len) { + if (!conn->rx_skb) { BT_ERR("Unexpected continuation frame (len %d)", skb->len); l2cap_conn_unreliable(conn, ECOMM); goto drop; } + /* Complete the L2CAP length if it has not been read */ + if (conn->rx_skb->len < L2CAP_LEN_SIZE) { + if (l2cap_recv_len(conn, skb) < 0) { + l2cap_conn_unreliable(conn, ECOMM); + goto drop; + } + + /* Header still could not be read just continue */ + if (conn->rx_skb->len < L2CAP_LEN_SIZE) + return; + } + if (skb->len > conn->rx_len) { BT_ERR("Fragment is too long (len %d, expected %d)", skb->len, conn->rx_len); - kfree_skb(conn->rx_skb); - conn->rx_skb = NULL; - conn->rx_len = 0; + l2cap_recv_reset(conn); l2cap_conn_unreliable(conn, ECOMM); goto drop; } - skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), - skb->len); - conn->rx_len -= skb->len; + /* Append fragment into frame (with header) */ + l2cap_recv_frag(conn, skb, skb->len); if (!conn->rx_len) { /* Complete frame received. l2cap_recv_frame diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 12d7b368b428..74971b4bd457 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -40,7 +40,7 @@ #include "msft.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 18 +#define MGMT_REVISION 19 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -110,7 +110,7 @@ static const u16 mgmt_commands[] = { MGMT_OP_SET_APPEARANCE, MGMT_OP_SET_BLOCKED_KEYS, MGMT_OP_SET_WIDEBAND_SPEECH, - MGMT_OP_READ_SECURITY_INFO, + MGMT_OP_READ_CONTROLLER_CAP, MGMT_OP_READ_EXP_FEATURES_INFO, MGMT_OP_SET_EXP_FEATURE, MGMT_OP_READ_DEF_SYSTEM_CONFIG, @@ -122,6 +122,9 @@ static const u16 mgmt_commands[] = { MGMT_OP_READ_ADV_MONITOR_FEATURES, MGMT_OP_ADD_ADV_PATTERNS_MONITOR, MGMT_OP_REMOVE_ADV_MONITOR, + MGMT_OP_ADD_EXT_ADV_PARAMS, + MGMT_OP_ADD_EXT_ADV_DATA, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, }; static const u16 mgmt_events[] = { @@ -174,7 +177,7 @@ static const u16 mgmt_untrusted_commands[] = { MGMT_OP_READ_CONFIG_INFO, MGMT_OP_READ_EXT_INDEX_LIST, MGMT_OP_READ_EXT_INFO, - MGMT_OP_READ_SECURITY_INFO, + MGMT_OP_READ_CONTROLLER_CAP, MGMT_OP_READ_EXP_FEATURES_INFO, MGMT_OP_READ_DEF_SYSTEM_CONFIG, MGMT_OP_READ_DEF_RUNTIME_CONFIG, @@ -3387,7 +3390,7 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data, static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { - struct mgmt_rp_get_phy_confguration rp; + struct mgmt_rp_get_phy_configuration rp; bt_dev_dbg(hdev, "sock %p", sk); @@ -3451,7 +3454,7 @@ unlock: static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { - struct mgmt_cp_set_phy_confguration *cp = data; + struct mgmt_cp_set_phy_configuration *cp = data; struct hci_cp_le_set_default_phy cp_phy; struct mgmt_pending_cmd *cmd; struct hci_request req; @@ -3708,13 +3711,14 @@ unlock: return err; } -static int read_security_info(struct sock *sk, struct hci_dev *hdev, - void *data, u16 data_len) +static int read_controller_cap(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) { - char buf[16]; - struct mgmt_rp_read_security_info *rp = (void *)buf; - u16 sec_len = 0; + char buf[20]; + struct mgmt_rp_read_controller_cap *rp = (void *)buf; + u16 cap_len = 0; u8 flags = 0; + u8 tx_power_range[2]; bt_dev_dbg(hdev, "sock %p", sk); @@ -3738,23 +3742,37 @@ static int read_security_info(struct sock *sk, struct hci_dev *hdev, flags |= 0x08; /* Encryption key size enforcement (LE) */ - sec_len = eir_append_data(rp->sec, sec_len, 0x01, &flags, 1); + cap_len = eir_append_data(rp->cap, cap_len, MGMT_CAP_SEC_FLAGS, + &flags, 1); /* When the Read Simple Pairing Options command is supported, then * also max encryption key size information is provided. */ if (hdev->commands[41] & 0x08) - sec_len = eir_append_le16(rp->sec, sec_len, 0x02, + cap_len = eir_append_le16(rp->cap, cap_len, + MGMT_CAP_MAX_ENC_KEY_SIZE, hdev->max_enc_key_size); - sec_len = eir_append_le16(rp->sec, sec_len, 0x03, SMP_MAX_ENC_KEY_SIZE); + cap_len = eir_append_le16(rp->cap, cap_len, + MGMT_CAP_SMP_MAX_ENC_KEY_SIZE, + SMP_MAX_ENC_KEY_SIZE); - rp->sec_len = cpu_to_le16(sec_len); + /* Append the min/max LE tx power parameters if we were able to fetch + * it from the controller + */ + if (hdev->commands[38] & 0x80) { + memcpy(&tx_power_range[0], &hdev->min_le_tx_power, 1); + memcpy(&tx_power_range[1], &hdev->max_le_tx_power, 1); + cap_len = eir_append_data(rp->cap, cap_len, MGMT_CAP_LE_TX_PWR, + tx_power_range, 2); + } + + rp->cap_len = cpu_to_le16(cap_len); hci_dev_unlock(hdev); - return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_SECURITY_INFO, 0, - rp, sizeof(*rp) + sec_len); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_CONTROLLER_CAP, 0, + rp, sizeof(*rp) + cap_len); } #ifdef CONFIG_BT_FEATURE_DEBUG @@ -4149,14 +4167,24 @@ static void mgmt_adv_monitor_added(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_ADV_MONITOR_ADDED, hdev, &ev, sizeof(ev), sk); } -static void mgmt_adv_monitor_removed(struct sock *sk, struct hci_dev *hdev, - u16 handle) +void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle) { - struct mgmt_ev_adv_monitor_added ev; + struct mgmt_ev_adv_monitor_removed ev; + struct mgmt_pending_cmd *cmd; + struct sock *sk_skip = NULL; + struct mgmt_cp_remove_adv_monitor *cp; + + cmd = pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev); + if (cmd) { + cp = cmd->param; + + if (cp->monitor_handle) + sk_skip = cmd->sk; + } ev.monitor_handle = cpu_to_le16(handle); - mgmt_event(MGMT_EV_ADV_MONITOR_REMOVED, hdev, &ev, sizeof(ev), sk); + mgmt_event(MGMT_EV_ADV_MONITOR_REMOVED, hdev, &ev, sizeof(ev), sk_skip); } static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, @@ -4167,6 +4195,7 @@ static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, int handle, err; size_t rp_size = 0; __u32 supported = 0; + __u32 enabled = 0; __u16 num_handles = 0; __u16 handles[HCI_MAX_ADV_MONITOR_NUM_HANDLES]; @@ -4174,12 +4203,11 @@ static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); - if (msft_get_features(hdev) & MSFT_FEATURE_MASK_LE_ADV_MONITOR) + if (msft_monitor_supported(hdev)) supported |= MGMT_ADV_MONITOR_FEATURE_MASK_OR_PATTERNS; - idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) { + idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) handles[num_handles++] = monitor->handle; - } hci_dev_unlock(hdev); @@ -4188,11 +4216,11 @@ static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, if (!rp) return -ENOMEM; - /* Once controller-based monitoring is in place, the enabled_features - * should reflect the use. - */ + /* All supported features are currently enabled */ + enabled = supported; + rp->supported_features = cpu_to_le32(supported); - rp->enabled_features = 0; + rp->enabled_features = cpu_to_le32(enabled); rp->max_num_handles = cpu_to_le16(HCI_MAX_ADV_MONITOR_NUM_HANDLES); rp->max_num_patterns = HCI_MAX_ADV_MONITOR_NUM_PATTERNS; rp->num_handles = cpu_to_le16(num_handles); @@ -4208,105 +4236,267 @@ static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, return err; } +int mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status) +{ + struct mgmt_rp_add_adv_patterns_monitor rp; + struct mgmt_pending_cmd *cmd; + struct adv_monitor *monitor; + int err = 0; + + hci_dev_lock(hdev); + + cmd = pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev); + if (!cmd) { + cmd = pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev); + if (!cmd) + goto done; + } + + monitor = cmd->user_data; + rp.monitor_handle = cpu_to_le16(monitor->handle); + + if (!status) { + mgmt_adv_monitor_added(cmd->sk, hdev, monitor->handle); + hdev->adv_monitors_cnt++; + if (monitor->state == ADV_MONITOR_STATE_NOT_REGISTERED) + monitor->state = ADV_MONITOR_STATE_REGISTERED; + hci_update_background_scan(hdev); + } + + err = mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_status(status), &rp, sizeof(rp)); + mgmt_pending_remove(cmd); + +done: + hci_dev_unlock(hdev); + bt_dev_dbg(hdev, "add monitor %d complete, status %d", + rp.monitor_handle, status); + + return err; +} + +static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, + struct adv_monitor *m, u8 status, + void *data, u16 len, u16 op) +{ + struct mgmt_rp_add_adv_patterns_monitor rp; + struct mgmt_pending_cmd *cmd; + int err; + bool pending; + + hci_dev_lock(hdev); + + if (status) + goto unlock; + + if (pending_find(MGMT_OP_SET_LE, hdev) || + pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev) || + pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev) || + pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev)) { + status = MGMT_STATUS_BUSY; + goto unlock; + } + + cmd = mgmt_pending_add(sk, op, hdev, data, len); + if (!cmd) { + status = MGMT_STATUS_NO_RESOURCES; + goto unlock; + } + + cmd->user_data = m; + pending = hci_add_adv_monitor(hdev, m, &err); + if (err) { + if (err == -ENOSPC || err == -ENOMEM) + status = MGMT_STATUS_NO_RESOURCES; + else if (err == -EINVAL) + status = MGMT_STATUS_INVALID_PARAMS; + else + status = MGMT_STATUS_FAILED; + + mgmt_pending_remove(cmd); + goto unlock; + } + + if (!pending) { + mgmt_pending_remove(cmd); + rp.monitor_handle = cpu_to_le16(m->handle); + mgmt_adv_monitor_added(sk, hdev, m->handle); + m->state = ADV_MONITOR_STATE_REGISTERED; + hdev->adv_monitors_cnt++; + + hci_dev_unlock(hdev); + return mgmt_cmd_complete(sk, hdev->id, op, MGMT_STATUS_SUCCESS, + &rp, sizeof(rp)); + } + + hci_dev_unlock(hdev); + + return 0; + +unlock: + hci_free_adv_monitor(hdev, m); + hci_dev_unlock(hdev); + return mgmt_cmd_status(sk, hdev->id, op, status); +} + +static void parse_adv_monitor_rssi(struct adv_monitor *m, + struct mgmt_adv_rssi_thresholds *rssi) +{ + if (rssi) { + m->rssi.low_threshold = rssi->low_threshold; + m->rssi.low_threshold_timeout = + __le16_to_cpu(rssi->low_threshold_timeout); + m->rssi.high_threshold = rssi->high_threshold; + m->rssi.high_threshold_timeout = + __le16_to_cpu(rssi->high_threshold_timeout); + m->rssi.sampling_period = rssi->sampling_period; + } else { + /* Default values. These numbers are the least constricting + * parameters for MSFT API to work, so it behaves as if there + * are no rssi parameter to consider. May need to be changed + * if other API are to be supported. + */ + m->rssi.low_threshold = -127; + m->rssi.low_threshold_timeout = 60; + m->rssi.high_threshold = -127; + m->rssi.high_threshold_timeout = 0; + m->rssi.sampling_period = 0; + } +} + +static u8 parse_adv_monitor_pattern(struct adv_monitor *m, u8 pattern_count, + struct mgmt_adv_pattern *patterns) +{ + u8 offset = 0, length = 0; + struct adv_pattern *p = NULL; + int i; + + for (i = 0; i < pattern_count; i++) { + offset = patterns[i].offset; + length = patterns[i].length; + if (offset >= HCI_MAX_AD_LENGTH || + length > HCI_MAX_AD_LENGTH || + (offset + length) > HCI_MAX_AD_LENGTH) + return MGMT_STATUS_INVALID_PARAMS; + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return MGMT_STATUS_NO_RESOURCES; + + p->ad_type = patterns[i].ad_type; + p->offset = patterns[i].offset; + p->length = patterns[i].length; + memcpy(p->value, patterns[i].value, p->length); + + INIT_LIST_HEAD(&p->list); + list_add(&p->list, &m->patterns); + } + + return MGMT_STATUS_SUCCESS; +} + static int add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_add_adv_patterns_monitor *cp = data; - struct mgmt_rp_add_adv_patterns_monitor rp; struct adv_monitor *m = NULL; - struct adv_pattern *p = NULL; - unsigned int mp_cnt = 0, prev_adv_monitors_cnt; - __u8 cp_ofst = 0, cp_len = 0; - int err, i; + u8 status = MGMT_STATUS_SUCCESS; + size_t expected_size = sizeof(*cp); BT_DBG("request for %s", hdev->name); - if (len <= sizeof(*cp) || cp->pattern_count == 0) { - err = mgmt_cmd_status(sk, hdev->id, - MGMT_OP_ADD_ADV_PATTERNS_MONITOR, - MGMT_STATUS_INVALID_PARAMS); - goto failed; + if (len <= sizeof(*cp)) { + status = MGMT_STATUS_INVALID_PARAMS; + goto done; } - m = kmalloc(sizeof(*m), GFP_KERNEL); + expected_size += cp->pattern_count * sizeof(struct mgmt_adv_pattern); + if (len != expected_size) { + status = MGMT_STATUS_INVALID_PARAMS; + goto done; + } + + m = kzalloc(sizeof(*m), GFP_KERNEL); if (!m) { - err = -ENOMEM; - goto failed; + status = MGMT_STATUS_NO_RESOURCES; + goto done; } INIT_LIST_HEAD(&m->patterns); - m->active = false; - for (i = 0; i < cp->pattern_count; i++) { - if (++mp_cnt > HCI_MAX_ADV_MONITOR_NUM_PATTERNS) { - err = mgmt_cmd_status(sk, hdev->id, - MGMT_OP_ADD_ADV_PATTERNS_MONITOR, - MGMT_STATUS_INVALID_PARAMS); - goto failed; - } + parse_adv_monitor_rssi(m, NULL); + status = parse_adv_monitor_pattern(m, cp->pattern_count, cp->patterns); - cp_ofst = cp->patterns[i].offset; - cp_len = cp->patterns[i].length; - if (cp_ofst >= HCI_MAX_AD_LENGTH || - cp_len > HCI_MAX_AD_LENGTH || - (cp_ofst + cp_len) > HCI_MAX_AD_LENGTH) { - err = mgmt_cmd_status(sk, hdev->id, - MGMT_OP_ADD_ADV_PATTERNS_MONITOR, - MGMT_STATUS_INVALID_PARAMS); - goto failed; - } +done: + return __add_adv_patterns_monitor(sk, hdev, m, status, data, len, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR); +} - p = kmalloc(sizeof(*p), GFP_KERNEL); - if (!p) { - err = -ENOMEM; - goto failed; - } +static int add_adv_patterns_monitor_rssi(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_cp_add_adv_patterns_monitor_rssi *cp = data; + struct adv_monitor *m = NULL; + u8 status = MGMT_STATUS_SUCCESS; + size_t expected_size = sizeof(*cp); - p->ad_type = cp->patterns[i].ad_type; - p->offset = cp->patterns[i].offset; - p->length = cp->patterns[i].length; - memcpy(p->value, cp->patterns[i].value, p->length); + BT_DBG("request for %s", hdev->name); - INIT_LIST_HEAD(&p->list); - list_add(&p->list, &m->patterns); + if (len <= sizeof(*cp)) { + status = MGMT_STATUS_INVALID_PARAMS; + goto done; } - if (mp_cnt != cp->pattern_count) { - err = mgmt_cmd_status(sk, hdev->id, - MGMT_OP_ADD_ADV_PATTERNS_MONITOR, - MGMT_STATUS_INVALID_PARAMS); - goto failed; + expected_size += cp->pattern_count * sizeof(struct mgmt_adv_pattern); + if (len != expected_size) { + status = MGMT_STATUS_INVALID_PARAMS; + goto done; } - hci_dev_lock(hdev); + m = kzalloc(sizeof(*m), GFP_KERNEL); + if (!m) { + status = MGMT_STATUS_NO_RESOURCES; + goto done; + } - prev_adv_monitors_cnt = hdev->adv_monitors_cnt; + INIT_LIST_HEAD(&m->patterns); - err = hci_add_adv_monitor(hdev, m); - if (err) { - if (err == -ENOSPC) { - mgmt_cmd_status(sk, hdev->id, - MGMT_OP_ADD_ADV_PATTERNS_MONITOR, - MGMT_STATUS_NO_RESOURCES); - } - goto unlock; - } + parse_adv_monitor_rssi(m, &cp->rssi); + status = parse_adv_monitor_pattern(m, cp->pattern_count, cp->patterns); - if (hdev->adv_monitors_cnt > prev_adv_monitors_cnt) - mgmt_adv_monitor_added(sk, hdev, m->handle); +done: + return __add_adv_patterns_monitor(sk, hdev, m, status, data, len, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI); +} - hci_dev_unlock(hdev); +int mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status) +{ + struct mgmt_rp_remove_adv_monitor rp; + struct mgmt_cp_remove_adv_monitor *cp; + struct mgmt_pending_cmd *cmd; + int err = 0; - rp.monitor_handle = cpu_to_le16(m->handle); + hci_dev_lock(hdev); - return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_ADV_PATTERNS_MONITOR, - MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); + cmd = pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev); + if (!cmd) + goto done; -unlock: + cp = cmd->param; + rp.monitor_handle = cp->monitor_handle; + + if (!status) + hci_update_background_scan(hdev); + + err = mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_status(status), &rp, sizeof(rp)); + mgmt_pending_remove(cmd); + +done: hci_dev_unlock(hdev); + bt_dev_dbg(hdev, "remove monitor %d complete, status %d", + rp.monitor_handle, status); -failed: - hci_free_adv_monitor(m); return err; } @@ -4315,37 +4505,64 @@ static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev, { struct mgmt_cp_remove_adv_monitor *cp = data; struct mgmt_rp_remove_adv_monitor rp; - unsigned int prev_adv_monitors_cnt; - u16 handle; - int err; + struct mgmt_pending_cmd *cmd; + u16 handle = __le16_to_cpu(cp->monitor_handle); + int err, status; + bool pending; BT_DBG("request for %s", hdev->name); + rp.monitor_handle = cp->monitor_handle; hci_dev_lock(hdev); - handle = __le16_to_cpu(cp->monitor_handle); - prev_adv_monitors_cnt = hdev->adv_monitors_cnt; + if (pending_find(MGMT_OP_SET_LE, hdev) || + pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev) || + pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev) || + pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev)) { + status = MGMT_STATUS_BUSY; + goto unlock; + } - err = hci_remove_adv_monitor(hdev, handle); - if (err == -ENOENT) { - err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADV_MONITOR, - MGMT_STATUS_INVALID_INDEX); + cmd = mgmt_pending_add(sk, MGMT_OP_REMOVE_ADV_MONITOR, hdev, data, len); + if (!cmd) { + status = MGMT_STATUS_NO_RESOURCES; goto unlock; } - if (hdev->adv_monitors_cnt < prev_adv_monitors_cnt) - mgmt_adv_monitor_removed(sk, hdev, handle); + if (handle) + pending = hci_remove_single_adv_monitor(hdev, handle, &err); + else + pending = hci_remove_all_adv_monitor(hdev, &err); - hci_dev_unlock(hdev); + if (err) { + mgmt_pending_remove(cmd); - rp.monitor_handle = cp->monitor_handle; + if (err == -ENOENT) + status = MGMT_STATUS_INVALID_INDEX; + else + status = MGMT_STATUS_FAILED; - return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_ADV_MONITOR, - MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); + goto unlock; + } + + /* monitor can be removed without forwarding request to controller */ + if (!pending) { + mgmt_pending_remove(cmd); + hci_dev_unlock(hdev); + + return mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_REMOVE_ADV_MONITOR, + MGMT_STATUS_SUCCESS, + &rp, sizeof(rp)); + } + + hci_dev_unlock(hdev); + return 0; unlock: hci_dev_unlock(hdev); - return err; + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADV_MONITOR, + status); } static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, @@ -4781,6 +4998,14 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, goto failed; } + if (hdev->discovery_paused) { + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_START_SERVICE_DISCOVERY, + MGMT_STATUS_BUSY, &cp->type, + sizeof(cp->type)); + goto failed; + } + uuid_count = __le16_to_cpu(cp->uuid_count); if (uuid_count > max_uuid_count) { bt_dev_err(hdev, "service_discovery: too big uuid_count value %u", @@ -7203,6 +7428,10 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev) flags |= MGMT_ADV_FLAG_MANAGED_FLAGS; flags |= MGMT_ADV_FLAG_APPEARANCE; flags |= MGMT_ADV_FLAG_LOCAL_NAME; + flags |= MGMT_ADV_PARAM_DURATION; + flags |= MGMT_ADV_PARAM_TIMEOUT; + flags |= MGMT_ADV_PARAM_INTERVALS; + flags |= MGMT_ADV_PARAM_TX_POWER; /* In extended adv TX_POWER returned from Set Adv Param * will be always valid. @@ -7377,6 +7606,31 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, return true; } +static bool requested_adv_flags_are_valid(struct hci_dev *hdev, u32 adv_flags) +{ + u32 supported_flags, phy_flags; + + /* The current implementation only supports a subset of the specified + * flags. Also need to check mutual exclusiveness of sec flags. + */ + supported_flags = get_supported_adv_flags(hdev); + phy_flags = adv_flags & MGMT_ADV_FLAG_SEC_MASK; + if (adv_flags & ~supported_flags || + ((phy_flags && (phy_flags ^ (phy_flags & -phy_flags))))) + return false; + + return true; +} + +static bool adv_busy(struct hci_dev *hdev) +{ + return (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || + pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || + pending_find(MGMT_OP_SET_LE, hdev) || + pending_find(MGMT_OP_ADD_EXT_ADV_PARAMS, hdev) || + pending_find(MGMT_OP_ADD_EXT_ADV_DATA, hdev)); +} + static void add_advertising_complete(struct hci_dev *hdev, u8 status, u16 opcode) { @@ -7391,6 +7645,8 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status, hci_dev_lock(hdev); cmd = pending_find(MGMT_OP_ADD_ADVERTISING, hdev); + if (!cmd) + cmd = pending_find(MGMT_OP_ADD_EXT_ADV_DATA, hdev); list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { if (!adv_instance->pending) @@ -7435,7 +7691,6 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_add_advertising *cp = data; struct mgmt_rp_add_advertising rp; u32 flags; - u32 supported_flags, phy_flags; u8 status; u16 timeout, duration; unsigned int prev_instance_cnt = hdev->adv_instance_cnt; @@ -7471,13 +7726,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, timeout = __le16_to_cpu(cp->timeout); duration = __le16_to_cpu(cp->duration); - /* The current implementation only supports a subset of the specified - * flags. Also need to check mutual exclusiveness of sec flags. - */ - supported_flags = get_supported_adv_flags(hdev); - phy_flags = flags & MGMT_ADV_FLAG_SEC_MASK; - if (flags & ~supported_flags || - ((phy_flags && (phy_flags ^ (phy_flags & -phy_flags))))) + if (!requested_adv_flags_are_valid(hdev, flags)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); @@ -7489,9 +7738,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || - pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || - pending_find(MGMT_OP_SET_LE, hdev)) { + if (adv_busy(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_BUSY); goto unlock; @@ -7509,7 +7756,10 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, cp->adv_data_len, cp->data, cp->scan_rsp_len, cp->data + cp->adv_data_len, - timeout, duration); + timeout, duration, + HCI_ADV_TX_POWER_NO_PREFERENCE, + hdev->le_adv_min_interval, + hdev->le_adv_max_interval); if (err < 0) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_FAILED); @@ -7582,6 +7832,338 @@ unlock: return err; } +static void add_ext_adv_params_complete(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + struct mgmt_pending_cmd *cmd; + struct mgmt_cp_add_ext_adv_params *cp; + struct mgmt_rp_add_ext_adv_params rp; + struct adv_info *adv_instance; + u32 flags; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + cmd = pending_find(MGMT_OP_ADD_EXT_ADV_PARAMS, hdev); + if (!cmd) + goto unlock; + + cp = cmd->param; + adv_instance = hci_find_adv_instance(hdev, cp->instance); + if (!adv_instance) + goto unlock; + + rp.instance = cp->instance; + rp.tx_power = adv_instance->tx_power; + + /* While we're at it, inform userspace of the available space for this + * advertisement, given the flags that will be used. + */ + flags = __le32_to_cpu(cp->flags); + rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true); + rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false); + + if (status) { + /* If this advertisement was previously advertising and we + * failed to update it, we signal that it has been removed and + * delete its structure + */ + if (!adv_instance->pending) + mgmt_advertising_removed(cmd->sk, hdev, cp->instance); + + hci_remove_adv_instance(hdev, cp->instance); + + mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, + mgmt_status(status)); + + } else { + mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_status(status), &rp, sizeof(rp)); + } + +unlock: + if (cmd) + mgmt_pending_remove(cmd); + + hci_dev_unlock(hdev); +} + +static int add_ext_adv_params(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + struct mgmt_cp_add_ext_adv_params *cp = data; + struct mgmt_rp_add_ext_adv_params rp; + struct mgmt_pending_cmd *cmd = NULL; + struct adv_info *adv_instance; + struct hci_request req; + u32 flags, min_interval, max_interval; + u16 timeout, duration; + u8 status; + s8 tx_power; + int err; + + BT_DBG("%s", hdev->name); + + status = mgmt_le_support(hdev); + if (status) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, + status); + + if (cp->instance < 1 || cp->instance > hdev->le_num_of_adv_sets) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, + MGMT_STATUS_INVALID_PARAMS); + + /* The purpose of breaking add_advertising into two separate MGMT calls + * for params and data is to allow more parameters to be added to this + * structure in the future. For this reason, we verify that we have the + * bare minimum structure we know of when the interface was defined. Any + * extra parameters we don't know about will be ignored in this request. + */ + if (data_len < MGMT_ADD_EXT_ADV_PARAMS_MIN_SIZE) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, + MGMT_STATUS_INVALID_PARAMS); + + flags = __le32_to_cpu(cp->flags); + + if (!requested_adv_flags_are_valid(hdev, flags)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, + MGMT_STATUS_INVALID_PARAMS); + + hci_dev_lock(hdev); + + /* In new interface, we require that we are powered to register */ + if (!hdev_is_powered(hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, + MGMT_STATUS_REJECTED); + goto unlock; + } + + if (adv_busy(hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, + MGMT_STATUS_BUSY); + goto unlock; + } + + /* Parse defined parameters from request, use defaults otherwise */ + timeout = (flags & MGMT_ADV_PARAM_TIMEOUT) ? + __le16_to_cpu(cp->timeout) : 0; + + duration = (flags & MGMT_ADV_PARAM_DURATION) ? + __le16_to_cpu(cp->duration) : + hdev->def_multi_adv_rotation_duration; + + min_interval = (flags & MGMT_ADV_PARAM_INTERVALS) ? + __le32_to_cpu(cp->min_interval) : + hdev->le_adv_min_interval; + + max_interval = (flags & MGMT_ADV_PARAM_INTERVALS) ? + __le32_to_cpu(cp->max_interval) : + hdev->le_adv_max_interval; + + tx_power = (flags & MGMT_ADV_PARAM_TX_POWER) ? + cp->tx_power : + HCI_ADV_TX_POWER_NO_PREFERENCE; + + /* Create advertising instance with no advertising or response data */ + err = hci_add_adv_instance(hdev, cp->instance, flags, + 0, NULL, 0, NULL, timeout, duration, + tx_power, min_interval, max_interval); + + if (err < 0) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, + MGMT_STATUS_FAILED); + goto unlock; + } + + hdev->cur_adv_instance = cp->instance; + /* Submit request for advertising params if ext adv available */ + if (ext_adv_capable(hdev)) { + hci_req_init(&req, hdev); + adv_instance = hci_find_adv_instance(hdev, cp->instance); + + /* Updating parameters of an active instance will return a + * Command Disallowed error, so we must first disable the + * instance if it is active. + */ + if (!adv_instance->pending) + __hci_req_disable_ext_adv_instance(&req, cp->instance); + + __hci_req_setup_ext_adv_instance(&req, cp->instance); + + err = hci_req_run(&req, add_ext_adv_params_complete); + + if (!err) + cmd = mgmt_pending_add(sk, MGMT_OP_ADD_EXT_ADV_PARAMS, + hdev, data, data_len); + if (!cmd) { + err = -ENOMEM; + hci_remove_adv_instance(hdev, cp->instance); + goto unlock; + } + + } else { + rp.instance = cp->instance; + rp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE; + rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true); + rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_ADD_EXT_ADV_PARAMS, + MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); + } + +unlock: + hci_dev_unlock(hdev); + + return err; +} + +static int add_ext_adv_data(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len) +{ + struct mgmt_cp_add_ext_adv_data *cp = data; + struct mgmt_rp_add_ext_adv_data rp; + u8 schedule_instance = 0; + struct adv_info *next_instance; + struct adv_info *adv_instance; + int err = 0; + struct mgmt_pending_cmd *cmd; + struct hci_request req; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + adv_instance = hci_find_adv_instance(hdev, cp->instance); + + if (!adv_instance) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA, + MGMT_STATUS_INVALID_PARAMS); + goto unlock; + } + + /* In new interface, we require that we are powered to register */ + if (!hdev_is_powered(hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA, + MGMT_STATUS_REJECTED); + goto clear_new_instance; + } + + if (adv_busy(hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA, + MGMT_STATUS_BUSY); + goto clear_new_instance; + } + + /* Validate new data */ + if (!tlv_data_is_valid(hdev, adv_instance->flags, cp->data, + cp->adv_data_len, true) || + !tlv_data_is_valid(hdev, adv_instance->flags, cp->data + + cp->adv_data_len, cp->scan_rsp_len, false)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA, + MGMT_STATUS_INVALID_PARAMS); + goto clear_new_instance; + } + + /* Set the data in the advertising instance */ + hci_set_adv_instance_data(hdev, cp->instance, cp->adv_data_len, + cp->data, cp->scan_rsp_len, + cp->data + cp->adv_data_len); + + /* We're good to go, update advertising data, parameters, and start + * advertising. + */ + + hci_req_init(&req, hdev); + + hci_req_add(&req, HCI_OP_READ_LOCAL_NAME, 0, NULL); + + if (ext_adv_capable(hdev)) { + __hci_req_update_adv_data(&req, cp->instance); + __hci_req_update_scan_rsp_data(&req, cp->instance); + __hci_req_enable_ext_advertising(&req, cp->instance); + + } else { + /* If using software rotation, determine next instance to use */ + + if (hdev->cur_adv_instance == cp->instance) { + /* If the currently advertised instance is being changed + * then cancel the current advertising and schedule the + * next instance. If there is only one instance then the + * overridden advertising data will be visible right + * away + */ + cancel_adv_timeout(hdev); + + next_instance = hci_get_next_instance(hdev, + cp->instance); + if (next_instance) + schedule_instance = next_instance->instance; + } else if (!hdev->adv_instance_timeout) { + /* Immediately advertise the new instance if no other + * instance is currently being advertised. + */ + schedule_instance = cp->instance; + } + + /* If the HCI_ADVERTISING flag is set or there is no instance to + * be advertised then we have no HCI communication to make. + * Simply return. + */ + if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + !schedule_instance) { + if (adv_instance->pending) { + mgmt_advertising_added(sk, hdev, cp->instance); + adv_instance->pending = false; + } + rp.instance = cp->instance; + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_ADD_EXT_ADV_DATA, + MGMT_STATUS_SUCCESS, &rp, + sizeof(rp)); + goto unlock; + } + + err = __hci_req_schedule_adv_instance(&req, schedule_instance, + true); + } + + cmd = mgmt_pending_add(sk, MGMT_OP_ADD_EXT_ADV_DATA, hdev, data, + data_len); + if (!cmd) { + err = -ENOMEM; + goto clear_new_instance; + } + + if (!err) + err = hci_req_run(&req, add_advertising_complete); + + if (err < 0) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA, + MGMT_STATUS_FAILED); + mgmt_pending_remove(cmd); + goto clear_new_instance; + } + + /* We were successful in updating data, so trigger advertising_added + * event if this is an instance that wasn't previously advertising. If + * a failure occurs in the requests we initiated, we will remove the + * instance again in add_advertising_complete + */ + if (adv_instance->pending) + mgmt_advertising_added(sk, hdev, cp->instance); + + goto unlock; + +clear_new_instance: + hci_remove_adv_instance(hdev, cp->instance); + +unlock: + hci_dev_unlock(hdev); + + return err; +} + static void remove_advertising_complete(struct hci_dev *hdev, u8 status, u16 opcode) { @@ -7834,7 +8416,7 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { set_blocked_keys, MGMT_OP_SET_BLOCKED_KEYS_SIZE, HCI_MGMT_VAR_LEN }, { set_wideband_speech, MGMT_SETTING_SIZE }, - { read_security_info, MGMT_READ_SECURITY_INFO_SIZE, + { read_controller_cap, MGMT_READ_CONTROLLER_CAP_SIZE, HCI_MGMT_UNTRUSTED }, { read_exp_features_info, MGMT_READ_EXP_FEATURES_INFO_SIZE, HCI_MGMT_UNTRUSTED | @@ -7856,6 +8438,13 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { add_adv_patterns_monitor,MGMT_ADD_ADV_PATTERNS_MONITOR_SIZE, HCI_MGMT_VAR_LEN }, { remove_adv_monitor, MGMT_REMOVE_ADV_MONITOR_SIZE }, + { add_ext_adv_params, MGMT_ADD_EXT_ADV_PARAMS_MIN_SIZE, + HCI_MGMT_VAR_LEN }, + { add_ext_adv_data, MGMT_ADD_EXT_ADV_DATA_SIZE, + HCI_MGMT_VAR_LEN }, + { add_adv_patterns_monitor_rssi, + MGMT_ADD_ADV_PATTERNS_MONITOR_RSSI_SIZE, + HCI_MGMT_VAR_LEN }, }; void mgmt_index_added(struct hci_dev *hdev) diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c index b30b571f8caf..1deb0ca7a929 100644 --- a/net/bluetooth/mgmt_config.c +++ b/net/bluetooth/mgmt_config.c @@ -11,74 +11,119 @@ #include "mgmt_util.h" #include "mgmt_config.h" -#define HDEV_PARAM_U16(_param_code_, _param_name_) \ -{ \ - { cpu_to_le16(_param_code_), sizeof(__u16) }, \ - { cpu_to_le16(hdev->_param_name_) } \ -} +#define HDEV_PARAM_U16(_param_name_) \ + struct {\ + struct mgmt_tlv entry; \ + __le16 value; \ + } __packed _param_name_ -#define HDEV_PARAM_U16_JIFFIES_TO_MSECS(_param_code_, _param_name_) \ -{ \ - { cpu_to_le16(_param_code_), sizeof(__u16) }, \ - { cpu_to_le16(jiffies_to_msecs(hdev->_param_name_)) } \ -} +#define HDEV_PARAM_U8(_param_name_) \ + struct {\ + struct mgmt_tlv entry; \ + __u8 value; \ + } __packed _param_name_ + +#define TLV_SET_U16(_param_code_, _param_name_) \ + { \ + { cpu_to_le16(_param_code_), sizeof(__u16) }, \ + cpu_to_le16(hdev->_param_name_) \ + } + +#define TLV_SET_U8(_param_code_, _param_name_) \ + { \ + { cpu_to_le16(_param_code_), sizeof(__u8) }, \ + hdev->_param_name_ \ + } + +#define TLV_SET_U16_JIFFIES_TO_MSECS(_param_code_, _param_name_) \ + { \ + { cpu_to_le16(_param_code_), sizeof(__u16) }, \ + cpu_to_le16(jiffies_to_msecs(hdev->_param_name_)) \ + } int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { - struct { - struct mgmt_tlv entry; - union { - /* This is a simplification for now since all values - * are 16 bits. In the future, this code may need - * refactoring to account for variable length values - * and properly calculate the required buffer size. - */ - __le16 value; - }; - } __packed params[] = { + int ret; + struct mgmt_rp_read_def_system_config { /* Please see mgmt-api.txt for documentation of these values */ - HDEV_PARAM_U16(0x0000, def_page_scan_type), - HDEV_PARAM_U16(0x0001, def_page_scan_int), - HDEV_PARAM_U16(0x0002, def_page_scan_window), - HDEV_PARAM_U16(0x0003, def_inq_scan_type), - HDEV_PARAM_U16(0x0004, def_inq_scan_int), - HDEV_PARAM_U16(0x0005, def_inq_scan_window), - HDEV_PARAM_U16(0x0006, def_br_lsto), - HDEV_PARAM_U16(0x0007, def_page_timeout), - HDEV_PARAM_U16(0x0008, sniff_min_interval), - HDEV_PARAM_U16(0x0009, sniff_max_interval), - HDEV_PARAM_U16(0x000a, le_adv_min_interval), - HDEV_PARAM_U16(0x000b, le_adv_max_interval), - HDEV_PARAM_U16(0x000c, def_multi_adv_rotation_duration), - HDEV_PARAM_U16(0x000d, le_scan_interval), - HDEV_PARAM_U16(0x000e, le_scan_window), - HDEV_PARAM_U16(0x000f, le_scan_int_suspend), - HDEV_PARAM_U16(0x0010, le_scan_window_suspend), - HDEV_PARAM_U16(0x0011, le_scan_int_discovery), - HDEV_PARAM_U16(0x0012, le_scan_window_discovery), - HDEV_PARAM_U16(0x0013, le_scan_int_adv_monitor), - HDEV_PARAM_U16(0x0014, le_scan_window_adv_monitor), - HDEV_PARAM_U16(0x0015, le_scan_int_connect), - HDEV_PARAM_U16(0x0016, le_scan_window_connect), - HDEV_PARAM_U16(0x0017, le_conn_min_interval), - HDEV_PARAM_U16(0x0018, le_conn_max_interval), - HDEV_PARAM_U16(0x0019, le_conn_latency), - HDEV_PARAM_U16(0x001a, le_supv_timeout), - HDEV_PARAM_U16_JIFFIES_TO_MSECS(0x001b, - def_le_autoconnect_timeout), + HDEV_PARAM_U16(def_page_scan_type); + HDEV_PARAM_U16(def_page_scan_int); + HDEV_PARAM_U16(def_page_scan_window); + HDEV_PARAM_U16(def_inq_scan_type); + HDEV_PARAM_U16(def_inq_scan_int); + HDEV_PARAM_U16(def_inq_scan_window); + HDEV_PARAM_U16(def_br_lsto); + HDEV_PARAM_U16(def_page_timeout); + HDEV_PARAM_U16(sniff_min_interval); + HDEV_PARAM_U16(sniff_max_interval); + HDEV_PARAM_U16(le_adv_min_interval); + HDEV_PARAM_U16(le_adv_max_interval); + HDEV_PARAM_U16(def_multi_adv_rotation_duration); + HDEV_PARAM_U16(le_scan_interval); + HDEV_PARAM_U16(le_scan_window); + HDEV_PARAM_U16(le_scan_int_suspend); + HDEV_PARAM_U16(le_scan_window_suspend); + HDEV_PARAM_U16(le_scan_int_discovery); + HDEV_PARAM_U16(le_scan_window_discovery); + HDEV_PARAM_U16(le_scan_int_adv_monitor); + HDEV_PARAM_U16(le_scan_window_adv_monitor); + HDEV_PARAM_U16(le_scan_int_connect); + HDEV_PARAM_U16(le_scan_window_connect); + HDEV_PARAM_U16(le_conn_min_interval); + HDEV_PARAM_U16(le_conn_max_interval); + HDEV_PARAM_U16(le_conn_latency); + HDEV_PARAM_U16(le_supv_timeout); + HDEV_PARAM_U16(def_le_autoconnect_timeout); + HDEV_PARAM_U16(advmon_allowlist_duration); + HDEV_PARAM_U16(advmon_no_filter_duration); + HDEV_PARAM_U8(enable_advmon_interleave_scan); + } __packed rp = { + TLV_SET_U16(0x0000, def_page_scan_type), + TLV_SET_U16(0x0001, def_page_scan_int), + TLV_SET_U16(0x0002, def_page_scan_window), + TLV_SET_U16(0x0003, def_inq_scan_type), + TLV_SET_U16(0x0004, def_inq_scan_int), + TLV_SET_U16(0x0005, def_inq_scan_window), + TLV_SET_U16(0x0006, def_br_lsto), + TLV_SET_U16(0x0007, def_page_timeout), + TLV_SET_U16(0x0008, sniff_min_interval), + TLV_SET_U16(0x0009, sniff_max_interval), + TLV_SET_U16(0x000a, le_adv_min_interval), + TLV_SET_U16(0x000b, le_adv_max_interval), + TLV_SET_U16(0x000c, def_multi_adv_rotation_duration), + TLV_SET_U16(0x000d, le_scan_interval), + TLV_SET_U16(0x000e, le_scan_window), + TLV_SET_U16(0x000f, le_scan_int_suspend), + TLV_SET_U16(0x0010, le_scan_window_suspend), + TLV_SET_U16(0x0011, le_scan_int_discovery), + TLV_SET_U16(0x0012, le_scan_window_discovery), + TLV_SET_U16(0x0013, le_scan_int_adv_monitor), + TLV_SET_U16(0x0014, le_scan_window_adv_monitor), + TLV_SET_U16(0x0015, le_scan_int_connect), + TLV_SET_U16(0x0016, le_scan_window_connect), + TLV_SET_U16(0x0017, le_conn_min_interval), + TLV_SET_U16(0x0018, le_conn_max_interval), + TLV_SET_U16(0x0019, le_conn_latency), + TLV_SET_U16(0x001a, le_supv_timeout), + TLV_SET_U16_JIFFIES_TO_MSECS(0x001b, + def_le_autoconnect_timeout), + TLV_SET_U16(0x001d, advmon_allowlist_duration), + TLV_SET_U16(0x001e, advmon_no_filter_duration), + TLV_SET_U8(0x001f, enable_advmon_interleave_scan), }; - struct mgmt_rp_read_def_system_config *rp = (void *)params; bt_dev_dbg(hdev, "sock %p", sk); - return mgmt_cmd_complete(sk, hdev->id, - MGMT_OP_READ_DEF_SYSTEM_CONFIG, - 0, rp, sizeof(params)); + ret = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_READ_DEF_SYSTEM_CONFIG, + 0, &rp, sizeof(rp)); + return ret; } #define TO_TLV(x) ((struct mgmt_tlv *)(x)) #define TLV_GET_LE16(tlv) le16_to_cpu(*((__le16 *)(TO_TLV(tlv)->value))) +#define TLV_GET_U8(tlv) (*((__u8 *)(TO_TLV(tlv)->value))) int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) @@ -95,6 +140,7 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, /* First pass to validate the tlv */ while (buffer_left >= sizeof(struct mgmt_tlv)) { const u8 len = TO_TLV(buffer)->length; + size_t exp_type_len; const u16 exp_len = sizeof(struct mgmt_tlv) + len; const u16 type = le16_to_cpu(TO_TLV(buffer)->type); @@ -138,20 +184,28 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, case 0x0019: case 0x001a: case 0x001b: - if (len != sizeof(u16)) { - bt_dev_warn(hdev, "invalid length %d, exp %zu for type %d", - len, sizeof(u16), type); - - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_DEF_SYSTEM_CONFIG, - MGMT_STATUS_INVALID_PARAMS); - } + case 0x001d: + case 0x001e: + exp_type_len = sizeof(u16); + break; + case 0x001f: + exp_type_len = sizeof(u8); break; default: + exp_type_len = 0; bt_dev_warn(hdev, "unsupported parameter %u", type); break; } + if (exp_type_len && len != exp_type_len) { + bt_dev_warn(hdev, "invalid length %d, exp %zu for type %d", + len, exp_type_len, type); + + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_DEF_SYSTEM_CONFIG, + MGMT_STATUS_INVALID_PARAMS); + } + buffer_left -= exp_len; buffer += exp_len; } @@ -251,6 +305,15 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, hdev->def_le_autoconnect_timeout = msecs_to_jiffies(TLV_GET_LE16(buffer)); break; + case 0x0001d: + hdev->advmon_allowlist_duration = TLV_GET_LE16(buffer); + break; + case 0x0001e: + hdev->advmon_no_filter_duration = TLV_GET_LE16(buffer); + break; + case 0x0001f: + hdev->enable_advmon_interleave_scan = TLV_GET_U8(buffer); + break; default: bt_dev_warn(hdev, "unsupported parameter %u", type); break; diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index 8579bfeb2836..47b104f318e9 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -5,27 +5,106 @@ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> +#include <net/bluetooth/mgmt.h> +#include "hci_request.h" +#include "mgmt_util.h" #include "msft.h" +#define MSFT_RSSI_THRESHOLD_VALUE_MIN -127 +#define MSFT_RSSI_THRESHOLD_VALUE_MAX 20 +#define MSFT_RSSI_LOW_TIMEOUT_MAX 0x3C + #define MSFT_OP_READ_SUPPORTED_FEATURES 0x00 struct msft_cp_read_supported_features { __u8 sub_opcode; } __packed; + struct msft_rp_read_supported_features { __u8 status; __u8 sub_opcode; __le64 features; __u8 evt_prefix_len; - __u8 evt_prefix[0]; + __u8 evt_prefix[]; } __packed; +#define MSFT_OP_LE_MONITOR_ADVERTISEMENT 0x03 +#define MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN 0x01 +struct msft_le_monitor_advertisement_pattern { + __u8 length; + __u8 data_type; + __u8 start_byte; + __u8 pattern[0]; +}; + +struct msft_le_monitor_advertisement_pattern_data { + __u8 count; + __u8 data[0]; +}; + +struct msft_cp_le_monitor_advertisement { + __u8 sub_opcode; + __s8 rssi_high; + __s8 rssi_low; + __u8 rssi_low_interval; + __u8 rssi_sampling_period; + __u8 cond_type; + __u8 data[0]; +} __packed; + +struct msft_rp_le_monitor_advertisement { + __u8 status; + __u8 sub_opcode; + __u8 handle; +} __packed; + +#define MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT 0x04 +struct msft_cp_le_cancel_monitor_advertisement { + __u8 sub_opcode; + __u8 handle; +} __packed; + +struct msft_rp_le_cancel_monitor_advertisement { + __u8 status; + __u8 sub_opcode; +} __packed; + +#define MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE 0x05 +struct msft_cp_le_set_advertisement_filter_enable { + __u8 sub_opcode; + __u8 enable; +} __packed; + +struct msft_rp_le_set_advertisement_filter_enable { + __u8 status; + __u8 sub_opcode; +} __packed; + +struct msft_monitor_advertisement_handle_data { + __u8 msft_handle; + __u16 mgmt_handle; + struct list_head list; +}; + struct msft_data { __u64 features; __u8 evt_prefix_len; __u8 *evt_prefix; + struct list_head handle_map; + __u16 pending_add_handle; + __u16 pending_remove_handle; + __u8 reregistering; + __u8 filter_enabled; }; +static int __msft_add_monitor_pattern(struct hci_dev *hdev, + struct adv_monitor *monitor); + +bool msft_monitor_supported(struct hci_dev *hdev) +{ + return !!(msft_get_features(hdev) & MSFT_FEATURE_MASK_LE_ADV_MONITOR); +} + static bool read_supported_features(struct hci_dev *hdev, struct msft_data *msft) { @@ -71,6 +150,35 @@ failed: return false; } +/* This function requires the caller holds hdev->lock */ +static void reregister_monitor_on_restart(struct hci_dev *hdev, int handle) +{ + struct adv_monitor *monitor; + struct msft_data *msft = hdev->msft_data; + int err; + + while (1) { + monitor = idr_get_next(&hdev->adv_monitors_idr, &handle); + if (!monitor) { + /* All monitors have been reregistered */ + msft->reregistering = false; + hci_update_background_scan(hdev); + return; + } + + msft->pending_add_handle = (u16)handle; + err = __msft_add_monitor_pattern(hdev, monitor); + + /* If success, we return and wait for monitor added callback */ + if (!err) + return; + + /* Otherwise remove the monitor and keep registering */ + hci_free_adv_monitor(hdev, monitor); + handle++; + } +} + void msft_do_open(struct hci_dev *hdev) { struct msft_data *msft; @@ -89,12 +197,21 @@ void msft_do_open(struct hci_dev *hdev) return; } + INIT_LIST_HEAD(&msft->handle_map); hdev->msft_data = msft; + + if (msft_monitor_supported(hdev)) { + msft->reregistering = true; + msft_set_filter_enable(hdev, true); + reregister_monitor_on_restart(hdev, 0); + } } void msft_do_close(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; + struct msft_monitor_advertisement_handle_data *handle_data, *tmp; + struct adv_monitor *monitor; if (!msft) return; @@ -103,6 +220,17 @@ void msft_do_close(struct hci_dev *hdev) hdev->msft_data = NULL; + list_for_each_entry_safe(handle_data, tmp, &msft->handle_map, list) { + monitor = idr_find(&hdev->adv_monitors_idr, + handle_data->mgmt_handle); + + if (monitor && monitor->state == ADV_MONITOR_STATE_OFFLOADED) + monitor->state = ADV_MONITOR_STATE_REGISTERED; + + list_del(&handle_data->list); + kfree(handle_data); + } + kfree(msft->evt_prefix); kfree(msft); } @@ -144,5 +272,336 @@ __u64 msft_get_features(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; - return msft ? msft->features : 0; + return msft ? msft->features : 0; +} + +/* is_mgmt = true matches the handle exposed to userspace via mgmt. + * is_mgmt = false matches the handle used by the msft controller. + * This function requires the caller holds hdev->lock + */ +static struct msft_monitor_advertisement_handle_data *msft_find_handle_data + (struct hci_dev *hdev, u16 handle, bool is_mgmt) +{ + struct msft_monitor_advertisement_handle_data *entry; + struct msft_data *msft = hdev->msft_data; + + list_for_each_entry(entry, &msft->handle_map, list) { + if (is_mgmt && entry->mgmt_handle == handle) + return entry; + if (!is_mgmt && entry->msft_handle == handle) + return entry; + } + + return NULL; +} + +static void msft_le_monitor_advertisement_cb(struct hci_dev *hdev, + u8 status, u16 opcode, + struct sk_buff *skb) +{ + struct msft_rp_le_monitor_advertisement *rp; + struct adv_monitor *monitor; + struct msft_monitor_advertisement_handle_data *handle_data; + struct msft_data *msft = hdev->msft_data; + + hci_dev_lock(hdev); + + monitor = idr_find(&hdev->adv_monitors_idr, msft->pending_add_handle); + if (!monitor) { + bt_dev_err(hdev, "msft add advmon: monitor %d is not found!", + msft->pending_add_handle); + status = HCI_ERROR_UNSPECIFIED; + goto unlock; + } + + if (status) + goto unlock; + + rp = (struct msft_rp_le_monitor_advertisement *)skb->data; + if (skb->len < sizeof(*rp)) { + status = HCI_ERROR_UNSPECIFIED; + goto unlock; + } + + handle_data = kmalloc(sizeof(*handle_data), GFP_KERNEL); + if (!handle_data) { + status = HCI_ERROR_UNSPECIFIED; + goto unlock; + } + + handle_data->mgmt_handle = monitor->handle; + handle_data->msft_handle = rp->handle; + INIT_LIST_HEAD(&handle_data->list); + list_add(&handle_data->list, &msft->handle_map); + + monitor->state = ADV_MONITOR_STATE_OFFLOADED; + +unlock: + if (status && monitor) + hci_free_adv_monitor(hdev, monitor); + + /* If in restart/reregister sequence, keep registering. */ + if (msft->reregistering) + reregister_monitor_on_restart(hdev, + msft->pending_add_handle + 1); + + hci_dev_unlock(hdev); + + if (!msft->reregistering) + hci_add_adv_patterns_monitor_complete(hdev, status); +} + +static void msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, + u8 status, u16 opcode, + struct sk_buff *skb) +{ + struct msft_cp_le_cancel_monitor_advertisement *cp; + struct msft_rp_le_cancel_monitor_advertisement *rp; + struct adv_monitor *monitor; + struct msft_monitor_advertisement_handle_data *handle_data; + struct msft_data *msft = hdev->msft_data; + int err; + bool pending; + + if (status) + goto done; + + rp = (struct msft_rp_le_cancel_monitor_advertisement *)skb->data; + if (skb->len < sizeof(*rp)) { + status = HCI_ERROR_UNSPECIFIED; + goto done; + } + + hci_dev_lock(hdev); + + cp = hci_sent_cmd_data(hdev, hdev->msft_opcode); + handle_data = msft_find_handle_data(hdev, cp->handle, false); + + if (handle_data) { + monitor = idr_find(&hdev->adv_monitors_idr, + handle_data->mgmt_handle); + if (monitor) + hci_free_adv_monitor(hdev, monitor); + + list_del(&handle_data->list); + kfree(handle_data); + } + + /* If remove all monitors is required, we need to continue the process + * here because the earlier it was paused when waiting for the + * response from controller. + */ + if (msft->pending_remove_handle == 0) { + pending = hci_remove_all_adv_monitor(hdev, &err); + if (pending) { + hci_dev_unlock(hdev); + return; + } + + if (err) + status = HCI_ERROR_UNSPECIFIED; + } + + hci_dev_unlock(hdev); + +done: + hci_remove_adv_monitor_complete(hdev, status); +} + +static void msft_le_set_advertisement_filter_enable_cb(struct hci_dev *hdev, + u8 status, u16 opcode, + struct sk_buff *skb) +{ + struct msft_cp_le_set_advertisement_filter_enable *cp; + struct msft_rp_le_set_advertisement_filter_enable *rp; + struct msft_data *msft = hdev->msft_data; + + rp = (struct msft_rp_le_set_advertisement_filter_enable *)skb->data; + if (skb->len < sizeof(*rp)) + return; + + /* Error 0x0C would be returned if the filter enabled status is + * already set to whatever we were trying to set. + * Although the default state should be disabled, some controller set + * the initial value to enabled. Because there is no way to know the + * actual initial value before sending this command, here we also treat + * error 0x0C as success. + */ + if (status != 0x00 && status != 0x0C) + return; + + hci_dev_lock(hdev); + + cp = hci_sent_cmd_data(hdev, hdev->msft_opcode); + msft->filter_enabled = cp->enable; + + if (status == 0x0C) + bt_dev_warn(hdev, "MSFT filter_enable is already %s", + cp->enable ? "on" : "off"); + + hci_dev_unlock(hdev); +} + +static bool msft_monitor_rssi_valid(struct adv_monitor *monitor) +{ + struct adv_rssi_thresholds *r = &monitor->rssi; + + if (r->high_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN || + r->high_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX || + r->low_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN || + r->low_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX) + return false; + + /* High_threshold_timeout is not supported, + * once high_threshold is reached, events are immediately reported. + */ + if (r->high_threshold_timeout != 0) + return false; + + if (r->low_threshold_timeout > MSFT_RSSI_LOW_TIMEOUT_MAX) + return false; + + /* Sampling period from 0x00 to 0xFF are all allowed */ + return true; +} + +static bool msft_monitor_pattern_valid(struct adv_monitor *monitor) +{ + return msft_monitor_rssi_valid(monitor); + /* No additional check needed for pattern-based monitor */ +} + +/* This function requires the caller holds hdev->lock */ +static int __msft_add_monitor_pattern(struct hci_dev *hdev, + struct adv_monitor *monitor) +{ + struct msft_cp_le_monitor_advertisement *cp; + struct msft_le_monitor_advertisement_pattern_data *pattern_data; + struct msft_le_monitor_advertisement_pattern *pattern; + struct adv_pattern *entry; + struct hci_request req; + struct msft_data *msft = hdev->msft_data; + size_t total_size = sizeof(*cp) + sizeof(*pattern_data); + ptrdiff_t offset = 0; + u8 pattern_count = 0; + int err = 0; + + if (!msft_monitor_pattern_valid(monitor)) + return -EINVAL; + + list_for_each_entry(entry, &monitor->patterns, list) { + pattern_count++; + total_size += sizeof(*pattern) + entry->length; + } + + cp = kmalloc(total_size, GFP_KERNEL); + if (!cp) + return -ENOMEM; + + cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT; + cp->rssi_high = monitor->rssi.high_threshold; + cp->rssi_low = monitor->rssi.low_threshold; + cp->rssi_low_interval = (u8)monitor->rssi.low_threshold_timeout; + cp->rssi_sampling_period = monitor->rssi.sampling_period; + + cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN; + + pattern_data = (void *)cp->data; + pattern_data->count = pattern_count; + + list_for_each_entry(entry, &monitor->patterns, list) { + pattern = (void *)(pattern_data->data + offset); + /* the length also includes data_type and offset */ + pattern->length = entry->length + 2; + pattern->data_type = entry->ad_type; + pattern->start_byte = entry->offset; + memcpy(pattern->pattern, entry->value, entry->length); + offset += sizeof(*pattern) + entry->length; + } + + hci_req_init(&req, hdev); + hci_req_add(&req, hdev->msft_opcode, total_size, cp); + err = hci_req_run_skb(&req, msft_le_monitor_advertisement_cb); + kfree(cp); + + if (!err) + msft->pending_add_handle = monitor->handle; + + return err; +} + +/* This function requires the caller holds hdev->lock */ +int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor) +{ + struct msft_data *msft = hdev->msft_data; + + if (!msft) + return -EOPNOTSUPP; + + if (msft->reregistering) + return -EBUSY; + + return __msft_add_monitor_pattern(hdev, monitor); +} + +/* This function requires the caller holds hdev->lock */ +int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, + u16 handle) +{ + struct msft_cp_le_cancel_monitor_advertisement cp; + struct msft_monitor_advertisement_handle_data *handle_data; + struct hci_request req; + struct msft_data *msft = hdev->msft_data; + int err = 0; + + if (!msft) + return -EOPNOTSUPP; + + if (msft->reregistering) + return -EBUSY; + + handle_data = msft_find_handle_data(hdev, monitor->handle, true); + + /* If no matched handle, just remove without telling controller */ + if (!handle_data) + return -ENOENT; + + cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; + cp.handle = handle_data->msft_handle; + + hci_req_init(&req, hdev); + hci_req_add(&req, hdev->msft_opcode, sizeof(cp), &cp); + err = hci_req_run_skb(&req, msft_le_cancel_monitor_advertisement_cb); + + if (!err) + msft->pending_remove_handle = handle; + + return err; +} + +void msft_req_add_set_filter_enable(struct hci_request *req, bool enable) +{ + struct hci_dev *hdev = req->hdev; + struct msft_cp_le_set_advertisement_filter_enable cp; + + cp.sub_opcode = MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE; + cp.enable = enable; + + hci_req_add(req, hdev->msft_opcode, sizeof(cp), &cp); +} + +int msft_set_filter_enable(struct hci_dev *hdev, bool enable) +{ + struct hci_request req; + struct msft_data *msft = hdev->msft_data; + int err; + + if (!msft) + return -EOPNOTSUPP; + + hci_req_init(&req, hdev); + msft_req_add_set_filter_enable(&req, enable); + err = hci_req_run_skb(&req, msft_le_set_advertisement_filter_enable_cb); + + return err; } diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h index e9c478e890b8..88ed613dfa08 100644 --- a/net/bluetooth/msft.h +++ b/net/bluetooth/msft.h @@ -12,16 +12,46 @@ #if IS_ENABLED(CONFIG_BT_MSFTEXT) +bool msft_monitor_supported(struct hci_dev *hdev); void msft_do_open(struct hci_dev *hdev); void msft_do_close(struct hci_dev *hdev); void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb); __u64 msft_get_features(struct hci_dev *hdev); +int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor); +int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, + u16 handle); +void msft_req_add_set_filter_enable(struct hci_request *req, bool enable); +int msft_set_filter_enable(struct hci_dev *hdev, bool enable); #else +static inline bool msft_monitor_supported(struct hci_dev *hdev) +{ + return false; +} + static inline void msft_do_open(struct hci_dev *hdev) {} static inline void msft_do_close(struct hci_dev *hdev) {} static inline void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb) {} static inline __u64 msft_get_features(struct hci_dev *hdev) { return 0; } +static inline int msft_add_monitor_pattern(struct hci_dev *hdev, + struct adv_monitor *monitor) +{ + return -EOPNOTSUPP; +} + +static inline int msft_remove_monitor(struct hci_dev *hdev, + struct adv_monitor *monitor, + u16 handle) +{ + return -EOPNOTSUPP; +} + +static inline void msft_req_add_set_filter_enable(struct hci_request *req, + bool enable) {} +static inline int msft_set_filter_enable(struct hci_dev *hdev, bool enable) +{ + return -EOPNOTSUPP; +} #endif diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 79ffcdef0b7a..22a110f37abc 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1003,6 +1003,11 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, case BT_SNDMTU: case BT_RCVMTU: + if (sk->sk_state != BT_CONNECTED) { + err = -ENOTCONN; + break; + } + if (put_user(sco_pi(sk)->conn->mtu, (u32 __user *)optval)) err = -EFAULT; break; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index bf4bef13d935..b0c1ee110eff 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -25,7 +25,6 @@ #include <linux/crypto.h> #include <crypto/aes.h> #include <crypto/algapi.h> -#include <crypto/b128ops.h> #include <crypto/hash.h> #include <crypto/kpp.h> @@ -425,7 +424,7 @@ static int smp_c1(const u8 k[16], SMP_DBG("p1 %16phN", p1); /* res = r XOR p1 */ - u128_xor((u128 *) res, (u128 *) r, (u128 *) p1); + crypto_xor_cpy(res, r, p1, sizeof(p1)); /* res = e(k, res) */ err = smp_e(k, res); @@ -442,7 +441,7 @@ static int smp_c1(const u8 k[16], SMP_DBG("p2 %16phN", p2); /* res = res XOR p2 */ - u128_xor((u128 *) res, (u128 *) res, (u128 *) p2); + crypto_xor(res, p2, sizeof(p2)); /* res = e(k, res) */ err = smp_e(k, res); @@ -3353,31 +3352,8 @@ static void smp_del_chan(struct l2cap_chan *chan) l2cap_chan_put(chan); } -static ssize_t force_bredr_smp_read(struct file *file, - char __user *user_buf, - size_t count, loff_t *ppos) +int smp_force_bredr(struct hci_dev *hdev, bool enable) { - struct hci_dev *hdev = file->private_data; - char buf[3]; - - buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP) ? 'Y': 'N'; - buf[1] = '\n'; - buf[2] = '\0'; - return simple_read_from_buffer(user_buf, count, ppos, buf, 2); -} - -static ssize_t force_bredr_smp_write(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - bool enable; - int err; - - err = kstrtobool_from_user(user_buf, count, &enable); - if (err) - return err; - if (enable == hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return -EALREADY; @@ -3399,16 +3375,9 @@ static ssize_t force_bredr_smp_write(struct file *file, hci_dev_change_flag(hdev, HCI_FORCE_BREDR_SMP); - return count; + return 0; } -static const struct file_operations force_bredr_smp_fops = { - .open = simple_open, - .read = force_bredr_smp_read, - .write = force_bredr_smp_write, - .llseek = default_llseek, -}; - int smp_register(struct hci_dev *hdev) { struct l2cap_chan *chan; @@ -3433,17 +3402,7 @@ int smp_register(struct hci_dev *hdev) hdev->smp_data = chan; - /* If the controller does not support BR/EDR Secure Connections - * feature, then the BR/EDR SMP channel shall not be present. - * - * To test this with Bluetooth 4.0 controllers, create a debugfs - * switch that allows forcing BR/EDR SMP support and accepting - * cross-transport pairing on non-AES encrypted connections. - */ if (!lmp_sc_capable(hdev)) { - debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs, - hdev, &force_bredr_smp_fops); - /* Flag can be already set here (due to power toggle) */ if (!hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return 0; diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index 121edadd5f8d..fc35a8bf358e 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -193,6 +193,8 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa); int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]); +int smp_force_bredr(struct hci_dev *hdev, bool enable); + int smp_register(struct hci_dev *hdev); void smp_unregister(struct hci_dev *hdev); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c1c30a9f76f3..58bcb8c849d5 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -272,7 +272,8 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, kattr->test.repeat) return -EINVAL; - if (ctx_size_in < prog->aux->max_ctx_offset) + if (ctx_size_in < prog->aux->max_ctx_offset || + ctx_size_in > MAX_BPF_FUNC_ARGS * sizeof(u64)) return -EINVAL; if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0) @@ -636,14 +637,11 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (IS_ERR(data)) return PTR_ERR(data); - xdp.data_hard_start = data; - xdp.data = data + headroom; - xdp.data_meta = xdp.data; - xdp.data_end = xdp.data + size; - xdp.frame_sz = headroom + max_data_sz + tailroom; - rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); - xdp.rxq = &rxqueue->xdp_rxq; + xdp_init_buff(&xdp, headroom + max_data_sz + tailroom, + &rxqueue->xdp_rxq); + xdp_prepare_buff(&xdp, data, headroom, size, true); + bpf_prog_change_xdp(NULL, prog); ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); if (ret) diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig index 8ad0233ce497..3d4a21462458 100644 --- a/net/bpfilter/Kconfig +++ b/net/bpfilter/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only menuconfig BPFILTER bool "BPF based packet filtering framework (BPFILTER)" - depends on NET && BPF && INET + depends on BPF && INET select USERMODE_DRIVER help This builds experimental bpfilter framework that is aiming to diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig index 80879196560c..3c8ded7d3e84 100644 --- a/net/bridge/Kconfig +++ b/net/bridge/Kconfig @@ -73,3 +73,14 @@ config BRIDGE_MRP Say N to exclude this support and reduce the binary size. If unsure, say N. + +config BRIDGE_CFM + bool "CFM protocol" + depends on BRIDGE + help + If you say Y here, then the Ethernet bridge will be able to run CFM + protocol according to 802.1Q section 12.14 + + Say N to exclude this support and reduce the binary size. + + If unsure, say N. diff --git a/net/bridge/Makefile b/net/bridge/Makefile index ccb394236fbd..7fb9a021873b 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -18,7 +18,7 @@ br_netfilter-y := br_netfilter_hooks.o br_netfilter-$(subst m,y,$(CONFIG_IPV6)) += br_netfilter_ipv6.o obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o -bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o +bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o br_multicast_eht.o bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_options.o @@ -27,3 +27,5 @@ bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o obj-$(CONFIG_NETFILTER) += netfilter/ bridge-$(CONFIG_BRIDGE_MRP) += br_mrp_switchdev.o br_mrp.o br_mrp_netlink.o + +bridge-$(CONFIG_BRIDGE_CFM) += br_cfm.o br_cfm_netlink.o diff --git a/net/bridge/br.c b/net/bridge/br.c index 401eeb9142eb..ef743f94254d 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -43,7 +43,10 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v if (event == NETDEV_REGISTER) { /* register of bridge completed, add sysfs entries */ - br_sysfs_addbr(dev); + err = br_sysfs_addbr(dev); + if (err) + return notifier_from_errno(err); + return NOTIFY_DONE; } } @@ -119,7 +122,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v break; case NETDEV_PRE_TYPE_CHANGE: - /* Forbid underlaying device to change its type. */ + /* Forbid underlying device to change its type. */ return NOTIFY_BAD; case NETDEV_RESEND_IGMP: diff --git a/net/bridge/br_cfm.c b/net/bridge/br_cfm.c new file mode 100644 index 000000000000..001064f7583d --- /dev/null +++ b/net/bridge/br_cfm.c @@ -0,0 +1,867 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/cfm_bridge.h> +#include <uapi/linux/cfm_bridge.h> +#include "br_private_cfm.h" + +static struct br_cfm_mep *br_mep_find(struct net_bridge *br, u32 instance) +{ + struct br_cfm_mep *mep; + + hlist_for_each_entry(mep, &br->mep_list, head) + if (mep->instance == instance) + return mep; + + return NULL; +} + +static struct br_cfm_mep *br_mep_find_ifindex(struct net_bridge *br, + u32 ifindex) +{ + struct br_cfm_mep *mep; + + hlist_for_each_entry_rcu(mep, &br->mep_list, head, + lockdep_rtnl_is_held()) + if (mep->create.ifindex == ifindex) + return mep; + + return NULL; +} + +static struct br_cfm_peer_mep *br_peer_mep_find(struct br_cfm_mep *mep, + u32 mepid) +{ + struct br_cfm_peer_mep *peer_mep; + + hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head, + lockdep_rtnl_is_held()) + if (peer_mep->mepid == mepid) + return peer_mep; + + return NULL; +} + +static struct net_bridge_port *br_mep_get_port(struct net_bridge *br, + u32 ifindex) +{ + struct net_bridge_port *port; + + list_for_each_entry(port, &br->port_list, list) + if (port->dev->ifindex == ifindex) + return port; + + return NULL; +} + +/* Calculate the CCM interval in us. */ +static u32 interval_to_us(enum br_cfm_ccm_interval interval) +{ + switch (interval) { + case BR_CFM_CCM_INTERVAL_NONE: + return 0; + case BR_CFM_CCM_INTERVAL_3_3_MS: + return 3300; + case BR_CFM_CCM_INTERVAL_10_MS: + return 10 * 1000; + case BR_CFM_CCM_INTERVAL_100_MS: + return 100 * 1000; + case BR_CFM_CCM_INTERVAL_1_SEC: + return 1000 * 1000; + case BR_CFM_CCM_INTERVAL_10_SEC: + return 10 * 1000 * 1000; + case BR_CFM_CCM_INTERVAL_1_MIN: + return 60 * 1000 * 1000; + case BR_CFM_CCM_INTERVAL_10_MIN: + return 10 * 60 * 1000 * 1000; + } + return 0; +} + +/* Convert the interface interval to CCM PDU value. */ +static u32 interval_to_pdu(enum br_cfm_ccm_interval interval) +{ + switch (interval) { + case BR_CFM_CCM_INTERVAL_NONE: + return 0; + case BR_CFM_CCM_INTERVAL_3_3_MS: + return 1; + case BR_CFM_CCM_INTERVAL_10_MS: + return 2; + case BR_CFM_CCM_INTERVAL_100_MS: + return 3; + case BR_CFM_CCM_INTERVAL_1_SEC: + return 4; + case BR_CFM_CCM_INTERVAL_10_SEC: + return 5; + case BR_CFM_CCM_INTERVAL_1_MIN: + return 6; + case BR_CFM_CCM_INTERVAL_10_MIN: + return 7; + } + return 0; +} + +/* Convert the CCM PDU value to interval on interface. */ +static u32 pdu_to_interval(u32 value) +{ + switch (value) { + case 0: + return BR_CFM_CCM_INTERVAL_NONE; + case 1: + return BR_CFM_CCM_INTERVAL_3_3_MS; + case 2: + return BR_CFM_CCM_INTERVAL_10_MS; + case 3: + return BR_CFM_CCM_INTERVAL_100_MS; + case 4: + return BR_CFM_CCM_INTERVAL_1_SEC; + case 5: + return BR_CFM_CCM_INTERVAL_10_SEC; + case 6: + return BR_CFM_CCM_INTERVAL_1_MIN; + case 7: + return BR_CFM_CCM_INTERVAL_10_MIN; + } + return BR_CFM_CCM_INTERVAL_NONE; +} + +static void ccm_rx_timer_start(struct br_cfm_peer_mep *peer_mep) +{ + u32 interval_us; + + interval_us = interval_to_us(peer_mep->mep->cc_config.exp_interval); + /* Function ccm_rx_dwork must be called with 1/4 + * of the configured CC 'expected_interval' + * in order to detect CCM defect after 3.25 interval. + */ + queue_delayed_work(system_wq, &peer_mep->ccm_rx_dwork, + usecs_to_jiffies(interval_us / 4)); +} + +static void br_cfm_notify(int event, const struct net_bridge_port *port) +{ + u32 filter = RTEXT_FILTER_CFM_STATUS; + + return br_info_notify(event, port->br, NULL, filter); +} + +static void cc_peer_enable(struct br_cfm_peer_mep *peer_mep) +{ + memset(&peer_mep->cc_status, 0, sizeof(peer_mep->cc_status)); + peer_mep->ccm_rx_count_miss = 0; + + ccm_rx_timer_start(peer_mep); +} + +static void cc_peer_disable(struct br_cfm_peer_mep *peer_mep) +{ + cancel_delayed_work_sync(&peer_mep->ccm_rx_dwork); +} + +static struct sk_buff *ccm_frame_build(struct br_cfm_mep *mep, + const struct br_cfm_cc_ccm_tx_info *const tx_info) + +{ + struct br_cfm_common_hdr *common_hdr; + struct net_bridge_port *b_port; + struct br_cfm_maid *maid; + u8 *itu_reserved, *e_tlv; + struct ethhdr *eth_hdr; + struct sk_buff *skb; + __be32 *status_tlv; + __be32 *snumber; + __be16 *mepid; + + skb = dev_alloc_skb(CFM_CCM_MAX_FRAME_LENGTH); + if (!skb) + return NULL; + + rcu_read_lock(); + b_port = rcu_dereference(mep->b_port); + if (!b_port) { + kfree_skb(skb); + rcu_read_unlock(); + return NULL; + } + skb->dev = b_port->dev; + rcu_read_unlock(); + /* The device cannot be deleted until the work_queue functions has + * completed. This function is called from ccm_tx_work_expired() + * that is a work_queue functions. + */ + + skb->protocol = htons(ETH_P_CFM); + skb->priority = CFM_FRAME_PRIO; + + /* Ethernet header */ + eth_hdr = skb_put(skb, sizeof(*eth_hdr)); + ether_addr_copy(eth_hdr->h_dest, tx_info->dmac.addr); + ether_addr_copy(eth_hdr->h_source, mep->config.unicast_mac.addr); + eth_hdr->h_proto = htons(ETH_P_CFM); + + /* Common CFM Header */ + common_hdr = skb_put(skb, sizeof(*common_hdr)); + common_hdr->mdlevel_version = mep->config.mdlevel << 5; + common_hdr->opcode = BR_CFM_OPCODE_CCM; + common_hdr->flags = (mep->rdi << 7) | + interval_to_pdu(mep->cc_config.exp_interval); + common_hdr->tlv_offset = CFM_CCM_TLV_OFFSET; + + /* Sequence number */ + snumber = skb_put(skb, sizeof(*snumber)); + if (tx_info->seq_no_update) { + *snumber = cpu_to_be32(mep->ccm_tx_snumber); + mep->ccm_tx_snumber += 1; + } else { + *snumber = 0; + } + + mepid = skb_put(skb, sizeof(*mepid)); + *mepid = cpu_to_be16((u16)mep->config.mepid); + + maid = skb_put(skb, sizeof(*maid)); + memcpy(maid->data, mep->cc_config.exp_maid.data, sizeof(maid->data)); + + /* ITU reserved (CFM_CCM_ITU_RESERVED_SIZE octets) */ + itu_reserved = skb_put(skb, CFM_CCM_ITU_RESERVED_SIZE); + memset(itu_reserved, 0, CFM_CCM_ITU_RESERVED_SIZE); + + /* Generel CFM TLV format: + * TLV type: one byte + * TLV value length: two bytes + * TLV value: 'TLV value length' bytes + */ + + /* Port status TLV. The value length is 1. Total of 4 bytes. */ + if (tx_info->port_tlv) { + status_tlv = skb_put(skb, sizeof(*status_tlv)); + *status_tlv = cpu_to_be32((CFM_PORT_STATUS_TLV_TYPE << 24) | + (1 << 8) | /* Value length */ + (tx_info->port_tlv_value & 0xFF)); + } + + /* Interface status TLV. The value length is 1. Total of 4 bytes. */ + if (tx_info->if_tlv) { + status_tlv = skb_put(skb, sizeof(*status_tlv)); + *status_tlv = cpu_to_be32((CFM_IF_STATUS_TLV_TYPE << 24) | + (1 << 8) | /* Value length */ + (tx_info->if_tlv_value & 0xFF)); + } + + /* End TLV */ + e_tlv = skb_put(skb, sizeof(*e_tlv)); + *e_tlv = CFM_ENDE_TLV_TYPE; + + return skb; +} + +static void ccm_frame_tx(struct sk_buff *skb) +{ + skb_reset_network_header(skb); + dev_queue_xmit(skb); +} + +/* This function is called with the configured CC 'expected_interval' + * in order to drive CCM transmission when enabled. + */ +static void ccm_tx_work_expired(struct work_struct *work) +{ + struct delayed_work *del_work; + struct br_cfm_mep *mep; + struct sk_buff *skb; + u32 interval_us; + + del_work = to_delayed_work(work); + mep = container_of(del_work, struct br_cfm_mep, ccm_tx_dwork); + + if (time_before_eq(mep->ccm_tx_end, jiffies)) { + /* Transmission period has ended */ + mep->cc_ccm_tx_info.period = 0; + return; + } + + skb = ccm_frame_build(mep, &mep->cc_ccm_tx_info); + if (skb) + ccm_frame_tx(skb); + + interval_us = interval_to_us(mep->cc_config.exp_interval); + queue_delayed_work(system_wq, &mep->ccm_tx_dwork, + usecs_to_jiffies(interval_us)); +} + +/* This function is called with 1/4 of the configured CC 'expected_interval' + * in order to detect CCM defect after 3.25 interval. + */ +static void ccm_rx_work_expired(struct work_struct *work) +{ + struct br_cfm_peer_mep *peer_mep; + struct net_bridge_port *b_port; + struct delayed_work *del_work; + + del_work = to_delayed_work(work); + peer_mep = container_of(del_work, struct br_cfm_peer_mep, ccm_rx_dwork); + + /* After 13 counts (4 * 3,25) then 3.25 intervals are expired */ + if (peer_mep->ccm_rx_count_miss < 13) { + /* 3.25 intervals are NOT expired without CCM reception */ + peer_mep->ccm_rx_count_miss++; + + /* Start timer again */ + ccm_rx_timer_start(peer_mep); + } else { + /* 3.25 intervals are expired without CCM reception. + * CCM defect detected + */ + peer_mep->cc_status.ccm_defect = true; + + /* Change in CCM defect status - notify */ + rcu_read_lock(); + b_port = rcu_dereference(peer_mep->mep->b_port); + if (b_port) + br_cfm_notify(RTM_NEWLINK, b_port); + rcu_read_unlock(); + } +} + +static u32 ccm_tlv_extract(struct sk_buff *skb, u32 index, + struct br_cfm_peer_mep *peer_mep) +{ + __be32 *s_tlv; + __be32 _s_tlv; + u32 h_s_tlv; + u8 *e_tlv; + u8 _e_tlv; + + e_tlv = skb_header_pointer(skb, index, sizeof(_e_tlv), &_e_tlv); + if (!e_tlv) + return 0; + + /* TLV is present - get the status TLV */ + s_tlv = skb_header_pointer(skb, + index, + sizeof(_s_tlv), &_s_tlv); + if (!s_tlv) + return 0; + + h_s_tlv = ntohl(*s_tlv); + if ((h_s_tlv >> 24) == CFM_IF_STATUS_TLV_TYPE) { + /* Interface status TLV */ + peer_mep->cc_status.tlv_seen = true; + peer_mep->cc_status.if_tlv_value = (h_s_tlv & 0xFF); + } + + if ((h_s_tlv >> 24) == CFM_PORT_STATUS_TLV_TYPE) { + /* Port status TLV */ + peer_mep->cc_status.tlv_seen = true; + peer_mep->cc_status.port_tlv_value = (h_s_tlv & 0xFF); + } + + /* The Sender ID TLV is not handled */ + /* The Organization-Specific TLV is not handled */ + + /* Return the length of this tlv. + * This is the length of the value field plus 3 bytes for size of type + * field and length field + */ + return ((h_s_tlv >> 8) & 0xFFFF) + 3; +} + +/* note: already called with rcu_read_lock */ +static int br_cfm_frame_rx(struct net_bridge_port *port, struct sk_buff *skb) +{ + u32 mdlevel, interval, size, index, max; + const struct br_cfm_common_hdr *hdr; + struct br_cfm_peer_mep *peer_mep; + const struct br_cfm_maid *maid; + struct br_cfm_common_hdr _hdr; + struct br_cfm_maid _maid; + struct br_cfm_mep *mep; + struct net_bridge *br; + __be32 *snumber; + __be32 _snumber; + __be16 *mepid; + __be16 _mepid; + + if (port->state == BR_STATE_DISABLED) + return 0; + + hdr = skb_header_pointer(skb, 0, sizeof(_hdr), &_hdr); + if (!hdr) + return 1; + + br = port->br; + mep = br_mep_find_ifindex(br, port->dev->ifindex); + if (unlikely(!mep)) + /* No MEP on this port - must be forwarded */ + return 0; + + mdlevel = hdr->mdlevel_version >> 5; + if (mdlevel > mep->config.mdlevel) + /* The level is above this MEP level - must be forwarded */ + return 0; + + if ((hdr->mdlevel_version & 0x1F) != 0) { + /* Invalid version */ + mep->status.version_unexp_seen = true; + return 1; + } + + if (mdlevel < mep->config.mdlevel) { + /* The level is below this MEP level */ + mep->status.rx_level_low_seen = true; + return 1; + } + + if (hdr->opcode == BR_CFM_OPCODE_CCM) { + /* CCM PDU received. */ + /* MA ID is after common header + sequence number + MEP ID */ + maid = skb_header_pointer(skb, + CFM_CCM_PDU_MAID_OFFSET, + sizeof(_maid), &_maid); + if (!maid) + return 1; + if (memcmp(maid->data, mep->cc_config.exp_maid.data, + sizeof(maid->data))) + /* MA ID not as expected */ + return 1; + + /* MEP ID is after common header + sequence number */ + mepid = skb_header_pointer(skb, + CFM_CCM_PDU_MEPID_OFFSET, + sizeof(_mepid), &_mepid); + if (!mepid) + return 1; + peer_mep = br_peer_mep_find(mep, (u32)ntohs(*mepid)); + if (!peer_mep) + return 1; + + /* Interval is in common header flags */ + interval = hdr->flags & 0x07; + if (mep->cc_config.exp_interval != pdu_to_interval(interval)) + /* Interval not as expected */ + return 1; + + /* A valid CCM frame is received */ + if (peer_mep->cc_status.ccm_defect) { + peer_mep->cc_status.ccm_defect = false; + + /* Change in CCM defect status - notify */ + br_cfm_notify(RTM_NEWLINK, port); + + /* Start CCM RX timer */ + ccm_rx_timer_start(peer_mep); + } + + peer_mep->cc_status.seen = true; + peer_mep->ccm_rx_count_miss = 0; + + /* RDI is in common header flags */ + peer_mep->cc_status.rdi = (hdr->flags & 0x80) ? true : false; + + /* Sequence number is after common header */ + snumber = skb_header_pointer(skb, + CFM_CCM_PDU_SEQNR_OFFSET, + sizeof(_snumber), &_snumber); + if (!snumber) + return 1; + if (ntohl(*snumber) != (mep->ccm_rx_snumber + 1)) + /* Unexpected sequence number */ + peer_mep->cc_status.seq_unexp_seen = true; + + mep->ccm_rx_snumber = ntohl(*snumber); + + /* TLV end is after common header + sequence number + MEP ID + + * MA ID + ITU reserved + */ + index = CFM_CCM_PDU_TLV_OFFSET; + max = 0; + do { /* Handle all TLVs */ + size = ccm_tlv_extract(skb, index, peer_mep); + index += size; + max += 1; + } while (size != 0 && max < 4); /* Max four TLVs possible */ + + return 1; + } + + mep->status.opcode_unexp_seen = true; + + return 1; +} + +static struct br_frame_type cfm_frame_type __read_mostly = { + .type = cpu_to_be16(ETH_P_CFM), + .frame_handler = br_cfm_frame_rx, +}; + +int br_cfm_mep_create(struct net_bridge *br, + const u32 instance, + struct br_cfm_mep_create *const create, + struct netlink_ext_ack *extack) +{ + struct net_bridge_port *p; + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + if (create->domain == BR_CFM_VLAN) { + NL_SET_ERR_MSG_MOD(extack, + "VLAN domain not supported"); + return -EINVAL; + } + if (create->domain != BR_CFM_PORT) { + NL_SET_ERR_MSG_MOD(extack, + "Invalid domain value"); + return -EINVAL; + } + if (create->direction == BR_CFM_MEP_DIRECTION_UP) { + NL_SET_ERR_MSG_MOD(extack, + "Up-MEP not supported"); + return -EINVAL; + } + if (create->direction != BR_CFM_MEP_DIRECTION_DOWN) { + NL_SET_ERR_MSG_MOD(extack, + "Invalid direction value"); + return -EINVAL; + } + p = br_mep_get_port(br, create->ifindex); + if (!p) { + NL_SET_ERR_MSG_MOD(extack, + "Port is not related to bridge"); + return -EINVAL; + } + mep = br_mep_find(br, instance); + if (mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance already exists"); + return -EEXIST; + } + + /* In PORT domain only one instance can be created per port */ + if (create->domain == BR_CFM_PORT) { + mep = br_mep_find_ifindex(br, create->ifindex); + if (mep) { + NL_SET_ERR_MSG_MOD(extack, + "Only one Port MEP on a port allowed"); + return -EINVAL; + } + } + + mep = kzalloc(sizeof(*mep), GFP_KERNEL); + if (!mep) + return -ENOMEM; + + mep->create = *create; + mep->instance = instance; + rcu_assign_pointer(mep->b_port, p); + + INIT_HLIST_HEAD(&mep->peer_mep_list); + INIT_DELAYED_WORK(&mep->ccm_tx_dwork, ccm_tx_work_expired); + + if (hlist_empty(&br->mep_list)) + br_add_frame(br, &cfm_frame_type); + + hlist_add_tail_rcu(&mep->head, &br->mep_list); + + return 0; +} + +static void mep_delete_implementation(struct net_bridge *br, + struct br_cfm_mep *mep) +{ + struct br_cfm_peer_mep *peer_mep; + struct hlist_node *n_store; + + ASSERT_RTNL(); + + /* Empty and free peer MEP list */ + hlist_for_each_entry_safe(peer_mep, n_store, &mep->peer_mep_list, head) { + cancel_delayed_work_sync(&peer_mep->ccm_rx_dwork); + hlist_del_rcu(&peer_mep->head); + kfree_rcu(peer_mep, rcu); + } + + cancel_delayed_work_sync(&mep->ccm_tx_dwork); + + RCU_INIT_POINTER(mep->b_port, NULL); + hlist_del_rcu(&mep->head); + kfree_rcu(mep, rcu); + + if (hlist_empty(&br->mep_list)) + br_del_frame(br, &cfm_frame_type); +} + +int br_cfm_mep_delete(struct net_bridge *br, + const u32 instance, + struct netlink_ext_ack *extack) +{ + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + mep = br_mep_find(br, instance); + if (!mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance does not exists"); + return -ENOENT; + } + + mep_delete_implementation(br, mep); + + return 0; +} + +int br_cfm_mep_config_set(struct net_bridge *br, + const u32 instance, + const struct br_cfm_mep_config *const config, + struct netlink_ext_ack *extack) +{ + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + mep = br_mep_find(br, instance); + if (!mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance does not exists"); + return -ENOENT; + } + + mep->config = *config; + + return 0; +} + +int br_cfm_cc_config_set(struct net_bridge *br, + const u32 instance, + const struct br_cfm_cc_config *const config, + struct netlink_ext_ack *extack) +{ + struct br_cfm_peer_mep *peer_mep; + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + mep = br_mep_find(br, instance); + if (!mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance does not exists"); + return -ENOENT; + } + + /* Check for no change in configuration */ + if (memcmp(config, &mep->cc_config, sizeof(*config)) == 0) + return 0; + + if (config->enable && !mep->cc_config.enable) + /* CC is enabled */ + hlist_for_each_entry(peer_mep, &mep->peer_mep_list, head) + cc_peer_enable(peer_mep); + + if (!config->enable && mep->cc_config.enable) + /* CC is disabled */ + hlist_for_each_entry(peer_mep, &mep->peer_mep_list, head) + cc_peer_disable(peer_mep); + + mep->cc_config = *config; + mep->ccm_rx_snumber = 0; + mep->ccm_tx_snumber = 1; + + return 0; +} + +int br_cfm_cc_peer_mep_add(struct net_bridge *br, const u32 instance, + u32 mepid, + struct netlink_ext_ack *extack) +{ + struct br_cfm_peer_mep *peer_mep; + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + mep = br_mep_find(br, instance); + if (!mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance does not exists"); + return -ENOENT; + } + + peer_mep = br_peer_mep_find(mep, mepid); + if (peer_mep) { + NL_SET_ERR_MSG_MOD(extack, + "Peer MEP-ID already exists"); + return -EEXIST; + } + + peer_mep = kzalloc(sizeof(*peer_mep), GFP_KERNEL); + if (!peer_mep) + return -ENOMEM; + + peer_mep->mepid = mepid; + peer_mep->mep = mep; + INIT_DELAYED_WORK(&peer_mep->ccm_rx_dwork, ccm_rx_work_expired); + + if (mep->cc_config.enable) + cc_peer_enable(peer_mep); + + hlist_add_tail_rcu(&peer_mep->head, &mep->peer_mep_list); + + return 0; +} + +int br_cfm_cc_peer_mep_remove(struct net_bridge *br, const u32 instance, + u32 mepid, + struct netlink_ext_ack *extack) +{ + struct br_cfm_peer_mep *peer_mep; + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + mep = br_mep_find(br, instance); + if (!mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance does not exists"); + return -ENOENT; + } + + peer_mep = br_peer_mep_find(mep, mepid); + if (!peer_mep) { + NL_SET_ERR_MSG_MOD(extack, + "Peer MEP-ID does not exists"); + return -ENOENT; + } + + cc_peer_disable(peer_mep); + + hlist_del_rcu(&peer_mep->head); + kfree_rcu(peer_mep, rcu); + + return 0; +} + +int br_cfm_cc_rdi_set(struct net_bridge *br, const u32 instance, + const bool rdi, struct netlink_ext_ack *extack) +{ + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + mep = br_mep_find(br, instance); + if (!mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance does not exists"); + return -ENOENT; + } + + mep->rdi = rdi; + + return 0; +} + +int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance, + const struct br_cfm_cc_ccm_tx_info *const tx_info, + struct netlink_ext_ack *extack) +{ + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + mep = br_mep_find(br, instance); + if (!mep) { + NL_SET_ERR_MSG_MOD(extack, + "MEP instance does not exists"); + return -ENOENT; + } + + if (memcmp(tx_info, &mep->cc_ccm_tx_info, sizeof(*tx_info)) == 0) { + /* No change in tx_info. */ + if (mep->cc_ccm_tx_info.period == 0) + /* Transmission is not enabled - just return */ + return 0; + + /* Transmission is ongoing, the end time is recalculated */ + mep->ccm_tx_end = jiffies + + usecs_to_jiffies(tx_info->period * 1000000); + return 0; + } + + if (tx_info->period == 0 && mep->cc_ccm_tx_info.period == 0) + /* Some change in info and transmission is not ongoing */ + goto save; + + if (tx_info->period != 0 && mep->cc_ccm_tx_info.period != 0) { + /* Some change in info and transmission is ongoing + * The end time is recalculated + */ + mep->ccm_tx_end = jiffies + + usecs_to_jiffies(tx_info->period * 1000000); + + goto save; + } + + if (tx_info->period == 0 && mep->cc_ccm_tx_info.period != 0) { + cancel_delayed_work_sync(&mep->ccm_tx_dwork); + goto save; + } + + /* Start delayed work to transmit CCM frames. It is done with zero delay + * to send first frame immediately + */ + mep->ccm_tx_end = jiffies + usecs_to_jiffies(tx_info->period * 1000000); + queue_delayed_work(system_wq, &mep->ccm_tx_dwork, 0); + +save: + mep->cc_ccm_tx_info = *tx_info; + + return 0; +} + +int br_cfm_mep_count(struct net_bridge *br, u32 *count) +{ + struct br_cfm_mep *mep; + + *count = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mep, &br->mep_list, head) + *count += 1; + rcu_read_unlock(); + + return 0; +} + +int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count) +{ + struct br_cfm_peer_mep *peer_mep; + struct br_cfm_mep *mep; + + *count = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mep, &br->mep_list, head) + hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) + *count += 1; + rcu_read_unlock(); + + return 0; +} + +bool br_cfm_created(struct net_bridge *br) +{ + return !hlist_empty(&br->mep_list); +} + +/* Deletes the CFM instances on a specific bridge port + */ +void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *port) +{ + struct hlist_node *n_store; + struct br_cfm_mep *mep; + + ASSERT_RTNL(); + + hlist_for_each_entry_safe(mep, n_store, &br->mep_list, head) + if (mep->create.ifindex == port->dev->ifindex) + mep_delete_implementation(br, mep); +} diff --git a/net/bridge/br_cfm_netlink.c b/net/bridge/br_cfm_netlink.c new file mode 100644 index 000000000000..5c4c369f8536 --- /dev/null +++ b/net/bridge/br_cfm_netlink.c @@ -0,0 +1,726 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <net/genetlink.h> + +#include "br_private.h" +#include "br_private_cfm.h" + +static const struct nla_policy +br_cfm_mep_create_policy[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1] = { + [IFLA_BRIDGE_CFM_MEP_CREATE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX] = { .type = NLA_U32 }, +}; + +static const struct nla_policy +br_cfm_mep_delete_policy[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1] = { + [IFLA_BRIDGE_CFM_MEP_DELETE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE] = { .type = NLA_U32 }, +}; + +static const struct nla_policy +br_cfm_mep_config_policy[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1] = { + [IFLA_BRIDGE_CFM_MEP_CONFIG_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC] = NLA_POLICY_ETH_ADDR, + [IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL] = NLA_POLICY_MAX(NLA_U32, 7), + [IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF), +}; + +static const struct nla_policy +br_cfm_cc_config_policy[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1] = { + [IFLA_BRIDGE_CFM_CC_CONFIG_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID] = { + .type = NLA_BINARY, .len = CFM_MAID_LENGTH }, +}; + +static const struct nla_policy +br_cfm_cc_peer_mep_policy[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1] = { + [IFLA_BRIDGE_CFM_CC_PEER_MEP_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_PEER_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF), +}; + +static const struct nla_policy +br_cfm_cc_rdi_policy[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1] = { + [IFLA_BRIDGE_CFM_CC_RDI_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_CC_RDI_INSTANCE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_RDI_RDI] = { .type = NLA_U32 }, +}; + +static const struct nla_policy +br_cfm_cc_ccm_tx_policy[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1] = { + [IFLA_BRIDGE_CFM_CC_CCM_TX_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC] = NLA_POLICY_ETH_ADDR, + [IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE] = { .type = NLA_U8 }, + [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV] = { .type = NLA_U32 }, + [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE] = { .type = NLA_U8 }, +}; + +static const struct nla_policy +br_cfm_policy[IFLA_BRIDGE_CFM_MAX + 1] = { + [IFLA_BRIDGE_CFM_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_CFM_MEP_CREATE] = + NLA_POLICY_NESTED(br_cfm_mep_create_policy), + [IFLA_BRIDGE_CFM_MEP_DELETE] = + NLA_POLICY_NESTED(br_cfm_mep_delete_policy), + [IFLA_BRIDGE_CFM_MEP_CONFIG] = + NLA_POLICY_NESTED(br_cfm_mep_config_policy), + [IFLA_BRIDGE_CFM_CC_CONFIG] = + NLA_POLICY_NESTED(br_cfm_cc_config_policy), + [IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD] = + NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy), + [IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE] = + NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy), + [IFLA_BRIDGE_CFM_CC_RDI] = + NLA_POLICY_NESTED(br_cfm_cc_rdi_policy), + [IFLA_BRIDGE_CFM_CC_CCM_TX] = + NLA_POLICY_NESTED(br_cfm_cc_ccm_tx_policy), +}; + +static int br_mep_create_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1]; + struct br_cfm_mep_create create; + u32 instance; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CREATE_MAX, attr, + br_cfm_mep_create_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]) { + NL_SET_ERR_MSG_MOD(extack, "Missing DOMAIN attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]) { + NL_SET_ERR_MSG_MOD(extack, "Missing DIRECTION attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]) { + NL_SET_ERR_MSG_MOD(extack, "Missing IFINDEX attribute"); + return -EINVAL; + } + + memset(&create, 0, sizeof(create)); + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]); + create.domain = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]); + create.direction = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]); + create.ifindex = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]); + + return br_cfm_mep_create(br, instance, &create, extack); +} + +static int br_mep_delete_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1]; + u32 instance; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_DELETE_MAX, attr, + br_cfm_mep_delete_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing INSTANCE attribute"); + return -EINVAL; + } + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]); + + return br_cfm_mep_delete(br, instance, extack); +} + +static int br_mep_config_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1]; + struct br_cfm_mep_config config; + u32 instance; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CONFIG_MAX, attr, + br_cfm_mep_config_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC]) { + NL_SET_ERR_MSG_MOD(extack, "Missing UNICAST_MAC attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]) { + NL_SET_ERR_MSG_MOD(extack, "Missing MDLEVEL attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]) { + NL_SET_ERR_MSG_MOD(extack, "Missing MEPID attribute"); + return -EINVAL; + } + + memset(&config, 0, sizeof(config)); + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]); + nla_memcpy(&config.unicast_mac.addr, + tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC], + sizeof(config.unicast_mac.addr)); + config.mdlevel = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]); + config.mepid = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]); + + return br_cfm_mep_config_set(br, instance, &config, extack); +} + +static int br_cc_config_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1]; + struct br_cfm_cc_config config; + u32 instance; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CONFIG_MAX, attr, + br_cfm_cc_config_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing ENABLE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INTERVAL attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID]) { + NL_SET_ERR_MSG_MOD(extack, "Missing MAID attribute"); + return -EINVAL; + } + + memset(&config, 0, sizeof(config)); + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]); + config.enable = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]); + config.exp_interval = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]); + nla_memcpy(&config.exp_maid.data, tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID], + sizeof(config.exp_maid.data)); + + return br_cfm_cc_config_set(br, instance, &config, extack); +} + +static int br_cc_peer_mep_add_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1]; + u32 instance, peer_mep_id; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr, + br_cfm_cc_peer_mep_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) { + NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute"); + return -EINVAL; + } + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]); + peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]); + + return br_cfm_cc_peer_mep_add(br, instance, peer_mep_id, extack); +} + +static int br_cc_peer_mep_remove_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1]; + u32 instance, peer_mep_id; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr, + br_cfm_cc_peer_mep_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) { + NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute"); + return -EINVAL; + } + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]); + peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]); + + return br_cfm_cc_peer_mep_remove(br, instance, peer_mep_id, extack); +} + +static int br_cc_rdi_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1]; + u32 instance, rdi; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_RDI_MAX, attr, + br_cfm_cc_rdi_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]) { + NL_SET_ERR_MSG_MOD(extack, "Missing RDI attribute"); + return -EINVAL; + } + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]); + rdi = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]); + + return br_cfm_cc_rdi_set(br, instance, rdi, extack); +} + +static int br_cc_ccm_tx_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1]; + struct br_cfm_cc_ccm_tx_info tx_info; + u32 instance; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CCM_TX_MAX, attr, + br_cfm_cc_ccm_tx_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC]) { + NL_SET_ERR_MSG_MOD(extack, "Missing DMAC attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing SEQ_NO_UPDATE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]) { + NL_SET_ERR_MSG_MOD(extack, "Missing PERIOD attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]) { + NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV_VALUE attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]) { + NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV attribute"); + return -EINVAL; + } + if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV_VALUE attribute"); + return -EINVAL; + } + + memset(&tx_info, 0, sizeof(tx_info)); + + instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]); + nla_memcpy(&tx_info.dmac.addr, + tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC], + sizeof(tx_info.dmac.addr)); + tx_info.seq_no_update = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]); + tx_info.period = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]); + tx_info.if_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]); + tx_info.if_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]); + tx_info.port_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]); + tx_info.port_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]); + + return br_cfm_cc_ccm_tx(br, instance, &tx_info, extack); +} + +int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_CFM_MAX + 1]; + int err; + + /* When this function is called for a port then the br pointer is + * invalid, therefor set the br to point correctly + */ + if (p) + br = p->br; + + err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MAX, attr, + br_cfm_policy, extack); + if (err) + return err; + + if (tb[IFLA_BRIDGE_CFM_MEP_CREATE]) { + err = br_mep_create_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CREATE], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_CFM_MEP_DELETE]) { + err = br_mep_delete_parse(br, tb[IFLA_BRIDGE_CFM_MEP_DELETE], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_CFM_MEP_CONFIG]) { + err = br_mep_config_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CONFIG], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_CFM_CC_CONFIG]) { + err = br_cc_config_parse(br, tb[IFLA_BRIDGE_CFM_CC_CONFIG], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD]) { + err = br_cc_peer_mep_add_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE]) { + err = br_cc_peer_mep_remove_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_CFM_CC_RDI]) { + err = br_cc_rdi_parse(br, tb[IFLA_BRIDGE_CFM_CC_RDI], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_CFM_CC_CCM_TX]) { + err = br_cc_ccm_tx_parse(br, tb[IFLA_BRIDGE_CFM_CC_CCM_TX], + extack); + if (err) + return err; + } + + return 0; +} + +int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br) +{ + struct br_cfm_peer_mep *peer_mep; + struct br_cfm_mep *mep; + struct nlattr *tb; + + hlist_for_each_entry_rcu(mep, &br->mep_list, head) { + tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INFO); + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN, + mep->create.domain)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION, + mep->create.direction)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX, + mep->create.ifindex)) + goto nla_put_failure; + + nla_nest_end(skb, tb); + + tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INFO); + + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC, + sizeof(mep->config.unicast_mac.addr), + mep->config.unicast_mac.addr)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL, + mep->config.mdlevel)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID, + mep->config.mepid)) + goto nla_put_failure; + + nla_nest_end(skb, tb); + + tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INFO); + + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE, + mep->cc_config.enable)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL, + mep->cc_config.exp_interval)) + goto nla_put_failure; + + if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID, + sizeof(mep->cc_config.exp_maid.data), + mep->cc_config.exp_maid.data)) + goto nla_put_failure; + + nla_nest_end(skb, tb); + + tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_RDI_INFO); + + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_RDI, + mep->rdi)) + goto nla_put_failure; + + nla_nest_end(skb, tb); + + tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INFO); + + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC, + sizeof(mep->cc_ccm_tx_info.dmac), + mep->cc_ccm_tx_info.dmac.addr)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE, + mep->cc_ccm_tx_info.seq_no_update)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD, + mep->cc_ccm_tx_info.period)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV, + mep->cc_ccm_tx_info.if_tlv)) + goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE, + mep->cc_ccm_tx_info.if_tlv_value)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV, + mep->cc_ccm_tx_info.port_tlv)) + goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE, + mep->cc_ccm_tx_info.port_tlv_value)) + goto nla_put_failure; + + nla_nest_end(skb, tb); + + hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) { + tb = nla_nest_start(skb, + IFLA_BRIDGE_CFM_CC_PEER_MEP_INFO); + + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_MEPID, + peer_mep->mepid)) + goto nla_put_failure; + + nla_nest_end(skb, tb); + } + } + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, tb); + +nla_info_failure: + return -EMSGSIZE; +} + +int br_cfm_status_fill_info(struct sk_buff *skb, + struct net_bridge *br, + bool getlink) +{ + struct br_cfm_peer_mep *peer_mep; + struct br_cfm_mep *mep; + struct nlattr *tb; + + hlist_for_each_entry_rcu(mep, &br->mep_list, head) { + tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INFO); + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN, + mep->status.opcode_unexp_seen)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN, + mep->status.version_unexp_seen)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN, + mep->status.rx_level_low_seen)) + goto nla_put_failure; + + /* Only clear if this is a GETLINK */ + if (getlink) { + /* Clear all 'seen' indications */ + mep->status.opcode_unexp_seen = false; + mep->status.version_unexp_seen = false; + mep->status.rx_level_low_seen = false; + } + + nla_nest_end(skb, tb); + + hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) { + tb = nla_nest_start(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_INFO); + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE, + mep->instance)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID, + peer_mep->mepid)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT, + peer_mep->cc_status.ccm_defect)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI, + peer_mep->cc_status.rdi)) + goto nla_put_failure; + + if (nla_put_u8(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE, + peer_mep->cc_status.port_tlv_value)) + goto nla_put_failure; + + if (nla_put_u8(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE, + peer_mep->cc_status.if_tlv_value)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN, + peer_mep->cc_status.seen)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN, + peer_mep->cc_status.tlv_seen)) + goto nla_put_failure; + + if (nla_put_u32(skb, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN, + peer_mep->cc_status.seq_unexp_seen)) + goto nla_put_failure; + + if (getlink) { /* Only clear if this is a GETLINK */ + /* Clear all 'seen' indications */ + peer_mep->cc_status.seen = false; + peer_mep->cc_status.tlv_seen = false; + peer_mep->cc_status.seq_unexp_seen = false; + } + + nla_nest_end(skb, tb); + } + } + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, tb); + +nla_info_failure: + return -EMSGSIZE; +} diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 6f742fee874a..3f2f06b4dd27 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -30,7 +30,6 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct net_bridge *br = netdev_priv(dev); struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; - struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); const struct nf_br_ops *nf_ops; u8 state = BR_STATE_FORWARDING; const unsigned char *dest; @@ -45,10 +44,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } - u64_stats_update_begin(&brstats->syncp); - brstats->tx_packets++; - brstats->tx_bytes += skb->len; - u64_stats_update_end(&brstats->syncp); + dev_sw_netstats_tx_add(dev, 1, skb->len); br_switchdev_frame_unmark(skb); BR_INPUT_SKB_CB(skb)->brdev = dev; @@ -93,7 +89,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) mdst = br_mdb_get(br, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && - br_multicast_querier_exists(br, eth_hdr(skb))) + br_multicast_querier_exists(br, eth_hdr(skb), mdst)) br_multicast_flood(mdst, skb, false, true); else br_flood(br, skb, BR_PKT_MULTICAST, false, true); @@ -119,26 +115,26 @@ static int br_dev_init(struct net_device *dev) struct net_bridge *br = netdev_priv(dev); int err; - br->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!br->stats) + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) return -ENOMEM; err = br_fdb_hash_init(br); if (err) { - free_percpu(br->stats); + free_percpu(dev->tstats); return err; } err = br_mdb_hash_init(br); if (err) { - free_percpu(br->stats); + free_percpu(dev->tstats); br_fdb_hash_fini(br); return err; } err = br_vlan_init(br); if (err) { - free_percpu(br->stats); + free_percpu(dev->tstats); br_mdb_hash_fini(br); br_fdb_hash_fini(br); return err; @@ -146,7 +142,7 @@ static int br_dev_init(struct net_device *dev) err = br_multicast_init_stats(br); if (err) { - free_percpu(br->stats); + free_percpu(dev->tstats); br_vlan_flush(br); br_mdb_hash_fini(br); br_fdb_hash_fini(br); @@ -165,7 +161,7 @@ static void br_dev_uninit(struct net_device *dev) br_vlan_flush(br); br_mdb_hash_fini(br); br_fdb_hash_fini(br); - free_percpu(br->stats); + free_percpu(dev->tstats); } static int br_dev_open(struct net_device *dev) @@ -177,6 +173,9 @@ static int br_dev_open(struct net_device *dev) br_stp_enable_bridge(br); br_multicast_open(br); + if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) + br_multicast_join_snoopers(br); + return 0; } @@ -197,19 +196,14 @@ static int br_dev_stop(struct net_device *dev) br_stp_disable_bridge(br); br_multicast_stop(br); + if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) + br_multicast_leave_snoopers(br); + netif_stop_queue(dev); return 0; } -static void br_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *stats) -{ - struct net_bridge *br = netdev_priv(dev); - - dev_fetch_sw_netstats(stats, br->stats); -} - static int br_change_mtu(struct net_device *dev, int new_mtu) { struct net_bridge *br = netdev_priv(dev); @@ -403,7 +397,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_init = br_dev_init, .ndo_uninit = br_dev_uninit, .ndo_start_xmit = br_dev_xmit, - .ndo_get_stats64 = br_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_set_mac_address = br_set_mac_address, .ndo_set_rx_mode = br_dev_set_multicast_list, .ndo_change_rx_flags = br_dev_change_rx_flags, @@ -454,8 +448,12 @@ void br_dev_setup(struct net_device *dev) spin_lock_init(&br->lock); INIT_LIST_HEAD(&br->port_list); INIT_HLIST_HEAD(&br->fdb_list); + INIT_HLIST_HEAD(&br->frame_type_list); #if IS_ENABLED(CONFIG_BRIDGE_MRP) - INIT_LIST_HEAD(&br->mrp_list); + INIT_HLIST_HEAD(&br->mrp_list); +#endif +#if IS_ENABLED(CONFIG_BRIDGE_CFM) + INIT_HLIST_HEAD(&br->mep_list); #endif spin_lock_init(&br->hash_lock); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 32ac8343b0ba..b7490237f3fc 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -602,6 +602,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, /* fastpath: update of existing entry */ if (unlikely(source != fdb->dst && !test_bit(BR_FDB_STICKY, &fdb->flags))) { + br_switchdev_fdb_notify(fdb, RTM_DELNEIGH); fdb->dst = source; fdb_modified = true; /* Take over HW learned entry */ diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index e28ffadd1371..6e9b049ae521 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -39,8 +39,7 @@ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb br_drop_fake_rtable(skb); if (skb->ip_summed == CHECKSUM_PARTIAL && - (skb->protocol == htons(ETH_P_8021Q) || - skb->protocol == htons(ETH_P_8021AD))) { + eth_type_vlan(skb->protocol)) { int depth; if (!__vlan_get_protocol(skb, skb->protocol, &depth)) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index a0e9a7937412..f7d2f472ae24 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -334,6 +334,7 @@ static void del_nbp(struct net_bridge_port *p) spin_unlock_bh(&br->lock); br_mrp_port_del(br, p); + br_cfm_port_del(br, p); br_ifinfo_notify(RTM_DELLINK, NULL, p); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 59a318b9f646..222285d9dae2 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -35,16 +35,12 @@ static int br_pass_frame_up(struct sk_buff *skb) struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); struct net_bridge_vlan_group *vg; - struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); - u64_stats_update_begin(&brstats->syncp); - brstats->rx_packets++; - brstats->rx_bytes += skb->len; - u64_stats_update_end(&brstats->syncp); + dev_sw_netstats_rx_add(brdev, skb->len); vg = br_vlan_group_rcu(br); /* Bridge is just like any other port. Make sure the - * packet is allowed except in promisc modue when someone + * packet is allowed except in promisc mode when someone * may be running packet capture. */ if (!(brdev->flags & IFF_PROMISC) && @@ -134,7 +130,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb case BR_PKT_MULTICAST: mdst = br_mdb_get(br, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && - br_multicast_querier_exists(br, eth_hdr(skb))) { + br_multicast_querier_exists(br, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || br_multicast_is_router(br)) { local_rcv = true; @@ -254,6 +250,21 @@ frame_finish: return RX_HANDLER_CONSUMED; } +/* Return 0 if the frame was not processed otherwise 1 + * note: already called with rcu_read_lock + */ +static int br_process_frame_type(struct net_bridge_port *p, + struct sk_buff *skb) +{ + struct br_frame_type *tmp; + + hlist_for_each_entry_rcu(tmp, &p->br->frame_type_list, list) + if (unlikely(tmp->type == skb->protocol)) + return tmp->frame_handler(p, skb); + + return 0; +} + /* * Return NULL if skb is handled * note: already called with rcu_read_lock @@ -343,7 +354,7 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) } } - if (unlikely(br_mrp_process(p, skb))) + if (unlikely(br_process_frame_type(p, skb))) return RX_HANDLER_PASS; forward: @@ -380,3 +391,19 @@ rx_handler_func_t *br_get_rx_handler(const struct net_device *dev) return br_handle_frame; } + +void br_add_frame(struct net_bridge *br, struct br_frame_type *ft) +{ + hlist_add_head_rcu(&ft->list, &br->frame_type_list); +} + +void br_del_frame(struct net_bridge *br, struct br_frame_type *ft) +{ + struct br_frame_type *tmp; + + hlist_for_each_entry(tmp, &br->frame_type_list, list) + if (ft == tmp) { + hlist_del_rcu(&ft->list); + return; + } +} diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index e15bab19a012..8846c5bcd075 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -87,6 +87,8 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, ip->src.ip6 = nla_get_in6_addr(mdb_attrs[MDBE_ATTR_SOURCE]); break; #endif + default: + ether_addr_copy(ip->dst.mac_addr, entry->addr.u.mac_addr); } } @@ -174,9 +176,11 @@ static int __mdb_fill_info(struct sk_buff *skb, if (mp->addr.proto == htons(ETH_P_IP)) e.addr.u.ip4 = mp->addr.dst.ip4; #if IS_ENABLED(CONFIG_IPV6) - if (mp->addr.proto == htons(ETH_P_IPV6)) + else if (mp->addr.proto == htons(ETH_P_IPV6)) e.addr.u.ip6 = mp->addr.dst.ip6; #endif + else + ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); e.addr.proto = mp->addr.proto; nest_ent = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY_INFO); @@ -210,6 +214,8 @@ static int __mdb_fill_info(struct sk_buff *skb, } break; #endif + default: + ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); } if (p) { if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, p->rt_protocol)) @@ -562,9 +568,12 @@ void br_mdb_notify(struct net_device *dev, if (mp->addr.proto == htons(ETH_P_IP)) ip_eth_mc_map(mp->addr.dst.ip4, mdb.addr); #if IS_ENABLED(CONFIG_IPV6) - else + else if (mp->addr.proto == htons(ETH_P_IPV6)) ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb.addr); #endif + else + ether_addr_copy(mdb.addr, mp->addr.dst.mac_addr); + mdb.obj.orig_dev = pg->key.port->dev; switch (type) { case RTM_NEWMDB: @@ -693,6 +702,12 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry, return false; } #endif + } else if (entry->addr.proto == 0) { + /* L2 mdb */ + if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) { + NL_SET_ERR_MSG_MOD(extack, "L2 entry group is not multicast"); + return false; + } } else { NL_SET_ERR_MSG_MOD(extack, "Unknown entry protocol"); return false; @@ -831,6 +846,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct net_bridge_port_group __rcu **pp; struct br_ip group, star_group; unsigned long now = jiffies; + unsigned char flags = 0; u8 filter_mode; int err; @@ -849,6 +865,11 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, } } + if (br_group_is_l2(&group) && entry->state != MDB_PERMANENT) { + NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed"); + return -EINVAL; + } + mp = br_mdb_ip_get(br, &group); if (!mp) { mp = br_multicast_new_group(br, &group); @@ -884,7 +905,10 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, filter_mode = br_multicast_is_star_g(&group) ? MCAST_EXCLUDE : MCAST_INCLUDE; - p = br_multicast_new_port_group(port, &group, *pp, entry->state, NULL, + if (entry->state == MDB_PERMANENT) + flags |= MDB_PG_FLAGS_PERMANENT; + + p = br_multicast_new_port_group(port, &group, *pp, flags, NULL, filter_mode, RTPROT_STATIC); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index b36689e6e7cb..12487f6fe9b4 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -6,6 +6,13 @@ static const u8 mrp_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x1 }; static const u8 mrp_in_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x3 }; +static int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb); + +static struct br_frame_type mrp_frame_type __read_mostly = { + .type = cpu_to_be16(ETH_P_MRP), + .frame_handler = br_mrp_process, +}; + static bool br_mrp_is_ring_port(struct net_bridge_port *p_port, struct net_bridge_port *s_port, struct net_bridge_port *port) @@ -47,8 +54,8 @@ static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id) struct br_mrp *res = NULL; struct br_mrp *mrp; - list_for_each_entry_rcu(mrp, &br->mrp_list, list, - lockdep_rtnl_is_held()) { + hlist_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { if (mrp->ring_id == ring_id) { res = mrp; break; @@ -63,8 +70,8 @@ static struct br_mrp *br_mrp_find_in_id(struct net_bridge *br, u32 in_id) struct br_mrp *res = NULL; struct br_mrp *mrp; - list_for_each_entry_rcu(mrp, &br->mrp_list, list, - lockdep_rtnl_is_held()) { + hlist_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { if (mrp->in_id == in_id) { res = mrp; break; @@ -78,8 +85,8 @@ static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex) { struct br_mrp *mrp; - list_for_each_entry_rcu(mrp, &br->mrp_list, list, - lockdep_rtnl_is_held()) { + hlist_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { struct net_bridge_port *p; p = rtnl_dereference(mrp->p_port); @@ -104,8 +111,8 @@ static struct br_mrp *br_mrp_find_port(struct net_bridge *br, struct br_mrp *res = NULL; struct br_mrp *mrp; - list_for_each_entry_rcu(mrp, &br->mrp_list, list, - lockdep_rtnl_is_held()) { + hlist_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { if (rcu_access_pointer(mrp->p_port) == p || rcu_access_pointer(mrp->s_port) == p || rcu_access_pointer(mrp->i_port) == p) { @@ -443,8 +450,11 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) rcu_assign_pointer(mrp->i_port, NULL); } - list_del_rcu(&mrp->list); + hlist_del_rcu(&mrp->list); kfree_rcu(mrp, rcu); + + if (hlist_empty(&br->mrp_list)) + br_del_frame(br, &mrp_frame_type); } /* Adds a new MRP instance. @@ -493,9 +503,12 @@ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance) spin_unlock_bh(&br->lock); rcu_assign_pointer(mrp->s_port, p); + if (hlist_empty(&br->mrp_list)) + br_add_frame(br, &mrp_frame_type); + INIT_DELAYED_WORK(&mrp->test_work, br_mrp_test_work_expired); INIT_DELAYED_WORK(&mrp->in_test_work, br_mrp_in_test_work_expired); - list_add_tail_rcu(&mrp->list, &br->mrp_list); + hlist_add_tail_rcu(&mrp->list, &br->mrp_list); err = br_mrp_switchdev_add(br, mrp); if (err) @@ -544,19 +557,22 @@ int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance) int br_mrp_set_port_state(struct net_bridge_port *p, enum br_mrp_port_state_type state) { + u32 port_state; + if (!p || !(p->flags & BR_MRP_AWARE)) return -EINVAL; spin_lock_bh(&p->br->lock); if (state == BR_MRP_PORT_STATE_FORWARDING) - p->state = BR_STATE_FORWARDING; + port_state = BR_STATE_FORWARDING; else - p->state = BR_STATE_BLOCKING; + port_state = BR_STATE_BLOCKING; + p->state = port_state; spin_unlock_bh(&p->br->lock); - br_mrp_port_switchdev_set_state(p, state); + br_mrp_port_switchdev_set_state(p, port_state); return 0; } @@ -623,7 +639,7 @@ int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role) { struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id); - int err; + enum br_mrp_hw_support support; if (!mrp) return -EINVAL; @@ -631,9 +647,9 @@ int br_mrp_set_ring_role(struct net_bridge *br, mrp->ring_role = role->ring_role; /* If there is an error just bailed out */ - err = br_mrp_switchdev_set_ring_role(br, mrp, role->ring_role); - if (err && err != -EOPNOTSUPP) - return err; + support = br_mrp_switchdev_set_ring_role(br, mrp, role->ring_role); + if (support == BR_MRP_NONE) + return -EOPNOTSUPP; /* Now detect if the HW actually applied the role or not. If the HW * applied the role it means that the SW will not to do those operations @@ -641,7 +657,7 @@ int br_mrp_set_ring_role(struct net_bridge *br, * SW when ring is open, but if the is not pushed to the HW the SW will * need to detect when the ring is open */ - mrp->ring_role_offloaded = err == -EOPNOTSUPP ? 0 : 1; + mrp->ring_role_offloaded = support == BR_MRP_SW ? 0 : 1; return 0; } @@ -654,6 +670,7 @@ int br_mrp_start_test(struct net_bridge *br, struct br_mrp_start_test *test) { struct br_mrp *mrp = br_mrp_find_id(br, test->ring_id); + enum br_mrp_hw_support support; if (!mrp) return -EINVAL; @@ -661,9 +678,13 @@ int br_mrp_start_test(struct net_bridge *br, /* Try to push it to the HW and if it fails then continue with SW * implementation and if that also fails then return error. */ - if (!br_mrp_switchdev_send_ring_test(br, mrp, test->interval, - test->max_miss, test->period, - test->monitor)) + support = br_mrp_switchdev_send_ring_test(br, mrp, test->interval, + test->max_miss, test->period, + test->monitor); + if (support == BR_MRP_NONE) + return -EOPNOTSUPP; + + if (support == BR_MRP_HW) return 0; mrp->test_interval = test->interval; @@ -705,8 +726,8 @@ int br_mrp_set_in_state(struct net_bridge *br, struct br_mrp_in_state *state) int br_mrp_set_in_role(struct net_bridge *br, struct br_mrp_in_role *role) { struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id); + enum br_mrp_hw_support support; struct net_bridge_port *p; - int err; if (!mrp) return -EINVAL; @@ -764,10 +785,10 @@ int br_mrp_set_in_role(struct net_bridge *br, struct br_mrp_in_role *role) mrp->in_id = role->in_id; /* If there is an error just bailed out */ - err = br_mrp_switchdev_set_in_role(br, mrp, role->in_id, - role->ring_id, role->in_role); - if (err && err != -EOPNOTSUPP) - return err; + support = br_mrp_switchdev_set_in_role(br, mrp, role->in_id, + role->ring_id, role->in_role); + if (support == BR_MRP_NONE) + return -EOPNOTSUPP; /* Now detect if the HW actually applied the role or not. If the HW * applied the role it means that the SW will not to do those operations @@ -775,7 +796,7 @@ int br_mrp_set_in_role(struct net_bridge *br, struct br_mrp_in_role *role) * SW when interconnect ring is open, but if the is not pushed to the HW * the SW will need to detect when the interconnect ring is open. */ - mrp->in_role_offloaded = err == -EOPNOTSUPP ? 0 : 1; + mrp->in_role_offloaded = support == BR_MRP_SW ? 0 : 1; return 0; } @@ -788,6 +809,7 @@ int br_mrp_start_in_test(struct net_bridge *br, struct br_mrp_start_in_test *in_test) { struct br_mrp *mrp = br_mrp_find_in_id(br, in_test->in_id); + enum br_mrp_hw_support support; if (!mrp) return -EINVAL; @@ -798,8 +820,13 @@ int br_mrp_start_in_test(struct net_bridge *br, /* Try to push it to the HW and if it fails then continue with SW * implementation and if that also fails then return error. */ - if (!br_mrp_switchdev_send_in_test(br, mrp, in_test->interval, - in_test->max_miss, in_test->period)) + support = br_mrp_switchdev_send_in_test(br, mrp, in_test->interval, + in_test->max_miss, + in_test->period); + if (support == BR_MRP_NONE) + return -EOPNOTSUPP; + + if (support == BR_MRP_HW) return 0; mrp->in_test_interval = in_test->interval; @@ -812,7 +839,7 @@ int br_mrp_start_in_test(struct net_bridge *br, return 0; } -/* Determin if the frame type is a ring frame */ +/* Determine if the frame type is a ring frame */ static bool br_mrp_ring_frame(struct sk_buff *skb) { const struct br_mrp_tlv_hdr *hdr; @@ -832,7 +859,7 @@ static bool br_mrp_ring_frame(struct sk_buff *skb) return false; } -/* Determin if the frame type is an interconnect frame */ +/* Determine if the frame type is an interconnect frame */ static bool br_mrp_in_frame(struct sk_buff *skb) { const struct br_mrp_tlv_hdr *hdr; @@ -845,7 +872,8 @@ static bool br_mrp_in_frame(struct sk_buff *skb) if (hdr->type == BR_MRP_TLV_HEADER_IN_TEST || hdr->type == BR_MRP_TLV_HEADER_IN_TOPO || hdr->type == BR_MRP_TLV_HEADER_IN_LINK_DOWN || - hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP) + hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP || + hdr->type == BR_MRP_TLV_HEADER_IN_LINK_STATUS) return true; return false; @@ -880,7 +908,7 @@ static void br_mrp_mrm_process(struct br_mrp *mrp, struct net_bridge_port *port, br_mrp_ring_port_open(port->dev, false); } -/* Determin if the test hdr has a better priority than the node */ +/* Determine if the test hdr has a better priority than the node */ static bool br_mrp_test_better_than_own(struct br_mrp *mrp, struct net_bridge *br, const struct br_mrp_ring_test_hdr *hdr) @@ -1113,9 +1141,9 @@ static int br_mrp_rcv(struct net_bridge_port *p, goto no_forward; } } else { - /* MIM should forward IntLinkChange and + /* MIM should forward IntLinkChange/Status and * IntTopoChange between ring ports but MIM - * should not forward IntLinkChange and + * should not forward IntLinkChange/Status and * IntTopoChange if the frame was received at * the interconnect port */ @@ -1142,6 +1170,17 @@ static int br_mrp_rcv(struct net_bridge_port *p, in_type == BR_MRP_TLV_HEADER_IN_LINK_DOWN)) goto forward; + /* MIC should forward IntLinkStatus frames only to + * interconnect port if it was received on a ring port. + * If it is received on interconnect port then, it + * should be forward on both ring ports + */ + if (br_mrp_is_ring_port(p_port, s_port, p) && + in_type == BR_MRP_TLV_HEADER_IN_LINK_STATUS) { + p_dst = NULL; + s_dst = NULL; + } + /* Should forward the InTopo frames only between the * ring ports */ @@ -1172,20 +1211,18 @@ no_forward: * normal forwarding. * note: already called with rcu_read_lock */ -int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) +static int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) { /* If there is no MRP instance do normal forwarding */ if (likely(!(p->flags & BR_MRP_AWARE))) goto out; - if (unlikely(skb->protocol == htons(ETH_P_MRP))) - return br_mrp_rcv(p, skb, p->dev); - + return br_mrp_rcv(p, skb, p->dev); out: return 0; } bool br_mrp_enabled(struct net_bridge *br) { - return !list_empty(&br->mrp_list); + return !hlist_empty(&br->mrp_list); } diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index 2a2fdf3500c5..ce6f63c77cc0 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -453,7 +453,7 @@ int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br) if (!mrp_tb) return -EMSGSIZE; - list_for_each_entry_rcu(mrp, &br->mrp_list, list) { + hlist_for_each_entry_rcu(mrp, &br->mrp_list, list) { struct net_bridge_port *p; tb = nla_nest_start_noflag(skb, IFLA_BRIDGE_MRP_INFO); diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c index ed547e03ace1..cb54b324fa8c 100644 --- a/net/bridge/br_mrp_switchdev.c +++ b/net/bridge/br_mrp_switchdev.c @@ -4,6 +4,30 @@ #include "br_private_mrp.h" +static enum br_mrp_hw_support +br_mrp_switchdev_port_obj(struct net_bridge *br, + const struct switchdev_obj *obj, bool add) +{ + int err; + + if (add) + err = switchdev_port_obj_add(br->dev, obj, NULL); + else + err = switchdev_port_obj_del(br->dev, obj); + + /* In case of success just return and notify the SW that doesn't need + * to do anything + */ + if (!err) + return BR_MRP_HW; + + if (err != -EOPNOTSUPP) + return BR_MRP_NONE; + + /* Continue with SW backup */ + return BR_MRP_SW; +} + int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp) { struct switchdev_obj_mrp mrp_obj = { @@ -14,14 +38,11 @@ int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp) .ring_id = mrp->ring_id, .prio = mrp->prio, }; - int err; - err = switchdev_port_obj_add(br->dev, &mrp_obj.obj, NULL); + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return 0; - if (err && err != -EOPNOTSUPP) - return err; - - return 0; + return switchdev_port_obj_add(br->dev, &mrp_obj.obj, NULL); } int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp) @@ -33,40 +54,54 @@ int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp) .s_port = NULL, .ring_id = mrp->ring_id, }; - int err; - - err = switchdev_port_obj_del(br->dev, &mrp_obj.obj); - if (err && err != -EOPNOTSUPP) - return err; + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return 0; - return 0; + return switchdev_port_obj_del(br->dev, &mrp_obj.obj); } -int br_mrp_switchdev_set_ring_role(struct net_bridge *br, - struct br_mrp *mrp, - enum br_mrp_ring_role_type role) +enum br_mrp_hw_support +br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_ring_role_type role) { struct switchdev_obj_ring_role_mrp mrp_role = { .obj.orig_dev = br->dev, .obj.id = SWITCHDEV_OBJ_ID_RING_ROLE_MRP, .ring_role = role, .ring_id = mrp->ring_id, + .sw_backup = false, }; + enum br_mrp_hw_support support; int err; - if (role == BR_MRP_RING_ROLE_DISABLED) - err = switchdev_port_obj_del(br->dev, &mrp_role.obj); - else + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return BR_MRP_SW; + + support = br_mrp_switchdev_port_obj(br, &mrp_role.obj, + role != BR_MRP_RING_ROLE_DISABLED); + if (support != BR_MRP_SW) + return support; + + /* If the driver can't configure to run completely the protocol in HW, + * then try again to configure the HW so the SW can run the protocol. + */ + mrp_role.sw_backup = true; + if (role != BR_MRP_RING_ROLE_DISABLED) err = switchdev_port_obj_add(br->dev, &mrp_role.obj, NULL); + else + err = switchdev_port_obj_del(br->dev, &mrp_role.obj); - return err; + if (!err) + return BR_MRP_SW; + + return BR_MRP_NONE; } -int br_mrp_switchdev_send_ring_test(struct net_bridge *br, - struct br_mrp *mrp, u32 interval, - u8 max_miss, u32 period, - bool monitor) +enum br_mrp_hw_support +br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period, + bool monitor) { struct switchdev_obj_ring_test_mrp test = { .obj.orig_dev = br->dev, @@ -77,14 +112,11 @@ int br_mrp_switchdev_send_ring_test(struct net_bridge *br, .period = period, .monitor = monitor, }; - int err; - if (interval == 0) - err = switchdev_port_obj_del(br->dev, &test.obj); - else - err = switchdev_port_obj_add(br->dev, &test.obj, NULL); + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return BR_MRP_SW; - return err; + return br_mrp_switchdev_port_obj(br, &test.obj, interval != 0); } int br_mrp_switchdev_set_ring_state(struct net_bridge *br, @@ -97,19 +129,17 @@ int br_mrp_switchdev_set_ring_state(struct net_bridge *br, .ring_state = state, .ring_id = mrp->ring_id, }; - int err; - - err = switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL); - if (err && err != -EOPNOTSUPP) - return err; + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return 0; - return 0; + return switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL); } -int br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, - u16 in_id, u32 ring_id, - enum br_mrp_in_role_type role) +enum br_mrp_hw_support +br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, + u16 in_id, u32 ring_id, + enum br_mrp_in_role_type role) { struct switchdev_obj_in_role_mrp mrp_role = { .obj.orig_dev = br->dev, @@ -118,15 +148,32 @@ int br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, .in_id = mrp->in_id, .ring_id = mrp->ring_id, .i_port = rtnl_dereference(mrp->i_port)->dev, + .sw_backup = false, }; + enum br_mrp_hw_support support; int err; - if (role == BR_MRP_IN_ROLE_DISABLED) - err = switchdev_port_obj_del(br->dev, &mrp_role.obj); - else + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return BR_MRP_SW; + + support = br_mrp_switchdev_port_obj(br, &mrp_role.obj, + role != BR_MRP_IN_ROLE_DISABLED); + if (support != BR_MRP_NONE) + return support; + + /* If the driver can't configure to run completely the protocol in HW, + * then try again to configure the HW so the SW can run the protocol. + */ + mrp_role.sw_backup = true; + if (role != BR_MRP_IN_ROLE_DISABLED) err = switchdev_port_obj_add(br->dev, &mrp_role.obj, NULL); + else + err = switchdev_port_obj_del(br->dev, &mrp_role.obj); + + if (!err) + return BR_MRP_SW; - return err; + return BR_MRP_NONE; } int br_mrp_switchdev_set_in_state(struct net_bridge *br, struct br_mrp *mrp, @@ -138,18 +185,16 @@ int br_mrp_switchdev_set_in_state(struct net_bridge *br, struct br_mrp *mrp, .in_state = state, .in_id = mrp->in_id, }; - int err; - - err = switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL); - if (err && err != -EOPNOTSUPP) - return err; + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return 0; - return 0; + return switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL); } -int br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, - u32 interval, u8 max_miss, u32 period) +enum br_mrp_hw_support +br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period) { struct switchdev_obj_in_test_mrp test = { .obj.orig_dev = br->dev, @@ -159,32 +204,25 @@ int br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, .in_id = mrp->in_id, .period = period, }; - int err; - if (interval == 0) - err = switchdev_port_obj_del(br->dev, &test.obj); - else - err = switchdev_port_obj_add(br->dev, &test.obj, NULL); + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return BR_MRP_SW; - return err; + return br_mrp_switchdev_port_obj(br, &test.obj, interval != 0); } -int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, - enum br_mrp_port_state_type state) +int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state) { struct switchdev_attr attr = { .orig_dev = p->dev, - .id = SWITCHDEV_ATTR_ID_MRP_PORT_STATE, - .u.mrp_port_state = state, + .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE, + .u.stp_state = state, }; - int err; - err = switchdev_port_attr_set(p->dev, &attr); - if (err && err != -EOPNOTSUPP) - br_warn(p->br, "error setting offload MRP state on port %u(%s)\n", - (unsigned int)p->port_no, p->dev->name); + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return 0; - return err; + return switchdev_port_attr_set(p->dev, &attr, NULL); } int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, @@ -195,11 +233,9 @@ int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, .id = SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, .u.mrp_port_role = role, }; - int err; - err = switchdev_port_attr_set(p->dev, &attr); - if (err && err != -EOPNOTSUPP) - return err; + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return 0; - return 0; + return switchdev_port_attr_set(p->dev, &attr, NULL); } diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index eae898c3cff7..9d265447d654 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -33,6 +33,7 @@ #endif #include "br_private.h" +#include "br_private_mcast_eht.h" static const struct rhashtable_params br_mdb_rht_params = { .head_offset = offsetof(struct net_bridge_mdb_entry, rhnode), @@ -179,7 +180,8 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, break; #endif default: - return NULL; + ip.proto = 0; + ether_addr_copy(ip.dst.mac_addr, eth_hdr(skb)->h_dest); } return br_mdb_ip_get_rcu(br, &ip); @@ -440,7 +442,8 @@ static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) br_multicast_sg_add_exclude_ports(star_mp, sg); } -static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src) +static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src, + bool fastleave) { struct net_bridge_port_group *p, *pg = src->pg; struct net_bridge_port_group __rcu **pp; @@ -465,6 +468,8 @@ static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src) (p->flags & MDB_PG_FLAGS_PERMANENT)) break; + if (fastleave) + p->flags |= MDB_PG_FLAGS_FAST_LEAVE; br_multicast_del_pg(mp, p, pp); break; } @@ -558,11 +563,12 @@ static void br_multicast_destroy_group_src(struct net_bridge_mcast_gc *gc) kfree_rcu(src, rcu); } -static void br_multicast_del_group_src(struct net_bridge_group_src *src) +void br_multicast_del_group_src(struct net_bridge_group_src *src, + bool fastleave) { struct net_bridge *br = src->pg->key.port->br; - br_multicast_fwd_src_remove(src); + br_multicast_fwd_src_remove(src, fastleave); hlist_del_init_rcu(&src->node); src->pg->src_ents--; hlist_add_head(&src->mcast_gc.gc_node, &br->mcast_gc_list); @@ -592,8 +598,9 @@ void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, rcu_assign_pointer(*pp, pg->next); hlist_del_init(&pg->mglist); + br_multicast_eht_clean_sets(pg); hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) - br_multicast_del_group_src(ent); + br_multicast_del_group_src(ent, false); br_mdb_notify(br->dev, mp, pg, RTM_DELMDB); if (!br_multicast_is_star_g(&mp->addr)) { rhashtable_remove_fast(&br->sg_port_tbl, &pg->rhnode, @@ -650,7 +657,7 @@ static void br_multicast_port_group_expired(struct timer_list *t) pg->filter_mode = MCAST_INCLUDE; hlist_for_each_entry_safe(src_ent, tmp, &pg->src_list, node) { if (!timer_pending(&src_ent->timer)) { - br_multicast_del_group_src(src_ent); + br_multicast_del_group_src(src_ent, false); changed = true; } } @@ -1077,7 +1084,7 @@ static void br_multicast_group_src_expired(struct timer_list *t) pg = src->pg; if (pg->filter_mode == MCAST_INCLUDE) { - br_multicast_del_group_src(src); + br_multicast_del_group_src(src, false); if (!hlist_empty(&pg->src_list)) goto out; br_multicast_find_del_pg(br, pg); @@ -1089,7 +1096,7 @@ out: spin_unlock(&br->multicast_lock); } -static struct net_bridge_group_src * +struct net_bridge_group_src * br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip) { struct net_bridge_group_src *ent; @@ -1171,6 +1178,8 @@ struct net_bridge_port_group *br_multicast_new_port_group( p->flags = flags; p->filter_mode = filter_mode; p->rt_protocol = rt_protocol; + p->eht_host_tree = RB_ROOT; + p->eht_set_tree = RB_ROOT; p->mcast_gc.destroy = br_multicast_destroy_port_group; INIT_HLIST_HEAD(&p->src_list); @@ -1203,6 +1212,10 @@ void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify) if (notify) br_mdb_notify(mp->br->dev, mp, NULL, RTM_NEWMDB); } + + if (br_group_is_l2(&mp->addr)) + return; + mod_timer(&mp->timer, jiffies + mp->br->multicast_membership_interval); } @@ -1238,7 +1251,7 @@ __br_multicast_add_group(struct net_bridge *br, mp = br_multicast_new_group(br, group); if (IS_ERR(mp)) - return ERR_PTR(PTR_ERR(mp)); + return ERR_CAST(mp); if (!port) { br_multicast_host_join(mp, true); @@ -1254,8 +1267,8 @@ __br_multicast_add_group(struct net_bridge *br, break; } - p = br_multicast_new_port_group(port, group, *pp, 0, src, filter_mode, - RTPROT_KERNEL); + p = br_multicast_new_port_group(port, group, *pp, 0, src, + filter_mode, RTPROT_KERNEL); if (unlikely(!p)) { p = ERR_PTR(-ENOMEM); goto out; @@ -1287,7 +1300,7 @@ static int br_multicast_add_group(struct net_bridge *br, pg = __br_multicast_add_group(br, port, group, src, filter_mode, igmpv2_mldv1, false); /* NULL is considered valid for host joined groups */ - err = IS_ERR(pg) ? PTR_ERR(pg) : 0; + err = PTR_ERR_OR_ZERO(pg); spin_unlock(&br->multicast_lock); return err; @@ -1368,7 +1381,7 @@ static void br_mc_router_state_change(struct net_bridge *p, .u.mrouter = is_mc_router, }; - switchdev_port_attr_set(p->dev, &attr); + switchdev_port_attr_set(p->dev, &attr, NULL); } static void br_multicast_local_router_expired(struct timer_list *t) @@ -1589,12 +1602,13 @@ static void br_mc_disabled_update(struct net_device *dev, bool value) .u.mc_disabled = !value, }; - switchdev_port_attr_set(dev, &attr); + switchdev_port_attr_set(dev, &attr, NULL); } int br_multicast_add_port(struct net_bridge_port *port) { port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + port->multicast_eht_hosts_limit = BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT; timer_setup(&port->multicast_router_timer, br_multicast_router_expired, 0); @@ -1695,7 +1709,7 @@ static int __grp_src_delete_marked(struct net_bridge_port_group *pg) hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) if (ent->flags & BR_SGRP_F_DELETE) { - br_multicast_del_group_src(ent); + br_multicast_del_group_src(ent, false); deleted++; } @@ -1794,8 +1808,9 @@ static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg) * INCLUDE (A) ALLOW (B) INCLUDE (A+B) (B)=GMI * EXCLUDE (X,Y) ALLOW (A) EXCLUDE (X+A,Y-A) (A)=GMI */ -static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, - void *srcs, u32 nsrcs, size_t src_size) +static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, void *h_addr, + void *srcs, u32 nsrcs, size_t addr_size, + int grec_type) { struct net_bridge *br = pg->key.port->br; struct net_bridge_group_src *ent; @@ -1807,7 +1822,7 @@ static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.src, srcs, src_size); + memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (!ent) { ent = br_multicast_new_group_src(pg, &src_ip); @@ -1817,9 +1832,11 @@ static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, if (ent) __grp_src_mod_timer(ent, now + br_multicast_gmi(br)); - srcs += src_size; } + if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + changed = true; + return changed; } @@ -1828,8 +1845,9 @@ static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, * Delete (A-B) * Group Timer=GMI */ -static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, - void *srcs, u32 nsrcs, size_t src_size) +static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, void *h_addr, + void *srcs, u32 nsrcs, size_t addr_size, + int grec_type) { struct net_bridge_group_src *ent; struct br_ip src_ip; @@ -1841,7 +1859,7 @@ static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.src, srcs, src_size); + memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) ent->flags &= ~BR_SGRP_F_DELETE; @@ -1849,9 +1867,10 @@ static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, ent = br_multicast_new_group_src(pg, &src_ip); if (ent) br_multicast_fwd_src_handle(ent); - srcs += src_size; } + br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type); + __grp_src_delete_marked(pg); } @@ -1861,8 +1880,9 @@ static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, * Delete (Y-A) * Group Timer=GMI */ -static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, - void *srcs, u32 nsrcs, size_t src_size) +static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, void *h_addr, + void *srcs, u32 nsrcs, size_t addr_size, + int grec_type) { struct net_bridge *br = pg->key.port->br; struct net_bridge_group_src *ent; @@ -1877,7 +1897,7 @@ static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.src, srcs, src_size); + memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags &= ~BR_SGRP_F_DELETE; @@ -1889,29 +1909,34 @@ static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, changed = true; } } - srcs += src_size; } + if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + changed = true; + if (__grp_src_delete_marked(pg)) changed = true; return changed; } -static bool br_multicast_isexc(struct net_bridge_port_group *pg, - void *srcs, u32 nsrcs, size_t src_size) +static bool br_multicast_isexc(struct net_bridge_port_group *pg, void *h_addr, + void *srcs, u32 nsrcs, size_t addr_size, + int grec_type) { struct net_bridge *br = pg->key.port->br; bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: - __grp_src_isexc_incl(pg, srcs, nsrcs, src_size); + __grp_src_isexc_incl(pg, h_addr, srcs, nsrcs, addr_size, + grec_type); br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: - changed = __grp_src_isexc_excl(pg, srcs, nsrcs, src_size); + changed = __grp_src_isexc_excl(pg, h_addr, srcs, nsrcs, addr_size, + grec_type); break; } @@ -1925,8 +1950,9 @@ static bool br_multicast_isexc(struct net_bridge_port_group *pg, * INCLUDE (A) TO_IN (B) INCLUDE (A+B) (B)=GMI * Send Q(G,A-B) */ -static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, - void *srcs, u32 nsrcs, size_t src_size) +static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, void *h_addr, + void *srcs, u32 nsrcs, size_t addr_size, + int grec_type) { struct net_bridge *br = pg->key.port->br; u32 src_idx, to_send = pg->src_ents; @@ -1941,7 +1967,7 @@ static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.src, srcs, src_size); + memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags &= ~BR_SGRP_F_SEND; @@ -1953,9 +1979,11 @@ static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, } if (ent) __grp_src_mod_timer(ent, now + br_multicast_gmi(br)); - srcs += src_size; } + if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + changed = true; + if (to_send) __grp_src_query_marked_and_rexmit(pg); @@ -1967,8 +1995,9 @@ static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, * Send Q(G,X-A) * Send Q(G) */ -static bool __grp_src_toin_excl(struct net_bridge_port_group *pg, - void *srcs, u32 nsrcs, size_t src_size) +static bool __grp_src_toin_excl(struct net_bridge_port_group *pg, void *h_addr, + void *srcs, u32 nsrcs, size_t addr_size, + int grec_type) { struct net_bridge *br = pg->key.port->br; u32 src_idx, to_send = pg->src_ents; @@ -1984,7 +2013,7 @@ static bool __grp_src_toin_excl(struct net_bridge_port_group *pg, memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.src, srcs, src_size); + memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { if (timer_pending(&ent->timer)) { @@ -1998,9 +2027,11 @@ static bool __grp_src_toin_excl(struct net_bridge_port_group *pg, } if (ent) __grp_src_mod_timer(ent, now + br_multicast_gmi(br)); - srcs += src_size; } + if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + changed = true; + if (to_send) __grp_src_query_marked_and_rexmit(pg); @@ -2009,20 +2040,32 @@ static bool __grp_src_toin_excl(struct net_bridge_port_group *pg, return changed; } -static bool br_multicast_toin(struct net_bridge_port_group *pg, - void *srcs, u32 nsrcs, size_t src_size) +static bool br_multicast_toin(struct net_bridge_port_group *pg, void *h_addr, + void *srcs, u32 nsrcs, size_t addr_size, + int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: - changed = __grp_src_toin_incl(pg, srcs, nsrcs, src_size); + changed = __grp_src_toin_incl(pg, h_addr, srcs, nsrcs, addr_size, + grec_type); break; case MCAST_EXCLUDE: - changed = __grp_src_toin_excl(pg, srcs, nsrcs, src_size); + changed = __grp_src_toin_excl(pg, h_addr, srcs, nsrcs, addr_size, + grec_type); break; } + if (br_multicast_eht_should_del_pg(pg)) { + pg->flags |= MDB_PG_FLAGS_FAST_LEAVE; + br_multicast_find_del_pg(pg->key.port->br, pg); + /* a notification has already been sent and we shouldn't + * access pg after the delete so we have to return false + */ + changed = false; + } + return changed; } @@ -2032,8 +2075,9 @@ static bool br_multicast_toin(struct net_bridge_port_group *pg, * Send Q(G,A*B) * Group Timer=GMI */ -static void __grp_src_toex_incl(struct net_bridge_port_group *pg, - void *srcs, u32 nsrcs, size_t src_size) +static void __grp_src_toex_incl(struct net_bridge_port_group *pg, void *h_addr, + void *srcs, u32 nsrcs, size_t addr_size, + int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; @@ -2045,7 +2089,7 @@ static void __grp_src_toex_incl(struct net_bridge_port_group *pg, memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.src, srcs, src_size); + memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags = (ent->flags & ~BR_SGRP_F_DELETE) | @@ -2056,9 +2100,10 @@ static void __grp_src_toex_incl(struct net_bridge_port_group *pg, } if (ent) br_multicast_fwd_src_handle(ent); - srcs += src_size; } + br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type); + __grp_src_delete_marked(pg); if (to_send) __grp_src_query_marked_and_rexmit(pg); @@ -2071,8 +2116,9 @@ static void __grp_src_toex_incl(struct net_bridge_port_group *pg, * Send Q(G,A-Y) * Group Timer=GMI */ -static bool __grp_src_toex_excl(struct net_bridge_port_group *pg, - void *srcs, u32 nsrcs, size_t src_size) +static bool __grp_src_toex_excl(struct net_bridge_port_group *pg, void *h_addr, + void *srcs, u32 nsrcs, size_t addr_size, + int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; @@ -2085,7 +2131,7 @@ static bool __grp_src_toex_excl(struct net_bridge_port_group *pg, memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.src, srcs, src_size); + memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags &= ~BR_SGRP_F_DELETE; @@ -2100,9 +2146,11 @@ static bool __grp_src_toex_excl(struct net_bridge_port_group *pg, ent->flags |= BR_SGRP_F_SEND; to_send++; } - srcs += src_size; } + if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + changed = true; + if (__grp_src_delete_marked(pg)) changed = true; if (to_send) @@ -2111,20 +2159,23 @@ static bool __grp_src_toex_excl(struct net_bridge_port_group *pg, return changed; } -static bool br_multicast_toex(struct net_bridge_port_group *pg, - void *srcs, u32 nsrcs, size_t src_size) +static bool br_multicast_toex(struct net_bridge_port_group *pg, void *h_addr, + void *srcs, u32 nsrcs, size_t addr_size, + int grec_type) { struct net_bridge *br = pg->key.port->br; bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: - __grp_src_toex_incl(pg, srcs, nsrcs, src_size); + __grp_src_toex_incl(pg, h_addr, srcs, nsrcs, addr_size, + grec_type); br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: - changed = __grp_src_toex_excl(pg, srcs, nsrcs, src_size); + changed = __grp_src_toex_excl(pg, h_addr, srcs, nsrcs, addr_size, + grec_type); break; } @@ -2137,11 +2188,12 @@ static bool br_multicast_toex(struct net_bridge_port_group *pg, /* State Msg type New state Actions * INCLUDE (A) BLOCK (B) INCLUDE (A) Send Q(G,A*B) */ -static void __grp_src_block_incl(struct net_bridge_port_group *pg, - void *srcs, u32 nsrcs, size_t src_size) +static bool __grp_src_block_incl(struct net_bridge_port_group *pg, void *h_addr, + void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; + bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) @@ -2150,28 +2202,29 @@ static void __grp_src_block_incl(struct net_bridge_port_group *pg, memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.src, srcs, src_size); + memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags |= BR_SGRP_F_SEND; to_send++; } - srcs += src_size; } + if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + changed = true; + if (to_send) __grp_src_query_marked_and_rexmit(pg); - if (pg->filter_mode == MCAST_INCLUDE && hlist_empty(&pg->src_list)) - br_multicast_find_del_pg(pg->key.port->br, pg); + return changed; } /* State Msg type New state Actions * EXCLUDE (X,Y) BLOCK (A) EXCLUDE (X+(A-Y),Y) (A-X-Y)=Group Timer * Send Q(G,A-Y) */ -static bool __grp_src_block_excl(struct net_bridge_port_group *pg, - void *srcs, u32 nsrcs, size_t src_size) +static bool __grp_src_block_excl(struct net_bridge_port_group *pg, void *h_addr, + void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; @@ -2184,7 +2237,7 @@ static bool __grp_src_block_excl(struct net_bridge_port_group *pg, memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.src, srcs, src_size); + memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (!ent) { ent = br_multicast_new_group_src(pg, &src_ip); @@ -2197,29 +2250,44 @@ static bool __grp_src_block_excl(struct net_bridge_port_group *pg, ent->flags |= BR_SGRP_F_SEND; to_send++; } - srcs += src_size; } + if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + changed = true; + if (to_send) __grp_src_query_marked_and_rexmit(pg); return changed; } -static bool br_multicast_block(struct net_bridge_port_group *pg, - void *srcs, u32 nsrcs, size_t src_size) +static bool br_multicast_block(struct net_bridge_port_group *pg, void *h_addr, + void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: - __grp_src_block_incl(pg, srcs, nsrcs, src_size); + changed = __grp_src_block_incl(pg, h_addr, srcs, nsrcs, addr_size, + grec_type); break; case MCAST_EXCLUDE: - changed = __grp_src_block_excl(pg, srcs, nsrcs, src_size); + changed = __grp_src_block_excl(pg, h_addr, srcs, nsrcs, addr_size, + grec_type); break; } + if ((pg->filter_mode == MCAST_INCLUDE && hlist_empty(&pg->src_list)) || + br_multicast_eht_should_del_pg(pg)) { + if (br_multicast_eht_should_del_pg(pg)) + pg->flags |= MDB_PG_FLAGS_FAST_LEAVE; + br_multicast_find_del_pg(pg->key.port->br, pg); + /* a notification has already been sent and we shouldn't + * access pg after the delete so we have to return false + */ + changed = false; + } + return changed; } @@ -2252,8 +2320,8 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, struct igmpv3_report *ih; struct igmpv3_grec *grec; int i, len, num, type; + __be32 group, *h_addr; bool changed = false; - __be32 group; int err = 0; u16 nsrcs; @@ -2313,32 +2381,33 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, pg = br_multicast_find_port(mdst, port, src); if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) goto unlock_continue; - /* reload grec */ + /* reload grec and host addr */ grec = (void *)(skb->data + len - sizeof(*grec) - (nsrcs * 4)); + h_addr = &ip_hdr(skb)->saddr; switch (type) { case IGMPV3_ALLOW_NEW_SOURCES: - changed = br_multicast_isinc_allow(pg, grec->grec_src, - nsrcs, sizeof(__be32)); + changed = br_multicast_isinc_allow(pg, h_addr, grec->grec_src, + nsrcs, sizeof(__be32), type); break; case IGMPV3_MODE_IS_INCLUDE: - changed = br_multicast_isinc_allow(pg, grec->grec_src, nsrcs, - sizeof(__be32)); + changed = br_multicast_isinc_allow(pg, h_addr, grec->grec_src, + nsrcs, sizeof(__be32), type); break; case IGMPV3_MODE_IS_EXCLUDE: - changed = br_multicast_isexc(pg, grec->grec_src, nsrcs, - sizeof(__be32)); + changed = br_multicast_isexc(pg, h_addr, grec->grec_src, + nsrcs, sizeof(__be32), type); break; case IGMPV3_CHANGE_TO_INCLUDE: - changed = br_multicast_toin(pg, grec->grec_src, nsrcs, - sizeof(__be32)); + changed = br_multicast_toin(pg, h_addr, grec->grec_src, + nsrcs, sizeof(__be32), type); break; case IGMPV3_CHANGE_TO_EXCLUDE: - changed = br_multicast_toex(pg, grec->grec_src, nsrcs, - sizeof(__be32)); + changed = br_multicast_toex(pg, h_addr, grec->grec_src, + nsrcs, sizeof(__be32), type); break; case IGMPV3_BLOCK_OLD_SOURCES: - changed = br_multicast_block(pg, grec->grec_src, nsrcs, - sizeof(__be32)); + changed = br_multicast_block(pg, h_addr, grec->grec_src, + nsrcs, sizeof(__be32), type); break; } if (changed) @@ -2362,6 +2431,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, unsigned int nsrcs_offset; const unsigned char *src; struct icmp6hdr *icmp6h; + struct in6_addr *h_addr; struct mld2_grec *grec; unsigned int grec_len; bool changed = false; @@ -2440,31 +2510,43 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, pg = br_multicast_find_port(mdst, port, src); if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) goto unlock_continue; + h_addr = &ipv6_hdr(skb)->saddr; switch (grec->grec_type) { case MLD2_ALLOW_NEW_SOURCES: - changed = br_multicast_isinc_allow(pg, grec->grec_src, - nsrcs, - sizeof(struct in6_addr)); + changed = br_multicast_isinc_allow(pg, h_addr, + grec->grec_src, nsrcs, + sizeof(struct in6_addr), + grec->grec_type); break; case MLD2_MODE_IS_INCLUDE: - changed = br_multicast_isinc_allow(pg, grec->grec_src, nsrcs, - sizeof(struct in6_addr)); + changed = br_multicast_isinc_allow(pg, h_addr, + grec->grec_src, nsrcs, + sizeof(struct in6_addr), + grec->grec_type); break; case MLD2_MODE_IS_EXCLUDE: - changed = br_multicast_isexc(pg, grec->grec_src, nsrcs, - sizeof(struct in6_addr)); + changed = br_multicast_isexc(pg, h_addr, + grec->grec_src, nsrcs, + sizeof(struct in6_addr), + grec->grec_type); break; case MLD2_CHANGE_TO_INCLUDE: - changed = br_multicast_toin(pg, grec->grec_src, nsrcs, - sizeof(struct in6_addr)); + changed = br_multicast_toin(pg, h_addr, + grec->grec_src, nsrcs, + sizeof(struct in6_addr), + grec->grec_type); break; case MLD2_CHANGE_TO_EXCLUDE: - changed = br_multicast_toex(pg, grec->grec_src, nsrcs, - sizeof(struct in6_addr)); + changed = br_multicast_toex(pg, h_addr, + grec->grec_src, nsrcs, + sizeof(struct in6_addr), + grec->grec_type); break; case MLD2_BLOCK_OLD_SOURCES: - changed = br_multicast_block(pg, grec->grec_src, nsrcs, - sizeof(struct in6_addr)); + changed = br_multicast_block(pg, h_addr, + grec->grec_src, nsrcs, + sizeof(struct in6_addr), + grec->grec_type); break; } if (changed) @@ -2563,7 +2645,7 @@ static void br_port_mc_router_state_change(struct net_bridge_port *p, .u.mrouter = is_mc_router, }; - switchdev_port_attr_set(p->dev, &attr); + switchdev_port_attr_set(p->dev, &attr, NULL); } /* @@ -3286,7 +3368,7 @@ static inline void br_ip6_multicast_join_snoopers(struct net_bridge *br) } #endif -static void br_multicast_join_snoopers(struct net_bridge *br) +void br_multicast_join_snoopers(struct net_bridge *br) { br_ip4_multicast_join_snoopers(br); br_ip6_multicast_join_snoopers(br); @@ -3317,7 +3399,7 @@ static inline void br_ip6_multicast_leave_snoopers(struct net_bridge *br) } #endif -static void br_multicast_leave_snoopers(struct net_bridge *br) +void br_multicast_leave_snoopers(struct net_bridge *br) { br_ip4_multicast_leave_snoopers(br); br_ip6_multicast_leave_snoopers(br); @@ -3336,9 +3418,6 @@ static void __br_multicast_open(struct net_bridge *br, void br_multicast_open(struct net_bridge *br) { - if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) - br_multicast_join_snoopers(br); - __br_multicast_open(br, &br->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) __br_multicast_open(br, &br->ip6_own_query); @@ -3354,9 +3433,6 @@ void br_multicast_stop(struct net_bridge *br) del_timer_sync(&br->ip6_other_query.timer); del_timer_sync(&br->ip6_own_query.timer); #endif - - if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) - br_multicast_leave_snoopers(br); } void br_multicast_dev_del(struct net_bridge *br) @@ -3487,6 +3563,7 @@ static void br_multicast_start_querier(struct net_bridge *br, int br_multicast_toggle(struct net_bridge *br, unsigned long val) { struct net_bridge_port *port; + bool change_snoopers = false; spin_lock_bh(&br->multicast_lock); if (!!br_opt_get(br, BROPT_MULTICAST_ENABLED) == !!val) @@ -3495,7 +3572,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val) br_mc_disabled_update(br->dev, val); br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) { - br_multicast_leave_snoopers(br); + change_snoopers = true; goto unlock; } @@ -3506,9 +3583,30 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val) list_for_each_entry(port, &br->port_list, list) __br_multicast_enable_port(port); + change_snoopers = true; + unlock: spin_unlock_bh(&br->multicast_lock); + /* br_multicast_join_snoopers has the potential to cause + * an MLD Report/Leave to be delivered to br_multicast_rcv, + * which would in turn call br_multicast_add_group, which would + * attempt to acquire multicast_lock. This function should be + * called after the lock has been released to avoid deadlocks on + * multicast_lock. + * + * br_multicast_leave_snoopers does not have the problem since + * br_multicast_rcv first checks BROPT_MULTICAST_ENABLED, and + * returns without calling br_multicast_ipv4/6_rcv if it's not + * enabled. Moved both functions out just for symmetry. + */ + if (change_snoopers) { + if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) + br_multicast_join_snoopers(br); + else + br_multicast_leave_snoopers(br); + } + return 0; } @@ -3690,7 +3788,7 @@ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto) memset(ð, 0, sizeof(eth)); eth.h_proto = htons(proto); - ret = br_multicast_querier_exists(br, ð); + ret = br_multicast_querier_exists(br, ð, NULL); unlock: rcu_read_unlock(); diff --git a/net/bridge/br_multicast_eht.c b/net/bridge/br_multicast_eht.c new file mode 100644 index 000000000000..fea38b9a7268 --- /dev/null +++ b/net/bridge/br_multicast_eht.c @@ -0,0 +1,878 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2020, Nikolay Aleksandrov <nikolay@nvidia.com> +#include <linux/err.h> +#include <linux/export.h> +#include <linux/if_ether.h> +#include <linux/igmp.h> +#include <linux/in.h> +#include <linux/jhash.h> +#include <linux/kernel.h> +#include <linux/log2.h> +#include <linux/netdevice.h> +#include <linux/netfilter_bridge.h> +#include <linux/random.h> +#include <linux/rculist.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/timer.h> +#include <linux/inetdevice.h> +#include <linux/mroute.h> +#include <net/ip.h> +#include <net/switchdev.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <linux/icmpv6.h> +#include <net/ipv6.h> +#include <net/mld.h> +#include <net/ip6_checksum.h> +#include <net/addrconf.h> +#endif + +#include "br_private.h" +#include "br_private_mcast_eht.h" + +static bool br_multicast_del_eht_set_entry(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *src_addr, + union net_bridge_eht_addr *h_addr); +static void br_multicast_create_eht_set_entry(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *src_addr, + union net_bridge_eht_addr *h_addr, + int filter_mode, + bool allow_zero_src); + +static struct net_bridge_group_eht_host * +br_multicast_eht_host_lookup(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr) +{ + struct rb_node *node = pg->eht_host_tree.rb_node; + + while (node) { + struct net_bridge_group_eht_host *this; + int result; + + this = rb_entry(node, struct net_bridge_group_eht_host, + rb_node); + result = memcmp(h_addr, &this->h_addr, sizeof(*h_addr)); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return this; + } + + return NULL; +} + +static int br_multicast_eht_host_filter_mode(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr) +{ + struct net_bridge_group_eht_host *eht_host; + + eht_host = br_multicast_eht_host_lookup(pg, h_addr); + if (!eht_host) + return MCAST_INCLUDE; + + return eht_host->filter_mode; +} + +static struct net_bridge_group_eht_set_entry * +br_multicast_eht_set_entry_lookup(struct net_bridge_group_eht_set *eht_set, + union net_bridge_eht_addr *h_addr) +{ + struct rb_node *node = eht_set->entry_tree.rb_node; + + while (node) { + struct net_bridge_group_eht_set_entry *this; + int result; + + this = rb_entry(node, struct net_bridge_group_eht_set_entry, + rb_node); + result = memcmp(h_addr, &this->h_addr, sizeof(*h_addr)); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return this; + } + + return NULL; +} + +static struct net_bridge_group_eht_set * +br_multicast_eht_set_lookup(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *src_addr) +{ + struct rb_node *node = pg->eht_set_tree.rb_node; + + while (node) { + struct net_bridge_group_eht_set *this; + int result; + + this = rb_entry(node, struct net_bridge_group_eht_set, + rb_node); + result = memcmp(src_addr, &this->src_addr, sizeof(*src_addr)); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return this; + } + + return NULL; +} + +static void __eht_destroy_host(struct net_bridge_group_eht_host *eht_host) +{ + WARN_ON(!hlist_empty(&eht_host->set_entries)); + + br_multicast_eht_hosts_dec(eht_host->pg); + + rb_erase(&eht_host->rb_node, &eht_host->pg->eht_host_tree); + RB_CLEAR_NODE(&eht_host->rb_node); + kfree(eht_host); +} + +static void br_multicast_destroy_eht_set_entry(struct net_bridge_mcast_gc *gc) +{ + struct net_bridge_group_eht_set_entry *set_h; + + set_h = container_of(gc, struct net_bridge_group_eht_set_entry, mcast_gc); + WARN_ON(!RB_EMPTY_NODE(&set_h->rb_node)); + + del_timer_sync(&set_h->timer); + kfree(set_h); +} + +static void br_multicast_destroy_eht_set(struct net_bridge_mcast_gc *gc) +{ + struct net_bridge_group_eht_set *eht_set; + + eht_set = container_of(gc, struct net_bridge_group_eht_set, mcast_gc); + WARN_ON(!RB_EMPTY_NODE(&eht_set->rb_node)); + WARN_ON(!RB_EMPTY_ROOT(&eht_set->entry_tree)); + + del_timer_sync(&eht_set->timer); + kfree(eht_set); +} + +static void __eht_del_set_entry(struct net_bridge_group_eht_set_entry *set_h) +{ + struct net_bridge_group_eht_host *eht_host = set_h->h_parent; + union net_bridge_eht_addr zero_addr; + + rb_erase(&set_h->rb_node, &set_h->eht_set->entry_tree); + RB_CLEAR_NODE(&set_h->rb_node); + hlist_del_init(&set_h->host_list); + memset(&zero_addr, 0, sizeof(zero_addr)); + if (memcmp(&set_h->h_addr, &zero_addr, sizeof(zero_addr))) + eht_host->num_entries--; + hlist_add_head(&set_h->mcast_gc.gc_node, &set_h->br->mcast_gc_list); + queue_work(system_long_wq, &set_h->br->mcast_gc_work); + + if (hlist_empty(&eht_host->set_entries)) + __eht_destroy_host(eht_host); +} + +static void br_multicast_del_eht_set(struct net_bridge_group_eht_set *eht_set) +{ + struct net_bridge_group_eht_set_entry *set_h; + struct rb_node *node; + + while ((node = rb_first(&eht_set->entry_tree))) { + set_h = rb_entry(node, struct net_bridge_group_eht_set_entry, + rb_node); + __eht_del_set_entry(set_h); + } + + rb_erase(&eht_set->rb_node, &eht_set->pg->eht_set_tree); + RB_CLEAR_NODE(&eht_set->rb_node); + hlist_add_head(&eht_set->mcast_gc.gc_node, &eht_set->br->mcast_gc_list); + queue_work(system_long_wq, &eht_set->br->mcast_gc_work); +} + +void br_multicast_eht_clean_sets(struct net_bridge_port_group *pg) +{ + struct net_bridge_group_eht_set *eht_set; + struct rb_node *node; + + while ((node = rb_first(&pg->eht_set_tree))) { + eht_set = rb_entry(node, struct net_bridge_group_eht_set, + rb_node); + br_multicast_del_eht_set(eht_set); + } +} + +static void br_multicast_eht_set_entry_expired(struct timer_list *t) +{ + struct net_bridge_group_eht_set_entry *set_h = from_timer(set_h, t, timer); + struct net_bridge *br = set_h->br; + + spin_lock(&br->multicast_lock); + if (RB_EMPTY_NODE(&set_h->rb_node) || timer_pending(&set_h->timer)) + goto out; + + br_multicast_del_eht_set_entry(set_h->eht_set->pg, + &set_h->eht_set->src_addr, + &set_h->h_addr); +out: + spin_unlock(&br->multicast_lock); +} + +static void br_multicast_eht_set_expired(struct timer_list *t) +{ + struct net_bridge_group_eht_set *eht_set = from_timer(eht_set, t, + timer); + struct net_bridge *br = eht_set->br; + + spin_lock(&br->multicast_lock); + if (RB_EMPTY_NODE(&eht_set->rb_node) || timer_pending(&eht_set->timer)) + goto out; + + br_multicast_del_eht_set(eht_set); +out: + spin_unlock(&br->multicast_lock); +} + +static struct net_bridge_group_eht_host * +__eht_lookup_create_host(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + unsigned char filter_mode) +{ + struct rb_node **link = &pg->eht_host_tree.rb_node, *parent = NULL; + struct net_bridge_group_eht_host *eht_host; + + while (*link) { + struct net_bridge_group_eht_host *this; + int result; + + this = rb_entry(*link, struct net_bridge_group_eht_host, + rb_node); + result = memcmp(h_addr, &this->h_addr, sizeof(*h_addr)); + parent = *link; + if (result < 0) + link = &((*link)->rb_left); + else if (result > 0) + link = &((*link)->rb_right); + else + return this; + } + + if (br_multicast_eht_hosts_over_limit(pg)) + return NULL; + + eht_host = kzalloc(sizeof(*eht_host), GFP_ATOMIC); + if (!eht_host) + return NULL; + + memcpy(&eht_host->h_addr, h_addr, sizeof(*h_addr)); + INIT_HLIST_HEAD(&eht_host->set_entries); + eht_host->pg = pg; + eht_host->filter_mode = filter_mode; + + rb_link_node(&eht_host->rb_node, parent, link); + rb_insert_color(&eht_host->rb_node, &pg->eht_host_tree); + + br_multicast_eht_hosts_inc(pg); + + return eht_host; +} + +static struct net_bridge_group_eht_set_entry * +__eht_lookup_create_set_entry(struct net_bridge *br, + struct net_bridge_group_eht_set *eht_set, + struct net_bridge_group_eht_host *eht_host, + bool allow_zero_src) +{ + struct rb_node **link = &eht_set->entry_tree.rb_node, *parent = NULL; + struct net_bridge_group_eht_set_entry *set_h; + + while (*link) { + struct net_bridge_group_eht_set_entry *this; + int result; + + this = rb_entry(*link, struct net_bridge_group_eht_set_entry, + rb_node); + result = memcmp(&eht_host->h_addr, &this->h_addr, + sizeof(union net_bridge_eht_addr)); + parent = *link; + if (result < 0) + link = &((*link)->rb_left); + else if (result > 0) + link = &((*link)->rb_right); + else + return this; + } + + /* always allow auto-created zero entry */ + if (!allow_zero_src && eht_host->num_entries >= PG_SRC_ENT_LIMIT) + return NULL; + + set_h = kzalloc(sizeof(*set_h), GFP_ATOMIC); + if (!set_h) + return NULL; + + memcpy(&set_h->h_addr, &eht_host->h_addr, + sizeof(union net_bridge_eht_addr)); + set_h->mcast_gc.destroy = br_multicast_destroy_eht_set_entry; + set_h->eht_set = eht_set; + set_h->h_parent = eht_host; + set_h->br = br; + timer_setup(&set_h->timer, br_multicast_eht_set_entry_expired, 0); + + hlist_add_head(&set_h->host_list, &eht_host->set_entries); + rb_link_node(&set_h->rb_node, parent, link); + rb_insert_color(&set_h->rb_node, &eht_set->entry_tree); + /* we must not count the auto-created zero entry otherwise we won't be + * able to track the full list of PG_SRC_ENT_LIMIT entries + */ + if (!allow_zero_src) + eht_host->num_entries++; + + return set_h; +} + +static struct net_bridge_group_eht_set * +__eht_lookup_create_set(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *src_addr) +{ + struct rb_node **link = &pg->eht_set_tree.rb_node, *parent = NULL; + struct net_bridge_group_eht_set *eht_set; + + while (*link) { + struct net_bridge_group_eht_set *this; + int result; + + this = rb_entry(*link, struct net_bridge_group_eht_set, + rb_node); + result = memcmp(src_addr, &this->src_addr, sizeof(*src_addr)); + parent = *link; + if (result < 0) + link = &((*link)->rb_left); + else if (result > 0) + link = &((*link)->rb_right); + else + return this; + } + + eht_set = kzalloc(sizeof(*eht_set), GFP_ATOMIC); + if (!eht_set) + return NULL; + + memcpy(&eht_set->src_addr, src_addr, sizeof(*src_addr)); + eht_set->mcast_gc.destroy = br_multicast_destroy_eht_set; + eht_set->pg = pg; + eht_set->br = pg->key.port->br; + eht_set->entry_tree = RB_ROOT; + timer_setup(&eht_set->timer, br_multicast_eht_set_expired, 0); + + rb_link_node(&eht_set->rb_node, parent, link); + rb_insert_color(&eht_set->rb_node, &pg->eht_set_tree); + + return eht_set; +} + +static void br_multicast_ip_src_to_eht_addr(const struct br_ip *src, + union net_bridge_eht_addr *dest) +{ + switch (src->proto) { + case htons(ETH_P_IP): + dest->ip4 = src->src.ip4; + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + memcpy(&dest->ip6, &src->src.ip6, sizeof(struct in6_addr)); + break; +#endif + } +} + +static void br_eht_convert_host_filter_mode(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + int filter_mode) +{ + struct net_bridge_group_eht_host *eht_host; + union net_bridge_eht_addr zero_addr; + + eht_host = br_multicast_eht_host_lookup(pg, h_addr); + if (eht_host) + eht_host->filter_mode = filter_mode; + + memset(&zero_addr, 0, sizeof(zero_addr)); + switch (filter_mode) { + case MCAST_INCLUDE: + br_multicast_del_eht_set_entry(pg, &zero_addr, h_addr); + break; + case MCAST_EXCLUDE: + br_multicast_create_eht_set_entry(pg, &zero_addr, h_addr, + MCAST_EXCLUDE, + true); + break; + } +} + +static void br_multicast_create_eht_set_entry(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *src_addr, + union net_bridge_eht_addr *h_addr, + int filter_mode, + bool allow_zero_src) +{ + struct net_bridge_group_eht_set_entry *set_h; + struct net_bridge_group_eht_host *eht_host; + struct net_bridge *br = pg->key.port->br; + struct net_bridge_group_eht_set *eht_set; + union net_bridge_eht_addr zero_addr; + + memset(&zero_addr, 0, sizeof(zero_addr)); + if (!allow_zero_src && !memcmp(src_addr, &zero_addr, sizeof(zero_addr))) + return; + + eht_set = __eht_lookup_create_set(pg, src_addr); + if (!eht_set) + return; + + eht_host = __eht_lookup_create_host(pg, h_addr, filter_mode); + if (!eht_host) + goto fail_host; + + set_h = __eht_lookup_create_set_entry(br, eht_set, eht_host, + allow_zero_src); + if (!set_h) + goto fail_set_entry; + + mod_timer(&set_h->timer, jiffies + br_multicast_gmi(br)); + mod_timer(&eht_set->timer, jiffies + br_multicast_gmi(br)); + + return; + +fail_set_entry: + if (hlist_empty(&eht_host->set_entries)) + __eht_destroy_host(eht_host); +fail_host: + if (RB_EMPTY_ROOT(&eht_set->entry_tree)) + br_multicast_del_eht_set(eht_set); +} + +static bool br_multicast_del_eht_set_entry(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *src_addr, + union net_bridge_eht_addr *h_addr) +{ + struct net_bridge_group_eht_set_entry *set_h; + struct net_bridge_group_eht_set *eht_set; + bool set_deleted = false; + + eht_set = br_multicast_eht_set_lookup(pg, src_addr); + if (!eht_set) + goto out; + + set_h = br_multicast_eht_set_entry_lookup(eht_set, h_addr); + if (!set_h) + goto out; + + __eht_del_set_entry(set_h); + + if (RB_EMPTY_ROOT(&eht_set->entry_tree)) { + br_multicast_del_eht_set(eht_set); + set_deleted = true; + } + +out: + return set_deleted; +} + +static void br_multicast_del_eht_host(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr) +{ + struct net_bridge_group_eht_set_entry *set_h; + struct net_bridge_group_eht_host *eht_host; + struct hlist_node *tmp; + + eht_host = br_multicast_eht_host_lookup(pg, h_addr); + if (!eht_host) + return; + + hlist_for_each_entry_safe(set_h, tmp, &eht_host->set_entries, host_list) + br_multicast_del_eht_set_entry(set_h->eht_set->pg, + &set_h->eht_set->src_addr, + &set_h->h_addr); +} + +static void __eht_allow_incl(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + void *srcs, + u32 nsrcs, + size_t addr_size) +{ + union net_bridge_eht_addr eht_src_addr; + u32 src_idx; + + memset(&eht_src_addr, 0, sizeof(eht_src_addr)); + for (src_idx = 0; src_idx < nsrcs; src_idx++) { + memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); + br_multicast_create_eht_set_entry(pg, &eht_src_addr, h_addr, + MCAST_INCLUDE, + false); + } +} + +static bool __eht_allow_excl(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + void *srcs, + u32 nsrcs, + size_t addr_size) +{ + bool changed = false, host_excl = false; + union net_bridge_eht_addr eht_src_addr; + struct net_bridge_group_src *src_ent; + struct br_ip src_ip; + u32 src_idx; + + host_excl = !!(br_multicast_eht_host_filter_mode(pg, h_addr) == MCAST_EXCLUDE); + memset(&eht_src_addr, 0, sizeof(eht_src_addr)); + for (src_idx = 0; src_idx < nsrcs; src_idx++) { + memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); + if (!host_excl) { + br_multicast_create_eht_set_entry(pg, &eht_src_addr, h_addr, + MCAST_INCLUDE, + false); + } else { + if (!br_multicast_del_eht_set_entry(pg, &eht_src_addr, + h_addr)) + continue; + memcpy(&src_ip, srcs + (src_idx * addr_size), addr_size); + src_ent = br_multicast_find_group_src(pg, &src_ip); + if (!src_ent) + continue; + br_multicast_del_group_src(src_ent, true); + changed = true; + } + } + + return changed; +} + +static bool br_multicast_eht_allow(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + void *srcs, + u32 nsrcs, + size_t addr_size) +{ + bool changed = false; + + switch (br_multicast_eht_host_filter_mode(pg, h_addr)) { + case MCAST_INCLUDE: + __eht_allow_incl(pg, h_addr, srcs, nsrcs, addr_size); + break; + case MCAST_EXCLUDE: + changed = __eht_allow_excl(pg, h_addr, srcs, nsrcs, addr_size); + break; + } + + return changed; +} + +static bool __eht_block_incl(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + void *srcs, + u32 nsrcs, + size_t addr_size) +{ + union net_bridge_eht_addr eht_src_addr; + struct net_bridge_group_src *src_ent; + bool changed = false; + struct br_ip src_ip; + u32 src_idx; + + memset(&eht_src_addr, 0, sizeof(eht_src_addr)); + memset(&src_ip, 0, sizeof(src_ip)); + src_ip.proto = pg->key.addr.proto; + for (src_idx = 0; src_idx < nsrcs; src_idx++) { + memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); + if (!br_multicast_del_eht_set_entry(pg, &eht_src_addr, h_addr)) + continue; + memcpy(&src_ip, srcs + (src_idx * addr_size), addr_size); + src_ent = br_multicast_find_group_src(pg, &src_ip); + if (!src_ent) + continue; + br_multicast_del_group_src(src_ent, true); + changed = true; + } + + return changed; +} + +static bool __eht_block_excl(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + void *srcs, + u32 nsrcs, + size_t addr_size) +{ + bool changed = false, host_excl = false; + union net_bridge_eht_addr eht_src_addr; + struct net_bridge_group_src *src_ent; + struct br_ip src_ip; + u32 src_idx; + + host_excl = !!(br_multicast_eht_host_filter_mode(pg, h_addr) == MCAST_EXCLUDE); + memset(&eht_src_addr, 0, sizeof(eht_src_addr)); + memset(&src_ip, 0, sizeof(src_ip)); + src_ip.proto = pg->key.addr.proto; + for (src_idx = 0; src_idx < nsrcs; src_idx++) { + memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); + if (host_excl) { + br_multicast_create_eht_set_entry(pg, &eht_src_addr, h_addr, + MCAST_EXCLUDE, + false); + } else { + if (!br_multicast_del_eht_set_entry(pg, &eht_src_addr, + h_addr)) + continue; + memcpy(&src_ip, srcs + (src_idx * addr_size), addr_size); + src_ent = br_multicast_find_group_src(pg, &src_ip); + if (!src_ent) + continue; + br_multicast_del_group_src(src_ent, true); + changed = true; + } + } + + return changed; +} + +static bool br_multicast_eht_block(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + void *srcs, + u32 nsrcs, + size_t addr_size) +{ + bool changed = false; + + switch (br_multicast_eht_host_filter_mode(pg, h_addr)) { + case MCAST_INCLUDE: + changed = __eht_block_incl(pg, h_addr, srcs, nsrcs, addr_size); + break; + case MCAST_EXCLUDE: + changed = __eht_block_excl(pg, h_addr, srcs, nsrcs, addr_size); + break; + } + + return changed; +} + +/* flush_entries is true when changing mode */ +static bool __eht_inc_exc(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + void *srcs, + u32 nsrcs, + size_t addr_size, + unsigned char filter_mode, + bool to_report) +{ + bool changed = false, flush_entries = to_report; + union net_bridge_eht_addr eht_src_addr; + u32 src_idx; + + if (br_multicast_eht_host_filter_mode(pg, h_addr) != filter_mode) + flush_entries = true; + + memset(&eht_src_addr, 0, sizeof(eht_src_addr)); + /* if we're changing mode del host and its entries */ + if (flush_entries) + br_multicast_del_eht_host(pg, h_addr); + for (src_idx = 0; src_idx < nsrcs; src_idx++) { + memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); + br_multicast_create_eht_set_entry(pg, &eht_src_addr, h_addr, + filter_mode, false); + } + /* we can be missing sets only if we've deleted some entries */ + if (flush_entries) { + struct net_bridge *br = pg->key.port->br; + struct net_bridge_group_eht_set *eht_set; + struct net_bridge_group_src *src_ent; + struct hlist_node *tmp; + + hlist_for_each_entry_safe(src_ent, tmp, &pg->src_list, node) { + br_multicast_ip_src_to_eht_addr(&src_ent->addr, + &eht_src_addr); + if (!br_multicast_eht_set_lookup(pg, &eht_src_addr)) { + br_multicast_del_group_src(src_ent, true); + changed = true; + continue; + } + /* this is an optimization for TO_INCLUDE where we lower + * the set's timeout to LMQT to catch timeout hosts: + * - host A (timing out): set entries X, Y + * - host B: set entry Z (new from current TO_INCLUDE) + * sends BLOCK Z after LMQT but host A's EHT + * entries still exist (unless lowered to LMQT + * so they can timeout with the S,Gs) + * => we wait another LMQT, when we can just delete the + * group immediately + */ + if (!(src_ent->flags & BR_SGRP_F_SEND) || + filter_mode != MCAST_INCLUDE || + !to_report) + continue; + eht_set = br_multicast_eht_set_lookup(pg, + &eht_src_addr); + if (!eht_set) + continue; + mod_timer(&eht_set->timer, jiffies + br_multicast_lmqt(br)); + } + } + + return changed; +} + +static bool br_multicast_eht_inc(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + void *srcs, + u32 nsrcs, + size_t addr_size, + bool to_report) +{ + bool changed; + + changed = __eht_inc_exc(pg, h_addr, srcs, nsrcs, addr_size, + MCAST_INCLUDE, to_report); + br_eht_convert_host_filter_mode(pg, h_addr, MCAST_INCLUDE); + + return changed; +} + +static bool br_multicast_eht_exc(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + void *srcs, + u32 nsrcs, + size_t addr_size, + bool to_report) +{ + bool changed; + + changed = __eht_inc_exc(pg, h_addr, srcs, nsrcs, addr_size, + MCAST_EXCLUDE, to_report); + br_eht_convert_host_filter_mode(pg, h_addr, MCAST_EXCLUDE); + + return changed; +} + +static bool __eht_ip4_handle(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + void *srcs, + u32 nsrcs, + int grec_type) +{ + bool changed = false, to_report = false; + + switch (grec_type) { + case IGMPV3_ALLOW_NEW_SOURCES: + br_multicast_eht_allow(pg, h_addr, srcs, nsrcs, sizeof(__be32)); + break; + case IGMPV3_BLOCK_OLD_SOURCES: + changed = br_multicast_eht_block(pg, h_addr, srcs, nsrcs, + sizeof(__be32)); + break; + case IGMPV3_CHANGE_TO_INCLUDE: + to_report = true; + fallthrough; + case IGMPV3_MODE_IS_INCLUDE: + changed = br_multicast_eht_inc(pg, h_addr, srcs, nsrcs, + sizeof(__be32), to_report); + break; + case IGMPV3_CHANGE_TO_EXCLUDE: + to_report = true; + fallthrough; + case IGMPV3_MODE_IS_EXCLUDE: + changed = br_multicast_eht_exc(pg, h_addr, srcs, nsrcs, + sizeof(__be32), to_report); + break; + } + + return changed; +} + +#if IS_ENABLED(CONFIG_IPV6) +static bool __eht_ip6_handle(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + void *srcs, + u32 nsrcs, + int grec_type) +{ + bool changed = false, to_report = false; + + switch (grec_type) { + case MLD2_ALLOW_NEW_SOURCES: + br_multicast_eht_allow(pg, h_addr, srcs, nsrcs, + sizeof(struct in6_addr)); + break; + case MLD2_BLOCK_OLD_SOURCES: + changed = br_multicast_eht_block(pg, h_addr, srcs, nsrcs, + sizeof(struct in6_addr)); + break; + case MLD2_CHANGE_TO_INCLUDE: + to_report = true; + fallthrough; + case MLD2_MODE_IS_INCLUDE: + changed = br_multicast_eht_inc(pg, h_addr, srcs, nsrcs, + sizeof(struct in6_addr), + to_report); + break; + case MLD2_CHANGE_TO_EXCLUDE: + to_report = true; + fallthrough; + case MLD2_MODE_IS_EXCLUDE: + changed = br_multicast_eht_exc(pg, h_addr, srcs, nsrcs, + sizeof(struct in6_addr), + to_report); + break; + } + + return changed; +} +#endif + +/* true means an entry was deleted */ +bool br_multicast_eht_handle(struct net_bridge_port_group *pg, + void *h_addr, + void *srcs, + u32 nsrcs, + size_t addr_size, + int grec_type) +{ + bool eht_enabled = !!(pg->key.port->flags & BR_MULTICAST_FAST_LEAVE); + union net_bridge_eht_addr eht_host_addr; + bool changed = false; + + if (!eht_enabled) + goto out; + + memset(&eht_host_addr, 0, sizeof(eht_host_addr)); + memcpy(&eht_host_addr, h_addr, addr_size); + if (addr_size == sizeof(__be32)) + changed = __eht_ip4_handle(pg, &eht_host_addr, srcs, nsrcs, + grec_type); +#if IS_ENABLED(CONFIG_IPV6) + else + changed = __eht_ip6_handle(pg, &eht_host_addr, srcs, nsrcs, + grec_type); +#endif + +out: + return changed; +} + +int br_multicast_eht_set_hosts_limit(struct net_bridge_port *p, + u32 eht_hosts_limit) +{ + struct net_bridge *br = p->br; + + if (!eht_hosts_limit) + return -EINVAL; + + spin_lock_bh(&br->multicast_lock); + p->multicast_eht_hosts_limit = eht_hosts_limit; + spin_unlock_bh(&br->multicast_lock); + + return 0; +} diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 04c3f9a82650..8edfb98ae1d5 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -735,6 +735,11 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff mtu_reserved = nf_bridge_mtu_reduction(skb); mtu = skb->dev->mtu; + if (nf_bridge->pkt_otherhost) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->pkt_otherhost = false; + } + if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu) mtu = nf_bridge->frag_max_size; @@ -835,8 +840,6 @@ static unsigned int br_nf_post_routing(void *priv, else return NF_ACCEPT; - /* We assume any code from br_dev_queue_push_xmit onwards doesn't care - * about the value of skb->pkt_type. */ if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; nf_bridge->pkt_otherhost = true; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 92d64abffa87..f2b1343f8332 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -16,7 +16,9 @@ #include "br_private.h" #include "br_private_stp.h" +#include "br_private_cfm.h" #include "br_private_tunnel.h" +#include "br_private_mcast_eht.h" static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg, u32 filter_mask) @@ -93,9 +95,11 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev, { struct net_bridge_vlan_group *vg = NULL; struct net_bridge_port *p = NULL; - struct net_bridge *br; - int num_vlan_infos; + struct net_bridge *br = NULL; + u32 num_cfm_peer_mep_infos; + u32 num_cfm_mep_infos; size_t vinfo_sz = 0; + int num_vlan_infos; rcu_read_lock(); if (netif_is_bridge_port(dev)) { @@ -114,6 +118,49 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev, /* Each VLAN is returned in bridge_vlan_info along with flags */ vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info)); + if (!(filter_mask & RTEXT_FILTER_CFM_STATUS)) + return vinfo_sz; + + if (!br) + return vinfo_sz; + + /* CFM status info must be added */ + br_cfm_mep_count(br, &num_cfm_mep_infos); + br_cfm_peer_mep_count(br, &num_cfm_peer_mep_infos); + + vinfo_sz += nla_total_size(0); /* IFLA_BRIDGE_CFM */ + /* For each status struct the MEP instance (u32) is added */ + /* MEP instance (u32) + br_cfm_mep_status */ + vinfo_sz += num_cfm_mep_infos * + /*IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE */ + (nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN */ + + nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN */ + + nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN */ + + nla_total_size(sizeof(u32))); + /* MEP instance (u32) + br_cfm_cc_peer_status */ + vinfo_sz += num_cfm_peer_mep_infos * + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE */ + (nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID */ + + nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT */ + + nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI */ + + nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE */ + + nla_total_size(sizeof(u8)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE */ + + nla_total_size(sizeof(u8)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN */ + + nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN */ + + nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN */ + + nla_total_size(sizeof(u32))); + return vinfo_sz; } @@ -153,6 +200,8 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ + + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT */ + + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_CNT */ + 0; } @@ -237,7 +286,11 @@ static int br_port_fill_attrs(struct sk_buff *skb, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BRPORT_MULTICAST_ROUTER, - p->multicast_router)) + p->multicast_router) || + nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, + p->multicast_eht_hosts_limit) || + nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, + p->multicast_eht_hosts_cnt)) return -EMSGSIZE; #endif @@ -377,7 +430,8 @@ nla_put_failure: static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port, u32 pid, u32 seq, int event, unsigned int flags, - u32 filter_mask, const struct net_device *dev) + u32 filter_mask, const struct net_device *dev, + bool getlink) { u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct nlattr *af = NULL; @@ -426,7 +480,9 @@ static int br_fill_ifinfo(struct sk_buff *skb, if (filter_mask & (RTEXT_FILTER_BRVLAN | RTEXT_FILTER_BRVLAN_COMPRESSED | - RTEXT_FILTER_MRP)) { + RTEXT_FILTER_MRP | + RTEXT_FILTER_CFM_CONFIG | + RTEXT_FILTER_CFM_STATUS)) { af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af) goto nla_put_failure; @@ -475,6 +531,36 @@ static int br_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; } + if (filter_mask & (RTEXT_FILTER_CFM_CONFIG | RTEXT_FILTER_CFM_STATUS)) { + struct nlattr *cfm_nest = NULL; + int err; + + if (!br_cfm_created(br) || port) + goto done; + + cfm_nest = nla_nest_start(skb, IFLA_BRIDGE_CFM); + if (!cfm_nest) + goto nla_put_failure; + + if (filter_mask & RTEXT_FILTER_CFM_CONFIG) { + rcu_read_lock(); + err = br_cfm_config_fill_info(skb, br); + rcu_read_unlock(); + if (err) + goto nla_put_failure; + } + + if (filter_mask & RTEXT_FILTER_CFM_STATUS) { + rcu_read_lock(); + err = br_cfm_status_fill_info(skb, br, getlink); + rcu_read_unlock(); + if (err) + goto nla_put_failure; + } + + nla_nest_end(skb, cfm_nest); + } + done: if (af) nla_nest_end(skb, af); @@ -486,11 +572,9 @@ nla_put_failure: return -EMSGSIZE; } -/* Notify listeners of a change in bridge or port information */ -void br_ifinfo_notify(int event, const struct net_bridge *br, - const struct net_bridge_port *port) +void br_info_notify(int event, const struct net_bridge *br, + const struct net_bridge_port *port, u32 filter) { - u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED; struct net_device *dev; struct sk_buff *skb; int err = -ENOBUFS; @@ -515,7 +599,7 @@ void br_ifinfo_notify(int event, const struct net_bridge *br, if (skb == NULL) goto errout; - err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, dev); + err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, dev, false); if (err < 0) { /* -EMSGSIZE implies BUG in br_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -528,6 +612,15 @@ errout: rtnl_set_sk_err(net, RTNLGRP_LINK, err); } +/* Notify listeners of a change in bridge or port information */ +void br_ifinfo_notify(int event, const struct net_bridge *br, + const struct net_bridge_port *port) +{ + u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED; + + return br_info_notify(event, br, port, filter); +} + /* * Dump information about all ports, in response to GETLINK */ @@ -538,11 +631,13 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN) && !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) && - !(filter_mask & RTEXT_FILTER_MRP)) + !(filter_mask & RTEXT_FILTER_MRP) && + !(filter_mask & RTEXT_FILTER_CFM_CONFIG) && + !(filter_mask & RTEXT_FILTER_CFM_STATUS)) return 0; return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags, - filter_mask, dev); + filter_mask, dev, true); } static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, @@ -700,6 +795,11 @@ static int br_afspec(struct net_bridge *br, if (err) return err; break; + case IFLA_BRIDGE_CFM: + err = br_cfm_parse(br, p, attr, cmd, extack); + if (err) + return err; + break; } } @@ -727,6 +827,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 }, + [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 }, }; /* Change the state of the port and notify spanning tree */ @@ -752,87 +853,59 @@ static int br_set_port_state(struct net_bridge_port *p, u8 state) } /* Set/clear or port flags based on attribute */ -static int br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[], - int attrtype, unsigned long mask) +static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[], + int attrtype, unsigned long mask) { - unsigned long flags; - int err; - if (!tb[attrtype]) - return 0; + return; if (nla_get_u8(tb[attrtype])) - flags = p->flags | mask; + p->flags |= mask; else - flags = p->flags & ~mask; - - err = br_switchdev_set_port_flag(p, flags, mask); - if (err) - return err; - - p->flags = flags; - return 0; + p->flags &= ~mask; } /* Process bridge protocol info on port */ -static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) +static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], + struct netlink_ext_ack *extack) { - unsigned long old_flags = p->flags; - bool br_vlan_tunnel_old = false; + unsigned long old_flags, changed_mask; + bool br_vlan_tunnel_old; int err; - err = br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE); - if (err) - return err; - - err = br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD); - if (err) - return err; - - err = br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE); - if (err) - return err; - - err = br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); - if (err) - return err; - - err = br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); - if (err) - return err; - - err = br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); - if (err) - return err; - - err = br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD); - if (err) - return err; - - err = br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST); - if (err) - return err; - - err = br_set_port_flag(p, tb, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD); - if (err) - return err; - - err = br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); - if (err) - return err; - - err = br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); - if (err) - return err; - - br_vlan_tunnel_old = (p->flags & BR_VLAN_TUNNEL) ? true : false; - err = br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL); - if (err) + old_flags = p->flags; + br_vlan_tunnel_old = (old_flags & BR_VLAN_TUNNEL) ? true : false; + + br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE); + br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD); + br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, + BR_MULTICAST_FAST_LEAVE); + br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); + br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); + br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); + br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD); + br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, + BR_MULTICAST_TO_UNICAST); + br_set_port_flag(p, tb, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD); + br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); + br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); + br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL); + br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS); + br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED); + + changed_mask = old_flags ^ p->flags; + + err = br_switchdev_set_port_flag(p, p->flags, changed_mask, extack); + if (err) { + p->flags = old_flags; return err; + } if (br_vlan_tunnel_old && !(p->flags & BR_VLAN_TUNNEL)) nbp_vlan_tunnel_info_flush(p); + br_port_flags_change(p, changed_mask); + if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); if (err) @@ -862,6 +935,15 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) if (err) return err; } + + if (tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]) { + u32 hlimit; + + hlimit = nla_get_u32(tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]); + err = br_multicast_eht_set_hosts_limit(p, hlimit); + if (err) + return err; + } #endif if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) { @@ -872,15 +954,6 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) p->group_fwd_mask = fwd_mask; } - err = br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, - BR_NEIGH_SUPPRESS); - if (err) - return err; - - err = br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED); - if (err) - return err; - if (tb[IFLA_BRPORT_BACKUP_PORT]) { struct net_device *backup_dev = NULL; u32 backup_ifindex; @@ -898,7 +971,6 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) return err; } - br_port_flags_change(p, old_flags ^ p->flags); return 0; } @@ -936,7 +1008,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, return err; spin_lock_bh(&p->br->lock); - err = br_setport(p, tb); + err = br_setport(p, tb, extack); spin_unlock_bh(&p->br->lock); } else { /* Binary compatibility with old RSTP */ @@ -1003,15 +1075,9 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[], return 0; #ifdef CONFIG_BRIDGE_VLAN_FILTERING - if (data[IFLA_BR_VLAN_PROTOCOL]) { - switch (nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL])) { - case htons(ETH_P_8021Q): - case htons(ETH_P_8021AD): - break; - default: - return -EPROTONOSUPPORT; - } - } + if (data[IFLA_BR_VLAN_PROTOCOL] && + !eth_type_vlan(nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]))) + return -EPROTONOSUPPORT; if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); @@ -1037,7 +1103,7 @@ static int br_port_slave_changelink(struct net_device *brdev, return 0; spin_lock_bh(&br->lock); - ret = br_setport(br_port_get_rtnl(dev), data); + ret = br_setport(br_port_get_rtnl(dev), data, extack); spin_unlock_bh(&br->lock); return ret; @@ -1146,7 +1212,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_VLAN_FILTERING]) { u8 vlan_filter = nla_get_u8(data[IFLA_BR_VLAN_FILTERING]); - err = __br_vlan_filter_toggle(br, vlan_filter); + err = br_vlan_filter_toggle(br, vlan_filter, extack); if (err) return err; } @@ -1155,7 +1221,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_VLAN_PROTOCOL]) { __be16 vlan_proto = nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]); - err = __br_vlan_set_proto(br, vlan_proto); + err = __br_vlan_set_proto(br, vlan_proto, extack); if (err) return err; } @@ -1631,7 +1697,7 @@ static int br_fill_linkxstats(struct sk_buff *skb, pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { struct bridge_vlan_xstats vxi; - struct br_vlan_stats stats; + struct pcpu_sw_netstats stats; if (++vl_idx < *prividx) continue; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 345118e35c42..d7d167e10b70 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -89,14 +89,6 @@ struct bridge_mcast_stats { }; #endif -struct br_vlan_stats { - u64 rx_bytes; - u64 rx_packets; - u64 tx_bytes; - u64 tx_packets; - struct u64_stats_sync syncp; -}; - struct br_tunnel_info { __be64 tunnel_id; struct metadata_dst *tunnel_dst; @@ -137,7 +129,7 @@ struct net_bridge_vlan { u16 flags; u16 priv_flags; u8 state; - struct br_vlan_stats __percpu *stats; + struct pcpu_sw_netstats __percpu *stats; union { struct net_bridge *br; struct net_bridge_port *port; @@ -260,6 +252,8 @@ struct net_bridge_port_group { struct timer_list timer; struct timer_list rexmit_timer; struct hlist_node mglist; + struct rb_root eht_set_tree; + struct rb_root eht_host_tree; struct rhash_head rhnode; struct net_bridge_mcast_gc mcast_gc; @@ -316,6 +310,8 @@ struct net_bridge_port { #if IS_ENABLED(CONFIG_IPV6) struct bridge_mcast_own_query ip6_own_query; #endif /* IS_ENABLED(CONFIG_IPV6) */ + u32 multicast_eht_hosts_limit; + u32 multicast_eht_hosts_cnt; unsigned char multicast_router; struct bridge_mcast_stats __percpu *mcast_stats; struct timer_list multicast_router_timer; @@ -383,9 +379,8 @@ enum net_bridge_opts { struct net_bridge { spinlock_t lock; spinlock_t hash_lock; - struct list_head port_list; + struct hlist_head frame_type_list; struct net_device *dev; - struct pcpu_sw_netstats __percpu *stats; unsigned long options; /* These fields are accessed on each packet */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING @@ -395,6 +390,7 @@ struct net_bridge { #endif struct rhashtable fdb_hash_tbl; + struct list_head port_list; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) union { struct rtable fake_rtable; @@ -481,7 +477,10 @@ struct net_bridge { struct hlist_head fdb_list; #if IS_ENABLED(CONFIG_BRIDGE_MRP) - struct list_head mrp_list; + struct hlist_head mrp_list; +#endif +#if IS_ENABLED(CONFIG_BRIDGE_CFM) + struct hlist_head mep_list; #endif }; @@ -755,6 +754,16 @@ int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev); int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); rx_handler_func_t *br_get_rx_handler(const struct net_device *dev); +struct br_frame_type { + __be16 type; + int (*frame_handler)(struct net_bridge_port *port, + struct sk_buff *skb); + struct hlist_node list; +}; + +void br_add_frame(struct net_bridge *br, struct br_frame_type *ft); +void br_del_frame(struct net_bridge *br, struct br_frame_type *ft); + static inline bool br_rx_handler_check_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler) == br_get_rx_handler(dev); @@ -792,6 +801,8 @@ void br_multicast_del_port(struct net_bridge_port *port); void br_multicast_enable_port(struct net_bridge_port *port); void br_multicast_disable_port(struct net_bridge_port *port); void br_multicast_init(struct net_bridge *br); +void br_multicast_join_snoopers(struct net_bridge *br); +void br_multicast_leave_snoopers(struct net_bridge *br); void br_multicast_open(struct net_bridge *br); void br_multicast_stop(struct net_bridge *br); void br_multicast_dev_del(struct net_bridge *br); @@ -839,6 +850,15 @@ void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, u8 filter_mode); void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, struct net_bridge_port_group *sg); +struct net_bridge_group_src * +br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip); +void br_multicast_del_group_src(struct net_bridge_group_src *src, + bool fastleave); + +static inline bool br_group_is_l2(const struct br_ip *group) +{ + return group->proto == 0; +} #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) @@ -871,7 +891,8 @@ __br_multicast_querier_exists(struct net_bridge *br, } static inline bool br_multicast_querier_exists(struct net_bridge *br, - struct ethhdr *eth) + struct ethhdr *eth, + const struct net_bridge_mdb_entry *mdb) { switch (eth->h_proto) { case (htons(ETH_P_IP)): @@ -883,7 +904,7 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br, &br->ip6_other_query, true); #endif default: - return false; + return !!mdb && br_group_is_l2(&mdb->addr); } } @@ -969,6 +990,14 @@ static inline void br_multicast_init(struct net_bridge *br) { } +static inline void br_multicast_join_snoopers(struct net_bridge *br) +{ +} + +static inline void br_multicast_leave_snoopers(struct net_bridge *br) +{ +} + static inline void br_multicast_open(struct net_bridge *br) { } @@ -993,7 +1022,8 @@ static inline bool br_multicast_is_router(struct net_bridge *br) } static inline bool br_multicast_querier_exists(struct net_bridge *br, - struct ethhdr *eth) + struct ethhdr *eth, + const struct net_bridge_mdb_entry *mdb) { return false; } @@ -1055,14 +1085,17 @@ int br_vlan_delete(struct net_bridge *br, u16 vid); void br_vlan_flush(struct net_bridge *br); struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid); void br_recalculate_fwd_mask(struct net_bridge *br); -int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); -int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); -int __br_vlan_set_proto(struct net_bridge *br, __be16 proto); -int br_vlan_set_proto(struct net_bridge *br, unsigned long val); +int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack); +int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, + struct netlink_ext_ack *extack); +int br_vlan_set_proto(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack); int br_vlan_set_stats(struct net_bridge *br, unsigned long val); int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); -int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); +int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack); int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, struct netlink_ext_ack *extack); int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, @@ -1072,7 +1105,7 @@ void nbp_vlan_flush(struct net_bridge_port *port); int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack); int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask); void br_vlan_get_stats(const struct net_bridge_vlan *v, - struct br_vlan_stats *stats); + struct pcpu_sw_netstats *stats); void br_vlan_port_event(struct net_bridge_port *p, unsigned long event); int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr); @@ -1231,8 +1264,9 @@ static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg) return 0; } -static inline int __br_vlan_filter_toggle(struct net_bridge *br, - unsigned long val) +static inline int br_vlan_filter_toggle(struct net_bridge *br, + unsigned long val, + struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } @@ -1268,7 +1302,7 @@ static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu( } static inline void br_vlan_get_stats(const struct net_bridge_vlan *v, - struct br_vlan_stats *stats) + struct pcpu_sw_netstats *stats) { } @@ -1417,7 +1451,6 @@ extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) #if IS_ENABLED(CONFIG_BRIDGE_MRP) int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack); -int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb); bool br_mrp_enabled(struct net_bridge *br); void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p); int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br); @@ -1429,11 +1462,6 @@ static inline int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, return -EOPNOTSUPP; } -static inline int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) -{ - return 0; -} - static inline bool br_mrp_enabled(struct net_bridge *br) { return false; @@ -1451,12 +1479,67 @@ static inline int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br) #endif +/* br_cfm.c */ +#if IS_ENABLED(CONFIG_BRIDGE_CFM) +int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, struct netlink_ext_ack *extack); +bool br_cfm_created(struct net_bridge *br); +void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *p); +int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br); +int br_cfm_status_fill_info(struct sk_buff *skb, + struct net_bridge *br, + bool getlink); +int br_cfm_mep_count(struct net_bridge *br, u32 *count); +int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count); +#else +static inline int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static inline bool br_cfm_created(struct net_bridge *br) +{ + return false; +} + +static inline void br_cfm_port_del(struct net_bridge *br, + struct net_bridge_port *p) +{ +} + +static inline int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br) +{ + return -EOPNOTSUPP; +} + +static inline int br_cfm_status_fill_info(struct sk_buff *skb, + struct net_bridge *br, + bool getlink) +{ + return -EOPNOTSUPP; +} + +static inline int br_cfm_mep_count(struct net_bridge *br, u32 *count) +{ + return -EOPNOTSUPP; +} + +static inline int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count) +{ + return -EOPNOTSUPP; +} +#endif + /* br_netlink.c */ extern struct rtnl_link_ops br_link_ops; int br_netlink_init(void); void br_netlink_fini(void); void br_ifinfo_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port); +void br_info_notify(int event, const struct net_bridge *br, + const struct net_bridge_port *port, u32 filter); int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags, struct netlink_ext_ack *extack); int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); @@ -1496,7 +1579,8 @@ bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, const struct sk_buff *skb); int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long flags, - unsigned long mask); + unsigned long mask, + struct netlink_ext_ack *extack); void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type); int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, @@ -1526,7 +1610,8 @@ static inline bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, static inline int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long flags, - unsigned long mask) + unsigned long mask, + struct netlink_ext_ack *extack) { return 0; } diff --git a/net/bridge/br_private_cfm.h b/net/bridge/br_private_cfm.h new file mode 100644 index 000000000000..a43a5e7fa2c3 --- /dev/null +++ b/net/bridge/br_private_cfm.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _BR_PRIVATE_CFM_H_ +#define _BR_PRIVATE_CFM_H_ + +#include "br_private.h" +#include <uapi/linux/cfm_bridge.h> + +struct br_cfm_mep_create { + enum br_cfm_domain domain; /* Domain for this MEP */ + enum br_cfm_mep_direction direction; /* Up or Down MEP direction */ + u32 ifindex; /* Residence port */ +}; + +int br_cfm_mep_create(struct net_bridge *br, + const u32 instance, + struct br_cfm_mep_create *const create, + struct netlink_ext_ack *extack); + +int br_cfm_mep_delete(struct net_bridge *br, + const u32 instance, + struct netlink_ext_ack *extack); + +struct br_cfm_mep_config { + u32 mdlevel; + u32 mepid; /* MEPID for this MEP */ + struct mac_addr unicast_mac; /* The MEP unicast MAC */ +}; + +int br_cfm_mep_config_set(struct net_bridge *br, + const u32 instance, + const struct br_cfm_mep_config *const config, + struct netlink_ext_ack *extack); + +struct br_cfm_maid { + u8 data[CFM_MAID_LENGTH]; +}; + +struct br_cfm_cc_config { + /* Expected received CCM PDU MAID. */ + struct br_cfm_maid exp_maid; + + /* Expected received CCM PDU interval. */ + /* Transmitting CCM PDU interval when CCM tx is enabled. */ + enum br_cfm_ccm_interval exp_interval; + + bool enable; /* Enable/disable CCM PDU handling */ +}; + +int br_cfm_cc_config_set(struct net_bridge *br, + const u32 instance, + const struct br_cfm_cc_config *const config, + struct netlink_ext_ack *extack); + +int br_cfm_cc_peer_mep_add(struct net_bridge *br, const u32 instance, + u32 peer_mep_id, + struct netlink_ext_ack *extack); +int br_cfm_cc_peer_mep_remove(struct net_bridge *br, const u32 instance, + u32 peer_mep_id, + struct netlink_ext_ack *extack); + +/* Transmitted CCM Remote Defect Indication status set. + * This RDI is inserted in transmitted CCM PDUs if CCM transmission is enabled. + * See br_cfm_cc_ccm_tx() with interval != BR_CFM_CCM_INTERVAL_NONE + */ +int br_cfm_cc_rdi_set(struct net_bridge *br, const u32 instance, + const bool rdi, struct netlink_ext_ack *extack); + +/* OAM PDU Tx information */ +struct br_cfm_cc_ccm_tx_info { + struct mac_addr dmac; + /* The CCM will be transmitted for this period in seconds. + * Call br_cfm_cc_ccm_tx before timeout to keep transmission alive. + * When period is zero any ongoing transmission will be stopped. + */ + u32 period; + + bool seq_no_update; /* Update Tx CCM sequence number */ + bool if_tlv; /* Insert Interface Status TLV */ + u8 if_tlv_value; /* Interface Status TLV value */ + bool port_tlv; /* Insert Port Status TLV */ + u8 port_tlv_value; /* Port Status TLV value */ + /* Sender ID TLV ?? + * Organization-Specific TLV ?? + */ +}; + +int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance, + const struct br_cfm_cc_ccm_tx_info *const tx_info, + struct netlink_ext_ack *extack); + +struct br_cfm_mep_status { + /* Indications that an OAM PDU has been seen. */ + bool opcode_unexp_seen; /* RX of OAM PDU with unexpected opcode */ + bool version_unexp_seen; /* RX of OAM PDU with unexpected version */ + bool rx_level_low_seen; /* Rx of OAM PDU with level low */ +}; + +struct br_cfm_cc_peer_status { + /* This CCM related status is based on the latest received CCM PDU. */ + u8 port_tlv_value; /* Port Status TLV value */ + u8 if_tlv_value; /* Interface Status TLV value */ + + /* CCM has not been received for 3.25 intervals */ + u8 ccm_defect:1; + + /* (RDI == 1) for last received CCM PDU */ + u8 rdi:1; + + /* Indications that a CCM PDU has been seen. */ + u8 seen:1; /* CCM PDU received */ + u8 tlv_seen:1; /* CCM PDU with TLV received */ + /* CCM PDU with unexpected sequence number received */ + u8 seq_unexp_seen:1; +}; + +struct br_cfm_mep { + /* list header of MEP instances */ + struct hlist_node head; + u32 instance; + struct br_cfm_mep_create create; + struct br_cfm_mep_config config; + struct br_cfm_cc_config cc_config; + struct br_cfm_cc_ccm_tx_info cc_ccm_tx_info; + /* List of multiple peer MEPs */ + struct hlist_head peer_mep_list; + struct net_bridge_port __rcu *b_port; + unsigned long ccm_tx_end; + struct delayed_work ccm_tx_dwork; + u32 ccm_tx_snumber; + u32 ccm_rx_snumber; + struct br_cfm_mep_status status; + bool rdi; + struct rcu_head rcu; +}; + +struct br_cfm_peer_mep { + struct hlist_node head; + struct br_cfm_mep *mep; + struct delayed_work ccm_rx_dwork; + u32 mepid; + struct br_cfm_cc_peer_status cc_status; + u32 ccm_rx_count_miss; + struct rcu_head rcu; +}; + +#endif /* _BR_PRIVATE_CFM_H_ */ diff --git a/net/bridge/br_private_mcast_eht.h b/net/bridge/br_private_mcast_eht.h new file mode 100644 index 000000000000..f89049f4892c --- /dev/null +++ b/net/bridge/br_private_mcast_eht.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2020, Nikolay Aleksandrov <nikolay@nvidia.com> + */ +#ifndef _BR_PRIVATE_MCAST_EHT_H_ +#define _BR_PRIVATE_MCAST_EHT_H_ + +#define BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT 512 + +union net_bridge_eht_addr { + __be32 ip4; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr ip6; +#endif +}; + +/* single host's list of set entries and filter_mode */ +struct net_bridge_group_eht_host { + struct rb_node rb_node; + + union net_bridge_eht_addr h_addr; + struct hlist_head set_entries; + unsigned int num_entries; + unsigned char filter_mode; + struct net_bridge_port_group *pg; +}; + +/* (host, src entry) added to a per-src set and host's list */ +struct net_bridge_group_eht_set_entry { + struct rb_node rb_node; + struct hlist_node host_list; + + union net_bridge_eht_addr h_addr; + struct timer_list timer; + struct net_bridge *br; + struct net_bridge_group_eht_set *eht_set; + struct net_bridge_group_eht_host *h_parent; + struct net_bridge_mcast_gc mcast_gc; +}; + +/* per-src set */ +struct net_bridge_group_eht_set { + struct rb_node rb_node; + + union net_bridge_eht_addr src_addr; + struct rb_root entry_tree; + struct timer_list timer; + struct net_bridge_port_group *pg; + struct net_bridge *br; + struct net_bridge_mcast_gc mcast_gc; +}; + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING +void br_multicast_eht_clean_sets(struct net_bridge_port_group *pg); +bool br_multicast_eht_handle(struct net_bridge_port_group *pg, + void *h_addr, + void *srcs, + u32 nsrcs, + size_t addr_size, + int grec_type); +int br_multicast_eht_set_hosts_limit(struct net_bridge_port *p, + u32 eht_hosts_limit); + +static inline bool +br_multicast_eht_should_del_pg(const struct net_bridge_port_group *pg) +{ + return !!((pg->key.port->flags & BR_MULTICAST_FAST_LEAVE) && + RB_EMPTY_ROOT(&pg->eht_host_tree)); +} + +static inline bool +br_multicast_eht_hosts_over_limit(const struct net_bridge_port_group *pg) +{ + const struct net_bridge_port *p = pg->key.port; + + return !!(p->multicast_eht_hosts_cnt >= p->multicast_eht_hosts_limit); +} + +static inline void br_multicast_eht_hosts_inc(struct net_bridge_port_group *pg) +{ + struct net_bridge_port *p = pg->key.port; + + p->multicast_eht_hosts_cnt++; +} + +static inline void br_multicast_eht_hosts_dec(struct net_bridge_port_group *pg) +{ + struct net_bridge_port *p = pg->key.port; + + p->multicast_eht_hosts_cnt--; +} +#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */ + +#endif /* _BR_PRIVATE_MCAST_EHT_H_ */ diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index af0e9eff6549..9559aa2750fb 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -8,7 +8,7 @@ struct br_mrp { /* list of mrp instances */ - struct list_head list; + struct hlist_node list; struct net_bridge_port __rcu *p_port; struct net_bridge_port __rcu *s_port; @@ -46,6 +46,20 @@ struct br_mrp { struct rcu_head rcu; }; +/* This type is returned by br_mrp_switchdev functions that allow to have a SW + * backup in case the HW can't implement completely the protocol. + * BR_MRP_NONE - means the HW can't run at all the protocol, so the SW stops + * configuring the node anymore. + * BR_MRP_SW - the HW can help the SW to run the protocol, by redirecting MRP + * frames to CPU. + * BR_MRP_HW - the HW can implement completely the protocol. + */ +enum br_mrp_hw_support { + BR_MRP_NONE, + BR_MRP_SW, + BR_MRP_HW, +}; + /* br_mrp.c */ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance); int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance); @@ -65,27 +79,59 @@ int br_mrp_start_in_test(struct net_bridge *br, /* br_mrp_switchdev.c */ int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp); int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp); -int br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp, - enum br_mrp_ring_role_type role); +enum br_mrp_hw_support +br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_ring_role_type role); int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp, enum br_mrp_ring_state_type state); -int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, - u32 interval, u8 max_miss, u32 period, - bool monitor); -int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, - enum br_mrp_port_state_type state); +enum br_mrp_hw_support +br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period, + bool monitor); +int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state); int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, enum br_mrp_port_role_type role); -int br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, - u16 in_id, u32 ring_id, - enum br_mrp_in_role_type role); +enum br_mrp_hw_support +br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, + u16 in_id, u32 ring_id, + enum br_mrp_in_role_type role); int br_mrp_switchdev_set_in_state(struct net_bridge *br, struct br_mrp *mrp, enum br_mrp_in_state_type state); -int br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, - u32 interval, u8 max_miss, u32 period); +enum br_mrp_hw_support +br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period); /* br_mrp_netlink.c */ int br_mrp_ring_port_open(struct net_device *dev, u8 loc); int br_mrp_in_port_open(struct net_device *dev, u8 loc); +/* MRP protocol data units */ +struct br_mrp_tlv_hdr { + __u8 type; + __u8 length; +}; + +struct br_mrp_common_hdr { + __be16 seq_id; + __u8 domain[MRP_DOMAIN_UUID_LENGTH]; +}; + +struct br_mrp_ring_test_hdr { + __be16 prio; + __u8 sa[ETH_ALEN]; + __be16 port_role; + __be16 state; + __be16 transitions; + __be32 timestamp; +} __attribute__((__packed__)); + +struct br_mrp_in_test_hdr { + __be16 id; + __u8 sa[ETH_ALEN]; + __be16 port_role; + __be16 state; + __be16 transitions; + __be32 timestamp; +} __attribute__((__packed__)); + #endif /* _BR_PRIVATE_MRP_H */ diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 3e88be7aa269..21c6781906aa 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -43,7 +43,7 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) return; p->state = state; - err = switchdev_port_attr_set(p->dev, &attr); + err = switchdev_port_attr_set(p->dev, &attr, NULL); if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); @@ -591,7 +591,7 @@ int __set_ageing_time(struct net_device *dev, unsigned long t) }; int err; - err = switchdev_port_attr_set(dev, &attr); + err = switchdev_port_attr_set(dev, &attr, NULL); if (err && err != -EOPNOTSUPP) return err; @@ -601,8 +601,8 @@ int __set_ageing_time(struct net_device *dev, unsigned long t) /* Set time interval that dynamic forwarding entries live * For pure software bridge, allow values outside the 802.1 * standard specification for special cases: - * 0 - entry never ages (all permanant) - * 1 - entry disappears (no persistance) + * 0 - entry never ages (all permanent) + * 1 - entry disappears (no persistence) * * Offloaded switch entries maybe more restrictive */ diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 015209bf44aa..b89503832fcc 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -60,42 +60,47 @@ bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long flags, - unsigned long mask) + unsigned long mask, + struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = p->dev, - .id = SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS, - .u.brport_flags = mask, }; struct switchdev_notifier_port_attr_info info = { .attr = &attr, }; int err; - if (mask & ~BR_PORT_FLAGS_HW_OFFLOAD) + mask &= BR_PORT_FLAGS_HW_OFFLOAD; + if (!mask) return 0; + attr.id = SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS; + attr.u.brport_flags.val = flags; + attr.u.brport_flags.mask = mask; + /* We run from atomic context here */ err = call_switchdev_notifiers(SWITCHDEV_PORT_ATTR_SET, p->dev, - &info.info, NULL); + &info.info, extack); err = notifier_to_errno(err); if (err == -EOPNOTSUPP) return 0; if (err) { - br_warn(p->br, "bridge flag offload is not supported %u(%s)\n", - (unsigned int)p->port_no, p->dev->name); + if (extack && !extack->_msg) + NL_SET_ERR_MSG_MOD(extack, + "bridge flag offload is not supported"); return -EOPNOTSUPP; } attr.id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS; attr.flags = SWITCHDEV_F_DEFER; - attr.u.brport_flags = flags; - err = switchdev_port_attr_set(p->dev, &attr); + err = switchdev_port_attr_set(p->dev, &attr, extack); if (err) { - br_warn(p->br, "error setting offload flag on port %u(%s)\n", - (unsigned int)p->port_no, p->dev->name); + if (extack && !extack->_msg) + NL_SET_ERR_MSG_MOD(extack, + "error setting offload flag on port"); return err; } @@ -153,8 +158,7 @@ int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .flags = flags, - .vid_begin = vid, - .vid_end = vid, + .vid = vid, }; return switchdev_port_obj_add(dev, &v.obj, extack); @@ -165,8 +169,7 @@ int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) struct switchdev_obj_port_vlan v = { .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, - .vid_begin = vid, - .vid_end = vid, + .vid = vid, }; return switchdev_port_obj_del(dev, &v.obj); diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 7db06e3f642a..072e29840082 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -19,6 +19,10 @@ #include "br_private.h" +/* IMPORTANT: new bridge options must be added with netlink support only + * please do not add new sysfs entries + */ + #define to_bridge(cd) ((struct net_bridge *)netdev_priv(to_net_dev(cd))) /* @@ -26,11 +30,13 @@ */ static ssize_t store_bridge_parm(struct device *d, const char *buf, size_t len, - int (*set)(struct net_bridge *, unsigned long)) + int (*set)(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack)) { struct net_bridge *br = to_bridge(d); - char *endp; + struct netlink_ext_ack extack = {0}; unsigned long val; + char *endp; int err; if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) @@ -43,9 +49,15 @@ static ssize_t store_bridge_parm(struct device *d, if (!rtnl_trylock()) return restart_syscall(); - err = (*set)(br, val); + err = (*set)(br, val, &extack); if (!err) netdev_state_change(br->dev); + if (extack._msg) { + if (err) + br_err(br, "%s\n", extack._msg); + else + br_warn(br, "%s\n", extack._msg); + } rtnl_unlock(); return err ? err : len; @@ -59,11 +71,17 @@ static ssize_t forward_delay_show(struct device *d, return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); } +static int set_forward_delay(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_set_forward_delay(br, val); +} + static ssize_t forward_delay_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_set_forward_delay); + return store_bridge_parm(d, buf, len, set_forward_delay); } static DEVICE_ATTR_RW(forward_delay); @@ -74,11 +92,17 @@ static ssize_t hello_time_show(struct device *d, struct device_attribute *attr, jiffies_to_clock_t(to_bridge(d)->hello_time)); } +static int set_hello_time(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_set_hello_time(br, val); +} + static ssize_t hello_time_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_set_hello_time); + return store_bridge_parm(d, buf, len, set_hello_time); } static DEVICE_ATTR_RW(hello_time); @@ -89,10 +113,16 @@ static ssize_t max_age_show(struct device *d, struct device_attribute *attr, jiffies_to_clock_t(to_bridge(d)->max_age)); } +static int set_max_age(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_set_max_age(br, val); +} + static ssize_t max_age_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_set_max_age); + return store_bridge_parm(d, buf, len, set_max_age); } static DEVICE_ATTR_RW(max_age); @@ -103,7 +133,8 @@ static ssize_t ageing_time_show(struct device *d, return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); } -static int set_ageing_time(struct net_bridge *br, unsigned long val) +static int set_ageing_time(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { return br_set_ageing_time(br, val); } @@ -124,9 +155,10 @@ static ssize_t stp_state_show(struct device *d, } -static int set_stp_state(struct net_bridge *br, unsigned long val) +static int set_stp_state(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { - return br_stp_set_enabled(br, val, NULL); + return br_stp_set_enabled(br, val, extack); } static ssize_t stp_state_store(struct device *d, @@ -145,7 +177,8 @@ static ssize_t group_fwd_mask_show(struct device *d, return sprintf(buf, "%#x\n", br->group_fwd_mask); } -static int set_group_fwd_mask(struct net_bridge *br, unsigned long val) +static int set_group_fwd_mask(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { if (val & BR_GROUPFWD_RESTRICTED) return -EINVAL; @@ -172,7 +205,8 @@ static ssize_t priority_show(struct device *d, struct device_attribute *attr, (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); } -static int set_priority(struct net_bridge *br, unsigned long val) +static int set_priority(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br_stp_set_bridge_priority(br, (u16) val); return 0; @@ -308,7 +342,8 @@ static ssize_t group_addr_store(struct device *d, static DEVICE_ATTR_RW(group_addr); -static int set_flush(struct net_bridge *br, unsigned long val) +static int set_flush(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br_fdb_flush(br); return 0; @@ -330,9 +365,10 @@ static ssize_t no_linklocal_learn_show(struct device *d, return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN)); } -static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val) +static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { - return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, NULL); + return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, extack); } static ssize_t no_linklocal_learn_store(struct device *d, @@ -351,11 +387,17 @@ static ssize_t multicast_router_show(struct device *d, return sprintf(buf, "%d\n", br->multicast_router); } +static int set_multicast_router(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_multicast_set_router(br, val); +} + static ssize_t multicast_router_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_multicast_set_router); + return store_bridge_parm(d, buf, len, set_multicast_router); } static DEVICE_ATTR_RW(multicast_router); @@ -367,11 +409,17 @@ static ssize_t multicast_snooping_show(struct device *d, return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED)); } +static int toggle_multicast(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_multicast_toggle(br, val); +} + static ssize_t multicast_snooping_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_multicast_toggle); + return store_bridge_parm(d, buf, len, toggle_multicast); } static DEVICE_ATTR_RW(multicast_snooping); @@ -384,7 +432,8 @@ static ssize_t multicast_query_use_ifaddr_show(struct device *d, br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)); } -static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val) +static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val); return 0; @@ -407,11 +456,17 @@ static ssize_t multicast_querier_show(struct device *d, return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_QUERIER)); } +static int set_multicast_querier(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_multicast_set_querier(br, val); +} + static ssize_t multicast_querier_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_multicast_set_querier); + return store_bridge_parm(d, buf, len, set_multicast_querier); } static DEVICE_ATTR_RW(multicast_querier); @@ -421,10 +476,12 @@ static ssize_t hash_elasticity_show(struct device *d, return sprintf(buf, "%u\n", RHT_ELASTICITY); } -static int set_elasticity(struct net_bridge *br, unsigned long val) +static int set_elasticity(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { - br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n", - RHT_ELASTICITY); + /* 16 is RHT_ELASTICITY */ + NL_SET_ERR_MSG_MOD(extack, + "the hash_elasticity option has been deprecated and is always 16"); return 0; } @@ -443,7 +500,8 @@ static ssize_t hash_max_show(struct device *d, struct device_attribute *attr, return sprintf(buf, "%u\n", br->hash_max); } -static int set_hash_max(struct net_bridge *br, unsigned long val) +static int set_hash_max(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->hash_max = val; return 0; @@ -465,11 +523,17 @@ static ssize_t multicast_igmp_version_show(struct device *d, return sprintf(buf, "%u\n", br->multicast_igmp_version); } +static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_multicast_set_igmp_version(br, val); +} + static ssize_t multicast_igmp_version_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_multicast_set_igmp_version); + return store_bridge_parm(d, buf, len, set_multicast_igmp_version); } static DEVICE_ATTR_RW(multicast_igmp_version); @@ -481,7 +545,8 @@ static ssize_t multicast_last_member_count_show(struct device *d, return sprintf(buf, "%u\n", br->multicast_last_member_count); } -static int set_last_member_count(struct net_bridge *br, unsigned long val) +static int set_last_member_count(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_last_member_count = val; return 0; @@ -502,7 +567,8 @@ static ssize_t multicast_startup_query_count_show( return sprintf(buf, "%u\n", br->multicast_startup_query_count); } -static int set_startup_query_count(struct net_bridge *br, unsigned long val) +static int set_startup_query_count(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_startup_query_count = val; return 0; @@ -524,7 +590,8 @@ static ssize_t multicast_last_member_interval_show( jiffies_to_clock_t(br->multicast_last_member_interval)); } -static int set_last_member_interval(struct net_bridge *br, unsigned long val) +static int set_last_member_interval(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_last_member_interval = clock_t_to_jiffies(val); return 0; @@ -546,7 +613,8 @@ static ssize_t multicast_membership_interval_show( jiffies_to_clock_t(br->multicast_membership_interval)); } -static int set_membership_interval(struct net_bridge *br, unsigned long val) +static int set_membership_interval(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_membership_interval = clock_t_to_jiffies(val); return 0; @@ -569,7 +637,8 @@ static ssize_t multicast_querier_interval_show(struct device *d, jiffies_to_clock_t(br->multicast_querier_interval)); } -static int set_querier_interval(struct net_bridge *br, unsigned long val) +static int set_querier_interval(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_querier_interval = clock_t_to_jiffies(val); return 0; @@ -592,7 +661,8 @@ static ssize_t multicast_query_interval_show(struct device *d, jiffies_to_clock_t(br->multicast_query_interval)); } -static int set_query_interval(struct net_bridge *br, unsigned long val) +static int set_query_interval(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_query_interval = clock_t_to_jiffies(val); return 0; @@ -615,7 +685,8 @@ static ssize_t multicast_query_response_interval_show( jiffies_to_clock_t(br->multicast_query_response_interval)); } -static int set_query_response_interval(struct net_bridge *br, unsigned long val) +static int set_query_response_interval(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_query_response_interval = clock_t_to_jiffies(val); return 0; @@ -638,7 +709,8 @@ static ssize_t multicast_startup_query_interval_show( jiffies_to_clock_t(br->multicast_startup_query_interval)); } -static int set_startup_query_interval(struct net_bridge *br, unsigned long val) +static int set_startup_query_interval(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_startup_query_interval = clock_t_to_jiffies(val); return 0; @@ -662,7 +734,8 @@ static ssize_t multicast_stats_enabled_show(struct device *d, br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)); } -static int set_stats_enabled(struct net_bridge *br, unsigned long val) +static int set_stats_enabled(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!val); return 0; @@ -687,11 +760,17 @@ static ssize_t multicast_mld_version_show(struct device *d, return sprintf(buf, "%u\n", br->multicast_mld_version); } +static int set_multicast_mld_version(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_multicast_set_mld_version(br, val); +} + static ssize_t multicast_mld_version_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_multicast_set_mld_version); + return store_bridge_parm(d, buf, len, set_multicast_mld_version); } static DEVICE_ATTR_RW(multicast_mld_version); #endif @@ -704,7 +783,8 @@ static ssize_t nf_call_iptables_show( return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES)); } -static int set_nf_call_iptables(struct net_bridge *br, unsigned long val) +static int set_nf_call_iptables(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val); return 0; @@ -725,7 +805,8 @@ static ssize_t nf_call_ip6tables_show( return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES)); } -static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val) +static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val); return 0; @@ -746,7 +827,8 @@ static ssize_t nf_call_arptables_show( return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES)); } -static int set_nf_call_arptables(struct net_bridge *br, unsigned long val) +static int set_nf_call_arptables(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val); return 0; @@ -817,11 +899,17 @@ static ssize_t vlan_stats_enabled_show(struct device *d, return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED)); } +static int set_vlan_stats_enabled(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_vlan_set_stats(br, val); +} + static ssize_t vlan_stats_enabled_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_vlan_set_stats); + return store_bridge_parm(d, buf, len, set_vlan_stats_enabled); } static DEVICE_ATTR_RW(vlan_stats_enabled); @@ -833,11 +921,17 @@ static ssize_t vlan_stats_per_port_show(struct device *d, return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)); } +static int set_vlan_stats_per_port(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_vlan_set_stats_per_port(br, val); +} + static ssize_t vlan_stats_per_port_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_vlan_set_stats_per_port); + return store_bridge_parm(d, buf, len, set_vlan_stats_per_port); } static DEVICE_ATTR_RW(vlan_stats_per_port); #endif diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 7a59cdddd3ce..72e92376eef1 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -17,6 +17,10 @@ #include "br_private.h" +/* IMPORTANT: new bridge port options must be added with netlink support only + * please do not add new sysfs entries + */ + struct brport_attribute { struct attribute attr; ssize_t (*show)(struct net_bridge_port *, char *); @@ -55,9 +59,9 @@ static BRPORT_ATTR(_name, 0644, \ static int store_flag(struct net_bridge_port *p, unsigned long v, unsigned long mask) { - unsigned long flags; - - flags = p->flags; + struct netlink_ext_ack extack = {0}; + unsigned long flags = p->flags; + int err; if (v) flags |= mask; @@ -65,6 +69,12 @@ static int store_flag(struct net_bridge_port *p, unsigned long v, flags &= ~mask; if (flags != p->flags) { + err = br_switchdev_set_port_flag(p, flags, mask, &extack); + if (err) { + netdev_err(p->dev, "%s\n", extack._msg); + return err; + } + p->flags = flags; br_port_flags_change(p, mask); } diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 3e493eb85bb2..8829f621b8ec 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -266,11 +266,14 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, } masterv = br_vlan_get_master(br, v->vid, extack); - if (!masterv) + if (!masterv) { + err = -ENOMEM; goto out_filt; + } v->brvlan = masterv; if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) { - v->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats); + v->stats = + netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!v->stats) { err = -ENOMEM; goto out_filt; @@ -421,7 +424,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb) { - struct br_vlan_stats *stats; + struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; u16 vid; @@ -474,7 +477,7 @@ static bool __allowed_ingress(const struct net_bridge *br, struct sk_buff *skb, u16 *vid, u8 *state) { - struct br_vlan_stats *stats; + struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; bool tagged; @@ -708,7 +711,7 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, if (!vlan) return -ENOMEM; - vlan->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats); + vlan->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!vlan->stats) { kfree(vlan); return -ENOMEM; @@ -803,7 +806,8 @@ void br_recalculate_fwd_mask(struct net_bridge *br) ~(1u << br->group_addr[5]); } -int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) +int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = br->dev, @@ -816,7 +820,7 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val) return 0; - err = switchdev_port_attr_set(br->dev, &attr); + err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) return err; @@ -828,11 +832,6 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) return 0; } -int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) -{ - return __br_vlan_filter_toggle(br, val); -} - bool br_vlan_enabled(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -851,17 +850,28 @@ int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto) } EXPORT_SYMBOL_GPL(br_vlan_get_proto); -int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) +int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, + struct netlink_ext_ack *extack) { + struct switchdev_attr attr = { + .orig_dev = br->dev, + .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL, + .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, + .u.vlan_protocol = ntohs(proto), + }; int err = 0; struct net_bridge_port *p; struct net_bridge_vlan *vlan; struct net_bridge_vlan_group *vg; - __be16 oldproto; + __be16 oldproto = br->vlan_proto; if (br->vlan_proto == proto) return 0; + err = switchdev_port_attr_set(br->dev, &attr, extack); + if (err && err != -EOPNOTSUPP) + return err; + /* Add VLANs for the new proto to the device filter. */ list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); @@ -872,7 +882,6 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) } } - oldproto = br->vlan_proto; br->vlan_proto = proto; recalculate_group_addr(br); @@ -888,6 +897,9 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) return 0; err_filt: + attr.u.vlan_protocol = ntohs(oldproto); + switchdev_port_attr_set(br->dev, &attr, NULL); + list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) vlan_vid_del(p->dev, proto, vlan->vid); @@ -900,12 +912,13 @@ err_filt: return err; } -int br_vlan_set_proto(struct net_bridge *br, unsigned long val) +int br_vlan_set_proto(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { - if (val != ETH_P_8021Q && val != ETH_P_8021AD) + if (!eth_type_vlan(htons(val))) return -EPROTONOSUPPORT; - return __br_vlan_set_proto(br, htons(val)); + return __br_vlan_set_proto(br, htons(val), extack); } int br_vlan_set_stats(struct net_bridge *br, unsigned long val) @@ -1085,7 +1098,8 @@ err_port: goto out; } -int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val) +int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { u16 pvid = val; int err = 0; @@ -1102,7 +1116,7 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val) err = -EPERM; goto out; } - err = __br_vlan_set_default_pvid(br, pvid, NULL); + err = __br_vlan_set_default_pvid(br, pvid, extack); out: return err; } @@ -1152,7 +1166,7 @@ int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) if (!vg) goto out; - ret = switchdev_port_attr_set(p->dev, &attr); + ret = switchdev_port_attr_set(p->dev, &attr, extack); if (ret && ret != -EOPNOTSUPP) goto err_vlan_enabled; @@ -1262,14 +1276,14 @@ void nbp_vlan_flush(struct net_bridge_port *port) } void br_vlan_get_stats(const struct net_bridge_vlan *v, - struct br_vlan_stats *stats) + struct pcpu_sw_netstats *stats) { int i; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(i) { u64 rxpackets, rxbytes, txpackets, txbytes; - struct br_vlan_stats *cpu_stats; + struct pcpu_sw_netstats *cpu_stats; unsigned int start; cpu_stats = per_cpu_ptr(v->stats, i); @@ -1585,7 +1599,7 @@ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) static bool br_vlan_stats_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) { - struct br_vlan_stats stats; + struct pcpu_sw_netstats stats; struct nlattr *nest; nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_STATS); diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 5040fe43f4b4..ac5372121e60 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -17,7 +17,9 @@ config NFT_BRIDGE_META config NFT_BRIDGE_REJECT tristate "Netfilter nf_tables bridge reject support" - depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6 + depends on NFT_REJECT + depends on NF_REJECT_IPV4 + depends on NF_REJECT_IPV6 help Add support to reject packets. diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 8e8ffac037cd..97805ec424c1 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -87,9 +87,8 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, return nft_meta_get_init(ctx, expr, tb); } - priv->dreg = nft_parse_register(tb[NFTA_META_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } static struct nft_expr_type nft_meta_bridge_type; diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index deae2c9a0f69..eba0efe64d05 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -39,30 +39,6 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb, } } -static int nft_bridge_iphdr_validate(struct sk_buff *skb) -{ - struct iphdr *iph; - u32 len; - - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - return 0; - - iph = ip_hdr(skb); - if (iph->ihl < 5 || iph->version != 4) - return 0; - - len = ntohs(iph->tot_len); - if (skb->len < len) - return 0; - else if (len < (iph->ihl*4)) - return 0; - - if (!pskb_may_pull(skb, iph->ihl*4)) - return 0; - - return 1; -} - /* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT) * or the bridge port (NF_BRIDGE PREROUTING). */ @@ -72,29 +48,11 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net, int hook) { struct sk_buff *nskb; - struct iphdr *niph; - const struct tcphdr *oth; - struct tcphdr _oth; - if (!nft_bridge_iphdr_validate(oldskb)) - return; - - oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook); - if (!oth) - return; - - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + - LL_MAX_HEADER, GFP_ATOMIC); + nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, dev, hook); if (!nskb) return; - skb_reserve(nskb, LL_MAX_HEADER); - niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, - net->ipv4.sysctl_ip_default_ttl); - nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - niph->tot_len = htons(nskb->len); - ip_send_check(niph); - nft_reject_br_push_etherhdr(oldskb, nskb); br_forward(br_port_get_rcu(dev), nskb, false, true); @@ -106,139 +64,32 @@ static void nft_reject_br_send_v4_unreach(struct net *net, int hook, u8 code) { struct sk_buff *nskb; - struct iphdr *niph; - struct icmphdr *icmph; - unsigned int len; - __wsum csum; - u8 proto; - - if (!nft_bridge_iphdr_validate(oldskb)) - return; - - /* IP header checks: fragment. */ - if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) - return; - - /* RFC says return as much as we can without exceeding 576 bytes. */ - len = min_t(unsigned int, 536, oldskb->len); - - if (!pskb_may_pull(oldskb, len)) - return; - - if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len))) - return; - - proto = ip_hdr(oldskb)->protocol; - - if (!skb_csum_unnecessary(oldskb) && - nf_reject_verify_csum(proto) && - nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto)) - return; - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmphdr) + - LL_MAX_HEADER + len, GFP_ATOMIC); + nskb = nf_reject_skb_v4_unreach(net, oldskb, dev, hook, code); if (!nskb) return; - skb_reserve(nskb, LL_MAX_HEADER); - niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP, - net->ipv4.sysctl_ip_default_ttl); - - skb_reset_transport_header(nskb); - icmph = skb_put_zero(nskb, sizeof(struct icmphdr)); - icmph->type = ICMP_DEST_UNREACH; - icmph->code = code; - - skb_put_data(nskb, skb_network_header(oldskb), len); - - csum = csum_partial((void *)icmph, len + sizeof(struct icmphdr), 0); - icmph->checksum = csum_fold(csum); - - niph->tot_len = htons(nskb->len); - ip_send_check(niph); - nft_reject_br_push_etherhdr(oldskb, nskb); br_forward(br_port_get_rcu(dev), nskb, false, true); } -static int nft_bridge_ip6hdr_validate(struct sk_buff *skb) -{ - struct ipv6hdr *hdr; - u32 pkt_len; - - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - return 0; - - hdr = ipv6_hdr(skb); - if (hdr->version != 6) - return 0; - - pkt_len = ntohs(hdr->payload_len); - if (pkt_len + sizeof(struct ipv6hdr) > skb->len) - return 0; - - return 1; -} - static void nft_reject_br_send_v6_tcp_reset(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, int hook) { struct sk_buff *nskb; - const struct tcphdr *oth; - struct tcphdr _oth; - unsigned int otcplen; - struct ipv6hdr *nip6h; - if (!nft_bridge_ip6hdr_validate(oldskb)) - return; - - oth = nf_reject_ip6_tcphdr_get(oldskb, &_oth, &otcplen, hook); - if (!oth) - return; - - nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct tcphdr) + - LL_MAX_HEADER, GFP_ATOMIC); + nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, dev, hook); if (!nskb) return; - skb_reserve(nskb, LL_MAX_HEADER); - nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, - net->ipv6.devconf_all->hop_limit); - nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen); - nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr)); - nft_reject_br_push_etherhdr(oldskb, nskb); br_forward(br_port_get_rcu(dev), nskb, false, true); } -static bool reject6_br_csum_ok(struct sk_buff *skb, int hook) -{ - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - int thoff; - __be16 fo; - u8 proto = ip6h->nexthdr; - - if (skb_csum_unnecessary(skb)) - return true; - - if (ip6h->payload_len && - pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h))) - return false; - - ip6h = ipv6_hdr(skb); - thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo); - if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) - return false; - - if (!nf_reject_verify_csum(proto)) - return true; - - return nf_ip6_checksum(skb, hook, thoff, proto) == 0; -} static void nft_reject_br_send_v6_unreach(struct net *net, struct sk_buff *oldskb, @@ -246,49 +97,11 @@ static void nft_reject_br_send_v6_unreach(struct net *net, int hook, u8 code) { struct sk_buff *nskb; - struct ipv6hdr *nip6h; - struct icmp6hdr *icmp6h; - unsigned int len; - - if (!nft_bridge_ip6hdr_validate(oldskb)) - return; - /* Include "As much of invoking packet as possible without the ICMPv6 - * packet exceeding the minimum IPv6 MTU" in the ICMP payload. - */ - len = min_t(unsigned int, 1220, oldskb->len); - - if (!pskb_may_pull(oldskb, len)) - return; - - if (!reject6_br_csum_ok(oldskb, hook)) - return; - - nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) + - LL_MAX_HEADER + len, GFP_ATOMIC); + nskb = nf_reject_skb_v6_unreach(net, oldskb, dev, hook, code); if (!nskb) return; - skb_reserve(nskb, LL_MAX_HEADER); - nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6, - net->ipv6.devconf_all->hop_limit); - - skb_reset_transport_header(nskb); - icmp6h = skb_put_zero(nskb, sizeof(struct icmp6hdr)); - icmp6h->icmp6_type = ICMPV6_DEST_UNREACH; - icmp6h->icmp6_code = code; - - skb_put_data(nskb, skb_network_header(oldskb), len); - nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr)); - - icmp6h->icmp6_cksum = - csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, - nskb->len - sizeof(struct ipv6hdr), - IPPROTO_ICMPV6, - csum_partial(icmp6h, - nskb->len - sizeof(struct ipv6hdr), - 0)); - nft_reject_br_push_etherhdr(oldskb, nskb); br_forward(br_port_get_rcu(dev), nskb, false, true); @@ -364,69 +177,13 @@ static int nft_reject_bridge_validate(const struct nft_ctx *ctx, (1 << NF_BR_LOCAL_IN)); } -static int nft_reject_bridge_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_reject *priv = nft_expr_priv(expr); - int icmp_code; - - if (tb[NFTA_REJECT_TYPE] == NULL) - return -EINVAL; - - priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - case NFT_REJECT_ICMPX_UNREACH: - if (tb[NFTA_REJECT_ICMP_CODE] == NULL) - return -EINVAL; - - icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); - if (priv->type == NFT_REJECT_ICMPX_UNREACH && - icmp_code > NFT_REJECT_ICMPX_MAX) - return -EINVAL; - - priv->icmp_code = icmp_code; - break; - case NFT_REJECT_TCP_RST: - break; - default: - return -EINVAL; - } - return 0; -} - -static int nft_reject_bridge_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_reject *priv = nft_expr_priv(expr); - - if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) - goto nla_put_failure; - - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - case NFT_REJECT_ICMPX_UNREACH: - if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) - goto nla_put_failure; - break; - default: - break; - } - - return 0; - -nla_put_failure: - return -1; -} - static struct nft_expr_type nft_reject_bridge_type; static const struct nft_expr_ops nft_reject_bridge_ops = { .type = &nft_reject_bridge_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), .eval = nft_reject_bridge_eval, - .init = nft_reject_bridge_init, - .dump = nft_reject_bridge_dump, + .init = nft_reject_init, + .dump = nft_reject_dump, .validate = nft_reject_bridge_validate, }; diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 79b6a04d8eb6..fadc7c8a3107 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -115,10 +115,7 @@ static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) else skb->ip_summed = CHECKSUM_NONE; - if (in_interrupt()) - netif_rx(skb); - else - netif_rx_ni(skb); + netif_rx_any_context(skb); /* Update statistics. */ priv->netdev->stats.rx_packets++; diff --git a/net/can/Kconfig b/net/can/Kconfig index 7c9958df91d3..a9ac5ffab286 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -4,7 +4,6 @@ # menuconfig CAN - depends on NET tristate "CAN bus subsystem support" help Controller Area Network (CAN) is a slow (up to 1Mbit/s) serial diff --git a/net/can/af_can.c b/net/can/af_can.c index 6373ab9c5507..837bb8af0ec3 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -541,10 +541,13 @@ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id, /* Check for bugs in CAN protocol implementations using af_can.c: * 'rcv' will be NULL if no matching list item was found for removal. + * As this case may potentially happen when closing a socket while + * the notifier for removing the CAN netdev is running we just print + * a warning here. */ if (!rcv) { - WARN(1, "BUG: receive list entry not found for dev %s, id %03X, mask %03X\n", - DNAME(dev), can_id, mask); + pr_warn("can: receive list entry not found for dev %s, id %03X, mask %03X\n", + DNAME(dev), can_id, mask); goto out; } @@ -677,16 +680,25 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - if (unlikely(dev->type != ARPHRD_CAN || skb->len != CAN_MTU || - cfd->len > CAN_MAX_DLEN)) { - pr_warn_once("PF_CAN: dropped non conform CAN skbuf: dev type %d, len %d, datalen %d\n", + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CAN_MTU)) { + pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n", + dev->type, skb->len); + goto free_skb; + } + + /* This check is made separately since cfd->len would be uninitialized if skb->len = 0. */ + if (unlikely(cfd->len > CAN_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d, datalen %d\n", dev->type, skb->len, cfd->len); - kfree_skb(skb); - return NET_RX_DROP; + goto free_skb; } can_receive(skb, dev); return NET_RX_SUCCESS; + +free_skb: + kfree_skb(skb); + return NET_RX_DROP; } static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, @@ -694,16 +706,25 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - if (unlikely(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU || - cfd->len > CANFD_MAX_DLEN)) { - pr_warn_once("PF_CAN: dropped non conform CAN FD skbuf: dev type %d, len %d, datalen %d\n", + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU)) { + pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n", + dev->type, skb->len); + goto free_skb; + } + + /* This check is made separately since cfd->len would be uninitialized if skb->len = 0. */ + if (unlikely(cfd->len > CANFD_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d, datalen %d\n", dev->type, skb->len, cfd->len); - kfree_skb(skb); - return NET_RX_DROP; + goto free_skb; } can_receive(skb, dev); return NET_RX_SUCCESS; + +free_skb: + kfree_skb(skb); + return NET_RX_DROP; } /* af_can protocol functions */ @@ -870,7 +891,7 @@ static __init int can_init(void) int err; /* check for correct padding to be able to use the structs similarly */ - BUILD_BUG_ON(offsetof(struct can_frame, can_dlc) != + BUILD_BUG_ON(offsetof(struct can_frame, len) != offsetof(struct canfd_frame, len) || offsetof(struct can_frame, data) != offsetof(struct canfd_frame, data)); diff --git a/net/can/gw.c b/net/can/gw.c index 6b790b6ff8d2..ba4124805602 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -199,6 +199,68 @@ static void mod_set_fddata(struct canfd_frame *cf, struct cf_mod *mod) memcpy(cf->data, mod->modframe.set.data, CANFD_MAX_DLEN); } +/* retrieve valid CC DLC value and store it into 'len' */ +static void mod_retrieve_ccdlc(struct canfd_frame *cf) +{ + struct can_frame *ccf = (struct can_frame *)cf; + + /* len8_dlc is only valid if len == CAN_MAX_DLEN */ + if (ccf->len != CAN_MAX_DLEN) + return; + + /* do we have a valid len8_dlc value from 9 .. 15 ? */ + if (ccf->len8_dlc > CAN_MAX_DLEN && ccf->len8_dlc <= CAN_MAX_RAW_DLC) + ccf->len = ccf->len8_dlc; +} + +/* convert valid CC DLC value in 'len' into struct can_frame elements */ +static void mod_store_ccdlc(struct canfd_frame *cf) +{ + struct can_frame *ccf = (struct can_frame *)cf; + + /* clear potential leftovers */ + ccf->len8_dlc = 0; + + /* plain data length 0 .. 8 - that was easy */ + if (ccf->len <= CAN_MAX_DLEN) + return; + + /* potentially broken values are caught in can_can_gw_rcv() */ + if (ccf->len > CAN_MAX_RAW_DLC) + return; + + /* we have a valid dlc value from 9 .. 15 in ccf->len */ + ccf->len8_dlc = ccf->len; + ccf->len = CAN_MAX_DLEN; +} + +static void mod_and_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) +{ + mod_retrieve_ccdlc(cf); + mod_and_len(cf, mod); + mod_store_ccdlc(cf); +} + +static void mod_or_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) +{ + mod_retrieve_ccdlc(cf); + mod_or_len(cf, mod); + mod_store_ccdlc(cf); +} + +static void mod_xor_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) +{ + mod_retrieve_ccdlc(cf); + mod_xor_len(cf, mod); + mod_store_ccdlc(cf); +} + +static void mod_set_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) +{ + mod_set_len(cf, mod); + mod_store_ccdlc(cf); +} + static void canframecpy(struct canfd_frame *dst, struct can_frame *src) { /* Copy the struct members separately to ensure that no uninitialized @@ -207,7 +269,7 @@ static void canframecpy(struct canfd_frame *dst, struct can_frame *src) */ dst->can_id = src->can_id; - dst->len = src->can_dlc; + dst->len = src->len; *(u64 *)dst->data = *(u64 *)src->data; } @@ -842,8 +904,8 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_and_id; - if (mb.modtype & CGW_MOD_LEN) - mod->modfunc[modidx++] = mod_and_len; + if (mb.modtype & CGW_MOD_DLC) + mod->modfunc[modidx++] = mod_and_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_and_data; @@ -858,8 +920,8 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_or_id; - if (mb.modtype & CGW_MOD_LEN) - mod->modfunc[modidx++] = mod_or_len; + if (mb.modtype & CGW_MOD_DLC) + mod->modfunc[modidx++] = mod_or_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_or_data; @@ -874,8 +936,8 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_xor_id; - if (mb.modtype & CGW_MOD_LEN) - mod->modfunc[modidx++] = mod_xor_len; + if (mb.modtype & CGW_MOD_DLC) + mod->modfunc[modidx++] = mod_xor_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_xor_data; @@ -890,8 +952,8 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_set_id; - if (mb.modtype & CGW_MOD_LEN) - mod->modfunc[modidx++] = mod_set_len; + if (mb.modtype & CGW_MOD_DLC) + mod->modfunc[modidx++] = mod_set_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_set_data; diff --git a/net/can/isotp.c b/net/can/isotp.c index d78ab13bd8be..3ef7f78e553b 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -865,6 +865,14 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (!size || size > MAX_MSG_LENGTH) return -EINVAL; + /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ + off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; + + /* does the given data fit into a single frame for SF_BROADCAST? */ + if ((so->opt.flags & CAN_ISOTP_SF_BROADCAST) && + (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) + return -EINVAL; + err = memcpy_from_msg(so->tx.buf, msg, size); if (err < 0) return err; @@ -891,9 +899,6 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) cf = (struct canfd_frame *)skb->data; skb_put(skb, so->ll.mtu); - /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ - off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; - /* check for single frame transmission depending on TX_DL */ if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) { /* The message size generally fits into a SingleFrame - good. @@ -1016,7 +1021,7 @@ static int isotp_release(struct socket *sock) hrtimer_cancel(&so->rxtimer); /* remove current filters & unregister */ - if (so->bound) { + if (so->bound && (!(so->opt.flags & CAN_ISOTP_SF_BROADCAST))) { if (so->ifindex) { struct net_device *dev; @@ -1052,15 +1057,25 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) struct net_device *dev; int err = 0; int notify_enetdown = 0; + int do_rx_reg = 1; if (len < CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.tp)) return -EINVAL; - if (addr->can_addr.tp.rx_id == addr->can_addr.tp.tx_id) - return -EADDRNOTAVAIL; + /* do not register frame reception for functional addressing */ + if (so->opt.flags & CAN_ISOTP_SF_BROADCAST) + do_rx_reg = 0; - if ((addr->can_addr.tp.rx_id | addr->can_addr.tp.tx_id) & - (CAN_ERR_FLAG | CAN_RTR_FLAG)) + /* do not validate rx address for functional addressing */ + if (do_rx_reg) { + if (addr->can_addr.tp.rx_id == addr->can_addr.tp.tx_id) + return -EADDRNOTAVAIL; + + if (addr->can_addr.tp.rx_id & (CAN_ERR_FLAG | CAN_RTR_FLAG)) + return -EADDRNOTAVAIL; + } + + if (addr->can_addr.tp.tx_id & (CAN_ERR_FLAG | CAN_RTR_FLAG)) return -EADDRNOTAVAIL; if (!addr->can_ifindex) @@ -1093,13 +1108,14 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) ifindex = dev->ifindex; - can_rx_register(net, dev, addr->can_addr.tp.rx_id, - SINGLE_MASK(addr->can_addr.tp.rx_id), isotp_rcv, sk, - "isotp", sk); + if (do_rx_reg) + can_rx_register(net, dev, addr->can_addr.tp.rx_id, + SINGLE_MASK(addr->can_addr.tp.rx_id), + isotp_rcv, sk, "isotp", sk); dev_put(dev); - if (so->bound) { + if (so->bound && do_rx_reg) { /* unregister old filter */ if (so->ifindex) { dev = dev_get_by_index(net, so->ifindex); @@ -1139,6 +1155,7 @@ static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) if (peer) return -EOPNOTSUPP; + memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = so->ifindex; addr->can_addr.tp.rx_id = so->rxid; @@ -1157,6 +1174,9 @@ static int isotp_setsockopt(struct socket *sock, int level, int optname, if (level != SOL_CAN_ISOTP) return -EINVAL; + if (so->bound) + return -EISCONN; + switch (optname) { case CAN_ISOTP_OPTS: if (optlen != sizeof(struct can_isotp_options)) @@ -1299,7 +1319,7 @@ static int isotp_notifier(struct notifier_block *nb, unsigned long msg, case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ - if (so->bound) + if (so->bound && (!(so->opt.flags & CAN_ISOTP_SF_BROADCAST))) can_rx_unregister(dev_net(dev), dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 137054bff9ec..bb914d8b4216 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -62,7 +62,7 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data) skb_pull(skb, J1939_CAN_HDR); /* fix length, set to dlc, with 8 maximum */ - skb_trim(skb, min_t(uint8_t, cf->can_dlc, 8)); + skb_trim(skb, min_t(uint8_t, cf->len, 8)); /* set addr */ skcb = j1939_skb_to_cb(skb); @@ -335,7 +335,7 @@ int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb) canid |= skcb->addr.da << 8; cf->can_id = canid; - cf->can_dlc = dlc; + cf->len = dlc; return can_send(skb, 1); diff --git a/net/can/raw.c b/net/can/raw.c index 6ec8aa1d0da4..37b47a39a3ed 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -665,10 +665,18 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, if (ro->count > 0) { int fsize = ro->count * sizeof(struct can_filter); - if (len > fsize) - len = fsize; - if (copy_to_user(optval, ro->filter, len)) - err = -EFAULT; + /* user space buffer to small for filter list? */ + if (len < fsize) { + /* return -ERANGE and needed space in optlen */ + err = -ERANGE; + if (put_user(fsize, optlen)) + err = -EFAULT; + } else { + if (len > fsize) + len = fsize; + if (copy_to_user(optval, ro->filter, len)) + err = -EFAULT; + } } else { len = 0; } diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index f36f9a3a4e20..c5c4eef3a9ff 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -5,6 +5,9 @@ config CEPH_LIB select LIBCRC32C select CRYPTO_AES select CRYPTO_CBC + select CRYPTO_GCM + select CRYPTO_HMAC + select CRYPTO_SHA256 select CRYPTO select KEYS default n diff --git a/net/ceph/Makefile b/net/ceph/Makefile index ce09bb4fb249..8802a0c0155d 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -14,4 +14,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ crypto.o armor.o \ auth_x.o \ ceph_strings.o ceph_hash.o \ - pagevec.o snapshot.o string_table.o + pagevec.o snapshot.o string_table.o \ + messenger_v1.o messenger_v2.o diff --git a/net/ceph/auth.c b/net/ceph/auth.c index fbeee068ea14..eb261aa5fe18 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -21,28 +21,31 @@ static u32 supported_protocols[] = { CEPH_AUTH_CEPHX }; -static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) +static int init_protocol(struct ceph_auth_client *ac, int proto) { - switch (protocol) { + dout("%s proto %d\n", __func__, proto); + + switch (proto) { case CEPH_AUTH_NONE: return ceph_auth_none_init(ac); case CEPH_AUTH_CEPHX: return ceph_x_init(ac); default: - return -ENOENT; + pr_err("bad auth protocol %d\n", proto); + return -EINVAL; } } /* * setup, teardown. */ -struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_crypto_key *key) +struct ceph_auth_client *ceph_auth_init(const char *name, + const struct ceph_crypto_key *key, + const int *con_modes) { struct ceph_auth_client *ac; int ret; - dout("auth_init name '%s'\n", name); - ret = -ENOMEM; ac = kzalloc(sizeof(*ac), GFP_NOFS); if (!ac) @@ -54,8 +57,12 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp ac->name = name; else ac->name = CEPH_AUTH_NAME_DEFAULT; - dout("auth_init name %s\n", ac->name); ac->key = key; + ac->preferred_mode = con_modes[0]; + ac->fallback_mode = con_modes[1]; + + dout("%s name '%s' preferred_mode %d fallback_mode %d\n", __func__, + ac->name, ac->preferred_mode, ac->fallback_mode); return ac; out: @@ -145,31 +152,35 @@ bad: goto out; } -static int ceph_build_auth_request(struct ceph_auth_client *ac, - void *msg_buf, size_t msg_len) +static int build_request(struct ceph_auth_client *ac, bool add_header, + void *buf, int buf_len) { - struct ceph_mon_request_header *monhdr = msg_buf; - void *p = monhdr + 1; - void *end = msg_buf + msg_len; + void *end = buf + buf_len; + void *p; int ret; - monhdr->have_version = 0; - monhdr->session_mon = cpu_to_le16(-1); - monhdr->session_mon_tid = 0; - - ceph_encode_32(&p, ac->protocol); + p = buf; + if (add_header) { + /* struct ceph_mon_request_header + protocol */ + ceph_encode_64_safe(&p, end, 0, e_range); + ceph_encode_16_safe(&p, end, -1, e_range); + ceph_encode_64_safe(&p, end, 0, e_range); + ceph_encode_32_safe(&p, end, ac->protocol, e_range); + } + ceph_encode_need(&p, end, sizeof(u32), e_range); ret = ac->ops->build_request(ac, p + sizeof(u32), end); if (ret < 0) { - pr_err("error %d building auth method %s request\n", ret, - ac->ops->name); - goto out; + pr_err("auth protocol '%s' building request failed: %d\n", + ceph_auth_proto_name(ac->protocol), ret); + return ret; } dout(" built request %d bytes\n", ret); ceph_encode_32(&p, ret); - ret = p + ret - msg_buf; -out: - return ret; + return p + ret - buf; + +e_range: + return -ERANGE; } /* @@ -229,10 +240,10 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, ac->ops = NULL; } if (ac->protocol != protocol) { - ret = ceph_auth_init_protocol(ac, protocol); + ret = init_protocol(ac, protocol); if (ret) { - pr_err("error %d on auth protocol %d init\n", - ret, protocol); + pr_err("auth protocol '%s' init failed: %d\n", + ceph_auth_proto_name(protocol), ret); goto out; } } @@ -240,12 +251,13 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, ac->negotiating = false; } - ret = ac->ops->handle_reply(ac, result, payload, payload_end); - if (ret == -EAGAIN) { - ret = ceph_build_auth_request(ac, reply_buf, reply_len); - } else if (ret) { - pr_err("auth method '%s' error %d\n", ac->ops->name, ret); - } + ret = ac->ops->handle_reply(ac, result, payload, payload_end, + NULL, NULL, NULL, NULL); + if (ret == -EAGAIN) + ret = build_request(ac, true, reply_buf, reply_len); + else if (ret) + pr_err("auth protocol '%s' mauth authentication failed: %d\n", + ceph_auth_proto_name(ac->protocol), result); out: mutex_unlock(&ac->mutex); @@ -264,7 +276,7 @@ int ceph_build_auth(struct ceph_auth_client *ac, mutex_lock(&ac->mutex); if (ac->ops->should_authenticate(ac)) - ret = ceph_build_auth_request(ac, msg_buf, msg_len); + ret = build_request(ac, true, msg_buf, msg_len); mutex_unlock(&ac->mutex); return ret; } @@ -281,19 +293,38 @@ int ceph_auth_is_authenticated(struct ceph_auth_client *ac) } EXPORT_SYMBOL(ceph_auth_is_authenticated); -int ceph_auth_create_authorizer(struct ceph_auth_client *ac, - int peer_type, - struct ceph_auth_handshake *auth) +int __ceph_auth_get_authorizer(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + int peer_type, bool force_new, + int *proto, int *pref_mode, int *fallb_mode) { - int ret = 0; + int ret; mutex_lock(&ac->mutex); - if (ac->ops && ac->ops->create_authorizer) + if (force_new && auth->authorizer) { + ceph_auth_destroy_authorizer(auth->authorizer); + auth->authorizer = NULL; + } + if (!auth->authorizer) ret = ac->ops->create_authorizer(ac, peer_type, auth); + else if (ac->ops->update_authorizer) + ret = ac->ops->update_authorizer(ac, peer_type, auth); + else + ret = 0; + if (ret) + goto out; + + *proto = ac->protocol; + if (pref_mode && fallb_mode) { + *pref_mode = ac->preferred_mode; + *fallb_mode = ac->fallback_mode; + } + +out: mutex_unlock(&ac->mutex); return ret; } -EXPORT_SYMBOL(ceph_auth_create_authorizer); +EXPORT_SYMBOL(__ceph_auth_get_authorizer); void ceph_auth_destroy_authorizer(struct ceph_authorizer *a) { @@ -301,20 +332,6 @@ void ceph_auth_destroy_authorizer(struct ceph_authorizer *a) } EXPORT_SYMBOL(ceph_auth_destroy_authorizer); -int ceph_auth_update_authorizer(struct ceph_auth_client *ac, - int peer_type, - struct ceph_auth_handshake *a) -{ - int ret = 0; - - mutex_lock(&ac->mutex); - if (ac->ops && ac->ops->update_authorizer) - ret = ac->ops->update_authorizer(ac, peer_type, a); - mutex_unlock(&ac->mutex); - return ret; -} -EXPORT_SYMBOL(ceph_auth_update_authorizer); - int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac, struct ceph_authorizer *a, void *challenge_buf, @@ -332,13 +349,18 @@ int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac, EXPORT_SYMBOL(ceph_auth_add_authorizer_challenge); int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, - struct ceph_authorizer *a) + struct ceph_authorizer *a, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) { int ret = 0; mutex_lock(&ac->mutex); if (ac->ops && ac->ops->verify_authorizer_reply) - ret = ac->ops->verify_authorizer_reply(ac, a); + ret = ac->ops->verify_authorizer_reply(ac, a, + reply, reply_len, session_key, session_key_len, + con_secret, con_secret_len); mutex_unlock(&ac->mutex); return ret; } @@ -352,3 +374,279 @@ void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type) mutex_unlock(&ac->mutex); } EXPORT_SYMBOL(ceph_auth_invalidate_authorizer); + +/* + * msgr2 authentication + */ + +static bool contains(const int *arr, int cnt, int val) +{ + int i; + + for (i = 0; i < cnt; i++) { + if (arr[i] == val) + return true; + } + + return false; +} + +static int encode_con_modes(void **p, void *end, int pref_mode, int fallb_mode) +{ + WARN_ON(pref_mode == CEPH_CON_MODE_UNKNOWN); + if (fallb_mode != CEPH_CON_MODE_UNKNOWN) { + ceph_encode_32_safe(p, end, 2, e_range); + ceph_encode_32_safe(p, end, pref_mode, e_range); + ceph_encode_32_safe(p, end, fallb_mode, e_range); + } else { + ceph_encode_32_safe(p, end, 1, e_range); + ceph_encode_32_safe(p, end, pref_mode, e_range); + } + + return 0; + +e_range: + return -ERANGE; +} + +/* + * Similar to ceph_auth_build_hello(). + */ +int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len) +{ + int proto = ac->key ? CEPH_AUTH_CEPHX : CEPH_AUTH_NONE; + void *end = buf + buf_len; + void *lenp; + void *p; + int ret; + + mutex_lock(&ac->mutex); + if (ac->protocol == CEPH_AUTH_UNKNOWN) { + ret = init_protocol(ac, proto); + if (ret) { + pr_err("auth protocol '%s' init failed: %d\n", + ceph_auth_proto_name(proto), ret); + goto out; + } + } else { + WARN_ON(ac->protocol != proto); + ac->ops->reset(ac); + } + + p = buf; + ceph_encode_32_safe(&p, end, ac->protocol, e_range); + ret = encode_con_modes(&p, end, ac->preferred_mode, ac->fallback_mode); + if (ret) + goto out; + + lenp = p; + p += 4; /* space for len */ + + ceph_encode_8_safe(&p, end, CEPH_AUTH_MODE_MON, e_range); + ret = ceph_auth_entity_name_encode(ac->name, &p, end); + if (ret) + goto out; + + ceph_encode_64_safe(&p, end, ac->global_id, e_range); + ceph_encode_32(&lenp, p - lenp - 4); + ret = p - buf; + +out: + mutex_unlock(&ac->mutex); + return ret; + +e_range: + ret = -ERANGE; + goto out; +} + +int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply, + int reply_len, void *buf, int buf_len) +{ + int ret; + + mutex_lock(&ac->mutex); + ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len, + NULL, NULL, NULL, NULL); + if (ret == -EAGAIN) + ret = build_request(ac, false, buf, buf_len); + else + WARN_ON(ret >= 0); + mutex_unlock(&ac->mutex); + return ret; +} + +int ceph_auth_handle_reply_done(struct ceph_auth_client *ac, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + int ret; + + mutex_lock(&ac->mutex); + if (global_id && ac->global_id != global_id) { + dout("%s global_id %llu -> %llu\n", __func__, ac->global_id, + global_id); + ac->global_id = global_id; + } + + ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len, + session_key, session_key_len, + con_secret, con_secret_len); + mutex_unlock(&ac->mutex); + return ret; +} + +bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + mutex_lock(&ac->mutex); + WARN_ON(used_proto != ac->protocol); + + if (result == -EOPNOTSUPP) { + if (!contains(allowed_protos, proto_cnt, ac->protocol)) { + pr_err("auth protocol '%s' not allowed\n", + ceph_auth_proto_name(ac->protocol)); + goto not_allowed; + } + if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) && + (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN || + !contains(allowed_modes, mode_cnt, ac->fallback_mode))) { + pr_err("preferred mode '%s' not allowed\n", + ceph_con_mode_name(ac->preferred_mode)); + if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN) + pr_err("no fallback mode\n"); + else + pr_err("fallback mode '%s' not allowed\n", + ceph_con_mode_name(ac->fallback_mode)); + goto not_allowed; + } + } + + WARN_ON(result == -EOPNOTSUPP || result >= 0); + pr_err("auth protocol '%s' msgr authentication failed: %d\n", + ceph_auth_proto_name(ac->protocol), result); + + mutex_unlock(&ac->mutex); + return true; + +not_allowed: + mutex_unlock(&ac->mutex); + return false; +} + +int ceph_auth_get_authorizer(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + int peer_type, void *buf, int *buf_len) +{ + void *end = buf + *buf_len; + int pref_mode, fallb_mode; + int proto; + void *p; + int ret; + + ret = __ceph_auth_get_authorizer(ac, auth, peer_type, true, &proto, + &pref_mode, &fallb_mode); + if (ret) + return ret; + + p = buf; + ceph_encode_32_safe(&p, end, proto, e_range); + ret = encode_con_modes(&p, end, pref_mode, fallb_mode); + if (ret) + return ret; + + ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range); + *buf_len = p - buf; + return 0; + +e_range: + return -ERANGE; +} +EXPORT_SYMBOL(ceph_auth_get_authorizer); + +int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + void *reply, int reply_len, + void *buf, int *buf_len) +{ + void *end = buf + *buf_len; + void *p; + int ret; + + ret = ceph_auth_add_authorizer_challenge(ac, auth->authorizer, + reply, reply_len); + if (ret) + return ret; + + p = buf; + ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range); + *buf_len = p - buf; + return 0; + +e_range: + return -ERANGE; +} +EXPORT_SYMBOL(ceph_auth_handle_svc_reply_more); + +int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + return ceph_auth_verify_authorizer_reply(ac, auth->authorizer, + reply, reply_len, session_key, session_key_len, + con_secret, con_secret_len); +} +EXPORT_SYMBOL(ceph_auth_handle_svc_reply_done); + +bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac, + int peer_type, int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + mutex_lock(&ac->mutex); + WARN_ON(used_proto != ac->protocol); + + if (result == -EOPNOTSUPP) { + if (!contains(allowed_protos, proto_cnt, ac->protocol)) { + pr_err("auth protocol '%s' not allowed by %s\n", + ceph_auth_proto_name(ac->protocol), + ceph_entity_type_name(peer_type)); + goto not_allowed; + } + if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) && + (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN || + !contains(allowed_modes, mode_cnt, ac->fallback_mode))) { + pr_err("preferred mode '%s' not allowed by %s\n", + ceph_con_mode_name(ac->preferred_mode), + ceph_entity_type_name(peer_type)); + if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN) + pr_err("no fallback mode\n"); + else + pr_err("fallback mode '%s' not allowed by %s\n", + ceph_con_mode_name(ac->fallback_mode), + ceph_entity_type_name(peer_type)); + goto not_allowed; + } + } + + WARN_ON(result == -EOPNOTSUPP || result >= 0); + pr_err("auth protocol '%s' authorization to %s failed: %d\n", + ceph_auth_proto_name(ac->protocol), + ceph_entity_type_name(peer_type), result); + + if (ac->ops->invalidate_authorizer) + ac->ops->invalidate_authorizer(ac, peer_type); + + mutex_unlock(&ac->mutex); + return true; + +not_allowed: + mutex_unlock(&ac->mutex); + return false; +} +EXPORT_SYMBOL(ceph_auth_handle_bad_authorizer); diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c index edb7042479ed..70e86e462250 100644 --- a/net/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -70,7 +70,9 @@ static int build_request(struct ceph_auth_client *ac, void *buf, void *end) * authenticate state, so nothing happens here. */ static int handle_reply(struct ceph_auth_client *ac, int result, - void *buf, void *end) + void *buf, void *end, u8 *session_key, + int *session_key_len, u8 *con_secret, + int *con_secret_len) { struct ceph_auth_none_info *xi = ac->private; @@ -116,7 +118,6 @@ static int ceph_auth_none_create_authorizer( } static const struct ceph_auth_client_ops ceph_auth_none_ops = { - .name = "none", .reset = reset, .destroy = destroy, .is_authenticated = is_authenticated, diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index b52732337ca6..ca44c327bace 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -22,12 +22,15 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); static int ceph_x_is_authenticated(struct ceph_auth_client *ac) { struct ceph_x_info *xi = ac->private; - int need; + int missing; + int need; /* missing + need renewal */ ceph_x_validate_tickets(ac, &need); - dout("ceph_x_is_authenticated want=%d need=%d have=%d\n", - ac->want_keys, need, xi->have_keys); - return (ac->want_keys & xi->have_keys) == ac->want_keys; + missing = ac->want_keys & ~xi->have_keys; + WARN_ON((need & missing) != missing); + dout("%s want 0x%x have 0x%x missing 0x%x -> %d\n", __func__, + ac->want_keys, xi->have_keys, missing, !missing); + return !missing; } static int ceph_x_should_authenticate(struct ceph_auth_client *ac) @@ -36,9 +39,9 @@ static int ceph_x_should_authenticate(struct ceph_auth_client *ac) int need; ceph_x_validate_tickets(ac, &need); - dout("ceph_x_should_authenticate want=%d need=%d have=%d\n", - ac->want_keys, need, xi->have_keys); - return need != 0; + dout("%s want 0x%x have 0x%x need 0x%x -> %d\n", __func__, + ac->want_keys, xi->have_keys, need, !!need); + return !!need; } static int ceph_x_encrypt_offset(void) @@ -197,7 +200,7 @@ static int process_one_ticket(struct ceph_auth_client *ac, dout(" decrypted %d bytes\n", ret); dend = dp + ret; - tkt_struct_v = ceph_decode_8(&dp); + ceph_decode_8_safe(&dp, dend, tkt_struct_v, bad); if (tkt_struct_v != 1) goto bad; @@ -205,6 +208,7 @@ static int process_one_ticket(struct ceph_auth_client *ac, if (ret) goto out; + ceph_decode_need(&dp, dend, sizeof(struct ceph_timespec), bad); ceph_decode_timespec64(&validity, dp); dp += sizeof(struct ceph_timespec); new_expires = ktime_get_real_seconds() + validity.tv_sec; @@ -265,22 +269,21 @@ out: static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, struct ceph_crypto_key *secret, - void *buf, void *end) + void **p, void *end) { - void *p = buf; u8 reply_struct_v; u32 num; int ret; - ceph_decode_8_safe(&p, end, reply_struct_v, bad); + ceph_decode_8_safe(p, end, reply_struct_v, bad); if (reply_struct_v != 1) return -EINVAL; - ceph_decode_32_safe(&p, end, num, bad); + ceph_decode_32_safe(p, end, num, bad); dout("%d tickets\n", num); while (num--) { - ret = process_one_ticket(ac, secret, &p, end); + ret = process_one_ticket(ac, secret, p, end); if (ret) return ret; } @@ -379,6 +382,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, } } au->service = th->service; + WARN_ON(!th->secret_id); au->secret_id = th->secret_id; msg_a = au->buf->vec.iov_base; @@ -442,9 +446,10 @@ static bool need_key(struct ceph_x_ticket_handler *th) static bool have_key(struct ceph_x_ticket_handler *th) { - if (th->have_key) { - if (ktime_get_real_seconds() >= th->expires) - th->have_key = false; + if (th->have_key && ktime_get_real_seconds() >= th->expires) { + dout("ticket %d (%s) secret_id %llu expired\n", th->service, + ceph_entity_type_name(th->service), th->secret_id); + th->have_key = false; } return th->have_key; @@ -486,6 +491,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, struct ceph_x_info *xi = ac->private; int need; struct ceph_x_request_header *head = buf; + void *p; int ret; struct ceph_x_ticket_handler *th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); @@ -494,18 +500,17 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, return PTR_ERR(th); ceph_x_validate_tickets(ac, &need); - - dout("build_request want %x have %x need %x\n", - ac->want_keys, xi->have_keys, need); + dout("%s want 0x%x have 0x%x need 0x%x\n", __func__, ac->want_keys, + xi->have_keys, need); if (need & CEPH_ENTITY_TYPE_AUTH) { struct ceph_x_authenticate *auth = (void *)(head + 1); - void *p = auth + 1; void *enc_buf = xi->auth_authorizer.enc_buf; struct ceph_x_challenge_blob *blob = enc_buf + ceph_x_encrypt_offset(); u64 *u; + p = auth + 1; if (p > end) return -ERANGE; @@ -521,7 +526,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, if (ret < 0) return ret; - auth->struct_v = 1; + auth->struct_v = 2; /* nautilus+ */ auth->key = 0; for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++) auth->key ^= *(__le64 *)u; @@ -534,39 +539,137 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, if (ret < 0) return ret; + /* nautilus+: request service tickets at the same time */ + need = ac->want_keys & ~CEPH_ENTITY_TYPE_AUTH; + WARN_ON(!need); + ceph_encode_32_safe(&p, end, need, e_range); return p - buf; } if (need) { - void *p = head + 1; - struct ceph_x_service_ticket_request *req; - - if (p > end) - return -ERANGE; - head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); - + dout(" get_principal_session_key\n"); ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); if (ret) return ret; - ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base, - xi->auth_authorizer.buf->vec.iov_len); - req = p; - req->keys = cpu_to_le32(need); - p += sizeof(*req); + p = buf; + ceph_encode_16_safe(&p, end, CEPHX_GET_PRINCIPAL_SESSION_KEY, + e_range); + ceph_encode_copy_safe(&p, end, + xi->auth_authorizer.buf->vec.iov_base, + xi->auth_authorizer.buf->vec.iov_len, e_range); + ceph_encode_8_safe(&p, end, 1, e_range); + ceph_encode_32_safe(&p, end, need, e_range); return p - buf; } return 0; + +e_range: + return -ERANGE; +} + +static int decode_con_secret(void **p, void *end, u8 *con_secret, + int *con_secret_len) +{ + int len; + + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_need(p, end, len, bad); + + dout("%s len %d\n", __func__, len); + if (con_secret) { + if (len > CEPH_MAX_CON_SECRET_LEN) { + pr_err("connection secret too big %d\n", len); + goto bad_memzero; + } + memcpy(con_secret, *p, len); + *con_secret_len = len; + } + memzero_explicit(*p, len); + *p += len; + return 0; + +bad_memzero: + memzero_explicit(*p, len); +bad: + pr_err("failed to decode connection secret\n"); + return -EINVAL; +} + +static int handle_auth_session_key(struct ceph_auth_client *ac, + void **p, void *end, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + struct ceph_x_info *xi = ac->private; + struct ceph_x_ticket_handler *th; + void *dp, *dend; + int len; + int ret; + + /* AUTH ticket */ + ret = ceph_x_proc_ticket_reply(ac, &xi->secret, p, end); + if (ret) + return ret; + + if (*p == end) { + /* pre-nautilus (or didn't request service tickets!) */ + WARN_ON(session_key || con_secret); + return 0; + } + + th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); + if (IS_ERR(th)) + return PTR_ERR(th); + + if (session_key) { + memcpy(session_key, th->session_key.key, th->session_key.len); + *session_key_len = th->session_key.len; + } + + /* connection secret */ + ceph_decode_32_safe(p, end, len, e_inval); + dout("%s connection secret blob len %d\n", __func__, len); + if (len > 0) { + dp = *p + ceph_x_encrypt_offset(); + ret = ceph_x_decrypt(&th->session_key, p, *p + len); + if (ret < 0) + return ret; + + dout("%s decrypted %d bytes\n", __func__, ret); + dend = dp + ret; + + ret = decode_con_secret(&dp, dend, con_secret, con_secret_len); + if (ret) + return ret; + } + + /* service tickets */ + ceph_decode_32_safe(p, end, len, e_inval); + dout("%s service tickets blob len %d\n", __func__, len); + if (len > 0) { + ret = ceph_x_proc_ticket_reply(ac, &th->session_key, + p, *p + len); + if (ret) + return ret; + } + + return 0; + +e_inval: + return -EINVAL; } static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, - void *buf, void *end) + void *buf, void *end, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) { struct ceph_x_info *xi = ac->private; - struct ceph_x_reply_header *head = buf; struct ceph_x_ticket_handler *th; int len = end - buf; + void *p; int op; int ret; @@ -587,22 +690,25 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, return -EAGAIN; } - op = le16_to_cpu(head->op); - result = le32_to_cpu(head->result); + p = buf; + ceph_decode_16_safe(&p, end, op, e_inval); + ceph_decode_32_safe(&p, end, result, e_inval); dout("handle_reply op %d result %d\n", op, result); switch (op) { case CEPHX_GET_AUTH_SESSION_KEY: - /* verify auth key */ - ret = ceph_x_proc_ticket_reply(ac, &xi->secret, - buf + sizeof(*head), end); + /* AUTH ticket + [connection secret] + service tickets */ + ret = handle_auth_session_key(ac, &p, end, session_key, + session_key_len, con_secret, + con_secret_len); break; case CEPHX_GET_PRINCIPAL_SESSION_KEY: th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); if (IS_ERR(th)) return PTR_ERR(th); - ret = ceph_x_proc_ticket_reply(ac, &th->session_key, - buf + sizeof(*head), end); + + /* service tickets */ + ret = ceph_x_proc_ticket_reply(ac, &th->session_key, &p, end); break; default: @@ -613,6 +719,9 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, if (ac->want_keys == xi->have_keys) return 0; return -EAGAIN; + +e_inval: + return -EINVAL; } static void ceph_x_destroy_authorizer(struct ceph_authorizer *a) @@ -678,40 +787,44 @@ static int ceph_x_update_authorizer( return 0; } -static int decrypt_authorize_challenge(struct ceph_x_authorizer *au, - void *challenge_buf, - int challenge_buf_len, - u64 *server_challenge) +/* + * CephXAuthorizeChallenge + */ +static int decrypt_authorizer_challenge(struct ceph_crypto_key *secret, + void *challenge, int challenge_len, + u64 *server_challenge) { - struct ceph_x_authorize_challenge *ch = - challenge_buf + sizeof(struct ceph_x_encrypt_header); + void *dp, *dend; int ret; /* no leading len */ - ret = __ceph_x_decrypt(&au->session_key, challenge_buf, - challenge_buf_len); + ret = __ceph_x_decrypt(secret, challenge, challenge_len); if (ret < 0) return ret; - if (ret < sizeof(*ch)) { - pr_err("bad size %d for ceph_x_authorize_challenge\n", ret); - return -EINVAL; - } - *server_challenge = le64_to_cpu(ch->server_challenge); + dout("%s decrypted %d bytes\n", __func__, ret); + dp = challenge + sizeof(struct ceph_x_encrypt_header); + dend = dp + ret; + + ceph_decode_skip_8(&dp, dend, e_inval); /* struct_v */ + ceph_decode_64_safe(&dp, dend, *server_challenge, e_inval); + dout("%s server_challenge %llu\n", __func__, *server_challenge); return 0; + +e_inval: + return -EINVAL; } static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac, struct ceph_authorizer *a, - void *challenge_buf, - int challenge_buf_len) + void *challenge, int challenge_len) { struct ceph_x_authorizer *au = (void *)a; u64 server_challenge; int ret; - ret = decrypt_authorize_challenge(au, challenge_buf, challenge_buf_len, - &server_challenge); + ret = decrypt_authorizer_challenge(&au->session_key, challenge, + challenge_len, &server_challenge); if (ret) { pr_err("failed to decrypt authorize challenge: %d", ret); return ret; @@ -726,29 +839,67 @@ static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac, return 0; } +/* + * CephXAuthorizeReply + */ +static int decrypt_authorizer_reply(struct ceph_crypto_key *secret, + void **p, void *end, u64 *nonce_plus_one, + u8 *con_secret, int *con_secret_len) +{ + void *dp, *dend; + u8 struct_v; + int ret; + + dp = *p + ceph_x_encrypt_offset(); + ret = ceph_x_decrypt(secret, p, end); + if (ret < 0) + return ret; + + dout("%s decrypted %d bytes\n", __func__, ret); + dend = dp + ret; + + ceph_decode_8_safe(&dp, dend, struct_v, e_inval); + ceph_decode_64_safe(&dp, dend, *nonce_plus_one, e_inval); + dout("%s nonce_plus_one %llu\n", __func__, *nonce_plus_one); + if (struct_v >= 2) { + ret = decode_con_secret(&dp, dend, con_secret, con_secret_len); + if (ret) + return ret; + } + + return 0; + +e_inval: + return -EINVAL; +} + static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, - struct ceph_authorizer *a) + struct ceph_authorizer *a, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) { struct ceph_x_authorizer *au = (void *)a; - void *p = au->enc_buf; - struct ceph_x_authorize_reply *reply = p + ceph_x_encrypt_offset(); + u64 nonce_plus_one; int ret; - ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN); - if (ret < 0) + if (session_key) { + memcpy(session_key, au->session_key.key, au->session_key.len); + *session_key_len = au->session_key.len; + } + + ret = decrypt_authorizer_reply(&au->session_key, &reply, + reply + reply_len, &nonce_plus_one, + con_secret, con_secret_len); + if (ret) return ret; - if (ret < sizeof(*reply)) { - pr_err("bad size %d for ceph_x_authorize_reply\n", ret); - return -EINVAL; + + if (nonce_plus_one != au->nonce + 1) { + pr_err("failed to authenticate server\n"); + return -EPERM; } - if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one)) - ret = -EPERM; - else - ret = 0; - dout("verify_authorizer_reply nonce %llx got %llx ret %d\n", - au->nonce, le64_to_cpu(reply->nonce_plus_one), ret); - return ret; + return 0; } static void ceph_x_reset(struct ceph_auth_client *ac) @@ -785,8 +936,15 @@ static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type) struct ceph_x_ticket_handler *th; th = get_ticket_handler(ac, peer_type); - if (!IS_ERR(th)) + if (IS_ERR(th)) + return; + + if (th->have_key) { + dout("ticket %d (%s) secret_id %llu invalidated\n", + th->service, ceph_entity_type_name(th->service), + th->secret_id); th->have_key = false; + } } static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, @@ -911,7 +1069,6 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth, } static const struct ceph_auth_client_ops ceph_x_ops = { - .name = "x", .is_authenticated = ceph_x_is_authenticated, .should_authenticate = ceph_x_should_authenticate, .build_request = ceph_x_build_request, diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h index 24b0b74564d0..792fcb974dc3 100644 --- a/net/ceph/auth_x_protocol.h +++ b/net/ceph/auth_x_protocol.h @@ -38,7 +38,8 @@ struct ceph_x_authenticate { __u8 struct_v; __le64 client_challenge; __le64 key; - /* ticket blob */ + /* old_ticket blob */ + /* nautilus+: other_keys */ } __attribute__ ((packed)); struct ceph_x_service_ticket_request { diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4e7edd707a14..271287c5ec12 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -265,6 +265,7 @@ enum { Opt_ip, Opt_crush_location, Opt_read_from_replica, + Opt_ms_mode, /* string args above */ Opt_share, Opt_crc, @@ -287,6 +288,23 @@ static const struct constant_table ceph_param_read_from_replica[] = { {} }; +enum ceph_ms_mode { + Opt_ms_mode_legacy, + Opt_ms_mode_crc, + Opt_ms_mode_secure, + Opt_ms_mode_prefer_crc, + Opt_ms_mode_prefer_secure +}; + +static const struct constant_table ceph_param_ms_mode[] = { + {"legacy", Opt_ms_mode_legacy}, + {"crc", Opt_ms_mode_crc}, + {"secure", Opt_ms_mode_secure}, + {"prefer-crc", Opt_ms_mode_prefer_crc}, + {"prefer-secure", Opt_ms_mode_prefer_secure}, + {} +}; + static const struct fs_parameter_spec ceph_parameters[] = { fsparam_flag ("abort_on_full", Opt_abort_on_full), fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures), @@ -305,6 +323,8 @@ static const struct fs_parameter_spec ceph_parameters[] = { fs_param_deprecated, NULL), fsparam_enum ("read_from_replica", Opt_read_from_replica, ceph_param_read_from_replica), + fsparam_enum ("ms_mode", Opt_ms_mode, + ceph_param_ms_mode), fsparam_string ("secret", Opt_secret), fsparam_flag_no ("share", Opt_share), fsparam_flag_no ("tcp_nodelay", Opt_tcp_nodelay), @@ -333,6 +353,8 @@ struct ceph_options *ceph_alloc_options(void) opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT; opt->read_from_replica = CEPH_READ_FROM_REPLICA_DEFAULT; + opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN; + opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; return opt; } EXPORT_SYMBOL(ceph_alloc_options); @@ -503,6 +525,32 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, BUG(); } break; + case Opt_ms_mode: + switch (result.uint_32) { + case Opt_ms_mode_legacy: + opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN; + opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; + break; + case Opt_ms_mode_crc: + opt->con_modes[0] = CEPH_CON_MODE_CRC; + opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; + break; + case Opt_ms_mode_secure: + opt->con_modes[0] = CEPH_CON_MODE_SECURE; + opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; + break; + case Opt_ms_mode_prefer_crc: + opt->con_modes[0] = CEPH_CON_MODE_CRC; + opt->con_modes[1] = CEPH_CON_MODE_SECURE; + break; + case Opt_ms_mode_prefer_secure: + opt->con_modes[0] = CEPH_CON_MODE_SECURE; + opt->con_modes[1] = CEPH_CON_MODE_CRC; + break; + default: + BUG(); + } + break; case Opt_osdtimeout: warn_plog(&log, "Ignoring osdtimeout"); @@ -616,6 +664,21 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, } else if (opt->read_from_replica == CEPH_OSD_FLAG_LOCALIZE_READS) { seq_puts(m, "read_from_replica=localize,"); } + if (opt->con_modes[0] != CEPH_CON_MODE_UNKNOWN) { + if (opt->con_modes[0] == CEPH_CON_MODE_CRC && + opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) { + seq_puts(m, "ms_mode=crc,"); + } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE && + opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) { + seq_puts(m, "ms_mode=secure,"); + } else if (opt->con_modes[0] == CEPH_CON_MODE_CRC && + opt->con_modes[1] == CEPH_CON_MODE_SECURE) { + seq_puts(m, "ms_mode=prefer-crc,"); + } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE && + opt->con_modes[1] == CEPH_CON_MODE_CRC) { + seq_puts(m, "ms_mode=prefer-secure,"); + } + } if (opt->flags & CEPH_OPT_FSID) seq_printf(m, "fsid=%pU,", &opt->fsid); diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c index 10e01494993c..355fea272120 100644 --- a/net/ceph/ceph_strings.c +++ b/net/ceph/ceph_strings.c @@ -18,6 +18,34 @@ const char *ceph_entity_type_name(int type) } EXPORT_SYMBOL(ceph_entity_type_name); +const char *ceph_auth_proto_name(int proto) +{ + switch (proto) { + case CEPH_AUTH_UNKNOWN: + return "unknown"; + case CEPH_AUTH_NONE: + return "none"; + case CEPH_AUTH_CEPHX: + return "cephx"; + default: + return "???"; + } +} + +const char *ceph_con_mode_name(int mode) +{ + switch (mode) { + case CEPH_CON_MODE_UNKNOWN: + return "unknown"; + case CEPH_CON_MODE_CRC: + return "crc"; + case CEPH_CON_MODE_SECURE: + return "secure"; + default: + return "???"; + } +} + const char *ceph_osd_op_name(int op) { switch (op) { diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 4f75df40fb12..92d89b331645 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -96,6 +96,7 @@ int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end) key->len = ceph_decode_16(p); ceph_decode_need(p, end, key->len, bad); ret = set_secret(key, *p); + memzero_explicit(*p, key->len); *p += key->len; return ret; @@ -134,7 +135,7 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey) void ceph_crypto_key_destroy(struct ceph_crypto_key *key) { if (key) { - kfree(key->key); + kfree_sensitive(key->key); key->key = NULL; if (key->tfm) { crypto_free_sync_skcipher(key->tfm); diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index 96ef4d860bc9..13bd526349fa 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -5,6 +5,9 @@ #include <linux/ceph/types.h> #include <linux/ceph/buffer.h> +#define CEPH_KEY_LEN 16 +#define CEPH_MAX_CON_SECRET_LEN 64 + /* * cryptographic secret */ diff --git a/net/ceph/decode.c b/net/ceph/decode.c index eea529595a7a..b44f7651be04 100644 --- a/net/ceph/decode.c +++ b/net/ceph/decode.c @@ -1,4 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> + +#include <linux/inet.h> #include <linux/ceph/decode.h> @@ -82,3 +85,101 @@ bad: } EXPORT_SYMBOL(ceph_decode_entity_addr); +/* + * Return addr of desired type (MSGR2 or LEGACY) or error. + * Make sure there is only one match. + * + * Assume encoding with MSG_ADDR2. + */ +int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2, + struct ceph_entity_addr *addr) +{ + __le32 my_type = msgr2 ? CEPH_ENTITY_ADDR_TYPE_MSGR2 : + CEPH_ENTITY_ADDR_TYPE_LEGACY; + struct ceph_entity_addr tmp_addr; + int addr_cnt; + bool found; + u8 marker; + int ret; + int i; + + ceph_decode_8_safe(p, end, marker, e_inval); + if (marker != 2) { + pr_err("bad addrvec marker %d\n", marker); + return -EINVAL; + } + + ceph_decode_32_safe(p, end, addr_cnt, e_inval); + + found = false; + for (i = 0; i < addr_cnt; i++) { + ret = ceph_decode_entity_addr(p, end, &tmp_addr); + if (ret) + return ret; + + if (tmp_addr.type == my_type) { + if (found) { + pr_err("another match of type %d in addrvec\n", + le32_to_cpu(my_type)); + return -EINVAL; + } + + memcpy(addr, &tmp_addr, sizeof(*addr)); + found = true; + } + } + if (!found && addr_cnt != 0) { + pr_err("no match of type %d in addrvec\n", + le32_to_cpu(my_type)); + return -ENOENT; + } + + return 0; + +e_inval: + return -EINVAL; +} +EXPORT_SYMBOL(ceph_decode_entity_addrvec); + +static int get_sockaddr_encoding_len(sa_family_t family) +{ + union { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + } u; + + switch (family) { + case AF_INET: + return sizeof(u.sin); + case AF_INET6: + return sizeof(u.sin6); + default: + return sizeof(u); + } +} + +int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr) +{ + sa_family_t family = get_unaligned(&addr->in_addr.ss_family); + int addr_len = get_sockaddr_encoding_len(family); + + return 1 + CEPH_ENCODING_START_BLK_LEN + 4 + 4 + 4 + addr_len; +} + +void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr) +{ + sa_family_t family = get_unaligned(&addr->in_addr.ss_family); + int addr_len = get_sockaddr_encoding_len(family); + + ceph_encode_8(p, 1); /* marker */ + ceph_start_encoding(p, 1, 1, sizeof(addr->type) + + sizeof(addr->nonce) + + sizeof(u32) + addr_len); + ceph_encode_copy(p, &addr->type, sizeof(addr->type)); + ceph_encode_copy(p, &addr->nonce, sizeof(addr->nonce)); + + ceph_encode_32(p, addr_len); + ceph_encode_16(p, family); + ceph_encode_copy(p, addr->in_addr.__data, addr_len - sizeof(family)); +} diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index af0f1fa24937..57d043b382ed 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -82,71 +82,51 @@ #define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */ #define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */ -/* - * connection states - */ -#define CON_STATE_CLOSED 1 /* -> PREOPEN */ -#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */ -#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */ -#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */ -#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */ -#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */ - -/* - * ceph_connection flag bits - */ -#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop - * messages on errors */ -#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */ -#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */ -#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ -#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ - static bool con_flag_valid(unsigned long con_flag) { switch (con_flag) { - case CON_FLAG_LOSSYTX: - case CON_FLAG_KEEPALIVE_PENDING: - case CON_FLAG_WRITE_PENDING: - case CON_FLAG_SOCK_CLOSED: - case CON_FLAG_BACKOFF: + case CEPH_CON_F_LOSSYTX: + case CEPH_CON_F_KEEPALIVE_PENDING: + case CEPH_CON_F_WRITE_PENDING: + case CEPH_CON_F_SOCK_CLOSED: + case CEPH_CON_F_BACKOFF: return true; default: return false; } } -static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag) +void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); clear_bit(con_flag, &con->flags); } -static void con_flag_set(struct ceph_connection *con, unsigned long con_flag) +void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); set_bit(con_flag, &con->flags); } -static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag) +bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_bit(con_flag, &con->flags); } -static bool con_flag_test_and_clear(struct ceph_connection *con, - unsigned long con_flag) +bool ceph_con_flag_test_and_clear(struct ceph_connection *con, + unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_and_clear_bit(con_flag, &con->flags); } -static bool con_flag_test_and_set(struct ceph_connection *con, - unsigned long con_flag) +bool ceph_con_flag_test_and_set(struct ceph_connection *con, + unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); @@ -157,12 +137,6 @@ static bool con_flag_test_and_set(struct ceph_connection *con, static struct kmem_cache *ceph_msg_cache; -/* static tag bytes (protocol control messages) */ -static char tag_msg = CEPH_MSGR_TAG_MSG; -static char tag_ack = CEPH_MSGR_TAG_ACK; -static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; -static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2; - #ifdef CONFIG_LOCKDEP static struct lock_class_key socket_class; #endif @@ -184,7 +158,7 @@ static void con_fault(struct ceph_connection *con); static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; static atomic_t addr_str_seq = ATOMIC_INIT(0); -static struct page *zero_page; /* used in certain error cases */ +struct page *ceph_zero_page; /* used in certain error cases */ const char *ceph_pr_addr(const struct ceph_entity_addr *addr) { @@ -219,10 +193,13 @@ const char *ceph_pr_addr(const struct ceph_entity_addr *addr) } EXPORT_SYMBOL(ceph_pr_addr); -static void encode_my_addr(struct ceph_messenger *msgr) +void ceph_encode_my_addr(struct ceph_messenger *msgr) { - memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); - ceph_encode_banner_addr(&msgr->my_enc_addr); + if (!ceph_msgr2(from_msgr(msgr))) { + memcpy(&msgr->my_enc_addr, &msgr->inst.addr, + sizeof(msgr->my_enc_addr)); + ceph_encode_banner_addr(&msgr->my_enc_addr); + } } /* @@ -254,9 +231,9 @@ static void _ceph_msgr_exit(void) ceph_msgr_wq = NULL; } - BUG_ON(zero_page == NULL); - put_page(zero_page); - zero_page = NULL; + BUG_ON(!ceph_zero_page); + put_page(ceph_zero_page); + ceph_zero_page = NULL; ceph_msgr_slab_exit(); } @@ -266,9 +243,9 @@ int __init ceph_msgr_init(void) if (ceph_msgr_slab_init()) return -ENOMEM; - BUG_ON(zero_page != NULL); - zero_page = ZERO_PAGE(0); - get_page(zero_page); + BUG_ON(ceph_zero_page); + ceph_zero_page = ZERO_PAGE(0); + get_page(ceph_zero_page); /* * The number of active work items is limited by the number of @@ -372,7 +349,7 @@ static void ceph_sock_data_ready(struct sock *sk) } if (sk->sk_state != TCP_CLOSE_WAIT) { - dout("%s on %p state = %lu, queueing work\n", __func__, + dout("%s %p state = %d, queueing work\n", __func__, con, con->state); queue_con(con); } @@ -390,7 +367,7 @@ static void ceph_sock_write_space(struct sock *sk) * buffer. See net/ipv4/tcp_input.c:tcp_check_space() * and net/core/stream.c:sk_stream_write_space(). */ - if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) { + if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) { if (sk_stream_is_writeable(sk)) { dout("%s %p queueing write work\n", __func__, con); clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); @@ -406,7 +383,7 @@ static void ceph_sock_state_change(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; - dout("%s %p state = %lu sk_state = %u\n", __func__, + dout("%s %p state = %d sk_state = %u\n", __func__, con, con->state, sk->sk_state); switch (sk->sk_state) { @@ -416,7 +393,7 @@ static void ceph_sock_state_change(struct sock *sk) case TCP_CLOSE_WAIT: dout("%s TCP_CLOSE_WAIT\n", __func__); con_sock_state_closing(con); - con_flag_set(con, CON_FLAG_SOCK_CLOSED); + ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED); queue_con(con); break; case TCP_ESTABLISHED: @@ -450,13 +427,15 @@ static void set_sock_callbacks(struct socket *sock, /* * initiate connection to a remote socket. */ -static int ceph_tcp_connect(struct ceph_connection *con) +int ceph_tcp_connect(struct ceph_connection *con) { struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */ struct socket *sock; unsigned int noio_flag; int ret; + dout("%s con %p peer_addr %s\n", __func__, con, + ceph_pr_addr(&con->peer_addr)); BUG_ON(con->sock); /* sock_create_kern() allocates with GFP_KERNEL */ @@ -474,8 +453,6 @@ static int ceph_tcp_connect(struct ceph_connection *con) set_sock_callbacks(sock, con); - dout("connect %s\n", ceph_pr_addr(&con->peer_addr)); - con_sock_state_connecting(con); ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss), O_NONBLOCK); @@ -498,103 +475,13 @@ static int ceph_tcp_connect(struct ceph_connection *con) } /* - * If @buf is NULL, discard up to @len bytes. - */ -static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) -{ - struct kvec iov = {buf, len}; - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; - int r; - - if (!buf) - msg.msg_flags |= MSG_TRUNC; - - iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len); - r = sock_recvmsg(sock, &msg, msg.msg_flags); - if (r == -EAGAIN) - r = 0; - return r; -} - -static int ceph_tcp_recvpage(struct socket *sock, struct page *page, - int page_offset, size_t length) -{ - struct bio_vec bvec = { - .bv_page = page, - .bv_offset = page_offset, - .bv_len = length - }; - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; - int r; - - BUG_ON(page_offset + length > PAGE_SIZE); - iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length); - r = sock_recvmsg(sock, &msg, msg.msg_flags); - if (r == -EAGAIN) - r = 0; - return r; -} - -/* - * write something. @more is true if caller will be sending more data - * shortly. - */ -static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, - size_t kvlen, size_t len, bool more) -{ - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; - int r; - - if (more) - msg.msg_flags |= MSG_MORE; - else - msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ - - r = kernel_sendmsg(sock, &msg, iov, kvlen, len); - if (r == -EAGAIN) - r = 0; - return r; -} - -/* - * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST - */ -static int ceph_tcp_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int more) -{ - ssize_t (*sendpage)(struct socket *sock, struct page *page, - int offset, size_t size, int flags); - int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more; - int ret; - - /* - * sendpage cannot properly handle pages with page_count == 0, - * we need to fall back to sendmsg if that's the case. - * - * Same goes for slab pages: skb_can_coalesce() allows - * coalescing neighboring slab objects into a single frag which - * triggers one of hardened usercopy checks. - */ - if (sendpage_ok(page)) - sendpage = sock->ops->sendpage; - else - sendpage = sock_no_sendpage; - - ret = sendpage(sock, page, offset, size, flags); - if (ret == -EAGAIN) - ret = 0; - - return ret; -} - -/* * Shutdown/close the socket for the given connection. */ -static int con_close_socket(struct ceph_connection *con) +int ceph_con_close_socket(struct ceph_connection *con) { int rc = 0; - dout("con_close_socket on %p sock %p\n", con, con->sock); + dout("%s con %p sock %p\n", __func__, con, con->sock); if (con->sock) { rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); sock_release(con->sock); @@ -607,12 +494,34 @@ static int con_close_socket(struct ceph_connection *con) * received a socket close event before we had the chance to * shut the socket down. */ - con_flag_clear(con, CON_FLAG_SOCK_CLOSED); + ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED); con_sock_state_closed(con); return rc; } +static void ceph_con_reset_protocol(struct ceph_connection *con) +{ + dout("%s con %p\n", __func__, con); + + ceph_con_close_socket(con); + if (con->in_msg) { + WARN_ON(con->in_msg->con != con); + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + } + if (con->out_msg) { + WARN_ON(con->out_msg->con != con); + ceph_msg_put(con->out_msg); + con->out_msg = NULL; + } + + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_reset_protocol(con); + else + ceph_con_v1_reset_protocol(con); +} + /* * Reset a connection. Discard all incoming and outgoing messages * and clear *_seq state. @@ -623,6 +532,7 @@ static void ceph_msg_remove(struct ceph_msg *msg) ceph_msg_put(msg); } + static void ceph_msg_remove_list(struct list_head *head) { while (!list_empty(head)) { @@ -632,31 +542,22 @@ static void ceph_msg_remove_list(struct list_head *head) } } -static void reset_connection(struct ceph_connection *con) +void ceph_con_reset_session(struct ceph_connection *con) { - /* reset connection, out_queue, msg_ and connect_seq */ - /* discard existing out_queue and msg_seq */ - dout("reset_connection %p\n", con); + dout("%s con %p\n", __func__, con); + + WARN_ON(con->in_msg); + WARN_ON(con->out_msg); ceph_msg_remove_list(&con->out_queue); ceph_msg_remove_list(&con->out_sent); - - if (con->in_msg) { - BUG_ON(con->in_msg->con != con); - ceph_msg_put(con->in_msg); - con->in_msg = NULL; - } - - con->connect_seq = 0; con->out_seq = 0; - if (con->out_msg) { - BUG_ON(con->out_msg->con != con); - ceph_msg_put(con->out_msg); - con->out_msg = NULL; - } con->in_seq = 0; con->in_seq_acked = 0; - con->out_skip = 0; + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_reset_session(con); + else + ceph_con_v1_reset_session(con); } /* @@ -666,17 +567,17 @@ void ceph_con_close(struct ceph_connection *con) { mutex_lock(&con->mutex); dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr)); - con->state = CON_STATE_CLOSED; + con->state = CEPH_CON_S_CLOSED; - con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */ - con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING); - con_flag_clear(con, CON_FLAG_WRITE_PENDING); - con_flag_clear(con, CON_FLAG_BACKOFF); + ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX); /* so we retry next + connect */ + ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING); + ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); + ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF); - reset_connection(con); - con->peer_global_seq = 0; + ceph_con_reset_protocol(con); + ceph_con_reset_session(con); cancel_con(con); - con_close_socket(con); mutex_unlock(&con->mutex); } EXPORT_SYMBOL(ceph_con_close); @@ -691,8 +592,8 @@ void ceph_con_open(struct ceph_connection *con, mutex_lock(&con->mutex); dout("con_open %p %s\n", con, ceph_pr_addr(addr)); - WARN_ON(con->state != CON_STATE_CLOSED); - con->state = CON_STATE_PREOPEN; + WARN_ON(con->state != CEPH_CON_S_CLOSED); + con->state = CEPH_CON_S_PREOPEN; con->peer_name.type = (__u8) entity_type; con->peer_name.num = cpu_to_le64(entity_num); @@ -709,7 +610,10 @@ EXPORT_SYMBOL(ceph_con_open); */ bool ceph_con_opened(struct ceph_connection *con) { - return con->connect_seq > 0; + if (ceph_msgr2(from_msgr(con->msgr))) + return ceph_con_v2_opened(con); + + return ceph_con_v1_opened(con); } /* @@ -732,16 +636,15 @@ void ceph_con_init(struct ceph_connection *con, void *private, INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, ceph_con_workfn); - con->state = CON_STATE_CLOSED; + con->state = CEPH_CON_S_CLOSED; } EXPORT_SYMBOL(ceph_con_init); - /* * We maintain a global counter to order connection attempts. Get * a unique seq greater than @gt. */ -static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) +u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt) { u32 ret; @@ -753,48 +656,53 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) return ret; } -static void con_out_kvec_reset(struct ceph_connection *con) -{ - BUG_ON(con->out_skip); - - con->out_kvec_left = 0; - con->out_kvec_bytes = 0; - con->out_kvec_cur = &con->out_kvec[0]; -} - -static void con_out_kvec_add(struct ceph_connection *con, - size_t size, void *data) +/* + * Discard messages that have been acked by the server. + */ +void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq) { - int index = con->out_kvec_left; + struct ceph_msg *msg; + u64 seq; - BUG_ON(con->out_skip); - BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); + dout("%s con %p ack_seq %llu\n", __func__, con, ack_seq); + while (!list_empty(&con->out_sent)) { + msg = list_first_entry(&con->out_sent, struct ceph_msg, + list_head); + WARN_ON(msg->needs_out_seq); + seq = le64_to_cpu(msg->hdr.seq); + if (seq > ack_seq) + break; - con->out_kvec[index].iov_len = size; - con->out_kvec[index].iov_base = data; - con->out_kvec_left++; - con->out_kvec_bytes += size; + dout("%s con %p discarding msg %p seq %llu\n", __func__, con, + msg, seq); + ceph_msg_remove(msg); + } } /* - * Chop off a kvec from the end. Return residual number of bytes for - * that kvec, i.e. how many bytes would have been written if the kvec - * hadn't been nuked. + * Discard messages that have been requeued in con_fault(), up to + * reconnect_seq. This avoids gratuitously resending messages that + * the server had received and handled prior to reconnect. */ -static int con_out_kvec_skip(struct ceph_connection *con) +void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq) { - int off = con->out_kvec_cur - con->out_kvec; - int skip = 0; + struct ceph_msg *msg; + u64 seq; - if (con->out_kvec_bytes > 0) { - skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len; - BUG_ON(con->out_kvec_bytes < skip); - BUG_ON(!con->out_kvec_left); - con->out_kvec_bytes -= skip; - con->out_kvec_left--; - } + dout("%s con %p reconnect_seq %llu\n", __func__, con, reconnect_seq); + while (!list_empty(&con->out_queue)) { + msg = list_first_entry(&con->out_queue, struct ceph_msg, + list_head); + if (msg->needs_out_seq) + break; + seq = le64_to_cpu(msg->hdr.seq); + if (seq > reconnect_seq) + break; - return skip; + dout("%s con %p discarding msg %p seq %llu\n", __func__, con, + msg, seq); + ceph_msg_remove(msg); + } } #ifdef CONFIG_BLOCK @@ -1113,10 +1021,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) cursor->need_crc = true; } -static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) +void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, + struct ceph_msg *msg, size_t length) { - struct ceph_msg_data_cursor *cursor = &msg->cursor; - BUG_ON(!length); BUG_ON(length > msg->data_length); BUG_ON(!msg->num_data_items); @@ -1132,9 +1039,9 @@ static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) * data item, and supply the page offset and length of that piece. * Indicate whether this is the last piece in this data item. */ -static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, - size_t *page_offset, size_t *length, - bool *last_piece) +struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length, + bool *last_piece) { struct page *page; @@ -1173,8 +1080,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, * Returns true if the result moves the cursor on to the next piece * of the data item. */ -static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, - size_t bytes) +void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { bool new_piece; @@ -1210,328 +1116,8 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, cursor->need_crc = new_piece; } -static size_t sizeof_footer(struct ceph_connection *con) -{ - return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ? - sizeof(struct ceph_msg_footer) : - sizeof(struct ceph_msg_footer_old); -} - -static void prepare_message_data(struct ceph_msg *msg, u32 data_len) -{ - /* Initialize data cursor */ - - ceph_msg_data_cursor_init(msg, (size_t)data_len); -} - -/* - * Prepare footer for currently outgoing message, and finish things - * off. Assumes out_kvec* are already valid.. we just add on to the end. - */ -static void prepare_write_message_footer(struct ceph_connection *con) -{ - struct ceph_msg *m = con->out_msg; - - m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; - - dout("prepare_write_message_footer %p\n", con); - con_out_kvec_add(con, sizeof_footer(con), &m->footer); - if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { - if (con->ops->sign_message) - con->ops->sign_message(m); - else - m->footer.sig = 0; - } else { - m->old_footer.flags = m->footer.flags; - } - con->out_more = m->more_to_follow; - con->out_msg_done = true; -} - -/* - * Prepare headers for the next outgoing message. - */ -static void prepare_write_message(struct ceph_connection *con) -{ - struct ceph_msg *m; - u32 crc; - - con_out_kvec_reset(con); - con->out_msg_done = false; - - /* Sneak an ack in there first? If we can get it into the same - * TCP packet that's a good thing. */ - if (con->in_seq > con->in_seq_acked) { - con->in_seq_acked = con->in_seq; - con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); - con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con_out_kvec_add(con, sizeof (con->out_temp_ack), - &con->out_temp_ack); - } - - BUG_ON(list_empty(&con->out_queue)); - m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); - con->out_msg = m; - BUG_ON(m->con != con); - - /* put message on sent list */ - ceph_msg_get(m); - list_move_tail(&m->list_head, &con->out_sent); - - /* - * only assign outgoing seq # if we haven't sent this message - * yet. if it is requeued, resend with it's original seq. - */ - if (m->needs_out_seq) { - m->hdr.seq = cpu_to_le64(++con->out_seq); - m->needs_out_seq = false; - - if (con->ops->reencode_message) - con->ops->reencode_message(m); - } - - dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", - m, con->out_seq, le16_to_cpu(m->hdr.type), - le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), - m->data_length); - WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len)); - WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); - - /* tag + hdr + front + middle */ - con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); - con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr); - con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); - - if (m->middle) - con_out_kvec_add(con, m->middle->vec.iov_len, - m->middle->vec.iov_base); - - /* fill in hdr crc and finalize hdr */ - crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); - con->out_msg->hdr.crc = cpu_to_le32(crc); - memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr)); - - /* fill in front and middle crc, footer */ - crc = crc32c(0, m->front.iov_base, m->front.iov_len); - con->out_msg->footer.front_crc = cpu_to_le32(crc); - if (m->middle) { - crc = crc32c(0, m->middle->vec.iov_base, - m->middle->vec.iov_len); - con->out_msg->footer.middle_crc = cpu_to_le32(crc); - } else - con->out_msg->footer.middle_crc = 0; - dout("%s front_crc %u middle_crc %u\n", __func__, - le32_to_cpu(con->out_msg->footer.front_crc), - le32_to_cpu(con->out_msg->footer.middle_crc)); - con->out_msg->footer.flags = 0; - - /* is there a data payload? */ - con->out_msg->footer.data_crc = 0; - if (m->data_length) { - prepare_message_data(con->out_msg, m->data_length); - con->out_more = 1; /* data + footer will follow */ - } else { - /* no, queue up footer too and be done */ - prepare_write_message_footer(con); - } - - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -/* - * Prepare an ack. - */ -static void prepare_write_ack(struct ceph_connection *con) -{ - dout("prepare_write_ack %p %llu -> %llu\n", con, - con->in_seq_acked, con->in_seq); - con->in_seq_acked = con->in_seq; - - con_out_kvec_reset(con); - - con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); - - con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con_out_kvec_add(con, sizeof (con->out_temp_ack), - &con->out_temp_ack); - - con->out_more = 1; /* more will follow.. eventually.. */ - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -/* - * Prepare to share the seq during handshake - */ -static void prepare_write_seq(struct ceph_connection *con) -{ - dout("prepare_write_seq %p %llu -> %llu\n", con, - con->in_seq_acked, con->in_seq); - con->in_seq_acked = con->in_seq; - - con_out_kvec_reset(con); - - con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con_out_kvec_add(con, sizeof (con->out_temp_ack), - &con->out_temp_ack); - - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -/* - * Prepare to write keepalive byte. - */ -static void prepare_write_keepalive(struct ceph_connection *con) -{ - dout("prepare_write_keepalive %p\n", con); - con_out_kvec_reset(con); - if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { - struct timespec64 now; - - ktime_get_real_ts64(&now); - con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); - ceph_encode_timespec64(&con->out_temp_keepalive2, &now); - con_out_kvec_add(con, sizeof(con->out_temp_keepalive2), - &con->out_temp_keepalive2); - } else { - con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive); - } - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -/* - * Connection negotiation. - */ - -static int get_connect_authorizer(struct ceph_connection *con) -{ - struct ceph_auth_handshake *auth; - int auth_proto; - - if (!con->ops->get_authorizer) { - con->auth = NULL; - con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; - con->out_connect.authorizer_len = 0; - return 0; - } - - auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry); - if (IS_ERR(auth)) - return PTR_ERR(auth); - - con->auth = auth; - con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto); - con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len); - return 0; -} - -/* - * We connected to a peer and are saying hello. - */ -static void prepare_write_banner(struct ceph_connection *con) -{ - con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); - con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), - &con->msgr->my_enc_addr); - - con->out_more = 0; - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -static void __prepare_write_connect(struct ceph_connection *con) -{ - con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect); - if (con->auth) - con_out_kvec_add(con, con->auth->authorizer_buf_len, - con->auth->authorizer_buf); - - con->out_more = 0; - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -static int prepare_write_connect(struct ceph_connection *con) -{ - unsigned int global_seq = get_global_seq(con->msgr, 0); - int proto; - int ret; - - switch (con->peer_name.type) { - case CEPH_ENTITY_TYPE_MON: - proto = CEPH_MONC_PROTOCOL; - break; - case CEPH_ENTITY_TYPE_OSD: - proto = CEPH_OSDC_PROTOCOL; - break; - case CEPH_ENTITY_TYPE_MDS: - proto = CEPH_MDSC_PROTOCOL; - break; - default: - BUG(); - } - - dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, - con->connect_seq, global_seq, proto); - - con->out_connect.features = - cpu_to_le64(from_msgr(con->msgr)->supported_features); - con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); - con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); - con->out_connect.global_seq = cpu_to_le32(global_seq); - con->out_connect.protocol_version = cpu_to_le32(proto); - con->out_connect.flags = 0; - - ret = get_connect_authorizer(con); - if (ret) - return ret; - - __prepare_write_connect(con); - return 0; -} - -/* - * write as much of pending kvecs to the socket as we can. - * 1 -> done - * 0 -> socket full, but more to do - * <0 -> error - */ -static int write_partial_kvec(struct ceph_connection *con) -{ - int ret; - - dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes); - while (con->out_kvec_bytes > 0) { - ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur, - con->out_kvec_left, con->out_kvec_bytes, - con->out_more); - if (ret <= 0) - goto out; - con->out_kvec_bytes -= ret; - if (con->out_kvec_bytes == 0) - break; /* done */ - - /* account for full iov entries consumed */ - while (ret >= con->out_kvec_cur->iov_len) { - BUG_ON(!con->out_kvec_left); - ret -= con->out_kvec_cur->iov_len; - con->out_kvec_cur++; - con->out_kvec_left--; - } - /* and for a partially-consumed entry */ - if (ret) { - con->out_kvec_cur->iov_len -= ret; - con->out_kvec_cur->iov_base += ret; - } - } - con->out_kvec_left = 0; - ret = 1; -out: - dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, - con->out_kvec_bytes, con->out_kvec_left, ret); - return ret; /* done! */ -} - -static u32 ceph_crc32c_page(u32 crc, struct page *page, - unsigned int page_offset, - unsigned int length) +u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset, + unsigned int length) { char *kaddr; @@ -1542,257 +1128,8 @@ static u32 ceph_crc32c_page(u32 crc, struct page *page, return crc; } -/* - * Write as much message data payload as we can. If we finish, queue - * up the footer. - * 1 -> done, footer is now queued in out_kvec[]. - * 0 -> socket full, but more to do - * <0 -> error - */ -static int write_partial_message_data(struct ceph_connection *con) -{ - struct ceph_msg *msg = con->out_msg; - struct ceph_msg_data_cursor *cursor = &msg->cursor; - bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); - int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; - u32 crc; - - dout("%s %p msg %p\n", __func__, con, msg); - - if (!msg->num_data_items) - return -EINVAL; - - /* - * Iterate through each page that contains data to be - * written, and send as much as possible for each. - * - * If we are calculating the data crc (the default), we will - * need to map the page. If we have no pages, they have - * been revoked, so use the zero page. - */ - crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; - while (cursor->total_resid) { - struct page *page; - size_t page_offset; - size_t length; - int ret; - - if (!cursor->resid) { - ceph_msg_data_advance(cursor, 0); - continue; - } - - page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); - if (length == cursor->total_resid) - more = MSG_MORE; - ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, - more); - if (ret <= 0) { - if (do_datacrc) - msg->footer.data_crc = cpu_to_le32(crc); - - return ret; - } - if (do_datacrc && cursor->need_crc) - crc = ceph_crc32c_page(crc, page, page_offset, length); - ceph_msg_data_advance(cursor, (size_t)ret); - } - - dout("%s %p msg %p done\n", __func__, con, msg); - - /* prepare and queue up footer, too */ - if (do_datacrc) - msg->footer.data_crc = cpu_to_le32(crc); - else - msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; - con_out_kvec_reset(con); - prepare_write_message_footer(con); - - return 1; /* must return > 0 to indicate success */ -} - -/* - * write some zeros - */ -static int write_partial_skip(struct ceph_connection *con) -{ - int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; - int ret; - - dout("%s %p %d left\n", __func__, con, con->out_skip); - while (con->out_skip > 0) { - size_t size = min(con->out_skip, (int) PAGE_SIZE); - - if (size == con->out_skip) - more = MSG_MORE; - ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more); - if (ret <= 0) - goto out; - con->out_skip -= ret; - } - ret = 1; -out: - return ret; -} - -/* - * Prepare to read connection handshake, or an ack. - */ -static void prepare_read_banner(struct ceph_connection *con) -{ - dout("prepare_read_banner %p\n", con); - con->in_base_pos = 0; -} - -static void prepare_read_connect(struct ceph_connection *con) -{ - dout("prepare_read_connect %p\n", con); - con->in_base_pos = 0; -} - -static void prepare_read_ack(struct ceph_connection *con) -{ - dout("prepare_read_ack %p\n", con); - con->in_base_pos = 0; -} - -static void prepare_read_seq(struct ceph_connection *con) -{ - dout("prepare_read_seq %p\n", con); - con->in_base_pos = 0; - con->in_tag = CEPH_MSGR_TAG_SEQ; -} - -static void prepare_read_tag(struct ceph_connection *con) -{ - dout("prepare_read_tag %p\n", con); - con->in_base_pos = 0; - con->in_tag = CEPH_MSGR_TAG_READY; -} - -static void prepare_read_keepalive_ack(struct ceph_connection *con) -{ - dout("prepare_read_keepalive_ack %p\n", con); - con->in_base_pos = 0; -} - -/* - * Prepare to read a message. - */ -static int prepare_read_message(struct ceph_connection *con) -{ - dout("prepare_read_message %p\n", con); - BUG_ON(con->in_msg != NULL); - con->in_base_pos = 0; - con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; - return 0; -} - - -static int read_partial(struct ceph_connection *con, - int end, int size, void *object) -{ - while (con->in_base_pos < end) { - int left = end - con->in_base_pos; - int have = size - left; - int ret = ceph_tcp_recvmsg(con->sock, object + have, left); - if (ret <= 0) - return ret; - con->in_base_pos += ret; - } - return 1; -} - - -/* - * Read all or part of the connect-side handshake on a new connection - */ -static int read_partial_banner(struct ceph_connection *con) -{ - int size; - int end; - int ret; - - dout("read_partial_banner %p at %d\n", con, con->in_base_pos); - - /* peer's banner */ - size = strlen(CEPH_BANNER); - end = size; - ret = read_partial(con, end, size, con->in_banner); - if (ret <= 0) - goto out; - - size = sizeof (con->actual_peer_addr); - end += size; - ret = read_partial(con, end, size, &con->actual_peer_addr); - if (ret <= 0) - goto out; - ceph_decode_banner_addr(&con->actual_peer_addr); - - size = sizeof (con->peer_addr_for_me); - end += size; - ret = read_partial(con, end, size, &con->peer_addr_for_me); - if (ret <= 0) - goto out; - ceph_decode_banner_addr(&con->peer_addr_for_me); - -out: - return ret; -} - -static int read_partial_connect(struct ceph_connection *con) -{ - int size; - int end; - int ret; - - dout("read_partial_connect %p at %d\n", con, con->in_base_pos); - - size = sizeof (con->in_reply); - end = size; - ret = read_partial(con, end, size, &con->in_reply); - if (ret <= 0) - goto out; - - if (con->auth) { - size = le32_to_cpu(con->in_reply.authorizer_len); - if (size > con->auth->authorizer_reply_buf_len) { - pr_err("authorizer reply too big: %d > %zu\n", size, - con->auth->authorizer_reply_buf_len); - ret = -EINVAL; - goto out; - } - - end += size; - ret = read_partial(con, end, size, - con->auth->authorizer_reply_buf); - if (ret <= 0) - goto out; - } - - dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", - con, (int)con->in_reply.tag, - le32_to_cpu(con->in_reply.connect_seq), - le32_to_cpu(con->in_reply.global_seq)); -out: - return ret; -} -/* - * Verify the hello banner looks okay. - */ -static int verify_hello(struct ceph_connection *con) -{ - if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { - pr_err("connect to %s got bad banner\n", - ceph_pr_addr(&con->peer_addr)); - con->error_msg = "protocol error, bad banner"; - return -1; - } - return 0; -} - -static bool addr_is_blank(struct ceph_entity_addr *addr) +bool ceph_addr_is_blank(const struct ceph_entity_addr *addr) { struct sockaddr_storage ss = addr->in_addr; /* align */ struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr; @@ -1808,7 +1145,7 @@ static bool addr_is_blank(struct ceph_entity_addr *addr) } } -static int addr_port(struct ceph_entity_addr *addr) +int ceph_addr_port(const struct ceph_entity_addr *addr) { switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: @@ -1819,7 +1156,7 @@ static int addr_port(struct ceph_entity_addr *addr) return 0; } -static void addr_set_port(struct ceph_entity_addr *addr, int p) +void ceph_addr_set_port(struct ceph_entity_addr *addr, int p) { switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: @@ -1977,8 +1314,17 @@ int ceph_parse_ips(const char *c, const char *end, port = CEPH_MON_PORT; } - addr_set_port(&addr[i], port); + ceph_addr_set_port(&addr[i], port); + /* + * We want the type to be set according to ms_mode + * option, but options are normally parsed after mon + * addresses. Rather than complicating parsing, set + * to LEGACY and override in build_initial_monmap() + * for mon addresses and ceph_messenger_init() for + * ip option. + */ addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY; + addr[i].nonce = 0; dout("parse_ips got %s\n", ceph_pr_addr(&addr[i])); @@ -2000,521 +1346,12 @@ bad: return ret; } -static int process_banner(struct ceph_connection *con) -{ - dout("process_banner on %p\n", con); - - if (verify_hello(con) < 0) - return -1; - - /* - * Make sure the other end is who we wanted. note that the other - * end may not yet know their ip address, so if it's 0.0.0.0, give - * them the benefit of the doubt. - */ - if (memcmp(&con->peer_addr, &con->actual_peer_addr, - sizeof(con->peer_addr)) != 0 && - !(addr_is_blank(&con->actual_peer_addr) && - con->actual_peer_addr.nonce == con->peer_addr.nonce)) { - pr_warn("wrong peer, want %s/%u, got %s/%u\n", - ceph_pr_addr(&con->peer_addr), - le32_to_cpu(con->peer_addr.nonce), - ceph_pr_addr(&con->actual_peer_addr), - le32_to_cpu(con->actual_peer_addr.nonce)); - con->error_msg = "wrong peer at address"; - return -1; - } - - /* - * did we learn our address? - */ - if (addr_is_blank(&con->msgr->inst.addr)) { - int port = addr_port(&con->msgr->inst.addr); - - memcpy(&con->msgr->inst.addr.in_addr, - &con->peer_addr_for_me.in_addr, - sizeof(con->peer_addr_for_me.in_addr)); - addr_set_port(&con->msgr->inst.addr, port); - encode_my_addr(con->msgr); - dout("process_banner learned my addr is %s\n", - ceph_pr_addr(&con->msgr->inst.addr)); - } - - return 0; -} - -static int process_connect(struct ceph_connection *con) -{ - u64 sup_feat = from_msgr(con->msgr)->supported_features; - u64 req_feat = from_msgr(con->msgr)->required_features; - u64 server_feat = le64_to_cpu(con->in_reply.features); - int ret; - - dout("process_connect on %p tag %d\n", con, (int)con->in_tag); - - if (con->auth) { - int len = le32_to_cpu(con->in_reply.authorizer_len); - - /* - * Any connection that defines ->get_authorizer() - * should also define ->add_authorizer_challenge() and - * ->verify_authorizer_reply(). - * - * See get_connect_authorizer(). - */ - if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { - ret = con->ops->add_authorizer_challenge( - con, con->auth->authorizer_reply_buf, len); - if (ret < 0) - return ret; - - con_out_kvec_reset(con); - __prepare_write_connect(con); - prepare_read_connect(con); - return 0; - } - - if (len) { - ret = con->ops->verify_authorizer_reply(con); - if (ret < 0) { - con->error_msg = "bad authorize reply"; - return ret; - } - } - } - - switch (con->in_reply.tag) { - case CEPH_MSGR_TAG_FEATURES: - pr_err("%s%lld %s feature set mismatch," - " my %llx < server's %llx, missing %llx\n", - ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr), - sup_feat, server_feat, server_feat & ~sup_feat); - con->error_msg = "missing required protocol features"; - reset_connection(con); - return -1; - - case CEPH_MSGR_TAG_BADPROTOVER: - pr_err("%s%lld %s protocol version mismatch," - " my %d != server's %d\n", - ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr), - le32_to_cpu(con->out_connect.protocol_version), - le32_to_cpu(con->in_reply.protocol_version)); - con->error_msg = "protocol version mismatch"; - reset_connection(con); - return -1; - - case CEPH_MSGR_TAG_BADAUTHORIZER: - con->auth_retry++; - dout("process_connect %p got BADAUTHORIZER attempt %d\n", con, - con->auth_retry); - if (con->auth_retry == 2) { - con->error_msg = "connect authorization failure"; - return -1; - } - con_out_kvec_reset(con); - ret = prepare_write_connect(con); - if (ret < 0) - return ret; - prepare_read_connect(con); - break; - - case CEPH_MSGR_TAG_RESETSESSION: - /* - * If we connected with a large connect_seq but the peer - * has no record of a session with us (no connection, or - * connect_seq == 0), they will send RESETSESION to indicate - * that they must have reset their session, and may have - * dropped messages. - */ - dout("process_connect got RESET peer seq %u\n", - le32_to_cpu(con->in_reply.connect_seq)); - pr_err("%s%lld %s connection reset\n", - ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr)); - reset_connection(con); - con_out_kvec_reset(con); - ret = prepare_write_connect(con); - if (ret < 0) - return ret; - prepare_read_connect(con); - - /* Tell ceph about it. */ - mutex_unlock(&con->mutex); - pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name)); - if (con->ops->peer_reset) - con->ops->peer_reset(con); - mutex_lock(&con->mutex); - if (con->state != CON_STATE_NEGOTIATING) - return -EAGAIN; - break; - - case CEPH_MSGR_TAG_RETRY_SESSION: - /* - * If we sent a smaller connect_seq than the peer has, try - * again with a larger value. - */ - dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", - le32_to_cpu(con->out_connect.connect_seq), - le32_to_cpu(con->in_reply.connect_seq)); - con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); - con_out_kvec_reset(con); - ret = prepare_write_connect(con); - if (ret < 0) - return ret; - prepare_read_connect(con); - break; - - case CEPH_MSGR_TAG_RETRY_GLOBAL: - /* - * If we sent a smaller global_seq than the peer has, try - * again with a larger value. - */ - dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", - con->peer_global_seq, - le32_to_cpu(con->in_reply.global_seq)); - get_global_seq(con->msgr, - le32_to_cpu(con->in_reply.global_seq)); - con_out_kvec_reset(con); - ret = prepare_write_connect(con); - if (ret < 0) - return ret; - prepare_read_connect(con); - break; - - case CEPH_MSGR_TAG_SEQ: - case CEPH_MSGR_TAG_READY: - if (req_feat & ~server_feat) { - pr_err("%s%lld %s protocol feature mismatch," - " my required %llx > server's %llx, need %llx\n", - ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr), - req_feat, server_feat, req_feat & ~server_feat); - con->error_msg = "missing required protocol features"; - reset_connection(con); - return -1; - } - - WARN_ON(con->state != CON_STATE_NEGOTIATING); - con->state = CON_STATE_OPEN; - con->auth_retry = 0; /* we authenticated; clear flag */ - con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); - con->connect_seq++; - con->peer_features = server_feat; - dout("process_connect got READY gseq %d cseq %d (%d)\n", - con->peer_global_seq, - le32_to_cpu(con->in_reply.connect_seq), - con->connect_seq); - WARN_ON(con->connect_seq != - le32_to_cpu(con->in_reply.connect_seq)); - - if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) - con_flag_set(con, CON_FLAG_LOSSYTX); - - con->delay = 0; /* reset backoff memory */ - - if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) { - prepare_write_seq(con); - prepare_read_seq(con); - } else { - prepare_read_tag(con); - } - break; - - case CEPH_MSGR_TAG_WAIT: - /* - * If there is a connection race (we are opening - * connections to each other), one of us may just have - * to WAIT. This shouldn't happen if we are the - * client. - */ - con->error_msg = "protocol error, got WAIT as client"; - return -1; - - default: - con->error_msg = "protocol error, garbage tag during connect"; - return -1; - } - return 0; -} - - -/* - * read (part of) an ack - */ -static int read_partial_ack(struct ceph_connection *con) -{ - int size = sizeof (con->in_temp_ack); - int end = size; - - return read_partial(con, end, size, &con->in_temp_ack); -} - -/* - * We can finally discard anything that's been acked. - */ -static void process_ack(struct ceph_connection *con) -{ - struct ceph_msg *m; - u64 ack = le64_to_cpu(con->in_temp_ack); - u64 seq; - bool reconnect = (con->in_tag == CEPH_MSGR_TAG_SEQ); - struct list_head *list = reconnect ? &con->out_queue : &con->out_sent; - - /* - * In the reconnect case, con_fault() has requeued messages - * in out_sent. We should cleanup old messages according to - * the reconnect seq. - */ - while (!list_empty(list)) { - m = list_first_entry(list, struct ceph_msg, list_head); - if (reconnect && m->needs_out_seq) - break; - seq = le64_to_cpu(m->hdr.seq); - if (seq > ack) - break; - dout("got ack for seq %llu type %d at %p\n", seq, - le16_to_cpu(m->hdr.type), m); - m->ack_stamp = jiffies; - ceph_msg_remove(m); - } - - prepare_read_tag(con); -} - - -static int read_partial_message_section(struct ceph_connection *con, - struct kvec *section, - unsigned int sec_len, u32 *crc) -{ - int ret, left; - - BUG_ON(!section); - - while (section->iov_len < sec_len) { - BUG_ON(section->iov_base == NULL); - left = sec_len - section->iov_len; - ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base + - section->iov_len, left); - if (ret <= 0) - return ret; - section->iov_len += ret; - } - if (section->iov_len == sec_len) - *crc = crc32c(0, section->iov_base, section->iov_len); - - return 1; -} - -static int read_partial_msg_data(struct ceph_connection *con) -{ - struct ceph_msg *msg = con->in_msg; - struct ceph_msg_data_cursor *cursor = &msg->cursor; - bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); - struct page *page; - size_t page_offset; - size_t length; - u32 crc = 0; - int ret; - - if (!msg->num_data_items) - return -EIO; - - if (do_datacrc) - crc = con->in_data_crc; - while (cursor->total_resid) { - if (!cursor->resid) { - ceph_msg_data_advance(cursor, 0); - continue; - } - - page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); - ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); - if (ret <= 0) { - if (do_datacrc) - con->in_data_crc = crc; - - return ret; - } - - if (do_datacrc) - crc = ceph_crc32c_page(crc, page, page_offset, ret); - ceph_msg_data_advance(cursor, (size_t)ret); - } - if (do_datacrc) - con->in_data_crc = crc; - - return 1; /* must return > 0 to indicate success */ -} - -/* - * read (part of) a message. - */ -static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); - -static int read_partial_message(struct ceph_connection *con) -{ - struct ceph_msg *m = con->in_msg; - int size; - int end; - int ret; - unsigned int front_len, middle_len, data_len; - bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); - bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH); - u64 seq; - u32 crc; - - dout("read_partial_message con %p msg %p\n", con, m); - - /* header */ - size = sizeof (con->in_hdr); - end = size; - ret = read_partial(con, end, size, &con->in_hdr); - if (ret <= 0) - return ret; - - crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); - if (cpu_to_le32(crc) != con->in_hdr.crc) { - pr_err("read_partial_message bad hdr crc %u != expected %u\n", - crc, con->in_hdr.crc); - return -EBADMSG; - } - - front_len = le32_to_cpu(con->in_hdr.front_len); - if (front_len > CEPH_MSG_MAX_FRONT_LEN) - return -EIO; - middle_len = le32_to_cpu(con->in_hdr.middle_len); - if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN) - return -EIO; - data_len = le32_to_cpu(con->in_hdr.data_len); - if (data_len > CEPH_MSG_MAX_DATA_LEN) - return -EIO; - - /* verify seq# */ - seq = le64_to_cpu(con->in_hdr.seq); - if ((s64)seq - (s64)con->in_seq < 1) { - pr_info("skipping %s%lld %s seq %lld expected %lld\n", - ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr), - seq, con->in_seq + 1); - con->in_base_pos = -front_len - middle_len - data_len - - sizeof_footer(con); - con->in_tag = CEPH_MSGR_TAG_READY; - return 1; - } else if ((s64)seq - (s64)con->in_seq > 1) { - pr_err("read_partial_message bad seq %lld expected %lld\n", - seq, con->in_seq + 1); - con->error_msg = "bad message sequence # for incoming message"; - return -EBADE; - } - - /* allocate message? */ - if (!con->in_msg) { - int skip = 0; - - dout("got hdr type %d front %d data %d\n", con->in_hdr.type, - front_len, data_len); - ret = ceph_con_in_msg_alloc(con, &skip); - if (ret < 0) - return ret; - - BUG_ON(!con->in_msg ^ skip); - if (skip) { - /* skip this message */ - dout("alloc_msg said skip message\n"); - con->in_base_pos = -front_len - middle_len - data_len - - sizeof_footer(con); - con->in_tag = CEPH_MSGR_TAG_READY; - con->in_seq++; - return 1; - } - - BUG_ON(!con->in_msg); - BUG_ON(con->in_msg->con != con); - m = con->in_msg; - m->front.iov_len = 0; /* haven't read it yet */ - if (m->middle) - m->middle->vec.iov_len = 0; - - /* prepare for data payload, if any */ - - if (data_len) - prepare_message_data(con->in_msg, data_len); - } - - /* front */ - ret = read_partial_message_section(con, &m->front, front_len, - &con->in_front_crc); - if (ret <= 0) - return ret; - - /* middle */ - if (m->middle) { - ret = read_partial_message_section(con, &m->middle->vec, - middle_len, - &con->in_middle_crc); - if (ret <= 0) - return ret; - } - - /* (page) data */ - if (data_len) { - ret = read_partial_msg_data(con); - if (ret <= 0) - return ret; - } - - /* footer */ - size = sizeof_footer(con); - end += size; - ret = read_partial(con, end, size, &m->footer); - if (ret <= 0) - return ret; - - if (!need_sign) { - m->footer.flags = m->old_footer.flags; - m->footer.sig = 0; - } - - dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", - m, front_len, m->footer.front_crc, middle_len, - m->footer.middle_crc, data_len, m->footer.data_crc); - - /* crc ok? */ - if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) { - pr_err("read_partial_message %p front crc %u != exp. %u\n", - m, con->in_front_crc, m->footer.front_crc); - return -EBADMSG; - } - if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) { - pr_err("read_partial_message %p middle crc %u != exp %u\n", - m, con->in_middle_crc, m->footer.middle_crc); - return -EBADMSG; - } - if (do_datacrc && - (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && - con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { - pr_err("read_partial_message %p data crc %u != exp. %u\n", m, - con->in_data_crc, le32_to_cpu(m->footer.data_crc)); - return -EBADMSG; - } - - if (need_sign && con->ops->check_message_signature && - con->ops->check_message_signature(m)) { - pr_err("read_partial_message %p signature check failed\n", m); - return -EBADMSG; - } - - return 1; /* done! */ -} - /* * Process message. This happens in the worker thread. The callback should * be careful not to do anything that waits on other incoming messages or it * may deadlock. */ -static void process_message(struct ceph_connection *con) +void ceph_con_process_message(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; @@ -2528,12 +1365,13 @@ static void process_message(struct ceph_connection *con) con->in_seq++; mutex_unlock(&con->mutex); - dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", + dout("===== %p %llu from %s%lld %d=%s len %d+%d+%d (%u %u %u) =====\n", msg, le64_to_cpu(msg->hdr.seq), ENTITY_NAME(msg->hdr.src), le16_to_cpu(msg->hdr.type), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), le32_to_cpu(msg->hdr.front_len), + le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len), con->in_front_crc, con->in_middle_crc, con->in_data_crc); con->ops->dispatch(con, msg); @@ -2541,264 +1379,6 @@ static void process_message(struct ceph_connection *con) mutex_lock(&con->mutex); } -static int read_keepalive_ack(struct ceph_connection *con) -{ - struct ceph_timespec ceph_ts; - size_t size = sizeof(ceph_ts); - int ret = read_partial(con, size, size, &ceph_ts); - if (ret <= 0) - return ret; - ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts); - prepare_read_tag(con); - return 1; -} - -/* - * Write something to the socket. Called in a worker thread when the - * socket appears to be writeable and we have something ready to send. - */ -static int try_write(struct ceph_connection *con) -{ - int ret = 1; - - dout("try_write start %p state %lu\n", con, con->state); - if (con->state != CON_STATE_PREOPEN && - con->state != CON_STATE_CONNECTING && - con->state != CON_STATE_NEGOTIATING && - con->state != CON_STATE_OPEN) - return 0; - - /* open the socket first? */ - if (con->state == CON_STATE_PREOPEN) { - BUG_ON(con->sock); - con->state = CON_STATE_CONNECTING; - - con_out_kvec_reset(con); - prepare_write_banner(con); - prepare_read_banner(con); - - BUG_ON(con->in_msg); - con->in_tag = CEPH_MSGR_TAG_READY; - dout("try_write initiating connect on %p new state %lu\n", - con, con->state); - ret = ceph_tcp_connect(con); - if (ret < 0) { - con->error_msg = "connect error"; - goto out; - } - } - -more: - dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); - BUG_ON(!con->sock); - - /* kvec data queued? */ - if (con->out_kvec_left) { - ret = write_partial_kvec(con); - if (ret <= 0) - goto out; - } - if (con->out_skip) { - ret = write_partial_skip(con); - if (ret <= 0) - goto out; - } - - /* msg pages? */ - if (con->out_msg) { - if (con->out_msg_done) { - ceph_msg_put(con->out_msg); - con->out_msg = NULL; /* we're done with this one */ - goto do_next; - } - - ret = write_partial_message_data(con); - if (ret == 1) - goto more; /* we need to send the footer, too! */ - if (ret == 0) - goto out; - if (ret < 0) { - dout("try_write write_partial_message_data err %d\n", - ret); - goto out; - } - } - -do_next: - if (con->state == CON_STATE_OPEN) { - if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) { - prepare_write_keepalive(con); - goto more; - } - /* is anything else pending? */ - if (!list_empty(&con->out_queue)) { - prepare_write_message(con); - goto more; - } - if (con->in_seq > con->in_seq_acked) { - prepare_write_ack(con); - goto more; - } - } - - /* Nothing to do! */ - con_flag_clear(con, CON_FLAG_WRITE_PENDING); - dout("try_write nothing else to write.\n"); - ret = 0; -out: - dout("try_write done on %p ret %d\n", con, ret); - return ret; -} - -/* - * Read what we can from the socket. - */ -static int try_read(struct ceph_connection *con) -{ - int ret = -1; - -more: - dout("try_read start on %p state %lu\n", con, con->state); - if (con->state != CON_STATE_CONNECTING && - con->state != CON_STATE_NEGOTIATING && - con->state != CON_STATE_OPEN) - return 0; - - BUG_ON(!con->sock); - - dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, - con->in_base_pos); - - if (con->state == CON_STATE_CONNECTING) { - dout("try_read connecting\n"); - ret = read_partial_banner(con); - if (ret <= 0) - goto out; - ret = process_banner(con); - if (ret < 0) - goto out; - - con->state = CON_STATE_NEGOTIATING; - - /* - * Received banner is good, exchange connection info. - * Do not reset out_kvec, as sending our banner raced - * with receiving peer banner after connect completed. - */ - ret = prepare_write_connect(con); - if (ret < 0) - goto out; - prepare_read_connect(con); - - /* Send connection info before awaiting response */ - goto out; - } - - if (con->state == CON_STATE_NEGOTIATING) { - dout("try_read negotiating\n"); - ret = read_partial_connect(con); - if (ret <= 0) - goto out; - ret = process_connect(con); - if (ret < 0) - goto out; - goto more; - } - - WARN_ON(con->state != CON_STATE_OPEN); - - if (con->in_base_pos < 0) { - /* - * skipping + discarding content. - */ - ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos); - if (ret <= 0) - goto out; - dout("skipped %d / %d bytes\n", ret, -con->in_base_pos); - con->in_base_pos += ret; - if (con->in_base_pos) - goto more; - } - if (con->in_tag == CEPH_MSGR_TAG_READY) { - /* - * what's next? - */ - ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1); - if (ret <= 0) - goto out; - dout("try_read got tag %d\n", (int)con->in_tag); - switch (con->in_tag) { - case CEPH_MSGR_TAG_MSG: - prepare_read_message(con); - break; - case CEPH_MSGR_TAG_ACK: - prepare_read_ack(con); - break; - case CEPH_MSGR_TAG_KEEPALIVE2_ACK: - prepare_read_keepalive_ack(con); - break; - case CEPH_MSGR_TAG_CLOSE: - con_close_socket(con); - con->state = CON_STATE_CLOSED; - goto out; - default: - goto bad_tag; - } - } - if (con->in_tag == CEPH_MSGR_TAG_MSG) { - ret = read_partial_message(con); - if (ret <= 0) { - switch (ret) { - case -EBADMSG: - con->error_msg = "bad crc/signature"; - fallthrough; - case -EBADE: - ret = -EIO; - break; - case -EIO: - con->error_msg = "io error"; - break; - } - goto out; - } - if (con->in_tag == CEPH_MSGR_TAG_READY) - goto more; - process_message(con); - if (con->state == CON_STATE_OPEN) - prepare_read_tag(con); - goto more; - } - if (con->in_tag == CEPH_MSGR_TAG_ACK || - con->in_tag == CEPH_MSGR_TAG_SEQ) { - /* - * the final handshake seq exchange is semantically - * equivalent to an ACK - */ - ret = read_partial_ack(con); - if (ret <= 0) - goto out; - process_ack(con); - goto more; - } - if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { - ret = read_keepalive_ack(con); - if (ret <= 0) - goto out; - goto more; - } - -out: - dout("try_read done on %p ret %d\n", con, ret); - return ret; - -bad_tag: - pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag); - con->error_msg = "protocol error, garbage tag"; - ret = -1; - goto out; -} - - /* * Atomically queue work on a connection after the specified delay. * Bump @con reference to avoid races with connection teardown. @@ -2811,6 +1391,9 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay) return -ENOENT; } + if (delay >= HZ) + delay = round_jiffies_relative(delay); + dout("%s %p %lu\n", __func__, con, delay); if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { dout("%s %p - already queued\n", __func__, con); @@ -2836,27 +1419,30 @@ static void cancel_con(struct ceph_connection *con) static bool con_sock_closed(struct ceph_connection *con) { - if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED)) + if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED)) return false; #define CASE(x) \ - case CON_STATE_ ## x: \ + case CEPH_CON_S_ ## x: \ con->error_msg = "socket closed (con state " #x ")"; \ break; switch (con->state) { CASE(CLOSED); CASE(PREOPEN); - CASE(CONNECTING); - CASE(NEGOTIATING); + CASE(V1_BANNER); + CASE(V1_CONNECT_MSG); + CASE(V2_BANNER_PREFIX); + CASE(V2_BANNER_PAYLOAD); + CASE(V2_HELLO); + CASE(V2_AUTH); + CASE(V2_AUTH_SIGNATURE); + CASE(V2_SESSION_CONNECT); + CASE(V2_SESSION_RECONNECT); CASE(OPEN); CASE(STANDBY); default: - pr_warn("%s con %p unrecognized state %lu\n", - __func__, con, con->state); - con->error_msg = "unrecognized con state"; BUG(); - break; } #undef CASE @@ -2867,15 +1453,15 @@ static bool con_backoff(struct ceph_connection *con) { int ret; - if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF)) + if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF)) return false; - ret = queue_con_delay(con, round_jiffies_relative(con->delay)); + ret = queue_con_delay(con, con->delay); if (ret) { dout("%s: con %p FAILED to back off %lu\n", __func__, con, con->delay); BUG_ON(ret == -ENOENT); - con_flag_set(con, CON_FLAG_BACKOFF); + ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); } return true; @@ -2891,11 +1477,11 @@ static void con_fault_finish(struct ceph_connection *con) * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ - if (con->auth_retry) { - dout("auth_retry %d, invalidating\n", con->auth_retry); + if (con->v1.auth_retry) { + dout("auth_retry %d, invalidating\n", con->v1.auth_retry); if (con->ops->invalidate_authorizer) con->ops->invalidate_authorizer(con); - con->auth_retry = 0; + con->v1.auth_retry = 0; } if (con->ops->fault) @@ -2923,21 +1509,24 @@ static void ceph_con_workfn(struct work_struct *work) dout("%s: con %p BACKOFF\n", __func__, con); break; } - if (con->state == CON_STATE_STANDBY) { + if (con->state == CEPH_CON_S_STANDBY) { dout("%s: con %p STANDBY\n", __func__, con); break; } - if (con->state == CON_STATE_CLOSED) { + if (con->state == CEPH_CON_S_CLOSED) { dout("%s: con %p CLOSED\n", __func__, con); BUG_ON(con->sock); break; } - if (con->state == CON_STATE_PREOPEN) { + if (con->state == CEPH_CON_S_PREOPEN) { dout("%s: con %p PREOPEN\n", __func__, con); BUG_ON(con->sock); } - ret = try_read(con); + if (ceph_msgr2(from_msgr(con->msgr))) + ret = ceph_con_v2_try_read(con); + else + ret = ceph_con_v1_try_read(con); if (ret < 0) { if (ret == -EAGAIN) continue; @@ -2947,7 +1536,10 @@ static void ceph_con_workfn(struct work_struct *work) break; } - ret = try_write(con); + if (ceph_msgr2(from_msgr(con->msgr))) + ret = ceph_con_v2_try_write(con); + else + ret = ceph_con_v1_try_write(con); if (ret < 0) { if (ret == -EAGAIN) continue; @@ -2974,64 +1566,54 @@ static void ceph_con_workfn(struct work_struct *work) */ static void con_fault(struct ceph_connection *con) { - dout("fault %p state %lu to peer %s\n", + dout("fault %p state %d to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr)); pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr), con->error_msg); con->error_msg = NULL; - WARN_ON(con->state != CON_STATE_CONNECTING && - con->state != CON_STATE_NEGOTIATING && - con->state != CON_STATE_OPEN); + WARN_ON(con->state == CEPH_CON_S_STANDBY || + con->state == CEPH_CON_S_CLOSED); - con_close_socket(con); + ceph_con_reset_protocol(con); - if (con_flag_test(con, CON_FLAG_LOSSYTX)) { + if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) { dout("fault on LOSSYTX channel, marking CLOSED\n"); - con->state = CON_STATE_CLOSED; + con->state = CEPH_CON_S_CLOSED; return; } - if (con->in_msg) { - BUG_ON(con->in_msg->con != con); - ceph_msg_put(con->in_msg); - con->in_msg = NULL; - } - if (con->out_msg) { - BUG_ON(con->out_msg->con != con); - ceph_msg_put(con->out_msg); - con->out_msg = NULL; - } - /* Requeue anything that hasn't been acked */ list_splice_init(&con->out_sent, &con->out_queue); /* If there are no messages queued or keepalive pending, place * the connection in a STANDBY state */ if (list_empty(&con->out_queue) && - !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) { + !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) { dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); - con_flag_clear(con, CON_FLAG_WRITE_PENDING); - con->state = CON_STATE_STANDBY; + ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); + con->state = CEPH_CON_S_STANDBY; } else { /* retry after a delay. */ - con->state = CON_STATE_PREOPEN; - if (con->delay == 0) + con->state = CEPH_CON_S_PREOPEN; + if (!con->delay) { con->delay = BASE_DELAY_INTERVAL; - else if (con->delay < MAX_DELAY_INTERVAL) + } else if (con->delay < MAX_DELAY_INTERVAL) { con->delay *= 2; - con_flag_set(con, CON_FLAG_BACKOFF); + if (con->delay > MAX_DELAY_INTERVAL) + con->delay = MAX_DELAY_INTERVAL; + } + ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); queue_con(con); } } - void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) { u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; msgr->inst.addr.nonce = cpu_to_le32(nonce); - encode_my_addr(msgr); + ceph_encode_my_addr(msgr); } /* @@ -3042,26 +1624,35 @@ void ceph_messenger_init(struct ceph_messenger *msgr, { spin_lock_init(&msgr->global_seq_lock); - if (myaddr) - msgr->inst.addr = *myaddr; + if (myaddr) { + memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr, + sizeof(msgr->inst.addr.in_addr)); + ceph_addr_set_port(&msgr->inst.addr, 0); + } - /* select a random nonce */ - msgr->inst.addr.type = 0; - get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); - encode_my_addr(msgr); + /* + * Since nautilus, clients are identified using type ANY. + * For msgr1, ceph_encode_banner_addr() munges it to NONE. + */ + msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY; + + /* generate a random non-zero nonce */ + do { + get_random_bytes(&msgr->inst.addr.nonce, + sizeof(msgr->inst.addr.nonce)); + } while (!msgr->inst.addr.nonce); + ceph_encode_my_addr(msgr); atomic_set(&msgr->stopping, 0); write_pnet(&msgr->net, get_net(current->nsproxy->net_ns)); dout("%s %p\n", __func__, msgr); } -EXPORT_SYMBOL(ceph_messenger_init); void ceph_messenger_fini(struct ceph_messenger *msgr) { put_net(read_pnet(&msgr->net)); } -EXPORT_SYMBOL(ceph_messenger_fini); static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con) { @@ -3075,17 +1666,19 @@ static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con) static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ - if (con->state == CON_STATE_STANDBY) { + if (con->state == CEPH_CON_S_STANDBY) { dout("clear_standby %p and ++connect_seq\n", con); - con->state = CON_STATE_PREOPEN; - con->connect_seq++; - WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING)); - WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)); + con->state = CEPH_CON_S_PREOPEN; + con->v1.connect_seq++; + WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)); + WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)); } } /* * Queue up an outgoing message on the given connection. + * + * Consumes a ref on @msg. */ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) { @@ -3096,7 +1689,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) mutex_lock(&con->mutex); - if (con->state == CON_STATE_CLOSED) { + if (con->state == CEPH_CON_S_CLOSED) { dout("con_send %p closed, dropping %p\n", con, msg); ceph_msg_put(msg); mutex_unlock(&con->mutex); @@ -3119,7 +1712,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) /* if there wasn't anything waiting to send before, queue * new work */ - if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) + if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) queue_con(con); } EXPORT_SYMBOL(ceph_con_send); @@ -3137,36 +1730,30 @@ void ceph_msg_revoke(struct ceph_msg *msg) } mutex_lock(&con->mutex); - if (!list_empty(&msg->list_head)) { - dout("%s %p msg %p - was on queue\n", __func__, con, msg); - list_del_init(&msg->list_head); - msg->hdr.seq = 0; - - ceph_msg_put(msg); + if (list_empty(&msg->list_head)) { + WARN_ON(con->out_msg == msg); + dout("%s con %p msg %p not linked\n", __func__, con, msg); + mutex_unlock(&con->mutex); + return; } + + dout("%s con %p msg %p was linked\n", __func__, con, msg); + msg->hdr.seq = 0; + ceph_msg_remove(msg); + if (con->out_msg == msg) { - BUG_ON(con->out_skip); - /* footer */ - if (con->out_msg_done) { - con->out_skip += con_out_kvec_skip(con); - } else { - BUG_ON(!msg->data_length); - con->out_skip += sizeof_footer(con); - } - /* data, middle, front */ - if (msg->data_length) - con->out_skip += msg->cursor.total_resid; - if (msg->middle) - con->out_skip += con_out_kvec_skip(con); - con->out_skip += con_out_kvec_skip(con); - - dout("%s %p msg %p - was sending, will write %d skip %d\n", - __func__, con, msg, con->out_kvec_bytes, con->out_skip); - msg->hdr.seq = 0; + WARN_ON(con->state != CEPH_CON_S_OPEN); + dout("%s con %p msg %p was sending\n", __func__, con, msg); + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_revoke(con); + else + ceph_con_v1_revoke(con); + ceph_msg_put(con->out_msg); con->out_msg = NULL; - ceph_msg_put(msg); + } else { + dout("%s con %p msg %p not current, out_msg %p\n", __func__, + con, msg, con->out_msg); } - mutex_unlock(&con->mutex); } @@ -3184,25 +1771,17 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg) mutex_lock(&con->mutex); if (con->in_msg == msg) { - unsigned int front_len = le32_to_cpu(con->in_hdr.front_len); - unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len); - unsigned int data_len = le32_to_cpu(con->in_hdr.data_len); - - /* skip rest of message */ - dout("%s %p msg %p revoked\n", __func__, con, msg); - con->in_base_pos = con->in_base_pos - - sizeof(struct ceph_msg_header) - - front_len - - middle_len - - data_len - - sizeof(struct ceph_msg_footer); + WARN_ON(con->state != CEPH_CON_S_OPEN); + dout("%s con %p msg %p was recving\n", __func__, con, msg); + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_revoke_incoming(con); + else + ceph_con_v1_revoke_incoming(con); ceph_msg_put(con->in_msg); con->in_msg = NULL; - con->in_tag = CEPH_MSGR_TAG_READY; - con->in_seq++; } else { - dout("%s %p in_msg %p msg %p no-op\n", - __func__, con, con->in_msg, msg); + dout("%s con %p msg %p not current, in_msg %p\n", __func__, + con, msg, con->in_msg); } mutex_unlock(&con->mutex); } @@ -3215,10 +1794,10 @@ void ceph_con_keepalive(struct ceph_connection *con) dout("con_keepalive %p\n", con); mutex_lock(&con->mutex); clear_standby(con); - con_flag_set(con, CON_FLAG_KEEPALIVE_PENDING); + ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING); mutex_unlock(&con->mutex); - if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) + if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) queue_con(con); } EXPORT_SYMBOL(ceph_con_keepalive); @@ -3424,9 +2003,9 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) * On error (ENOMEM, EAGAIN, ...), * - con->in_msg == NULL */ -static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) +int ceph_con_in_msg_alloc(struct ceph_connection *con, + struct ceph_msg_header *hdr, int *skip) { - struct ceph_msg_header *hdr = &con->in_hdr; int middle_len = le32_to_cpu(hdr->middle_len); struct ceph_msg *msg; int ret = 0; @@ -3437,7 +2016,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); - if (con->state != CON_STATE_OPEN) { + if (con->state != CEPH_CON_S_OPEN) { if (msg) ceph_msg_put(msg); return -EAGAIN; @@ -3458,7 +2037,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) con->error_msg = "error allocating memory for incoming message"; return -ENOMEM; } - memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); + memcpy(&con->in_msg->hdr, hdr, sizeof(*hdr)); if (middle_len && !con->in_msg->middle) { ret = ceph_alloc_middle(con, con->in_msg); @@ -3471,6 +2050,39 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) return ret; } +void ceph_con_get_out_msg(struct ceph_connection *con) +{ + struct ceph_msg *msg; + + BUG_ON(list_empty(&con->out_queue)); + msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head); + WARN_ON(msg->con != con); + + /* + * Put the message on "sent" list using a ref from ceph_con_send(). + * It is put when the message is acked or revoked. + */ + list_move_tail(&msg->list_head, &con->out_sent); + + /* + * Only assign outgoing seq # if we haven't sent this message + * yet. If it is requeued, resend with it's original seq. + */ + if (msg->needs_out_seq) { + msg->hdr.seq = cpu_to_le64(++con->out_seq); + msg->needs_out_seq = false; + + if (con->ops->reencode_message) + con->ops->reencode_message(msg); + } + + /* + * Get a ref for out_msg. It is put when we are done sending the + * message or in case of a fault. + */ + WARN_ON(con->out_msg); + con->out_msg = ceph_msg_get(msg); +} /* * Free a generically kmalloc'd message. diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c new file mode 100644 index 000000000000..2cb5ffdf071a --- /dev/null +++ b/net/ceph/messenger_v1.c @@ -0,0 +1,1506 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> + +#include <linux/bvec.h> +#include <linux/crc32c.h> +#include <linux/net.h> +#include <linux/socket.h> +#include <net/sock.h> + +#include <linux/ceph/ceph_features.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/libceph.h> +#include <linux/ceph/messenger.h> + +/* static tag bytes (protocol control messages) */ +static char tag_msg = CEPH_MSGR_TAG_MSG; +static char tag_ack = CEPH_MSGR_TAG_ACK; +static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; +static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2; + +/* + * If @buf is NULL, discard up to @len bytes. + */ +static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) +{ + struct kvec iov = {buf, len}; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; + + if (!buf) + msg.msg_flags |= MSG_TRUNC; + + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len); + r = sock_recvmsg(sock, &msg, msg.msg_flags); + if (r == -EAGAIN) + r = 0; + return r; +} + +static int ceph_tcp_recvpage(struct socket *sock, struct page *page, + int page_offset, size_t length) +{ + struct bio_vec bvec = { + .bv_page = page, + .bv_offset = page_offset, + .bv_len = length + }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; + + BUG_ON(page_offset + length > PAGE_SIZE); + iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length); + r = sock_recvmsg(sock, &msg, msg.msg_flags); + if (r == -EAGAIN) + r = 0; + return r; +} + +/* + * write something. @more is true if caller will be sending more data + * shortly. + */ +static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, + size_t kvlen, size_t len, bool more) +{ + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; + + if (more) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ + + r = kernel_sendmsg(sock, &msg, iov, kvlen, len); + if (r == -EAGAIN) + r = 0; + return r; +} + +/* + * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST + */ +static int ceph_tcp_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, int more) +{ + ssize_t (*sendpage)(struct socket *sock, struct page *page, + int offset, size_t size, int flags); + int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more; + int ret; + + /* + * sendpage cannot properly handle pages with page_count == 0, + * we need to fall back to sendmsg if that's the case. + * + * Same goes for slab pages: skb_can_coalesce() allows + * coalescing neighboring slab objects into a single frag which + * triggers one of hardened usercopy checks. + */ + if (sendpage_ok(page)) + sendpage = sock->ops->sendpage; + else + sendpage = sock_no_sendpage; + + ret = sendpage(sock, page, offset, size, flags); + if (ret == -EAGAIN) + ret = 0; + + return ret; +} + +static void con_out_kvec_reset(struct ceph_connection *con) +{ + BUG_ON(con->v1.out_skip); + + con->v1.out_kvec_left = 0; + con->v1.out_kvec_bytes = 0; + con->v1.out_kvec_cur = &con->v1.out_kvec[0]; +} + +static void con_out_kvec_add(struct ceph_connection *con, + size_t size, void *data) +{ + int index = con->v1.out_kvec_left; + + BUG_ON(con->v1.out_skip); + BUG_ON(index >= ARRAY_SIZE(con->v1.out_kvec)); + + con->v1.out_kvec[index].iov_len = size; + con->v1.out_kvec[index].iov_base = data; + con->v1.out_kvec_left++; + con->v1.out_kvec_bytes += size; +} + +/* + * Chop off a kvec from the end. Return residual number of bytes for + * that kvec, i.e. how many bytes would have been written if the kvec + * hadn't been nuked. + */ +static int con_out_kvec_skip(struct ceph_connection *con) +{ + int skip = 0; + + if (con->v1.out_kvec_bytes > 0) { + skip = con->v1.out_kvec_cur[con->v1.out_kvec_left - 1].iov_len; + BUG_ON(con->v1.out_kvec_bytes < skip); + BUG_ON(!con->v1.out_kvec_left); + con->v1.out_kvec_bytes -= skip; + con->v1.out_kvec_left--; + } + + return skip; +} + +static size_t sizeof_footer(struct ceph_connection *con) +{ + return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ? + sizeof(struct ceph_msg_footer) : + sizeof(struct ceph_msg_footer_old); +} + +static void prepare_message_data(struct ceph_msg *msg, u32 data_len) +{ + /* Initialize data cursor */ + + ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); +} + +/* + * Prepare footer for currently outgoing message, and finish things + * off. Assumes out_kvec* are already valid.. we just add on to the end. + */ +static void prepare_write_message_footer(struct ceph_connection *con) +{ + struct ceph_msg *m = con->out_msg; + + m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; + + dout("prepare_write_message_footer %p\n", con); + con_out_kvec_add(con, sizeof_footer(con), &m->footer); + if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { + if (con->ops->sign_message) + con->ops->sign_message(m); + else + m->footer.sig = 0; + } else { + m->old_footer.flags = m->footer.flags; + } + con->v1.out_more = m->more_to_follow; + con->v1.out_msg_done = true; +} + +/* + * Prepare headers for the next outgoing message. + */ +static void prepare_write_message(struct ceph_connection *con) +{ + struct ceph_msg *m; + u32 crc; + + con_out_kvec_reset(con); + con->v1.out_msg_done = false; + + /* Sneak an ack in there first? If we can get it into the same + * TCP packet that's a good thing. */ + if (con->in_seq > con->in_seq_acked) { + con->in_seq_acked = con->in_seq; + con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof(con->v1.out_temp_ack), + &con->v1.out_temp_ack); + } + + ceph_con_get_out_msg(con); + m = con->out_msg; + + dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", + m, con->out_seq, le16_to_cpu(m->hdr.type), + le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), + m->data_length); + WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len)); + WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); + + /* tag + hdr + front + middle */ + con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); + con_out_kvec_add(con, sizeof(con->v1.out_hdr), &con->v1.out_hdr); + con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); + + if (m->middle) + con_out_kvec_add(con, m->middle->vec.iov_len, + m->middle->vec.iov_base); + + /* fill in hdr crc and finalize hdr */ + crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); + con->out_msg->hdr.crc = cpu_to_le32(crc); + memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr)); + + /* fill in front and middle crc, footer */ + crc = crc32c(0, m->front.iov_base, m->front.iov_len); + con->out_msg->footer.front_crc = cpu_to_le32(crc); + if (m->middle) { + crc = crc32c(0, m->middle->vec.iov_base, + m->middle->vec.iov_len); + con->out_msg->footer.middle_crc = cpu_to_le32(crc); + } else + con->out_msg->footer.middle_crc = 0; + dout("%s front_crc %u middle_crc %u\n", __func__, + le32_to_cpu(con->out_msg->footer.front_crc), + le32_to_cpu(con->out_msg->footer.middle_crc)); + con->out_msg->footer.flags = 0; + + /* is there a data payload? */ + con->out_msg->footer.data_crc = 0; + if (m->data_length) { + prepare_message_data(con->out_msg, m->data_length); + con->v1.out_more = 1; /* data + footer will follow */ + } else { + /* no, queue up footer too and be done */ + prepare_write_message_footer(con); + } + + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +/* + * Prepare an ack. + */ +static void prepare_write_ack(struct ceph_connection *con) +{ + dout("prepare_write_ack %p %llu -> %llu\n", con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + con_out_kvec_reset(con); + + con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + + con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof(con->v1.out_temp_ack), + &con->v1.out_temp_ack); + + con->v1.out_more = 1; /* more will follow.. eventually.. */ + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +/* + * Prepare to share the seq during handshake + */ +static void prepare_write_seq(struct ceph_connection *con) +{ + dout("prepare_write_seq %p %llu -> %llu\n", con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + con_out_kvec_reset(con); + + con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof(con->v1.out_temp_ack), + &con->v1.out_temp_ack); + + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +/* + * Prepare to write keepalive byte. + */ +static void prepare_write_keepalive(struct ceph_connection *con) +{ + dout("prepare_write_keepalive %p\n", con); + con_out_kvec_reset(con); + if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { + struct timespec64 now; + + ktime_get_real_ts64(&now); + con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); + ceph_encode_timespec64(&con->v1.out_temp_keepalive2, &now); + con_out_kvec_add(con, sizeof(con->v1.out_temp_keepalive2), + &con->v1.out_temp_keepalive2); + } else { + con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive); + } + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +/* + * Connection negotiation. + */ + +static int get_connect_authorizer(struct ceph_connection *con) +{ + struct ceph_auth_handshake *auth; + int auth_proto; + + if (!con->ops->get_authorizer) { + con->v1.auth = NULL; + con->v1.out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; + con->v1.out_connect.authorizer_len = 0; + return 0; + } + + auth = con->ops->get_authorizer(con, &auth_proto, con->v1.auth_retry); + if (IS_ERR(auth)) + return PTR_ERR(auth); + + con->v1.auth = auth; + con->v1.out_connect.authorizer_protocol = cpu_to_le32(auth_proto); + con->v1.out_connect.authorizer_len = + cpu_to_le32(auth->authorizer_buf_len); + return 0; +} + +/* + * We connected to a peer and are saying hello. + */ +static void prepare_write_banner(struct ceph_connection *con) +{ + con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); + con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), + &con->msgr->my_enc_addr); + + con->v1.out_more = 0; + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +static void __prepare_write_connect(struct ceph_connection *con) +{ + con_out_kvec_add(con, sizeof(con->v1.out_connect), + &con->v1.out_connect); + if (con->v1.auth) + con_out_kvec_add(con, con->v1.auth->authorizer_buf_len, + con->v1.auth->authorizer_buf); + + con->v1.out_more = 0; + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +static int prepare_write_connect(struct ceph_connection *con) +{ + unsigned int global_seq = ceph_get_global_seq(con->msgr, 0); + int proto; + int ret; + + switch (con->peer_name.type) { + case CEPH_ENTITY_TYPE_MON: + proto = CEPH_MONC_PROTOCOL; + break; + case CEPH_ENTITY_TYPE_OSD: + proto = CEPH_OSDC_PROTOCOL; + break; + case CEPH_ENTITY_TYPE_MDS: + proto = CEPH_MDSC_PROTOCOL; + break; + default: + BUG(); + } + + dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, + con->v1.connect_seq, global_seq, proto); + + con->v1.out_connect.features = + cpu_to_le64(from_msgr(con->msgr)->supported_features); + con->v1.out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); + con->v1.out_connect.connect_seq = cpu_to_le32(con->v1.connect_seq); + con->v1.out_connect.global_seq = cpu_to_le32(global_seq); + con->v1.out_connect.protocol_version = cpu_to_le32(proto); + con->v1.out_connect.flags = 0; + + ret = get_connect_authorizer(con); + if (ret) + return ret; + + __prepare_write_connect(con); + return 0; +} + +/* + * write as much of pending kvecs to the socket as we can. + * 1 -> done + * 0 -> socket full, but more to do + * <0 -> error + */ +static int write_partial_kvec(struct ceph_connection *con) +{ + int ret; + + dout("write_partial_kvec %p %d left\n", con, con->v1.out_kvec_bytes); + while (con->v1.out_kvec_bytes > 0) { + ret = ceph_tcp_sendmsg(con->sock, con->v1.out_kvec_cur, + con->v1.out_kvec_left, + con->v1.out_kvec_bytes, + con->v1.out_more); + if (ret <= 0) + goto out; + con->v1.out_kvec_bytes -= ret; + if (!con->v1.out_kvec_bytes) + break; /* done */ + + /* account for full iov entries consumed */ + while (ret >= con->v1.out_kvec_cur->iov_len) { + BUG_ON(!con->v1.out_kvec_left); + ret -= con->v1.out_kvec_cur->iov_len; + con->v1.out_kvec_cur++; + con->v1.out_kvec_left--; + } + /* and for a partially-consumed entry */ + if (ret) { + con->v1.out_kvec_cur->iov_len -= ret; + con->v1.out_kvec_cur->iov_base += ret; + } + } + con->v1.out_kvec_left = 0; + ret = 1; +out: + dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, + con->v1.out_kvec_bytes, con->v1.out_kvec_left, ret); + return ret; /* done! */ +} + +/* + * Write as much message data payload as we can. If we finish, queue + * up the footer. + * 1 -> done, footer is now queued in out_kvec[]. + * 0 -> socket full, but more to do + * <0 -> error + */ +static int write_partial_message_data(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + struct ceph_msg_data_cursor *cursor = &msg->cursor; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); + int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; + u32 crc; + + dout("%s %p msg %p\n", __func__, con, msg); + + if (!msg->num_data_items) + return -EINVAL; + + /* + * Iterate through each page that contains data to be + * written, and send as much as possible for each. + * + * If we are calculating the data crc (the default), we will + * need to map the page. If we have no pages, they have + * been revoked, so use the zero page. + */ + crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; + while (cursor->total_resid) { + struct page *page; + size_t page_offset; + size_t length; + int ret; + + if (!cursor->resid) { + ceph_msg_data_advance(cursor, 0); + continue; + } + + page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); + if (length == cursor->total_resid) + more = MSG_MORE; + ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, + more); + if (ret <= 0) { + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); + + return ret; + } + if (do_datacrc && cursor->need_crc) + crc = ceph_crc32c_page(crc, page, page_offset, length); + ceph_msg_data_advance(cursor, (size_t)ret); + } + + dout("%s %p msg %p done\n", __func__, con, msg); + + /* prepare and queue up footer, too */ + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); + else + msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; + con_out_kvec_reset(con); + prepare_write_message_footer(con); + + return 1; /* must return > 0 to indicate success */ +} + +/* + * write some zeros + */ +static int write_partial_skip(struct ceph_connection *con) +{ + int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; + int ret; + + dout("%s %p %d left\n", __func__, con, con->v1.out_skip); + while (con->v1.out_skip > 0) { + size_t size = min(con->v1.out_skip, (int)PAGE_SIZE); + + if (size == con->v1.out_skip) + more = MSG_MORE; + ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size, + more); + if (ret <= 0) + goto out; + con->v1.out_skip -= ret; + } + ret = 1; +out: + return ret; +} + +/* + * Prepare to read connection handshake, or an ack. + */ +static void prepare_read_banner(struct ceph_connection *con) +{ + dout("prepare_read_banner %p\n", con); + con->v1.in_base_pos = 0; +} + +static void prepare_read_connect(struct ceph_connection *con) +{ + dout("prepare_read_connect %p\n", con); + con->v1.in_base_pos = 0; +} + +static void prepare_read_ack(struct ceph_connection *con) +{ + dout("prepare_read_ack %p\n", con); + con->v1.in_base_pos = 0; +} + +static void prepare_read_seq(struct ceph_connection *con) +{ + dout("prepare_read_seq %p\n", con); + con->v1.in_base_pos = 0; + con->v1.in_tag = CEPH_MSGR_TAG_SEQ; +} + +static void prepare_read_tag(struct ceph_connection *con) +{ + dout("prepare_read_tag %p\n", con); + con->v1.in_base_pos = 0; + con->v1.in_tag = CEPH_MSGR_TAG_READY; +} + +static void prepare_read_keepalive_ack(struct ceph_connection *con) +{ + dout("prepare_read_keepalive_ack %p\n", con); + con->v1.in_base_pos = 0; +} + +/* + * Prepare to read a message. + */ +static int prepare_read_message(struct ceph_connection *con) +{ + dout("prepare_read_message %p\n", con); + BUG_ON(con->in_msg != NULL); + con->v1.in_base_pos = 0; + con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; + return 0; +} + +static int read_partial(struct ceph_connection *con, + int end, int size, void *object) +{ + while (con->v1.in_base_pos < end) { + int left = end - con->v1.in_base_pos; + int have = size - left; + int ret = ceph_tcp_recvmsg(con->sock, object + have, left); + if (ret <= 0) + return ret; + con->v1.in_base_pos += ret; + } + return 1; +} + +/* + * Read all or part of the connect-side handshake on a new connection + */ +static int read_partial_banner(struct ceph_connection *con) +{ + int size; + int end; + int ret; + + dout("read_partial_banner %p at %d\n", con, con->v1.in_base_pos); + + /* peer's banner */ + size = strlen(CEPH_BANNER); + end = size; + ret = read_partial(con, end, size, con->v1.in_banner); + if (ret <= 0) + goto out; + + size = sizeof(con->v1.actual_peer_addr); + end += size; + ret = read_partial(con, end, size, &con->v1.actual_peer_addr); + if (ret <= 0) + goto out; + ceph_decode_banner_addr(&con->v1.actual_peer_addr); + + size = sizeof(con->v1.peer_addr_for_me); + end += size; + ret = read_partial(con, end, size, &con->v1.peer_addr_for_me); + if (ret <= 0) + goto out; + ceph_decode_banner_addr(&con->v1.peer_addr_for_me); + +out: + return ret; +} + +static int read_partial_connect(struct ceph_connection *con) +{ + int size; + int end; + int ret; + + dout("read_partial_connect %p at %d\n", con, con->v1.in_base_pos); + + size = sizeof(con->v1.in_reply); + end = size; + ret = read_partial(con, end, size, &con->v1.in_reply); + if (ret <= 0) + goto out; + + if (con->v1.auth) { + size = le32_to_cpu(con->v1.in_reply.authorizer_len); + if (size > con->v1.auth->authorizer_reply_buf_len) { + pr_err("authorizer reply too big: %d > %zu\n", size, + con->v1.auth->authorizer_reply_buf_len); + ret = -EINVAL; + goto out; + } + + end += size; + ret = read_partial(con, end, size, + con->v1.auth->authorizer_reply_buf); + if (ret <= 0) + goto out; + } + + dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", + con, con->v1.in_reply.tag, + le32_to_cpu(con->v1.in_reply.connect_seq), + le32_to_cpu(con->v1.in_reply.global_seq)); +out: + return ret; +} + +/* + * Verify the hello banner looks okay. + */ +static int verify_hello(struct ceph_connection *con) +{ + if (memcmp(con->v1.in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { + pr_err("connect to %s got bad banner\n", + ceph_pr_addr(&con->peer_addr)); + con->error_msg = "protocol error, bad banner"; + return -1; + } + return 0; +} + +static int process_banner(struct ceph_connection *con) +{ + struct ceph_entity_addr *my_addr = &con->msgr->inst.addr; + + dout("process_banner on %p\n", con); + + if (verify_hello(con) < 0) + return -1; + + /* + * Make sure the other end is who we wanted. note that the other + * end may not yet know their ip address, so if it's 0.0.0.0, give + * them the benefit of the doubt. + */ + if (memcmp(&con->peer_addr, &con->v1.actual_peer_addr, + sizeof(con->peer_addr)) != 0 && + !(ceph_addr_is_blank(&con->v1.actual_peer_addr) && + con->v1.actual_peer_addr.nonce == con->peer_addr.nonce)) { + pr_warn("wrong peer, want %s/%u, got %s/%u\n", + ceph_pr_addr(&con->peer_addr), + le32_to_cpu(con->peer_addr.nonce), + ceph_pr_addr(&con->v1.actual_peer_addr), + le32_to_cpu(con->v1.actual_peer_addr.nonce)); + con->error_msg = "wrong peer at address"; + return -1; + } + + /* + * did we learn our address? + */ + if (ceph_addr_is_blank(my_addr)) { + memcpy(&my_addr->in_addr, + &con->v1.peer_addr_for_me.in_addr, + sizeof(con->v1.peer_addr_for_me.in_addr)); + ceph_addr_set_port(my_addr, 0); + ceph_encode_my_addr(con->msgr); + dout("process_banner learned my addr is %s\n", + ceph_pr_addr(my_addr)); + } + + return 0; +} + +static int process_connect(struct ceph_connection *con) +{ + u64 sup_feat = from_msgr(con->msgr)->supported_features; + u64 req_feat = from_msgr(con->msgr)->required_features; + u64 server_feat = le64_to_cpu(con->v1.in_reply.features); + int ret; + + dout("process_connect on %p tag %d\n", con, con->v1.in_tag); + + if (con->v1.auth) { + int len = le32_to_cpu(con->v1.in_reply.authorizer_len); + + /* + * Any connection that defines ->get_authorizer() + * should also define ->add_authorizer_challenge() and + * ->verify_authorizer_reply(). + * + * See get_connect_authorizer(). + */ + if (con->v1.in_reply.tag == + CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { + ret = con->ops->add_authorizer_challenge( + con, con->v1.auth->authorizer_reply_buf, len); + if (ret < 0) + return ret; + + con_out_kvec_reset(con); + __prepare_write_connect(con); + prepare_read_connect(con); + return 0; + } + + if (len) { + ret = con->ops->verify_authorizer_reply(con); + if (ret < 0) { + con->error_msg = "bad authorize reply"; + return ret; + } + } + } + + switch (con->v1.in_reply.tag) { + case CEPH_MSGR_TAG_FEATURES: + pr_err("%s%lld %s feature set mismatch," + " my %llx < server's %llx, missing %llx\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr), + sup_feat, server_feat, server_feat & ~sup_feat); + con->error_msg = "missing required protocol features"; + return -1; + + case CEPH_MSGR_TAG_BADPROTOVER: + pr_err("%s%lld %s protocol version mismatch," + " my %d != server's %d\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr), + le32_to_cpu(con->v1.out_connect.protocol_version), + le32_to_cpu(con->v1.in_reply.protocol_version)); + con->error_msg = "protocol version mismatch"; + return -1; + + case CEPH_MSGR_TAG_BADAUTHORIZER: + con->v1.auth_retry++; + dout("process_connect %p got BADAUTHORIZER attempt %d\n", con, + con->v1.auth_retry); + if (con->v1.auth_retry == 2) { + con->error_msg = "connect authorization failure"; + return -1; + } + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_RESETSESSION: + /* + * If we connected with a large connect_seq but the peer + * has no record of a session with us (no connection, or + * connect_seq == 0), they will send RESETSESION to indicate + * that they must have reset their session, and may have + * dropped messages. + */ + dout("process_connect got RESET peer seq %u\n", + le32_to_cpu(con->v1.in_reply.connect_seq)); + pr_info("%s%lld %s session reset\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr)); + ceph_con_reset_session(con); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + + /* Tell ceph about it. */ + mutex_unlock(&con->mutex); + if (con->ops->peer_reset) + con->ops->peer_reset(con); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V1_CONNECT_MSG) + return -EAGAIN; + break; + + case CEPH_MSGR_TAG_RETRY_SESSION: + /* + * If we sent a smaller connect_seq than the peer has, try + * again with a larger value. + */ + dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", + le32_to_cpu(con->v1.out_connect.connect_seq), + le32_to_cpu(con->v1.in_reply.connect_seq)); + con->v1.connect_seq = le32_to_cpu(con->v1.in_reply.connect_seq); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_RETRY_GLOBAL: + /* + * If we sent a smaller global_seq than the peer has, try + * again with a larger value. + */ + dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", + con->v1.peer_global_seq, + le32_to_cpu(con->v1.in_reply.global_seq)); + ceph_get_global_seq(con->msgr, + le32_to_cpu(con->v1.in_reply.global_seq)); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_SEQ: + case CEPH_MSGR_TAG_READY: + if (req_feat & ~server_feat) { + pr_err("%s%lld %s protocol feature mismatch," + " my required %llx > server's %llx, need %llx\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr), + req_feat, server_feat, req_feat & ~server_feat); + con->error_msg = "missing required protocol features"; + return -1; + } + + WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG); + con->state = CEPH_CON_S_OPEN; + con->v1.auth_retry = 0; /* we authenticated; clear flag */ + con->v1.peer_global_seq = + le32_to_cpu(con->v1.in_reply.global_seq); + con->v1.connect_seq++; + con->peer_features = server_feat; + dout("process_connect got READY gseq %d cseq %d (%d)\n", + con->v1.peer_global_seq, + le32_to_cpu(con->v1.in_reply.connect_seq), + con->v1.connect_seq); + WARN_ON(con->v1.connect_seq != + le32_to_cpu(con->v1.in_reply.connect_seq)); + + if (con->v1.in_reply.flags & CEPH_MSG_CONNECT_LOSSY) + ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX); + + con->delay = 0; /* reset backoff memory */ + + if (con->v1.in_reply.tag == CEPH_MSGR_TAG_SEQ) { + prepare_write_seq(con); + prepare_read_seq(con); + } else { + prepare_read_tag(con); + } + break; + + case CEPH_MSGR_TAG_WAIT: + /* + * If there is a connection race (we are opening + * connections to each other), one of us may just have + * to WAIT. This shouldn't happen if we are the + * client. + */ + con->error_msg = "protocol error, got WAIT as client"; + return -1; + + default: + con->error_msg = "protocol error, garbage tag during connect"; + return -1; + } + return 0; +} + +/* + * read (part of) an ack + */ +static int read_partial_ack(struct ceph_connection *con) +{ + int size = sizeof(con->v1.in_temp_ack); + int end = size; + + return read_partial(con, end, size, &con->v1.in_temp_ack); +} + +/* + * We can finally discard anything that's been acked. + */ +static void process_ack(struct ceph_connection *con) +{ + u64 ack = le64_to_cpu(con->v1.in_temp_ack); + + if (con->v1.in_tag == CEPH_MSGR_TAG_ACK) + ceph_con_discard_sent(con, ack); + else + ceph_con_discard_requeued(con, ack); + + prepare_read_tag(con); +} + +static int read_partial_message_section(struct ceph_connection *con, + struct kvec *section, + unsigned int sec_len, u32 *crc) +{ + int ret, left; + + BUG_ON(!section); + + while (section->iov_len < sec_len) { + BUG_ON(section->iov_base == NULL); + left = sec_len - section->iov_len; + ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base + + section->iov_len, left); + if (ret <= 0) + return ret; + section->iov_len += ret; + } + if (section->iov_len == sec_len) + *crc = crc32c(0, section->iov_base, section->iov_len); + + return 1; +} + +static int read_partial_msg_data(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->in_msg; + struct ceph_msg_data_cursor *cursor = &msg->cursor; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); + struct page *page; + size_t page_offset; + size_t length; + u32 crc = 0; + int ret; + + if (!msg->num_data_items) + return -EIO; + + if (do_datacrc) + crc = con->in_data_crc; + while (cursor->total_resid) { + if (!cursor->resid) { + ceph_msg_data_advance(cursor, 0); + continue; + } + + page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); + ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); + if (ret <= 0) { + if (do_datacrc) + con->in_data_crc = crc; + + return ret; + } + + if (do_datacrc) + crc = ceph_crc32c_page(crc, page, page_offset, ret); + ceph_msg_data_advance(cursor, (size_t)ret); + } + if (do_datacrc) + con->in_data_crc = crc; + + return 1; /* must return > 0 to indicate success */ +} + +/* + * read (part of) a message. + */ +static int read_partial_message(struct ceph_connection *con) +{ + struct ceph_msg *m = con->in_msg; + int size; + int end; + int ret; + unsigned int front_len, middle_len, data_len; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); + bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH); + u64 seq; + u32 crc; + + dout("read_partial_message con %p msg %p\n", con, m); + + /* header */ + size = sizeof(con->v1.in_hdr); + end = size; + ret = read_partial(con, end, size, &con->v1.in_hdr); + if (ret <= 0) + return ret; + + crc = crc32c(0, &con->v1.in_hdr, offsetof(struct ceph_msg_header, crc)); + if (cpu_to_le32(crc) != con->v1.in_hdr.crc) { + pr_err("read_partial_message bad hdr crc %u != expected %u\n", + crc, con->v1.in_hdr.crc); + return -EBADMSG; + } + + front_len = le32_to_cpu(con->v1.in_hdr.front_len); + if (front_len > CEPH_MSG_MAX_FRONT_LEN) + return -EIO; + middle_len = le32_to_cpu(con->v1.in_hdr.middle_len); + if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN) + return -EIO; + data_len = le32_to_cpu(con->v1.in_hdr.data_len); + if (data_len > CEPH_MSG_MAX_DATA_LEN) + return -EIO; + + /* verify seq# */ + seq = le64_to_cpu(con->v1.in_hdr.seq); + if ((s64)seq - (s64)con->in_seq < 1) { + pr_info("skipping %s%lld %s seq %lld expected %lld\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr), + seq, con->in_seq + 1); + con->v1.in_base_pos = -front_len - middle_len - data_len - + sizeof_footer(con); + con->v1.in_tag = CEPH_MSGR_TAG_READY; + return 1; + } else if ((s64)seq - (s64)con->in_seq > 1) { + pr_err("read_partial_message bad seq %lld expected %lld\n", + seq, con->in_seq + 1); + con->error_msg = "bad message sequence # for incoming message"; + return -EBADE; + } + + /* allocate message? */ + if (!con->in_msg) { + int skip = 0; + + dout("got hdr type %d front %d data %d\n", con->v1.in_hdr.type, + front_len, data_len); + ret = ceph_con_in_msg_alloc(con, &con->v1.in_hdr, &skip); + if (ret < 0) + return ret; + + BUG_ON((!con->in_msg) ^ skip); + if (skip) { + /* skip this message */ + dout("alloc_msg said skip message\n"); + con->v1.in_base_pos = -front_len - middle_len - + data_len - sizeof_footer(con); + con->v1.in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; + return 1; + } + + BUG_ON(!con->in_msg); + BUG_ON(con->in_msg->con != con); + m = con->in_msg; + m->front.iov_len = 0; /* haven't read it yet */ + if (m->middle) + m->middle->vec.iov_len = 0; + + /* prepare for data payload, if any */ + + if (data_len) + prepare_message_data(con->in_msg, data_len); + } + + /* front */ + ret = read_partial_message_section(con, &m->front, front_len, + &con->in_front_crc); + if (ret <= 0) + return ret; + + /* middle */ + if (m->middle) { + ret = read_partial_message_section(con, &m->middle->vec, + middle_len, + &con->in_middle_crc); + if (ret <= 0) + return ret; + } + + /* (page) data */ + if (data_len) { + ret = read_partial_msg_data(con); + if (ret <= 0) + return ret; + } + + /* footer */ + size = sizeof_footer(con); + end += size; + ret = read_partial(con, end, size, &m->footer); + if (ret <= 0) + return ret; + + if (!need_sign) { + m->footer.flags = m->old_footer.flags; + m->footer.sig = 0; + } + + dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", + m, front_len, m->footer.front_crc, middle_len, + m->footer.middle_crc, data_len, m->footer.data_crc); + + /* crc ok? */ + if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) { + pr_err("read_partial_message %p front crc %u != exp. %u\n", + m, con->in_front_crc, m->footer.front_crc); + return -EBADMSG; + } + if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) { + pr_err("read_partial_message %p middle crc %u != exp %u\n", + m, con->in_middle_crc, m->footer.middle_crc); + return -EBADMSG; + } + if (do_datacrc && + (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && + con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { + pr_err("read_partial_message %p data crc %u != exp. %u\n", m, + con->in_data_crc, le32_to_cpu(m->footer.data_crc)); + return -EBADMSG; + } + + if (need_sign && con->ops->check_message_signature && + con->ops->check_message_signature(m)) { + pr_err("read_partial_message %p signature check failed\n", m); + return -EBADMSG; + } + + return 1; /* done! */ +} + +static int read_keepalive_ack(struct ceph_connection *con) +{ + struct ceph_timespec ceph_ts; + size_t size = sizeof(ceph_ts); + int ret = read_partial(con, size, size, &ceph_ts); + if (ret <= 0) + return ret; + ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts); + prepare_read_tag(con); + return 1; +} + +/* + * Read what we can from the socket. + */ +int ceph_con_v1_try_read(struct ceph_connection *con) +{ + int ret = -1; + +more: + dout("try_read start %p state %d\n", con, con->state); + if (con->state != CEPH_CON_S_V1_BANNER && + con->state != CEPH_CON_S_V1_CONNECT_MSG && + con->state != CEPH_CON_S_OPEN) + return 0; + + BUG_ON(!con->sock); + + dout("try_read tag %d in_base_pos %d\n", con->v1.in_tag, + con->v1.in_base_pos); + + if (con->state == CEPH_CON_S_V1_BANNER) { + ret = read_partial_banner(con); + if (ret <= 0) + goto out; + ret = process_banner(con); + if (ret < 0) + goto out; + + con->state = CEPH_CON_S_V1_CONNECT_MSG; + + /* + * Received banner is good, exchange connection info. + * Do not reset out_kvec, as sending our banner raced + * with receiving peer banner after connect completed. + */ + ret = prepare_write_connect(con); + if (ret < 0) + goto out; + prepare_read_connect(con); + + /* Send connection info before awaiting response */ + goto out; + } + + if (con->state == CEPH_CON_S_V1_CONNECT_MSG) { + ret = read_partial_connect(con); + if (ret <= 0) + goto out; + ret = process_connect(con); + if (ret < 0) + goto out; + goto more; + } + + WARN_ON(con->state != CEPH_CON_S_OPEN); + + if (con->v1.in_base_pos < 0) { + /* + * skipping + discarding content. + */ + ret = ceph_tcp_recvmsg(con->sock, NULL, -con->v1.in_base_pos); + if (ret <= 0) + goto out; + dout("skipped %d / %d bytes\n", ret, -con->v1.in_base_pos); + con->v1.in_base_pos += ret; + if (con->v1.in_base_pos) + goto more; + } + if (con->v1.in_tag == CEPH_MSGR_TAG_READY) { + /* + * what's next? + */ + ret = ceph_tcp_recvmsg(con->sock, &con->v1.in_tag, 1); + if (ret <= 0) + goto out; + dout("try_read got tag %d\n", con->v1.in_tag); + switch (con->v1.in_tag) { + case CEPH_MSGR_TAG_MSG: + prepare_read_message(con); + break; + case CEPH_MSGR_TAG_ACK: + prepare_read_ack(con); + break; + case CEPH_MSGR_TAG_KEEPALIVE2_ACK: + prepare_read_keepalive_ack(con); + break; + case CEPH_MSGR_TAG_CLOSE: + ceph_con_close_socket(con); + con->state = CEPH_CON_S_CLOSED; + goto out; + default: + goto bad_tag; + } + } + if (con->v1.in_tag == CEPH_MSGR_TAG_MSG) { + ret = read_partial_message(con); + if (ret <= 0) { + switch (ret) { + case -EBADMSG: + con->error_msg = "bad crc/signature"; + fallthrough; + case -EBADE: + ret = -EIO; + break; + case -EIO: + con->error_msg = "io error"; + break; + } + goto out; + } + if (con->v1.in_tag == CEPH_MSGR_TAG_READY) + goto more; + ceph_con_process_message(con); + if (con->state == CEPH_CON_S_OPEN) + prepare_read_tag(con); + goto more; + } + if (con->v1.in_tag == CEPH_MSGR_TAG_ACK || + con->v1.in_tag == CEPH_MSGR_TAG_SEQ) { + /* + * the final handshake seq exchange is semantically + * equivalent to an ACK + */ + ret = read_partial_ack(con); + if (ret <= 0) + goto out; + process_ack(con); + goto more; + } + if (con->v1.in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { + ret = read_keepalive_ack(con); + if (ret <= 0) + goto out; + goto more; + } + +out: + dout("try_read done on %p ret %d\n", con, ret); + return ret; + +bad_tag: + pr_err("try_read bad tag %d\n", con->v1.in_tag); + con->error_msg = "protocol error, garbage tag"; + ret = -1; + goto out; +} + +/* + * Write something to the socket. Called in a worker thread when the + * socket appears to be writeable and we have something ready to send. + */ +int ceph_con_v1_try_write(struct ceph_connection *con) +{ + int ret = 1; + + dout("try_write start %p state %d\n", con, con->state); + if (con->state != CEPH_CON_S_PREOPEN && + con->state != CEPH_CON_S_V1_BANNER && + con->state != CEPH_CON_S_V1_CONNECT_MSG && + con->state != CEPH_CON_S_OPEN) + return 0; + + /* open the socket first? */ + if (con->state == CEPH_CON_S_PREOPEN) { + BUG_ON(con->sock); + con->state = CEPH_CON_S_V1_BANNER; + + con_out_kvec_reset(con); + prepare_write_banner(con); + prepare_read_banner(con); + + BUG_ON(con->in_msg); + con->v1.in_tag = CEPH_MSGR_TAG_READY; + dout("try_write initiating connect on %p new state %d\n", + con, con->state); + ret = ceph_tcp_connect(con); + if (ret < 0) { + con->error_msg = "connect error"; + goto out; + } + } + +more: + dout("try_write out_kvec_bytes %d\n", con->v1.out_kvec_bytes); + BUG_ON(!con->sock); + + /* kvec data queued? */ + if (con->v1.out_kvec_left) { + ret = write_partial_kvec(con); + if (ret <= 0) + goto out; + } + if (con->v1.out_skip) { + ret = write_partial_skip(con); + if (ret <= 0) + goto out; + } + + /* msg pages? */ + if (con->out_msg) { + if (con->v1.out_msg_done) { + ceph_msg_put(con->out_msg); + con->out_msg = NULL; /* we're done with this one */ + goto do_next; + } + + ret = write_partial_message_data(con); + if (ret == 1) + goto more; /* we need to send the footer, too! */ + if (ret == 0) + goto out; + if (ret < 0) { + dout("try_write write_partial_message_data err %d\n", + ret); + goto out; + } + } + +do_next: + if (con->state == CEPH_CON_S_OPEN) { + if (ceph_con_flag_test_and_clear(con, + CEPH_CON_F_KEEPALIVE_PENDING)) { + prepare_write_keepalive(con); + goto more; + } + /* is anything else pending? */ + if (!list_empty(&con->out_queue)) { + prepare_write_message(con); + goto more; + } + if (con->in_seq > con->in_seq_acked) { + prepare_write_ack(con); + goto more; + } + } + + /* Nothing to do! */ + ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); + dout("try_write nothing else to write.\n"); + ret = 0; +out: + dout("try_write done on %p ret %d\n", con, ret); + return ret; +} + +void ceph_con_v1_revoke(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + + WARN_ON(con->v1.out_skip); + /* footer */ + if (con->v1.out_msg_done) { + con->v1.out_skip += con_out_kvec_skip(con); + } else { + WARN_ON(!msg->data_length); + con->v1.out_skip += sizeof_footer(con); + } + /* data, middle, front */ + if (msg->data_length) + con->v1.out_skip += msg->cursor.total_resid; + if (msg->middle) + con->v1.out_skip += con_out_kvec_skip(con); + con->v1.out_skip += con_out_kvec_skip(con); + + dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con, + con->v1.out_kvec_bytes, con->v1.out_skip); +} + +void ceph_con_v1_revoke_incoming(struct ceph_connection *con) +{ + unsigned int front_len = le32_to_cpu(con->v1.in_hdr.front_len); + unsigned int middle_len = le32_to_cpu(con->v1.in_hdr.middle_len); + unsigned int data_len = le32_to_cpu(con->v1.in_hdr.data_len); + + /* skip rest of message */ + con->v1.in_base_pos = con->v1.in_base_pos - + sizeof(struct ceph_msg_header) - + front_len - + middle_len - + data_len - + sizeof(struct ceph_msg_footer); + + con->v1.in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; + + dout("%s con %p in_base_pos %d\n", __func__, con, con->v1.in_base_pos); +} + +bool ceph_con_v1_opened(struct ceph_connection *con) +{ + return con->v1.connect_seq; +} + +void ceph_con_v1_reset_session(struct ceph_connection *con) +{ + con->v1.connect_seq = 0; + con->v1.peer_global_seq = 0; +} + +void ceph_con_v1_reset_protocol(struct ceph_connection *con) +{ + con->v1.out_skip = 0; +} diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c new file mode 100644 index 000000000000..cc40ce4e02fb --- /dev/null +++ b/net/ceph/messenger_v2.c @@ -0,0 +1,3459 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ceph msgr2 protocol implementation + * + * Copyright (C) 2020 Ilya Dryomov <idryomov@gmail.com> + */ + +#include <linux/ceph/ceph_debug.h> + +#include <crypto/aead.h> +#include <crypto/algapi.h> /* for crypto_memneq() */ +#include <crypto/hash.h> +#include <crypto/sha2.h> +#include <linux/bvec.h> +#include <linux/crc32c.h> +#include <linux/net.h> +#include <linux/scatterlist.h> +#include <linux/socket.h> +#include <linux/sched/mm.h> +#include <net/sock.h> +#include <net/tcp.h> + +#include <linux/ceph/ceph_features.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/libceph.h> +#include <linux/ceph/messenger.h> + +#include "crypto.h" /* for CEPH_KEY_LEN and CEPH_MAX_CON_SECRET_LEN */ + +#define FRAME_TAG_HELLO 1 +#define FRAME_TAG_AUTH_REQUEST 2 +#define FRAME_TAG_AUTH_BAD_METHOD 3 +#define FRAME_TAG_AUTH_REPLY_MORE 4 +#define FRAME_TAG_AUTH_REQUEST_MORE 5 +#define FRAME_TAG_AUTH_DONE 6 +#define FRAME_TAG_AUTH_SIGNATURE 7 +#define FRAME_TAG_CLIENT_IDENT 8 +#define FRAME_TAG_SERVER_IDENT 9 +#define FRAME_TAG_IDENT_MISSING_FEATURES 10 +#define FRAME_TAG_SESSION_RECONNECT 11 +#define FRAME_TAG_SESSION_RESET 12 +#define FRAME_TAG_SESSION_RETRY 13 +#define FRAME_TAG_SESSION_RETRY_GLOBAL 14 +#define FRAME_TAG_SESSION_RECONNECT_OK 15 +#define FRAME_TAG_WAIT 16 +#define FRAME_TAG_MESSAGE 17 +#define FRAME_TAG_KEEPALIVE2 18 +#define FRAME_TAG_KEEPALIVE2_ACK 19 +#define FRAME_TAG_ACK 20 + +#define FRAME_LATE_STATUS_ABORTED 0x1 +#define FRAME_LATE_STATUS_COMPLETE 0xe +#define FRAME_LATE_STATUS_ABORTED_MASK 0xf + +#define IN_S_HANDLE_PREAMBLE 1 +#define IN_S_HANDLE_CONTROL 2 +#define IN_S_HANDLE_CONTROL_REMAINDER 3 +#define IN_S_PREPARE_READ_DATA 4 +#define IN_S_PREPARE_READ_DATA_CONT 5 +#define IN_S_HANDLE_EPILOGUE 6 +#define IN_S_FINISH_SKIP 7 + +#define OUT_S_QUEUE_DATA 1 +#define OUT_S_QUEUE_DATA_CONT 2 +#define OUT_S_QUEUE_ENC_PAGE 3 +#define OUT_S_QUEUE_ZEROS 4 +#define OUT_S_FINISH_MESSAGE 5 +#define OUT_S_GET_NEXT 6 + +#define CTRL_BODY(p) ((void *)(p) + CEPH_PREAMBLE_LEN) +#define FRONT_PAD(p) ((void *)(p) + CEPH_EPILOGUE_SECURE_LEN) +#define MIDDLE_PAD(p) (FRONT_PAD(p) + CEPH_GCM_BLOCK_LEN) +#define DATA_PAD(p) (MIDDLE_PAD(p) + CEPH_GCM_BLOCK_LEN) + +#define CEPH_MSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) + +static int do_recvmsg(struct socket *sock, struct iov_iter *it) +{ + struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS }; + int ret; + + msg.msg_iter = *it; + while (iov_iter_count(it)) { + ret = sock_recvmsg(sock, &msg, msg.msg_flags); + if (ret <= 0) { + if (ret == -EAGAIN) + ret = 0; + return ret; + } + + iov_iter_advance(it, ret); + } + + WARN_ON(msg_data_left(&msg)); + return 1; +} + +/* + * Read as much as possible. + * + * Return: + * 1 - done, nothing (else) to read + * 0 - socket is empty, need to wait + * <0 - error + */ +static int ceph_tcp_recv(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p %s %zu\n", __func__, con, + iov_iter_is_discard(&con->v2.in_iter) ? "discard" : "need", + iov_iter_count(&con->v2.in_iter)); + ret = do_recvmsg(con->sock, &con->v2.in_iter); + dout("%s con %p ret %d left %zu\n", __func__, con, ret, + iov_iter_count(&con->v2.in_iter)); + return ret; +} + +static int do_sendmsg(struct socket *sock, struct iov_iter *it) +{ + struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS }; + int ret; + + msg.msg_iter = *it; + while (iov_iter_count(it)) { + ret = sock_sendmsg(sock, &msg); + if (ret <= 0) { + if (ret == -EAGAIN) + ret = 0; + return ret; + } + + iov_iter_advance(it, ret); + } + + WARN_ON(msg_data_left(&msg)); + return 1; +} + +static int do_try_sendpage(struct socket *sock, struct iov_iter *it) +{ + struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS }; + struct bio_vec bv; + int ret; + + if (WARN_ON(!iov_iter_is_bvec(it))) + return -EINVAL; + + while (iov_iter_count(it)) { + /* iov_iter_iovec() for ITER_BVEC */ + bv.bv_page = it->bvec->bv_page; + bv.bv_offset = it->bvec->bv_offset + it->iov_offset; + bv.bv_len = min(iov_iter_count(it), + it->bvec->bv_len - it->iov_offset); + + /* + * sendpage cannot properly handle pages with + * page_count == 0, we need to fall back to sendmsg if + * that's the case. + * + * Same goes for slab pages: skb_can_coalesce() allows + * coalescing neighboring slab objects into a single frag + * which triggers one of hardened usercopy checks. + */ + if (sendpage_ok(bv.bv_page)) { + ret = sock->ops->sendpage(sock, bv.bv_page, + bv.bv_offset, bv.bv_len, + CEPH_MSG_FLAGS); + } else { + iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, bv.bv_len); + ret = sock_sendmsg(sock, &msg); + } + if (ret <= 0) { + if (ret == -EAGAIN) + ret = 0; + return ret; + } + + iov_iter_advance(it, ret); + } + + return 1; +} + +/* + * Write as much as possible. The socket is expected to be corked, + * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here. + * + * Return: + * 1 - done, nothing (else) to write + * 0 - socket is full, need to wait + * <0 - error + */ +static int ceph_tcp_send(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p have %zu try_sendpage %d\n", __func__, con, + iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage); + if (con->v2.out_iter_sendpage) + ret = do_try_sendpage(con->sock, &con->v2.out_iter); + else + ret = do_sendmsg(con->sock, &con->v2.out_iter); + dout("%s con %p ret %d left %zu\n", __func__, con, ret, + iov_iter_count(&con->v2.out_iter)); + return ret; +} + +static void add_in_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.in_kvec_cnt >= ARRAY_SIZE(con->v2.in_kvecs)); + WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); + + con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_base = buf; + con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_len = len; + con->v2.in_kvec_cnt++; + + con->v2.in_iter.nr_segs++; + con->v2.in_iter.count += len; +} + +static void reset_in_kvecs(struct ceph_connection *con) +{ + WARN_ON(iov_iter_count(&con->v2.in_iter)); + + con->v2.in_kvec_cnt = 0; + iov_iter_kvec(&con->v2.in_iter, READ, con->v2.in_kvecs, 0, 0); +} + +static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv) +{ + WARN_ON(iov_iter_count(&con->v2.in_iter)); + + con->v2.in_bvec = *bv; + iov_iter_bvec(&con->v2.in_iter, READ, &con->v2.in_bvec, 1, bv->bv_len); +} + +static void set_in_skip(struct ceph_connection *con, int len) +{ + WARN_ON(iov_iter_count(&con->v2.in_iter)); + + dout("%s con %p len %d\n", __func__, con, len); + iov_iter_discard(&con->v2.in_iter, READ, len); +} + +static void add_out_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.out_kvec_cnt >= ARRAY_SIZE(con->v2.out_kvecs)); + WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); + WARN_ON(con->v2.out_zero); + + con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_base = buf; + con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_len = len; + con->v2.out_kvec_cnt++; + + con->v2.out_iter.nr_segs++; + con->v2.out_iter.count += len; +} + +static void reset_out_kvecs(struct ceph_connection *con) +{ + WARN_ON(iov_iter_count(&con->v2.out_iter)); + WARN_ON(con->v2.out_zero); + + con->v2.out_kvec_cnt = 0; + + iov_iter_kvec(&con->v2.out_iter, WRITE, con->v2.out_kvecs, 0, 0); + con->v2.out_iter_sendpage = false; +} + +static void set_out_bvec(struct ceph_connection *con, const struct bio_vec *bv, + bool zerocopy) +{ + WARN_ON(iov_iter_count(&con->v2.out_iter)); + WARN_ON(con->v2.out_zero); + + con->v2.out_bvec = *bv; + con->v2.out_iter_sendpage = zerocopy; + iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1, + con->v2.out_bvec.bv_len); +} + +static void set_out_bvec_zero(struct ceph_connection *con) +{ + WARN_ON(iov_iter_count(&con->v2.out_iter)); + WARN_ON(!con->v2.out_zero); + + con->v2.out_bvec.bv_page = ceph_zero_page; + con->v2.out_bvec.bv_offset = 0; + con->v2.out_bvec.bv_len = min(con->v2.out_zero, (int)PAGE_SIZE); + con->v2.out_iter_sendpage = true; + iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1, + con->v2.out_bvec.bv_len); +} + +static void out_zero_add(struct ceph_connection *con, int len) +{ + dout("%s con %p len %d\n", __func__, con, len); + con->v2.out_zero += len; +} + +static void *alloc_conn_buf(struct ceph_connection *con, int len) +{ + void *buf; + + dout("%s con %p len %d\n", __func__, con, len); + + if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs))) + return NULL; + + buf = ceph_kvmalloc(len, GFP_NOIO); + if (!buf) + return NULL; + + con->v2.conn_bufs[con->v2.conn_buf_cnt++] = buf; + return buf; +} + +static void free_conn_bufs(struct ceph_connection *con) +{ + while (con->v2.conn_buf_cnt) + kvfree(con->v2.conn_bufs[--con->v2.conn_buf_cnt]); +} + +static void add_in_sign_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.in_sign_kvec_cnt >= ARRAY_SIZE(con->v2.in_sign_kvecs)); + + con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_base = buf; + con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_len = len; + con->v2.in_sign_kvec_cnt++; +} + +static void clear_in_sign_kvecs(struct ceph_connection *con) +{ + con->v2.in_sign_kvec_cnt = 0; +} + +static void add_out_sign_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.out_sign_kvec_cnt >= ARRAY_SIZE(con->v2.out_sign_kvecs)); + + con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_base = buf; + con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_len = len; + con->v2.out_sign_kvec_cnt++; +} + +static void clear_out_sign_kvecs(struct ceph_connection *con) +{ + con->v2.out_sign_kvec_cnt = 0; +} + +static bool con_secure(struct ceph_connection *con) +{ + return con->v2.con_mode == CEPH_CON_MODE_SECURE; +} + +static int front_len(const struct ceph_msg *msg) +{ + return le32_to_cpu(msg->hdr.front_len); +} + +static int middle_len(const struct ceph_msg *msg) +{ + return le32_to_cpu(msg->hdr.middle_len); +} + +static int data_len(const struct ceph_msg *msg) +{ + return le32_to_cpu(msg->hdr.data_len); +} + +static bool need_padding(int len) +{ + return !IS_ALIGNED(len, CEPH_GCM_BLOCK_LEN); +} + +static int padded_len(int len) +{ + return ALIGN(len, CEPH_GCM_BLOCK_LEN); +} + +static int padding_len(int len) +{ + return padded_len(len) - len; +} + +/* preamble + control segment */ +static int head_onwire_len(int ctrl_len, bool secure) +{ + int head_len; + int rem_len; + + if (secure) { + head_len = CEPH_PREAMBLE_SECURE_LEN; + if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) { + rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN; + head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN; + } + } else { + head_len = CEPH_PREAMBLE_PLAIN_LEN; + if (ctrl_len) + head_len += ctrl_len + CEPH_CRC_LEN; + } + return head_len; +} + +/* front, middle and data segments + epilogue */ +static int __tail_onwire_len(int front_len, int middle_len, int data_len, + bool secure) +{ + if (!front_len && !middle_len && !data_len) + return 0; + + if (!secure) + return front_len + middle_len + data_len + + CEPH_EPILOGUE_PLAIN_LEN; + + return padded_len(front_len) + padded_len(middle_len) + + padded_len(data_len) + CEPH_EPILOGUE_SECURE_LEN; +} + +static int tail_onwire_len(const struct ceph_msg *msg, bool secure) +{ + return __tail_onwire_len(front_len(msg), middle_len(msg), + data_len(msg), secure); +} + +/* head_onwire_len(sizeof(struct ceph_msg_header2), false) */ +#define MESSAGE_HEAD_PLAIN_LEN (CEPH_PREAMBLE_PLAIN_LEN + \ + sizeof(struct ceph_msg_header2) + \ + CEPH_CRC_LEN) + +static const int frame_aligns[] = { + sizeof(void *), + sizeof(void *), + sizeof(void *), + PAGE_SIZE +}; + +/* + * Discards trailing empty segments, unless there is just one segment. + * A frame always has at least one (possibly empty) segment. + */ +static int calc_segment_count(const int *lens, int len_cnt) +{ + int i; + + for (i = len_cnt - 1; i >= 0; i--) { + if (lens[i]) + return i + 1; + } + + return 1; +} + +static void init_frame_desc(struct ceph_frame_desc *desc, int tag, + const int *lens, int len_cnt) +{ + int i; + + memset(desc, 0, sizeof(*desc)); + + desc->fd_tag = tag; + desc->fd_seg_cnt = calc_segment_count(lens, len_cnt); + BUG_ON(desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT); + for (i = 0; i < desc->fd_seg_cnt; i++) { + desc->fd_lens[i] = lens[i]; + desc->fd_aligns[i] = frame_aligns[i]; + } +} + +/* + * Preamble crc covers everything up to itself (28 bytes) and + * is calculated and verified irrespective of the connection mode + * (i.e. even if the frame is encrypted). + */ +static void encode_preamble(const struct ceph_frame_desc *desc, void *p) +{ + void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN; + void *start = p; + int i; + + memset(p, 0, CEPH_PREAMBLE_LEN); + + ceph_encode_8(&p, desc->fd_tag); + ceph_encode_8(&p, desc->fd_seg_cnt); + for (i = 0; i < desc->fd_seg_cnt; i++) { + ceph_encode_32(&p, desc->fd_lens[i]); + ceph_encode_16(&p, desc->fd_aligns[i]); + } + + put_unaligned_le32(crc32c(0, start, crcp - start), crcp); +} + +static int decode_preamble(void *p, struct ceph_frame_desc *desc) +{ + void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN; + u32 crc, expected_crc; + int i; + + crc = crc32c(0, p, crcp - p); + expected_crc = get_unaligned_le32(crcp); + if (crc != expected_crc) { + pr_err("bad preamble crc, calculated %u, expected %u\n", + crc, expected_crc); + return -EBADMSG; + } + + memset(desc, 0, sizeof(*desc)); + + desc->fd_tag = ceph_decode_8(&p); + desc->fd_seg_cnt = ceph_decode_8(&p); + if (desc->fd_seg_cnt < 1 || + desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT) { + pr_err("bad segment count %d\n", desc->fd_seg_cnt); + return -EINVAL; + } + for (i = 0; i < desc->fd_seg_cnt; i++) { + desc->fd_lens[i] = ceph_decode_32(&p); + desc->fd_aligns[i] = ceph_decode_16(&p); + } + + /* + * This would fire for FRAME_TAG_WAIT (it has one empty + * segment), but we should never get it as client. + */ + if (!desc->fd_lens[desc->fd_seg_cnt - 1]) { + pr_err("last segment empty\n"); + return -EINVAL; + } + + if (desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) { + pr_err("control segment too big %d\n", desc->fd_lens[0]); + return -EINVAL; + } + if (desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) { + pr_err("front segment too big %d\n", desc->fd_lens[1]); + return -EINVAL; + } + if (desc->fd_lens[2] > CEPH_MSG_MAX_MIDDLE_LEN) { + pr_err("middle segment too big %d\n", desc->fd_lens[2]); + return -EINVAL; + } + if (desc->fd_lens[3] > CEPH_MSG_MAX_DATA_LEN) { + pr_err("data segment too big %d\n", desc->fd_lens[3]); + return -EINVAL; + } + + return 0; +} + +static void encode_epilogue_plain(struct ceph_connection *con, bool aborted) +{ + con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED : + FRAME_LATE_STATUS_COMPLETE; + cpu_to_le32s(&con->v2.out_epil.front_crc); + cpu_to_le32s(&con->v2.out_epil.middle_crc); + cpu_to_le32s(&con->v2.out_epil.data_crc); +} + +static void encode_epilogue_secure(struct ceph_connection *con, bool aborted) +{ + memset(&con->v2.out_epil, 0, sizeof(con->v2.out_epil)); + con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED : + FRAME_LATE_STATUS_COMPLETE; +} + +static int decode_epilogue(void *p, u32 *front_crc, u32 *middle_crc, + u32 *data_crc) +{ + u8 late_status; + + late_status = ceph_decode_8(&p); + if ((late_status & FRAME_LATE_STATUS_ABORTED_MASK) != + FRAME_LATE_STATUS_COMPLETE) { + /* we should never get an aborted message as client */ + pr_err("bad late_status 0x%x\n", late_status); + return -EINVAL; + } + + if (front_crc && middle_crc && data_crc) { + *front_crc = ceph_decode_32(&p); + *middle_crc = ceph_decode_32(&p); + *data_crc = ceph_decode_32(&p); + } + + return 0; +} + +static void fill_header(struct ceph_msg_header *hdr, + const struct ceph_msg_header2 *hdr2, + int front_len, int middle_len, int data_len, + const struct ceph_entity_name *peer_name) +{ + hdr->seq = hdr2->seq; + hdr->tid = hdr2->tid; + hdr->type = hdr2->type; + hdr->priority = hdr2->priority; + hdr->version = hdr2->version; + hdr->front_len = cpu_to_le32(front_len); + hdr->middle_len = cpu_to_le32(middle_len); + hdr->data_len = cpu_to_le32(data_len); + hdr->data_off = hdr2->data_off; + hdr->src = *peer_name; + hdr->compat_version = hdr2->compat_version; + hdr->reserved = 0; + hdr->crc = 0; +} + +static void fill_header2(struct ceph_msg_header2 *hdr2, + const struct ceph_msg_header *hdr, u64 ack_seq) +{ + hdr2->seq = hdr->seq; + hdr2->tid = hdr->tid; + hdr2->type = hdr->type; + hdr2->priority = hdr->priority; + hdr2->version = hdr->version; + hdr2->data_pre_padding_len = 0; + hdr2->data_off = hdr->data_off; + hdr2->ack_seq = cpu_to_le64(ack_seq); + hdr2->flags = 0; + hdr2->compat_version = hdr->compat_version; + hdr2->reserved = 0; +} + +static int verify_control_crc(struct ceph_connection *con) +{ + int ctrl_len = con->v2.in_desc.fd_lens[0]; + u32 crc, expected_crc; + + WARN_ON(con->v2.in_kvecs[0].iov_len != ctrl_len); + WARN_ON(con->v2.in_kvecs[1].iov_len != CEPH_CRC_LEN); + + crc = crc32c(-1, con->v2.in_kvecs[0].iov_base, ctrl_len); + expected_crc = get_unaligned_le32(con->v2.in_kvecs[1].iov_base); + if (crc != expected_crc) { + pr_err("bad control crc, calculated %u, expected %u\n", + crc, expected_crc); + return -EBADMSG; + } + + return 0; +} + +static int verify_epilogue_crcs(struct ceph_connection *con, u32 front_crc, + u32 middle_crc, u32 data_crc) +{ + if (front_len(con->in_msg)) { + con->in_front_crc = crc32c(-1, con->in_msg->front.iov_base, + front_len(con->in_msg)); + } else { + WARN_ON(!middle_len(con->in_msg) && !data_len(con->in_msg)); + con->in_front_crc = -1; + } + + if (middle_len(con->in_msg)) + con->in_middle_crc = crc32c(-1, + con->in_msg->middle->vec.iov_base, + middle_len(con->in_msg)); + else if (data_len(con->in_msg)) + con->in_middle_crc = -1; + else + con->in_middle_crc = 0; + + if (!data_len(con->in_msg)) + con->in_data_crc = 0; + + dout("%s con %p msg %p crcs %u %u %u\n", __func__, con, con->in_msg, + con->in_front_crc, con->in_middle_crc, con->in_data_crc); + + if (con->in_front_crc != front_crc) { + pr_err("bad front crc, calculated %u, expected %u\n", + con->in_front_crc, front_crc); + return -EBADMSG; + } + if (con->in_middle_crc != middle_crc) { + pr_err("bad middle crc, calculated %u, expected %u\n", + con->in_middle_crc, middle_crc); + return -EBADMSG; + } + if (con->in_data_crc != data_crc) { + pr_err("bad data crc, calculated %u, expected %u\n", + con->in_data_crc, data_crc); + return -EBADMSG; + } + + return 0; +} + +static int setup_crypto(struct ceph_connection *con, + const u8 *session_key, int session_key_len, + const u8 *con_secret, int con_secret_len) +{ + unsigned int noio_flag; + int ret; + + dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n", + __func__, con, con->v2.con_mode, session_key_len, con_secret_len); + WARN_ON(con->v2.hmac_tfm || con->v2.gcm_tfm || con->v2.gcm_req); + + if (con->v2.con_mode != CEPH_CON_MODE_CRC && + con->v2.con_mode != CEPH_CON_MODE_SECURE) { + pr_err("bad con_mode %d\n", con->v2.con_mode); + return -EINVAL; + } + + if (!session_key_len) { + WARN_ON(con->v2.con_mode != CEPH_CON_MODE_CRC); + WARN_ON(con_secret_len); + return 0; /* auth_none */ + } + + noio_flag = memalloc_noio_save(); + con->v2.hmac_tfm = crypto_alloc_shash("hmac(sha256)", 0, 0); + memalloc_noio_restore(noio_flag); + if (IS_ERR(con->v2.hmac_tfm)) { + ret = PTR_ERR(con->v2.hmac_tfm); + con->v2.hmac_tfm = NULL; + pr_err("failed to allocate hmac tfm context: %d\n", ret); + return ret; + } + + WARN_ON((unsigned long)session_key & + crypto_shash_alignmask(con->v2.hmac_tfm)); + ret = crypto_shash_setkey(con->v2.hmac_tfm, session_key, + session_key_len); + if (ret) { + pr_err("failed to set hmac key: %d\n", ret); + return ret; + } + + if (con->v2.con_mode == CEPH_CON_MODE_CRC) { + WARN_ON(con_secret_len); + return 0; /* auth_x, plain mode */ + } + + if (con_secret_len < CEPH_GCM_KEY_LEN + 2 * CEPH_GCM_IV_LEN) { + pr_err("con_secret too small %d\n", con_secret_len); + return -EINVAL; + } + + noio_flag = memalloc_noio_save(); + con->v2.gcm_tfm = crypto_alloc_aead("gcm(aes)", 0, 0); + memalloc_noio_restore(noio_flag); + if (IS_ERR(con->v2.gcm_tfm)) { + ret = PTR_ERR(con->v2.gcm_tfm); + con->v2.gcm_tfm = NULL; + pr_err("failed to allocate gcm tfm context: %d\n", ret); + return ret; + } + + WARN_ON((unsigned long)con_secret & + crypto_aead_alignmask(con->v2.gcm_tfm)); + ret = crypto_aead_setkey(con->v2.gcm_tfm, con_secret, CEPH_GCM_KEY_LEN); + if (ret) { + pr_err("failed to set gcm key: %d\n", ret); + return ret; + } + + WARN_ON(crypto_aead_ivsize(con->v2.gcm_tfm) != CEPH_GCM_IV_LEN); + ret = crypto_aead_setauthsize(con->v2.gcm_tfm, CEPH_GCM_TAG_LEN); + if (ret) { + pr_err("failed to set gcm tag size: %d\n", ret); + return ret; + } + + con->v2.gcm_req = aead_request_alloc(con->v2.gcm_tfm, GFP_NOIO); + if (!con->v2.gcm_req) { + pr_err("failed to allocate gcm request\n"); + return -ENOMEM; + } + + crypto_init_wait(&con->v2.gcm_wait); + aead_request_set_callback(con->v2.gcm_req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &con->v2.gcm_wait); + + memcpy(&con->v2.in_gcm_nonce, con_secret + CEPH_GCM_KEY_LEN, + CEPH_GCM_IV_LEN); + memcpy(&con->v2.out_gcm_nonce, + con_secret + CEPH_GCM_KEY_LEN + CEPH_GCM_IV_LEN, + CEPH_GCM_IV_LEN); + return 0; /* auth_x, secure mode */ +} + +static int hmac_sha256(struct ceph_connection *con, const struct kvec *kvecs, + int kvec_cnt, u8 *hmac) +{ + SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm); /* tfm arg is ignored */ + int ret; + int i; + + dout("%s con %p hmac_tfm %p kvec_cnt %d\n", __func__, con, + con->v2.hmac_tfm, kvec_cnt); + + if (!con->v2.hmac_tfm) { + memset(hmac, 0, SHA256_DIGEST_SIZE); + return 0; /* auth_none */ + } + + desc->tfm = con->v2.hmac_tfm; + ret = crypto_shash_init(desc); + if (ret) + goto out; + + for (i = 0; i < kvec_cnt; i++) { + WARN_ON((unsigned long)kvecs[i].iov_base & + crypto_shash_alignmask(con->v2.hmac_tfm)); + ret = crypto_shash_update(desc, kvecs[i].iov_base, + kvecs[i].iov_len); + if (ret) + goto out; + } + + ret = crypto_shash_final(desc, hmac); + +out: + shash_desc_zero(desc); + return ret; /* auth_x, both plain and secure modes */ +} + +static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce) +{ + u64 counter; + + counter = le64_to_cpu(nonce->counter); + nonce->counter = cpu_to_le64(counter + 1); +} + +static int gcm_crypt(struct ceph_connection *con, bool encrypt, + struct scatterlist *src, struct scatterlist *dst, + int src_len) +{ + struct ceph_gcm_nonce *nonce; + int ret; + + nonce = encrypt ? &con->v2.out_gcm_nonce : &con->v2.in_gcm_nonce; + + aead_request_set_ad(con->v2.gcm_req, 0); /* no AAD */ + aead_request_set_crypt(con->v2.gcm_req, src, dst, src_len, (u8 *)nonce); + ret = crypto_wait_req(encrypt ? crypto_aead_encrypt(con->v2.gcm_req) : + crypto_aead_decrypt(con->v2.gcm_req), + &con->v2.gcm_wait); + if (ret) + return ret; + + gcm_inc_nonce(nonce); + return 0; +} + +static void get_bvec_at(struct ceph_msg_data_cursor *cursor, + struct bio_vec *bv) +{ + struct page *page; + size_t off, len; + + WARN_ON(!cursor->total_resid); + + /* skip zero-length data items */ + while (!cursor->resid) + ceph_msg_data_advance(cursor, 0); + + /* get a piece of data, cursor isn't advanced */ + page = ceph_msg_data_next(cursor, &off, &len, NULL); + + bv->bv_page = page; + bv->bv_offset = off; + bv->bv_len = len; +} + +static int calc_sg_cnt(void *buf, int buf_len) +{ + int sg_cnt; + + if (!buf_len) + return 0; + + sg_cnt = need_padding(buf_len) ? 1 : 0; + if (is_vmalloc_addr(buf)) { + WARN_ON(offset_in_page(buf)); + sg_cnt += PAGE_ALIGN(buf_len) >> PAGE_SHIFT; + } else { + sg_cnt++; + } + + return sg_cnt; +} + +static int calc_sg_cnt_cursor(struct ceph_msg_data_cursor *cursor) +{ + int data_len = cursor->total_resid; + struct bio_vec bv; + int sg_cnt; + + if (!data_len) + return 0; + + sg_cnt = need_padding(data_len) ? 1 : 0; + do { + get_bvec_at(cursor, &bv); + sg_cnt++; + + ceph_msg_data_advance(cursor, bv.bv_len); + } while (cursor->total_resid); + + return sg_cnt; +} + +static void init_sgs(struct scatterlist **sg, void *buf, int buf_len, u8 *pad) +{ + void *end = buf + buf_len; + struct page *page; + int len; + void *p; + + if (!buf_len) + return; + + if (is_vmalloc_addr(buf)) { + p = buf; + do { + page = vmalloc_to_page(p); + len = min_t(int, end - p, PAGE_SIZE); + WARN_ON(!page || !len || offset_in_page(p)); + sg_set_page(*sg, page, len, 0); + *sg = sg_next(*sg); + p += len; + } while (p != end); + } else { + sg_set_buf(*sg, buf, buf_len); + *sg = sg_next(*sg); + } + + if (need_padding(buf_len)) { + sg_set_buf(*sg, pad, padding_len(buf_len)); + *sg = sg_next(*sg); + } +} + +static void init_sgs_cursor(struct scatterlist **sg, + struct ceph_msg_data_cursor *cursor, u8 *pad) +{ + int data_len = cursor->total_resid; + struct bio_vec bv; + + if (!data_len) + return; + + do { + get_bvec_at(cursor, &bv); + sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset); + *sg = sg_next(*sg); + + ceph_msg_data_advance(cursor, bv.bv_len); + } while (cursor->total_resid); + + if (need_padding(data_len)) { + sg_set_buf(*sg, pad, padding_len(data_len)); + *sg = sg_next(*sg); + } +} + +static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg, + u8 *front_pad, u8 *middle_pad, u8 *data_pad, + void *epilogue, bool add_tag) +{ + struct ceph_msg_data_cursor cursor; + struct scatterlist *cur_sg; + int sg_cnt; + int ret; + + if (!front_len(msg) && !middle_len(msg) && !data_len(msg)) + return 0; + + sg_cnt = 1; /* epilogue + [auth tag] */ + if (front_len(msg)) + sg_cnt += calc_sg_cnt(msg->front.iov_base, + front_len(msg)); + if (middle_len(msg)) + sg_cnt += calc_sg_cnt(msg->middle->vec.iov_base, + middle_len(msg)); + if (data_len(msg)) { + ceph_msg_data_cursor_init(&cursor, msg, data_len(msg)); + sg_cnt += calc_sg_cnt_cursor(&cursor); + } + + ret = sg_alloc_table(sgt, sg_cnt, GFP_NOIO); + if (ret) + return ret; + + cur_sg = sgt->sgl; + if (front_len(msg)) + init_sgs(&cur_sg, msg->front.iov_base, front_len(msg), + front_pad); + if (middle_len(msg)) + init_sgs(&cur_sg, msg->middle->vec.iov_base, middle_len(msg), + middle_pad); + if (data_len(msg)) { + ceph_msg_data_cursor_init(&cursor, msg, data_len(msg)); + init_sgs_cursor(&cur_sg, &cursor, data_pad); + } + + WARN_ON(!sg_is_last(cur_sg)); + sg_set_buf(cur_sg, epilogue, + CEPH_GCM_BLOCK_LEN + (add_tag ? CEPH_GCM_TAG_LEN : 0)); + return 0; +} + +static int decrypt_preamble(struct ceph_connection *con) +{ + struct scatterlist sg; + + sg_init_one(&sg, con->v2.in_buf, CEPH_PREAMBLE_SECURE_LEN); + return gcm_crypt(con, false, &sg, &sg, CEPH_PREAMBLE_SECURE_LEN); +} + +static int decrypt_control_remainder(struct ceph_connection *con) +{ + int ctrl_len = con->v2.in_desc.fd_lens[0]; + int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN; + int pt_len = padding_len(rem_len) + CEPH_GCM_TAG_LEN; + struct scatterlist sgs[2]; + + WARN_ON(con->v2.in_kvecs[0].iov_len != rem_len); + WARN_ON(con->v2.in_kvecs[1].iov_len != pt_len); + + sg_init_table(sgs, 2); + sg_set_buf(&sgs[0], con->v2.in_kvecs[0].iov_base, rem_len); + sg_set_buf(&sgs[1], con->v2.in_buf, pt_len); + + return gcm_crypt(con, false, sgs, sgs, + padded_len(rem_len) + CEPH_GCM_TAG_LEN); +} + +static int decrypt_message(struct ceph_connection *con) +{ + struct sg_table sgt = {}; + int ret; + + ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf), + MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf), + con->v2.in_buf, true); + if (ret) + goto out; + + ret = gcm_crypt(con, false, sgt.sgl, sgt.sgl, + tail_onwire_len(con->in_msg, true)); + +out: + sg_free_table(&sgt); + return ret; +} + +static int prepare_banner(struct ceph_connection *con) +{ + int buf_len = CEPH_BANNER_V2_LEN + 2 + 8 + 8; + void *buf, *p; + + buf = alloc_conn_buf(con, buf_len); + if (!buf) + return -ENOMEM; + + p = buf; + ceph_encode_copy(&p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN); + ceph_encode_16(&p, sizeof(u64) + sizeof(u64)); + ceph_encode_64(&p, CEPH_MSGR2_SUPPORTED_FEATURES); + ceph_encode_64(&p, CEPH_MSGR2_REQUIRED_FEATURES); + WARN_ON(p != buf + buf_len); + + add_out_kvec(con, buf, buf_len); + add_out_sign_kvec(con, buf, buf_len); + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); + return 0; +} + +/* + * base: + * preamble + * control body (ctrl_len bytes) + * space for control crc + * + * extdata (optional): + * control body (extdata_len bytes) + * + * Compute control crc and gather base and extdata into: + * + * preamble + * control body (ctrl_len + extdata_len bytes) + * control crc + * + * Preamble should already be encoded at the start of base. + */ +static void prepare_head_plain(struct ceph_connection *con, void *base, + int ctrl_len, void *extdata, int extdata_len, + bool to_be_signed) +{ + int base_len = CEPH_PREAMBLE_LEN + ctrl_len + CEPH_CRC_LEN; + void *crcp = base + base_len - CEPH_CRC_LEN; + u32 crc; + + crc = crc32c(-1, CTRL_BODY(base), ctrl_len); + if (extdata_len) + crc = crc32c(crc, extdata, extdata_len); + put_unaligned_le32(crc, crcp); + + if (!extdata_len) { + add_out_kvec(con, base, base_len); + if (to_be_signed) + add_out_sign_kvec(con, base, base_len); + return; + } + + add_out_kvec(con, base, crcp - base); + add_out_kvec(con, extdata, extdata_len); + add_out_kvec(con, crcp, CEPH_CRC_LEN); + if (to_be_signed) { + add_out_sign_kvec(con, base, crcp - base); + add_out_sign_kvec(con, extdata, extdata_len); + add_out_sign_kvec(con, crcp, CEPH_CRC_LEN); + } +} + +static int prepare_head_secure_small(struct ceph_connection *con, + void *base, int ctrl_len) +{ + struct scatterlist sg; + int ret; + + /* inline buffer padding? */ + if (ctrl_len < CEPH_PREAMBLE_INLINE_LEN) + memset(CTRL_BODY(base) + ctrl_len, 0, + CEPH_PREAMBLE_INLINE_LEN - ctrl_len); + + sg_init_one(&sg, base, CEPH_PREAMBLE_SECURE_LEN); + ret = gcm_crypt(con, true, &sg, &sg, + CEPH_PREAMBLE_SECURE_LEN - CEPH_GCM_TAG_LEN); + if (ret) + return ret; + + add_out_kvec(con, base, CEPH_PREAMBLE_SECURE_LEN); + return 0; +} + +/* + * base: + * preamble + * control body (ctrl_len bytes) + * space for padding, if needed + * space for control remainder auth tag + * space for preamble auth tag + * + * Encrypt preamble and the inline portion, then encrypt the remainder + * and gather into: + * + * preamble + * control body (48 bytes) + * preamble auth tag + * control body (ctrl_len - 48 bytes) + * zero padding, if needed + * control remainder auth tag + * + * Preamble should already be encoded at the start of base. + */ +static int prepare_head_secure_big(struct ceph_connection *con, + void *base, int ctrl_len) +{ + int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN; + void *rem = CTRL_BODY(base) + CEPH_PREAMBLE_INLINE_LEN; + void *rem_tag = rem + padded_len(rem_len); + void *pmbl_tag = rem_tag + CEPH_GCM_TAG_LEN; + struct scatterlist sgs[2]; + int ret; + + sg_init_table(sgs, 2); + sg_set_buf(&sgs[0], base, rem - base); + sg_set_buf(&sgs[1], pmbl_tag, CEPH_GCM_TAG_LEN); + ret = gcm_crypt(con, true, sgs, sgs, rem - base); + if (ret) + return ret; + + /* control remainder padding? */ + if (need_padding(rem_len)) + memset(rem + rem_len, 0, padding_len(rem_len)); + + sg_init_one(&sgs[0], rem, pmbl_tag - rem); + ret = gcm_crypt(con, true, sgs, sgs, rem_tag - rem); + if (ret) + return ret; + + add_out_kvec(con, base, rem - base); + add_out_kvec(con, pmbl_tag, CEPH_GCM_TAG_LEN); + add_out_kvec(con, rem, pmbl_tag - rem); + return 0; +} + +static int __prepare_control(struct ceph_connection *con, int tag, + void *base, int ctrl_len, void *extdata, + int extdata_len, bool to_be_signed) +{ + int total_len = ctrl_len + extdata_len; + struct ceph_frame_desc desc; + int ret; + + dout("%s con %p tag %d len %d (%d+%d)\n", __func__, con, tag, + total_len, ctrl_len, extdata_len); + + /* extdata may be vmalloc'ed but not base */ + if (WARN_ON(is_vmalloc_addr(base) || !ctrl_len)) + return -EINVAL; + + init_frame_desc(&desc, tag, &total_len, 1); + encode_preamble(&desc, base); + + if (con_secure(con)) { + if (WARN_ON(extdata_len || to_be_signed)) + return -EINVAL; + + if (ctrl_len <= CEPH_PREAMBLE_INLINE_LEN) + /* fully inlined, inline buffer may need padding */ + ret = prepare_head_secure_small(con, base, ctrl_len); + else + /* partially inlined, inline buffer is full */ + ret = prepare_head_secure_big(con, base, ctrl_len); + if (ret) + return ret; + } else { + prepare_head_plain(con, base, ctrl_len, extdata, extdata_len, + to_be_signed); + } + + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); + return 0; +} + +static int prepare_control(struct ceph_connection *con, int tag, + void *base, int ctrl_len) +{ + return __prepare_control(con, tag, base, ctrl_len, NULL, 0, false); +} + +static int prepare_hello(struct ceph_connection *con) +{ + void *buf, *p; + int ctrl_len; + + ctrl_len = 1 + ceph_entity_addr_encoding_len(&con->peer_addr); + buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false)); + if (!buf) + return -ENOMEM; + + p = CTRL_BODY(buf); + ceph_encode_8(&p, CEPH_ENTITY_TYPE_CLIENT); + ceph_encode_entity_addr(&p, &con->peer_addr); + WARN_ON(p != CTRL_BODY(buf) + ctrl_len); + + return __prepare_control(con, FRAME_TAG_HELLO, buf, ctrl_len, + NULL, 0, true); +} + +/* so that head_onwire_len(AUTH_BUF_LEN, false) is 512 */ +#define AUTH_BUF_LEN (512 - CEPH_CRC_LEN - CEPH_PREAMBLE_PLAIN_LEN) + +static int prepare_auth_request(struct ceph_connection *con) +{ + void *authorizer, *authorizer_copy; + int ctrl_len, authorizer_len; + void *buf; + int ret; + + ctrl_len = AUTH_BUF_LEN; + buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false)); + if (!buf) + return -ENOMEM; + + mutex_unlock(&con->mutex); + ret = con->ops->get_auth_request(con, CTRL_BODY(buf), &ctrl_len, + &authorizer, &authorizer_len); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V2_HELLO) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + dout("%s con %p get_auth_request ret %d\n", __func__, con, ret); + if (ret) + return ret; + + authorizer_copy = alloc_conn_buf(con, authorizer_len); + if (!authorizer_copy) + return -ENOMEM; + + memcpy(authorizer_copy, authorizer, authorizer_len); + + return __prepare_control(con, FRAME_TAG_AUTH_REQUEST, buf, ctrl_len, + authorizer_copy, authorizer_len, true); +} + +static int prepare_auth_request_more(struct ceph_connection *con, + void *reply, int reply_len) +{ + int ctrl_len, authorizer_len; + void *authorizer; + void *buf; + int ret; + + ctrl_len = AUTH_BUF_LEN; + buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false)); + if (!buf) + return -ENOMEM; + + mutex_unlock(&con->mutex); + ret = con->ops->handle_auth_reply_more(con, reply, reply_len, + CTRL_BODY(buf), &ctrl_len, + &authorizer, &authorizer_len); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V2_AUTH) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + dout("%s con %p handle_auth_reply_more ret %d\n", __func__, con, ret); + if (ret) + return ret; + + return __prepare_control(con, FRAME_TAG_AUTH_REQUEST_MORE, buf, + ctrl_len, authorizer, authorizer_len, true); +} + +static int prepare_auth_signature(struct ceph_connection *con) +{ + void *buf; + int ret; + + buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE, + con_secure(con))); + if (!buf) + return -ENOMEM; + + ret = hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt, + CTRL_BODY(buf)); + if (ret) + return ret; + + return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf, + SHA256_DIGEST_SIZE); +} + +static int prepare_client_ident(struct ceph_connection *con) +{ + struct ceph_entity_addr *my_addr = &con->msgr->inst.addr; + struct ceph_client *client = from_msgr(con->msgr); + u64 global_id = ceph_client_gid(client); + void *buf, *p; + int ctrl_len; + + WARN_ON(con->v2.server_cookie); + WARN_ON(con->v2.connect_seq); + WARN_ON(con->v2.peer_global_seq); + + if (!con->v2.client_cookie) { + do { + get_random_bytes(&con->v2.client_cookie, + sizeof(con->v2.client_cookie)); + } while (!con->v2.client_cookie); + dout("%s con %p generated cookie 0x%llx\n", __func__, con, + con->v2.client_cookie); + } else { + dout("%s con %p cookie already set 0x%llx\n", __func__, con, + con->v2.client_cookie); + } + + dout("%s con %p my_addr %s/%u peer_addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx cookie 0x%llx\n", + __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce), + ceph_pr_addr(&con->peer_addr), le32_to_cpu(con->peer_addr.nonce), + global_id, con->v2.global_seq, client->supported_features, + client->required_features, con->v2.client_cookie); + + ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) + + ceph_entity_addr_encoding_len(&con->peer_addr) + 6 * 8; + buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con))); + if (!buf) + return -ENOMEM; + + p = CTRL_BODY(buf); + ceph_encode_8(&p, 2); /* addrvec marker */ + ceph_encode_32(&p, 1); /* addr_cnt */ + ceph_encode_entity_addr(&p, my_addr); + ceph_encode_entity_addr(&p, &con->peer_addr); + ceph_encode_64(&p, global_id); + ceph_encode_64(&p, con->v2.global_seq); + ceph_encode_64(&p, client->supported_features); + ceph_encode_64(&p, client->required_features); + ceph_encode_64(&p, 0); /* flags */ + ceph_encode_64(&p, con->v2.client_cookie); + WARN_ON(p != CTRL_BODY(buf) + ctrl_len); + + return prepare_control(con, FRAME_TAG_CLIENT_IDENT, buf, ctrl_len); +} + +static int prepare_session_reconnect(struct ceph_connection *con) +{ + struct ceph_entity_addr *my_addr = &con->msgr->inst.addr; + void *buf, *p; + int ctrl_len; + + WARN_ON(!con->v2.client_cookie); + WARN_ON(!con->v2.server_cookie); + WARN_ON(!con->v2.connect_seq); + WARN_ON(!con->v2.peer_global_seq); + + dout("%s con %p my_addr %s/%u client_cookie 0x%llx server_cookie 0x%llx global_seq %llu connect_seq %llu in_seq %llu\n", + __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce), + con->v2.client_cookie, con->v2.server_cookie, con->v2.global_seq, + con->v2.connect_seq, con->in_seq); + + ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) + 5 * 8; + buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con))); + if (!buf) + return -ENOMEM; + + p = CTRL_BODY(buf); + ceph_encode_8(&p, 2); /* entity_addrvec_t marker */ + ceph_encode_32(&p, 1); /* my_addrs len */ + ceph_encode_entity_addr(&p, my_addr); + ceph_encode_64(&p, con->v2.client_cookie); + ceph_encode_64(&p, con->v2.server_cookie); + ceph_encode_64(&p, con->v2.global_seq); + ceph_encode_64(&p, con->v2.connect_seq); + ceph_encode_64(&p, con->in_seq); + WARN_ON(p != CTRL_BODY(buf) + ctrl_len); + + return prepare_control(con, FRAME_TAG_SESSION_RECONNECT, buf, ctrl_len); +} + +static int prepare_keepalive2(struct ceph_connection *con) +{ + struct ceph_timespec *ts = CTRL_BODY(con->v2.out_buf); + struct timespec64 now; + + ktime_get_real_ts64(&now); + dout("%s con %p timestamp %lld.%09ld\n", __func__, con, now.tv_sec, + now.tv_nsec); + + ceph_encode_timespec64(ts, &now); + + reset_out_kvecs(con); + return prepare_control(con, FRAME_TAG_KEEPALIVE2, con->v2.out_buf, + sizeof(struct ceph_timespec)); +} + +static int prepare_ack(struct ceph_connection *con) +{ + void *p; + + dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + p = CTRL_BODY(con->v2.out_buf); + ceph_encode_64(&p, con->in_seq_acked); + + reset_out_kvecs(con); + return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8); +} + +static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted) +{ + dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con, + con->out_msg, aborted, con->v2.out_epil.front_crc, + con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc); + + encode_epilogue_plain(con, aborted); + add_out_kvec(con, &con->v2.out_epil, CEPH_EPILOGUE_PLAIN_LEN); +} + +/* + * For "used" empty segments, crc is -1. For unused (trailing) + * segments, crc is 0. + */ +static void prepare_message_plain(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + + prepare_head_plain(con, con->v2.out_buf, + sizeof(struct ceph_msg_header2), NULL, 0, false); + + if (!front_len(msg) && !middle_len(msg)) { + if (!data_len(msg)) { + /* + * Empty message: once the head is written, + * we are done -- there is no epilogue. + */ + con->v2.out_state = OUT_S_FINISH_MESSAGE; + return; + } + + con->v2.out_epil.front_crc = -1; + con->v2.out_epil.middle_crc = -1; + con->v2.out_state = OUT_S_QUEUE_DATA; + return; + } + + if (front_len(msg)) { + con->v2.out_epil.front_crc = crc32c(-1, msg->front.iov_base, + front_len(msg)); + add_out_kvec(con, msg->front.iov_base, front_len(msg)); + } else { + /* middle (at least) is there, checked above */ + con->v2.out_epil.front_crc = -1; + } + + if (middle_len(msg)) { + con->v2.out_epil.middle_crc = + crc32c(-1, msg->middle->vec.iov_base, middle_len(msg)); + add_out_kvec(con, msg->middle->vec.iov_base, middle_len(msg)); + } else { + con->v2.out_epil.middle_crc = data_len(msg) ? -1 : 0; + } + + if (data_len(msg)) { + con->v2.out_state = OUT_S_QUEUE_DATA; + } else { + con->v2.out_epil.data_crc = 0; + prepare_epilogue_plain(con, false); + con->v2.out_state = OUT_S_FINISH_MESSAGE; + } +} + +/* + * Unfortunately the kernel crypto API doesn't support streaming + * (piecewise) operation for AEAD algorithms, so we can't get away + * with a fixed size buffer and a couple sgs. Instead, we have to + * allocate pages for the entire tail of the message (currently up + * to ~32M) and two sgs arrays (up to ~256K each)... + */ +static int prepare_message_secure(struct ceph_connection *con) +{ + void *zerop = page_address(ceph_zero_page); + struct sg_table enc_sgt = {}; + struct sg_table sgt = {}; + struct page **enc_pages; + int enc_page_cnt; + int tail_len; + int ret; + + ret = prepare_head_secure_small(con, con->v2.out_buf, + sizeof(struct ceph_msg_header2)); + if (ret) + return ret; + + tail_len = tail_onwire_len(con->out_msg, true); + if (!tail_len) { + /* + * Empty message: once the head is written, + * we are done -- there is no epilogue. + */ + con->v2.out_state = OUT_S_FINISH_MESSAGE; + return 0; + } + + encode_epilogue_secure(con, false); + ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop, + &con->v2.out_epil, false); + if (ret) + goto out; + + enc_page_cnt = calc_pages_for(0, tail_len); + enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO); + if (IS_ERR(enc_pages)) { + ret = PTR_ERR(enc_pages); + goto out; + } + + WARN_ON(con->v2.out_enc_pages || con->v2.out_enc_page_cnt); + con->v2.out_enc_pages = enc_pages; + con->v2.out_enc_page_cnt = enc_page_cnt; + con->v2.out_enc_resid = tail_len; + con->v2.out_enc_i = 0; + + ret = sg_alloc_table_from_pages(&enc_sgt, enc_pages, enc_page_cnt, + 0, tail_len, GFP_NOIO); + if (ret) + goto out; + + ret = gcm_crypt(con, true, sgt.sgl, enc_sgt.sgl, + tail_len - CEPH_GCM_TAG_LEN); + if (ret) + goto out; + + dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con, + con->out_msg, sgt.orig_nents, enc_page_cnt); + con->v2.out_state = OUT_S_QUEUE_ENC_PAGE; + +out: + sg_free_table(&sgt); + sg_free_table(&enc_sgt); + return ret; +} + +static int prepare_message(struct ceph_connection *con) +{ + int lens[] = { + sizeof(struct ceph_msg_header2), + front_len(con->out_msg), + middle_len(con->out_msg), + data_len(con->out_msg) + }; + struct ceph_frame_desc desc; + int ret; + + dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con, + con->out_msg, lens[0], lens[1], lens[2], lens[3]); + + if (con->in_seq > con->in_seq_acked) { + dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + } + + reset_out_kvecs(con); + init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4); + encode_preamble(&desc, con->v2.out_buf); + fill_header2(CTRL_BODY(con->v2.out_buf), &con->out_msg->hdr, + con->in_seq_acked); + + if (con_secure(con)) { + ret = prepare_message_secure(con); + if (ret) + return ret; + } else { + prepare_message_plain(con); + } + + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); + return 0; +} + +static int prepare_read_banner_prefix(struct ceph_connection *con) +{ + void *buf; + + buf = alloc_conn_buf(con, CEPH_BANNER_V2_PREFIX_LEN); + if (!buf) + return -ENOMEM; + + reset_in_kvecs(con); + add_in_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN); + add_in_sign_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN); + con->state = CEPH_CON_S_V2_BANNER_PREFIX; + return 0; +} + +static int prepare_read_banner_payload(struct ceph_connection *con, + int payload_len) +{ + void *buf; + + buf = alloc_conn_buf(con, payload_len); + if (!buf) + return -ENOMEM; + + reset_in_kvecs(con); + add_in_kvec(con, buf, payload_len); + add_in_sign_kvec(con, buf, payload_len); + con->state = CEPH_CON_S_V2_BANNER_PAYLOAD; + return 0; +} + +static void prepare_read_preamble(struct ceph_connection *con) +{ + reset_in_kvecs(con); + add_in_kvec(con, con->v2.in_buf, + con_secure(con) ? CEPH_PREAMBLE_SECURE_LEN : + CEPH_PREAMBLE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_PREAMBLE; +} + +static int prepare_read_control(struct ceph_connection *con) +{ + int ctrl_len = con->v2.in_desc.fd_lens[0]; + int head_len; + void *buf; + + reset_in_kvecs(con); + if (con->state == CEPH_CON_S_V2_HELLO || + con->state == CEPH_CON_S_V2_AUTH) { + head_len = head_onwire_len(ctrl_len, false); + buf = alloc_conn_buf(con, head_len); + if (!buf) + return -ENOMEM; + + /* preserve preamble */ + memcpy(buf, con->v2.in_buf, CEPH_PREAMBLE_LEN); + + add_in_kvec(con, CTRL_BODY(buf), ctrl_len); + add_in_kvec(con, CTRL_BODY(buf) + ctrl_len, CEPH_CRC_LEN); + add_in_sign_kvec(con, buf, head_len); + } else { + if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) { + buf = alloc_conn_buf(con, ctrl_len); + if (!buf) + return -ENOMEM; + + add_in_kvec(con, buf, ctrl_len); + } else { + add_in_kvec(con, CTRL_BODY(con->v2.in_buf), ctrl_len); + } + add_in_kvec(con, con->v2.in_buf, CEPH_CRC_LEN); + } + con->v2.in_state = IN_S_HANDLE_CONTROL; + return 0; +} + +static int prepare_read_control_remainder(struct ceph_connection *con) +{ + int ctrl_len = con->v2.in_desc.fd_lens[0]; + int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN; + void *buf; + + buf = alloc_conn_buf(con, ctrl_len); + if (!buf) + return -ENOMEM; + + memcpy(buf, CTRL_BODY(con->v2.in_buf), CEPH_PREAMBLE_INLINE_LEN); + + reset_in_kvecs(con); + add_in_kvec(con, buf + CEPH_PREAMBLE_INLINE_LEN, rem_len); + add_in_kvec(con, con->v2.in_buf, + padding_len(rem_len) + CEPH_GCM_TAG_LEN); + con->v2.in_state = IN_S_HANDLE_CONTROL_REMAINDER; + return 0; +} + +static void prepare_read_data(struct ceph_connection *con) +{ + struct bio_vec bv; + + if (!con_secure(con)) + con->in_data_crc = -1; + ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg, + data_len(con->in_msg)); + + get_bvec_at(&con->v2.in_cursor, &bv); + set_in_bvec(con, &bv); + con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT; +} + +static void prepare_read_data_cont(struct ceph_connection *con) +{ + struct bio_vec bv; + + if (!con_secure(con)) + con->in_data_crc = ceph_crc32c_page(con->in_data_crc, + con->v2.in_bvec.bv_page, + con->v2.in_bvec.bv_offset, + con->v2.in_bvec.bv_len); + + ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len); + if (con->v2.in_cursor.total_resid) { + get_bvec_at(&con->v2.in_cursor, &bv); + set_in_bvec(con, &bv); + WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT); + return; + } + + /* + * We've read all data. Prepare to read data padding (if any) + * and epilogue. + */ + reset_in_kvecs(con); + if (con_secure(con)) { + if (need_padding(data_len(con->in_msg))) + add_in_kvec(con, DATA_PAD(con->v2.in_buf), + padding_len(data_len(con->in_msg))); + add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_SECURE_LEN); + } else { + add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); + } + con->v2.in_state = IN_S_HANDLE_EPILOGUE; +} + +static void __finish_skip(struct ceph_connection *con) +{ + con->in_seq++; + prepare_read_preamble(con); +} + +static void prepare_skip_message(struct ceph_connection *con) +{ + struct ceph_frame_desc *desc = &con->v2.in_desc; + int tail_len; + + dout("%s con %p %d+%d+%d\n", __func__, con, desc->fd_lens[1], + desc->fd_lens[2], desc->fd_lens[3]); + + tail_len = __tail_onwire_len(desc->fd_lens[1], desc->fd_lens[2], + desc->fd_lens[3], con_secure(con)); + if (!tail_len) { + __finish_skip(con); + } else { + set_in_skip(con, tail_len); + con->v2.in_state = IN_S_FINISH_SKIP; + } +} + +static int process_banner_prefix(struct ceph_connection *con) +{ + int payload_len; + void *p; + + WARN_ON(con->v2.in_kvecs[0].iov_len != CEPH_BANNER_V2_PREFIX_LEN); + + p = con->v2.in_kvecs[0].iov_base; + if (memcmp(p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN)) { + if (!memcmp(p, CEPH_BANNER, CEPH_BANNER_LEN)) + con->error_msg = "server is speaking msgr1 protocol"; + else + con->error_msg = "protocol error, bad banner"; + return -EINVAL; + } + + p += CEPH_BANNER_V2_LEN; + payload_len = ceph_decode_16(&p); + dout("%s con %p payload_len %d\n", __func__, con, payload_len); + + return prepare_read_banner_payload(con, payload_len); +} + +static int process_banner_payload(struct ceph_connection *con) +{ + void *end = con->v2.in_kvecs[0].iov_base + con->v2.in_kvecs[0].iov_len; + u64 feat = CEPH_MSGR2_SUPPORTED_FEATURES; + u64 req_feat = CEPH_MSGR2_REQUIRED_FEATURES; + u64 server_feat, server_req_feat; + void *p; + int ret; + + p = con->v2.in_kvecs[0].iov_base; + ceph_decode_64_safe(&p, end, server_feat, bad); + ceph_decode_64_safe(&p, end, server_req_feat, bad); + + dout("%s con %p server_feat 0x%llx server_req_feat 0x%llx\n", + __func__, con, server_feat, server_req_feat); + + if (req_feat & ~server_feat) { + pr_err("msgr2 feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n", + server_feat, req_feat & ~server_feat); + con->error_msg = "missing required protocol features"; + return -EINVAL; + } + if (server_req_feat & ~feat) { + pr_err("msgr2 feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n", + feat, server_req_feat & ~feat); + con->error_msg = "missing required protocol features"; + return -EINVAL; + } + + /* no reset_out_kvecs() as our banner may still be pending */ + ret = prepare_hello(con); + if (ret) { + pr_err("prepare_hello failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_HELLO; + prepare_read_preamble(con); + return 0; + +bad: + pr_err("failed to decode banner payload\n"); + return -EINVAL; +} + +static int process_hello(struct ceph_connection *con, void *p, void *end) +{ + struct ceph_entity_addr *my_addr = &con->msgr->inst.addr; + struct ceph_entity_addr addr_for_me; + u8 entity_type; + int ret; + + if (con->state != CEPH_CON_S_V2_HELLO) { + con->error_msg = "protocol error, unexpected hello"; + return -EINVAL; + } + + ceph_decode_8_safe(&p, end, entity_type, bad); + ret = ceph_decode_entity_addr(&p, end, &addr_for_me); + if (ret) { + pr_err("failed to decode addr_for_me: %d\n", ret); + return ret; + } + + dout("%s con %p entity_type %d addr_for_me %s\n", __func__, con, + entity_type, ceph_pr_addr(&addr_for_me)); + + if (entity_type != con->peer_name.type) { + pr_err("bad peer type, want %d, got %d\n", + con->peer_name.type, entity_type); + con->error_msg = "wrong peer at address"; + return -EINVAL; + } + + /* + * Set our address to the address our first peer (i.e. monitor) + * sees that we are connecting from. If we are behind some sort + * of NAT and want to be identified by some private (not NATed) + * address, ip option should be used. + */ + if (ceph_addr_is_blank(my_addr)) { + memcpy(&my_addr->in_addr, &addr_for_me.in_addr, + sizeof(my_addr->in_addr)); + ceph_addr_set_port(my_addr, 0); + dout("%s con %p set my addr %s, as seen by peer %s\n", + __func__, con, ceph_pr_addr(my_addr), + ceph_pr_addr(&con->peer_addr)); + } else { + dout("%s con %p my addr already set %s\n", + __func__, con, ceph_pr_addr(my_addr)); + } + + WARN_ON(ceph_addr_is_blank(my_addr) || ceph_addr_port(my_addr)); + WARN_ON(my_addr->type != CEPH_ENTITY_ADDR_TYPE_ANY); + WARN_ON(!my_addr->nonce); + + /* no reset_out_kvecs() as our hello may still be pending */ + ret = prepare_auth_request(con); + if (ret) { + if (ret != -EAGAIN) + pr_err("prepare_auth_request failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_AUTH; + return 0; + +bad: + pr_err("failed to decode hello\n"); + return -EINVAL; +} + +static int process_auth_bad_method(struct ceph_connection *con, + void *p, void *end) +{ + int allowed_protos[8], allowed_modes[8]; + int allowed_proto_cnt, allowed_mode_cnt; + int used_proto, result; + int ret; + int i; + + if (con->state != CEPH_CON_S_V2_AUTH) { + con->error_msg = "protocol error, unexpected auth_bad_method"; + return -EINVAL; + } + + ceph_decode_32_safe(&p, end, used_proto, bad); + ceph_decode_32_safe(&p, end, result, bad); + dout("%s con %p used_proto %d result %d\n", __func__, con, used_proto, + result); + + ceph_decode_32_safe(&p, end, allowed_proto_cnt, bad); + if (allowed_proto_cnt > ARRAY_SIZE(allowed_protos)) { + pr_err("allowed_protos too big %d\n", allowed_proto_cnt); + return -EINVAL; + } + for (i = 0; i < allowed_proto_cnt; i++) { + ceph_decode_32_safe(&p, end, allowed_protos[i], bad); + dout("%s con %p allowed_protos[%d] %d\n", __func__, con, + i, allowed_protos[i]); + } + + ceph_decode_32_safe(&p, end, allowed_mode_cnt, bad); + if (allowed_mode_cnt > ARRAY_SIZE(allowed_modes)) { + pr_err("allowed_modes too big %d\n", allowed_mode_cnt); + return -EINVAL; + } + for (i = 0; i < allowed_mode_cnt; i++) { + ceph_decode_32_safe(&p, end, allowed_modes[i], bad); + dout("%s con %p allowed_modes[%d] %d\n", __func__, con, + i, allowed_modes[i]); + } + + mutex_unlock(&con->mutex); + ret = con->ops->handle_auth_bad_method(con, used_proto, result, + allowed_protos, + allowed_proto_cnt, + allowed_modes, + allowed_mode_cnt); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V2_AUTH) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + dout("%s con %p handle_auth_bad_method ret %d\n", __func__, con, ret); + return ret; + +bad: + pr_err("failed to decode auth_bad_method\n"); + return -EINVAL; +} + +static int process_auth_reply_more(struct ceph_connection *con, + void *p, void *end) +{ + int payload_len; + int ret; + + if (con->state != CEPH_CON_S_V2_AUTH) { + con->error_msg = "protocol error, unexpected auth_reply_more"; + return -EINVAL; + } + + ceph_decode_32_safe(&p, end, payload_len, bad); + ceph_decode_need(&p, end, payload_len, bad); + + dout("%s con %p payload_len %d\n", __func__, con, payload_len); + + reset_out_kvecs(con); + ret = prepare_auth_request_more(con, p, payload_len); + if (ret) { + if (ret != -EAGAIN) + pr_err("prepare_auth_request_more failed: %d\n", ret); + return ret; + } + + return 0; + +bad: + pr_err("failed to decode auth_reply_more\n"); + return -EINVAL; +} + +/* + * Align session_key and con_secret to avoid GFP_ATOMIC allocation + * inside crypto_shash_setkey() and crypto_aead_setkey() called from + * setup_crypto(). __aligned(16) isn't guaranteed to work for stack + * objects, so do it by hand. + */ +static int process_auth_done(struct ceph_connection *con, void *p, void *end) +{ + u8 session_key_buf[CEPH_KEY_LEN + 16]; + u8 con_secret_buf[CEPH_MAX_CON_SECRET_LEN + 16]; + u8 *session_key = PTR_ALIGN(&session_key_buf[0], 16); + u8 *con_secret = PTR_ALIGN(&con_secret_buf[0], 16); + int session_key_len, con_secret_len; + int payload_len; + u64 global_id; + int ret; + + if (con->state != CEPH_CON_S_V2_AUTH) { + con->error_msg = "protocol error, unexpected auth_done"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, global_id, bad); + ceph_decode_32_safe(&p, end, con->v2.con_mode, bad); + ceph_decode_32_safe(&p, end, payload_len, bad); + + dout("%s con %p global_id %llu con_mode %d payload_len %d\n", + __func__, con, global_id, con->v2.con_mode, payload_len); + + mutex_unlock(&con->mutex); + session_key_len = 0; + con_secret_len = 0; + ret = con->ops->handle_auth_done(con, global_id, p, payload_len, + session_key, &session_key_len, + con_secret, &con_secret_len); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V2_AUTH) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + ret = -EAGAIN; + goto out; + } + + dout("%s con %p handle_auth_done ret %d\n", __func__, con, ret); + if (ret) + goto out; + + ret = setup_crypto(con, session_key, session_key_len, con_secret, + con_secret_len); + if (ret) + goto out; + + reset_out_kvecs(con); + ret = prepare_auth_signature(con); + if (ret) { + pr_err("prepare_auth_signature failed: %d\n", ret); + goto out; + } + + con->state = CEPH_CON_S_V2_AUTH_SIGNATURE; + +out: + memzero_explicit(session_key_buf, sizeof(session_key_buf)); + memzero_explicit(con_secret_buf, sizeof(con_secret_buf)); + return ret; + +bad: + pr_err("failed to decode auth_done\n"); + return -EINVAL; +} + +static int process_auth_signature(struct ceph_connection *con, + void *p, void *end) +{ + u8 hmac[SHA256_DIGEST_SIZE]; + int ret; + + if (con->state != CEPH_CON_S_V2_AUTH_SIGNATURE) { + con->error_msg = "protocol error, unexpected auth_signature"; + return -EINVAL; + } + + ret = hmac_sha256(con, con->v2.out_sign_kvecs, + con->v2.out_sign_kvec_cnt, hmac); + if (ret) + return ret; + + ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad); + if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) { + con->error_msg = "integrity error, bad auth signature"; + return -EBADMSG; + } + + dout("%s con %p auth signature ok\n", __func__, con); + + /* no reset_out_kvecs() as our auth_signature may still be pending */ + if (!con->v2.server_cookie) { + ret = prepare_client_ident(con); + if (ret) { + pr_err("prepare_client_ident failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_SESSION_CONNECT; + } else { + ret = prepare_session_reconnect(con); + if (ret) { + pr_err("prepare_session_reconnect failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_SESSION_RECONNECT; + } + + return 0; + +bad: + pr_err("failed to decode auth_signature\n"); + return -EINVAL; +} + +static int process_server_ident(struct ceph_connection *con, + void *p, void *end) +{ + struct ceph_client *client = from_msgr(con->msgr); + u64 features, required_features; + struct ceph_entity_addr addr; + u64 global_seq; + u64 global_id; + u64 cookie; + u64 flags; + int ret; + + if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) { + con->error_msg = "protocol error, unexpected server_ident"; + return -EINVAL; + } + + ret = ceph_decode_entity_addrvec(&p, end, true, &addr); + if (ret) { + pr_err("failed to decode server addrs: %d\n", ret); + return ret; + } + + ceph_decode_64_safe(&p, end, global_id, bad); + ceph_decode_64_safe(&p, end, global_seq, bad); + ceph_decode_64_safe(&p, end, features, bad); + ceph_decode_64_safe(&p, end, required_features, bad); + ceph_decode_64_safe(&p, end, flags, bad); + ceph_decode_64_safe(&p, end, cookie, bad); + + dout("%s con %p addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx flags 0x%llx cookie 0x%llx\n", + __func__, con, ceph_pr_addr(&addr), le32_to_cpu(addr.nonce), + global_id, global_seq, features, required_features, flags, cookie); + + /* is this who we intended to talk to? */ + if (memcmp(&addr, &con->peer_addr, sizeof(con->peer_addr))) { + pr_err("bad peer addr/nonce, want %s/%u, got %s/%u\n", + ceph_pr_addr(&con->peer_addr), + le32_to_cpu(con->peer_addr.nonce), + ceph_pr_addr(&addr), le32_to_cpu(addr.nonce)); + con->error_msg = "wrong peer at address"; + return -EINVAL; + } + + if (client->required_features & ~features) { + pr_err("RADOS feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n", + features, client->required_features & ~features); + con->error_msg = "missing required protocol features"; + return -EINVAL; + } + + /* + * Both name->type and name->num are set in ceph_con_open() but + * name->num may be bogus in the initial monmap. name->type is + * verified in handle_hello(). + */ + WARN_ON(!con->peer_name.type); + con->peer_name.num = cpu_to_le64(global_id); + con->v2.peer_global_seq = global_seq; + con->peer_features = features; + WARN_ON(required_features & ~client->supported_features); + con->v2.server_cookie = cookie; + + if (flags & CEPH_MSG_CONNECT_LOSSY) { + ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX); + WARN_ON(con->v2.server_cookie); + } else { + WARN_ON(!con->v2.server_cookie); + } + + clear_in_sign_kvecs(con); + clear_out_sign_kvecs(con); + free_conn_bufs(con); + con->delay = 0; /* reset backoff memory */ + + con->state = CEPH_CON_S_OPEN; + con->v2.out_state = OUT_S_GET_NEXT; + return 0; + +bad: + pr_err("failed to decode server_ident\n"); + return -EINVAL; +} + +static int process_ident_missing_features(struct ceph_connection *con, + void *p, void *end) +{ + struct ceph_client *client = from_msgr(con->msgr); + u64 missing_features; + + if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) { + con->error_msg = "protocol error, unexpected ident_missing_features"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, missing_features, bad); + pr_err("RADOS feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n", + client->supported_features, missing_features); + con->error_msg = "missing required protocol features"; + return -EINVAL; + +bad: + pr_err("failed to decode ident_missing_features\n"); + return -EINVAL; +} + +static int process_session_reconnect_ok(struct ceph_connection *con, + void *p, void *end) +{ + u64 seq; + + if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) { + con->error_msg = "protocol error, unexpected session_reconnect_ok"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, seq, bad); + + dout("%s con %p seq %llu\n", __func__, con, seq); + ceph_con_discard_requeued(con, seq); + + clear_in_sign_kvecs(con); + clear_out_sign_kvecs(con); + free_conn_bufs(con); + con->delay = 0; /* reset backoff memory */ + + con->state = CEPH_CON_S_OPEN; + con->v2.out_state = OUT_S_GET_NEXT; + return 0; + +bad: + pr_err("failed to decode session_reconnect_ok\n"); + return -EINVAL; +} + +static int process_session_retry(struct ceph_connection *con, + void *p, void *end) +{ + u64 connect_seq; + int ret; + + if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) { + con->error_msg = "protocol error, unexpected session_retry"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, connect_seq, bad); + + dout("%s con %p connect_seq %llu\n", __func__, con, connect_seq); + WARN_ON(connect_seq <= con->v2.connect_seq); + con->v2.connect_seq = connect_seq + 1; + + free_conn_bufs(con); + + reset_out_kvecs(con); + ret = prepare_session_reconnect(con); + if (ret) { + pr_err("prepare_session_reconnect (cseq) failed: %d\n", ret); + return ret; + } + + return 0; + +bad: + pr_err("failed to decode session_retry\n"); + return -EINVAL; +} + +static int process_session_retry_global(struct ceph_connection *con, + void *p, void *end) +{ + u64 global_seq; + int ret; + + if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) { + con->error_msg = "protocol error, unexpected session_retry_global"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, global_seq, bad); + + dout("%s con %p global_seq %llu\n", __func__, con, global_seq); + WARN_ON(global_seq <= con->v2.global_seq); + con->v2.global_seq = ceph_get_global_seq(con->msgr, global_seq); + + free_conn_bufs(con); + + reset_out_kvecs(con); + ret = prepare_session_reconnect(con); + if (ret) { + pr_err("prepare_session_reconnect (gseq) failed: %d\n", ret); + return ret; + } + + return 0; + +bad: + pr_err("failed to decode session_retry_global\n"); + return -EINVAL; +} + +static int process_session_reset(struct ceph_connection *con, + void *p, void *end) +{ + bool full; + int ret; + + if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) { + con->error_msg = "protocol error, unexpected session_reset"; + return -EINVAL; + } + + ceph_decode_8_safe(&p, end, full, bad); + if (!full) { + con->error_msg = "protocol error, bad session_reset"; + return -EINVAL; + } + + pr_info("%s%lld %s session reset\n", ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr)); + ceph_con_reset_session(con); + + mutex_unlock(&con->mutex); + if (con->ops->peer_reset) + con->ops->peer_reset(con); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + free_conn_bufs(con); + + reset_out_kvecs(con); + ret = prepare_client_ident(con); + if (ret) { + pr_err("prepare_client_ident (rst) failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_SESSION_CONNECT; + return 0; + +bad: + pr_err("failed to decode session_reset\n"); + return -EINVAL; +} + +static int process_keepalive2_ack(struct ceph_connection *con, + void *p, void *end) +{ + if (con->state != CEPH_CON_S_OPEN) { + con->error_msg = "protocol error, unexpected keepalive2_ack"; + return -EINVAL; + } + + ceph_decode_need(&p, end, sizeof(struct ceph_timespec), bad); + ceph_decode_timespec64(&con->last_keepalive_ack, p); + + dout("%s con %p timestamp %lld.%09ld\n", __func__, con, + con->last_keepalive_ack.tv_sec, con->last_keepalive_ack.tv_nsec); + + return 0; + +bad: + pr_err("failed to decode keepalive2_ack\n"); + return -EINVAL; +} + +static int process_ack(struct ceph_connection *con, void *p, void *end) +{ + u64 seq; + + if (con->state != CEPH_CON_S_OPEN) { + con->error_msg = "protocol error, unexpected ack"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, seq, bad); + + dout("%s con %p seq %llu\n", __func__, con, seq); + ceph_con_discard_sent(con, seq); + return 0; + +bad: + pr_err("failed to decode ack\n"); + return -EINVAL; +} + +static int process_control(struct ceph_connection *con, void *p, void *end) +{ + int tag = con->v2.in_desc.fd_tag; + int ret; + + dout("%s con %p tag %d len %d\n", __func__, con, tag, (int)(end - p)); + + switch (tag) { + case FRAME_TAG_HELLO: + ret = process_hello(con, p, end); + break; + case FRAME_TAG_AUTH_BAD_METHOD: + ret = process_auth_bad_method(con, p, end); + break; + case FRAME_TAG_AUTH_REPLY_MORE: + ret = process_auth_reply_more(con, p, end); + break; + case FRAME_TAG_AUTH_DONE: + ret = process_auth_done(con, p, end); + break; + case FRAME_TAG_AUTH_SIGNATURE: + ret = process_auth_signature(con, p, end); + break; + case FRAME_TAG_SERVER_IDENT: + ret = process_server_ident(con, p, end); + break; + case FRAME_TAG_IDENT_MISSING_FEATURES: + ret = process_ident_missing_features(con, p, end); + break; + case FRAME_TAG_SESSION_RECONNECT_OK: + ret = process_session_reconnect_ok(con, p, end); + break; + case FRAME_TAG_SESSION_RETRY: + ret = process_session_retry(con, p, end); + break; + case FRAME_TAG_SESSION_RETRY_GLOBAL: + ret = process_session_retry_global(con, p, end); + break; + case FRAME_TAG_SESSION_RESET: + ret = process_session_reset(con, p, end); + break; + case FRAME_TAG_KEEPALIVE2_ACK: + ret = process_keepalive2_ack(con, p, end); + break; + case FRAME_TAG_ACK: + ret = process_ack(con, p, end); + break; + default: + pr_err("bad tag %d\n", tag); + con->error_msg = "protocol error, bad tag"; + return -EINVAL; + } + if (ret) { + dout("%s con %p error %d\n", __func__, con, ret); + return ret; + } + + prepare_read_preamble(con); + return 0; +} + +/* + * Return: + * 1 - con->in_msg set, read message + * 0 - skip message + * <0 - error + */ +static int process_message_header(struct ceph_connection *con, + void *p, void *end) +{ + struct ceph_frame_desc *desc = &con->v2.in_desc; + struct ceph_msg_header2 *hdr2 = p; + struct ceph_msg_header hdr; + int skip; + int ret; + u64 seq; + + /* verify seq# */ + seq = le64_to_cpu(hdr2->seq); + if ((s64)seq - (s64)con->in_seq < 1) { + pr_info("%s%lld %s skipping old message: seq %llu, expected %llu\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr), + seq, con->in_seq + 1); + return 0; + } + if ((s64)seq - (s64)con->in_seq > 1) { + pr_err("bad seq %llu, expected %llu\n", seq, con->in_seq + 1); + con->error_msg = "bad message sequence # for incoming message"; + return -EBADE; + } + + ceph_con_discard_sent(con, le64_to_cpu(hdr2->ack_seq)); + + fill_header(&hdr, hdr2, desc->fd_lens[1], desc->fd_lens[2], + desc->fd_lens[3], &con->peer_name); + ret = ceph_con_in_msg_alloc(con, &hdr, &skip); + if (ret) + return ret; + + WARN_ON(!con->in_msg ^ skip); + if (skip) + return 0; + + WARN_ON(!con->in_msg); + WARN_ON(con->in_msg->con != con); + return 1; +} + +static int process_message(struct ceph_connection *con) +{ + ceph_con_process_message(con); + + /* + * We could have been closed by ceph_con_close() because + * ceph_con_process_message() temporarily drops con->mutex. + */ + if (con->state != CEPH_CON_S_OPEN) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + prepare_read_preamble(con); + return 0; +} + +static int __handle_control(struct ceph_connection *con, void *p) +{ + void *end = p + con->v2.in_desc.fd_lens[0]; + struct ceph_msg *msg; + int ret; + + if (con->v2.in_desc.fd_tag != FRAME_TAG_MESSAGE) + return process_control(con, p, end); + + ret = process_message_header(con, p, end); + if (ret < 0) + return ret; + if (ret == 0) { + prepare_skip_message(con); + return 0; + } + + msg = con->in_msg; /* set in process_message_header() */ + if (!front_len(msg) && !middle_len(msg)) { + if (!data_len(msg)) + return process_message(con); + + prepare_read_data(con); + return 0; + } + + reset_in_kvecs(con); + if (front_len(msg)) { + WARN_ON(front_len(msg) > msg->front_alloc_len); + add_in_kvec(con, msg->front.iov_base, front_len(msg)); + msg->front.iov_len = front_len(msg); + + if (con_secure(con) && need_padding(front_len(msg))) + add_in_kvec(con, FRONT_PAD(con->v2.in_buf), + padding_len(front_len(msg))); + } else { + msg->front.iov_len = 0; + } + if (middle_len(msg)) { + WARN_ON(middle_len(msg) > msg->middle->alloc_len); + add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg)); + msg->middle->vec.iov_len = middle_len(msg); + + if (con_secure(con) && need_padding(middle_len(msg))) + add_in_kvec(con, MIDDLE_PAD(con->v2.in_buf), + padding_len(middle_len(msg))); + } else if (msg->middle) { + msg->middle->vec.iov_len = 0; + } + + if (data_len(msg)) { + con->v2.in_state = IN_S_PREPARE_READ_DATA; + } else { + add_in_kvec(con, con->v2.in_buf, + con_secure(con) ? CEPH_EPILOGUE_SECURE_LEN : + CEPH_EPILOGUE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_EPILOGUE; + } + return 0; +} + +static int handle_preamble(struct ceph_connection *con) +{ + struct ceph_frame_desc *desc = &con->v2.in_desc; + int ret; + + if (con_secure(con)) { + ret = decrypt_preamble(con); + if (ret) { + if (ret == -EBADMSG) + con->error_msg = "integrity error, bad preamble auth tag"; + return ret; + } + } + + ret = decode_preamble(con->v2.in_buf, desc); + if (ret) { + if (ret == -EBADMSG) + con->error_msg = "integrity error, bad crc"; + else + con->error_msg = "protocol error, bad preamble"; + return ret; + } + + dout("%s con %p tag %d seg_cnt %d %d+%d+%d+%d\n", __func__, + con, desc->fd_tag, desc->fd_seg_cnt, desc->fd_lens[0], + desc->fd_lens[1], desc->fd_lens[2], desc->fd_lens[3]); + + if (!con_secure(con)) + return prepare_read_control(con); + + if (desc->fd_lens[0] > CEPH_PREAMBLE_INLINE_LEN) + return prepare_read_control_remainder(con); + + return __handle_control(con, CTRL_BODY(con->v2.in_buf)); +} + +static int handle_control(struct ceph_connection *con) +{ + int ctrl_len = con->v2.in_desc.fd_lens[0]; + void *buf; + int ret; + + WARN_ON(con_secure(con)); + + ret = verify_control_crc(con); + if (ret) { + con->error_msg = "integrity error, bad crc"; + return ret; + } + + if (con->state == CEPH_CON_S_V2_AUTH) { + buf = alloc_conn_buf(con, ctrl_len); + if (!buf) + return -ENOMEM; + + memcpy(buf, con->v2.in_kvecs[0].iov_base, ctrl_len); + return __handle_control(con, buf); + } + + return __handle_control(con, con->v2.in_kvecs[0].iov_base); +} + +static int handle_control_remainder(struct ceph_connection *con) +{ + int ret; + + WARN_ON(!con_secure(con)); + + ret = decrypt_control_remainder(con); + if (ret) { + if (ret == -EBADMSG) + con->error_msg = "integrity error, bad control remainder auth tag"; + return ret; + } + + return __handle_control(con, con->v2.in_kvecs[0].iov_base - + CEPH_PREAMBLE_INLINE_LEN); +} + +static int handle_epilogue(struct ceph_connection *con) +{ + u32 front_crc, middle_crc, data_crc; + int ret; + + if (con_secure(con)) { + ret = decrypt_message(con); + if (ret) { + if (ret == -EBADMSG) + con->error_msg = "integrity error, bad epilogue auth tag"; + return ret; + } + + /* just late_status */ + ret = decode_epilogue(con->v2.in_buf, NULL, NULL, NULL); + if (ret) { + con->error_msg = "protocol error, bad epilogue"; + return ret; + } + } else { + ret = decode_epilogue(con->v2.in_buf, &front_crc, + &middle_crc, &data_crc); + if (ret) { + con->error_msg = "protocol error, bad epilogue"; + return ret; + } + + ret = verify_epilogue_crcs(con, front_crc, middle_crc, + data_crc); + if (ret) { + con->error_msg = "integrity error, bad crc"; + return ret; + } + } + + return process_message(con); +} + +static void finish_skip(struct ceph_connection *con) +{ + dout("%s con %p\n", __func__, con); + + if (con_secure(con)) + gcm_inc_nonce(&con->v2.in_gcm_nonce); + + __finish_skip(con); +} + +static int populate_in_iter(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p state %d in_state %d\n", __func__, con, con->state, + con->v2.in_state); + WARN_ON(iov_iter_count(&con->v2.in_iter)); + + if (con->state == CEPH_CON_S_V2_BANNER_PREFIX) { + ret = process_banner_prefix(con); + } else if (con->state == CEPH_CON_S_V2_BANNER_PAYLOAD) { + ret = process_banner_payload(con); + } else if ((con->state >= CEPH_CON_S_V2_HELLO && + con->state <= CEPH_CON_S_V2_SESSION_RECONNECT) || + con->state == CEPH_CON_S_OPEN) { + switch (con->v2.in_state) { + case IN_S_HANDLE_PREAMBLE: + ret = handle_preamble(con); + break; + case IN_S_HANDLE_CONTROL: + ret = handle_control(con); + break; + case IN_S_HANDLE_CONTROL_REMAINDER: + ret = handle_control_remainder(con); + break; + case IN_S_PREPARE_READ_DATA: + prepare_read_data(con); + ret = 0; + break; + case IN_S_PREPARE_READ_DATA_CONT: + prepare_read_data_cont(con); + ret = 0; + break; + case IN_S_HANDLE_EPILOGUE: + ret = handle_epilogue(con); + break; + case IN_S_FINISH_SKIP: + finish_skip(con); + ret = 0; + break; + default: + WARN(1, "bad in_state %d", con->v2.in_state); + return -EINVAL; + } + } else { + WARN(1, "bad state %d", con->state); + return -EINVAL; + } + if (ret) { + dout("%s con %p error %d\n", __func__, con, ret); + return ret; + } + + if (WARN_ON(!iov_iter_count(&con->v2.in_iter))) + return -ENODATA; + dout("%s con %p populated %zu\n", __func__, con, + iov_iter_count(&con->v2.in_iter)); + return 1; +} + +int ceph_con_v2_try_read(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p state %d need %zu\n", __func__, con, con->state, + iov_iter_count(&con->v2.in_iter)); + + if (con->state == CEPH_CON_S_PREOPEN) + return 0; + + /* + * We should always have something pending here. If not, + * avoid calling populate_in_iter() as if we read something + * (ceph_tcp_recv() would immediately return 1). + */ + if (WARN_ON(!iov_iter_count(&con->v2.in_iter))) + return -ENODATA; + + for (;;) { + ret = ceph_tcp_recv(con); + if (ret <= 0) + return ret; + + ret = populate_in_iter(con); + if (ret <= 0) { + if (ret && ret != -EAGAIN && !con->error_msg) + con->error_msg = "read processing error"; + return ret; + } + } +} + +static void queue_data(struct ceph_connection *con) +{ + struct bio_vec bv; + + con->v2.out_epil.data_crc = -1; + ceph_msg_data_cursor_init(&con->v2.out_cursor, con->out_msg, + data_len(con->out_msg)); + + get_bvec_at(&con->v2.out_cursor, &bv); + set_out_bvec(con, &bv, true); + con->v2.out_state = OUT_S_QUEUE_DATA_CONT; +} + +static void queue_data_cont(struct ceph_connection *con) +{ + struct bio_vec bv; + + con->v2.out_epil.data_crc = ceph_crc32c_page( + con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page, + con->v2.out_bvec.bv_offset, con->v2.out_bvec.bv_len); + + ceph_msg_data_advance(&con->v2.out_cursor, con->v2.out_bvec.bv_len); + if (con->v2.out_cursor.total_resid) { + get_bvec_at(&con->v2.out_cursor, &bv); + set_out_bvec(con, &bv, true); + WARN_ON(con->v2.out_state != OUT_S_QUEUE_DATA_CONT); + return; + } + + /* + * We've written all data. Queue epilogue. Once it's written, + * we are done. + */ + reset_out_kvecs(con); + prepare_epilogue_plain(con, false); + con->v2.out_state = OUT_S_FINISH_MESSAGE; +} + +static void queue_enc_page(struct ceph_connection *con) +{ + struct bio_vec bv; + + dout("%s con %p i %d resid %d\n", __func__, con, con->v2.out_enc_i, + con->v2.out_enc_resid); + WARN_ON(!con->v2.out_enc_resid); + + bv.bv_page = con->v2.out_enc_pages[con->v2.out_enc_i]; + bv.bv_offset = 0; + bv.bv_len = min(con->v2.out_enc_resid, (int)PAGE_SIZE); + + set_out_bvec(con, &bv, false); + con->v2.out_enc_i++; + con->v2.out_enc_resid -= bv.bv_len; + + if (con->v2.out_enc_resid) { + WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE); + return; + } + + /* + * We've queued the last piece of ciphertext (ending with + * epilogue) + auth tag. Once it's written, we are done. + */ + WARN_ON(con->v2.out_enc_i != con->v2.out_enc_page_cnt); + con->v2.out_state = OUT_S_FINISH_MESSAGE; +} + +static void queue_zeros(struct ceph_connection *con) +{ + dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero); + + if (con->v2.out_zero) { + set_out_bvec_zero(con); + con->v2.out_zero -= con->v2.out_bvec.bv_len; + con->v2.out_state = OUT_S_QUEUE_ZEROS; + return; + } + + /* + * We've zero-filled everything up to epilogue. Queue epilogue + * with late_status set to ABORTED and crcs adjusted for zeros. + * Once it's written, we are done patching up for the revoke. + */ + reset_out_kvecs(con); + prepare_epilogue_plain(con, true); + con->v2.out_state = OUT_S_FINISH_MESSAGE; +} + +static void finish_message(struct ceph_connection *con) +{ + dout("%s con %p msg %p\n", __func__, con, con->out_msg); + + /* we end up here both plain and secure modes */ + if (con->v2.out_enc_pages) { + WARN_ON(!con->v2.out_enc_page_cnt); + ceph_release_page_vector(con->v2.out_enc_pages, + con->v2.out_enc_page_cnt); + con->v2.out_enc_pages = NULL; + con->v2.out_enc_page_cnt = 0; + } + /* message may have been revoked */ + if (con->out_msg) { + ceph_msg_put(con->out_msg); + con->out_msg = NULL; + } + + con->v2.out_state = OUT_S_GET_NEXT; +} + +static int populate_out_iter(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p state %d out_state %d\n", __func__, con, con->state, + con->v2.out_state); + WARN_ON(iov_iter_count(&con->v2.out_iter)); + + if (con->state != CEPH_CON_S_OPEN) { + WARN_ON(con->state < CEPH_CON_S_V2_BANNER_PREFIX || + con->state > CEPH_CON_S_V2_SESSION_RECONNECT); + goto nothing_pending; + } + + switch (con->v2.out_state) { + case OUT_S_QUEUE_DATA: + WARN_ON(!con->out_msg); + queue_data(con); + goto populated; + case OUT_S_QUEUE_DATA_CONT: + WARN_ON(!con->out_msg); + queue_data_cont(con); + goto populated; + case OUT_S_QUEUE_ENC_PAGE: + queue_enc_page(con); + goto populated; + case OUT_S_QUEUE_ZEROS: + WARN_ON(con->out_msg); /* revoked */ + queue_zeros(con); + goto populated; + case OUT_S_FINISH_MESSAGE: + finish_message(con); + break; + case OUT_S_GET_NEXT: + break; + default: + WARN(1, "bad out_state %d", con->v2.out_state); + return -EINVAL; + } + + WARN_ON(con->v2.out_state != OUT_S_GET_NEXT); + if (ceph_con_flag_test_and_clear(con, CEPH_CON_F_KEEPALIVE_PENDING)) { + ret = prepare_keepalive2(con); + if (ret) { + pr_err("prepare_keepalive2 failed: %d\n", ret); + return ret; + } + } else if (!list_empty(&con->out_queue)) { + ceph_con_get_out_msg(con); + ret = prepare_message(con); + if (ret) { + pr_err("prepare_message failed: %d\n", ret); + return ret; + } + } else if (con->in_seq > con->in_seq_acked) { + ret = prepare_ack(con); + if (ret) { + pr_err("prepare_ack failed: %d\n", ret); + return ret; + } + } else { + goto nothing_pending; + } + +populated: + if (WARN_ON(!iov_iter_count(&con->v2.out_iter))) + return -ENODATA; + dout("%s con %p populated %zu\n", __func__, con, + iov_iter_count(&con->v2.out_iter)); + return 1; + +nothing_pending: + WARN_ON(iov_iter_count(&con->v2.out_iter)); + dout("%s con %p nothing pending\n", __func__, con); + ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); + return 0; +} + +int ceph_con_v2_try_write(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p state %d have %zu\n", __func__, con, con->state, + iov_iter_count(&con->v2.out_iter)); + + /* open the socket first? */ + if (con->state == CEPH_CON_S_PREOPEN) { + WARN_ON(con->peer_addr.type != CEPH_ENTITY_ADDR_TYPE_MSGR2); + + /* + * Always bump global_seq. Bump connect_seq only if + * there is a session (i.e. we are reconnecting and will + * send session_reconnect instead of client_ident). + */ + con->v2.global_seq = ceph_get_global_seq(con->msgr, 0); + if (con->v2.server_cookie) + con->v2.connect_seq++; + + ret = prepare_read_banner_prefix(con); + if (ret) { + pr_err("prepare_read_banner_prefix failed: %d\n", ret); + con->error_msg = "connect error"; + return ret; + } + + reset_out_kvecs(con); + ret = prepare_banner(con); + if (ret) { + pr_err("prepare_banner failed: %d\n", ret); + con->error_msg = "connect error"; + return ret; + } + + ret = ceph_tcp_connect(con); + if (ret) { + pr_err("ceph_tcp_connect failed: %d\n", ret); + con->error_msg = "connect error"; + return ret; + } + } + + if (!iov_iter_count(&con->v2.out_iter)) { + ret = populate_out_iter(con); + if (ret <= 0) { + if (ret && ret != -EAGAIN && !con->error_msg) + con->error_msg = "write processing error"; + return ret; + } + } + + tcp_sock_set_cork(con->sock->sk, true); + for (;;) { + ret = ceph_tcp_send(con); + if (ret <= 0) + break; + + ret = populate_out_iter(con); + if (ret <= 0) { + if (ret && ret != -EAGAIN && !con->error_msg) + con->error_msg = "write processing error"; + break; + } + } + + tcp_sock_set_cork(con->sock->sk, false); + return ret; +} + +static u32 crc32c_zeros(u32 crc, int zero_len) +{ + int len; + + while (zero_len) { + len = min(zero_len, (int)PAGE_SIZE); + crc = crc32c(crc, page_address(ceph_zero_page), len); + zero_len -= len; + } + + return crc; +} + +static void prepare_zero_front(struct ceph_connection *con, int resid) +{ + int sent; + + WARN_ON(!resid || resid > front_len(con->out_msg)); + sent = front_len(con->out_msg) - resid; + dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); + + if (sent) { + con->v2.out_epil.front_crc = + crc32c(-1, con->out_msg->front.iov_base, sent); + con->v2.out_epil.front_crc = + crc32c_zeros(con->v2.out_epil.front_crc, resid); + } else { + con->v2.out_epil.front_crc = crc32c_zeros(-1, resid); + } + + con->v2.out_iter.count -= resid; + out_zero_add(con, resid); +} + +static void prepare_zero_middle(struct ceph_connection *con, int resid) +{ + int sent; + + WARN_ON(!resid || resid > middle_len(con->out_msg)); + sent = middle_len(con->out_msg) - resid; + dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); + + if (sent) { + con->v2.out_epil.middle_crc = + crc32c(-1, con->out_msg->middle->vec.iov_base, sent); + con->v2.out_epil.middle_crc = + crc32c_zeros(con->v2.out_epil.middle_crc, resid); + } else { + con->v2.out_epil.middle_crc = crc32c_zeros(-1, resid); + } + + con->v2.out_iter.count -= resid; + out_zero_add(con, resid); +} + +static void prepare_zero_data(struct ceph_connection *con) +{ + dout("%s con %p\n", __func__, con); + con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(con->out_msg)); + out_zero_add(con, data_len(con->out_msg)); +} + +static void revoke_at_queue_data(struct ceph_connection *con) +{ + int boundary; + int resid; + + WARN_ON(!data_len(con->out_msg)); + WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); + resid = iov_iter_count(&con->v2.out_iter); + + boundary = front_len(con->out_msg) + middle_len(con->out_msg); + if (resid > boundary) { + resid -= boundary; + WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN); + dout("%s con %p was sending head\n", __func__, con); + if (front_len(con->out_msg)) + prepare_zero_front(con, front_len(con->out_msg)); + if (middle_len(con->out_msg)) + prepare_zero_middle(con, middle_len(con->out_msg)); + prepare_zero_data(con); + WARN_ON(iov_iter_count(&con->v2.out_iter) != resid); + con->v2.out_state = OUT_S_QUEUE_ZEROS; + return; + } + + boundary = middle_len(con->out_msg); + if (resid > boundary) { + resid -= boundary; + dout("%s con %p was sending front\n", __func__, con); + prepare_zero_front(con, resid); + if (middle_len(con->out_msg)) + prepare_zero_middle(con, middle_len(con->out_msg)); + prepare_zero_data(con); + queue_zeros(con); + return; + } + + WARN_ON(!resid); + dout("%s con %p was sending middle\n", __func__, con); + prepare_zero_middle(con, resid); + prepare_zero_data(con); + queue_zeros(con); +} + +static void revoke_at_queue_data_cont(struct ceph_connection *con) +{ + int sent, resid; /* current piece of data */ + + WARN_ON(!data_len(con->out_msg)); + WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)); + resid = iov_iter_count(&con->v2.out_iter); + WARN_ON(!resid || resid > con->v2.out_bvec.bv_len); + sent = con->v2.out_bvec.bv_len - resid; + dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); + + if (sent) { + con->v2.out_epil.data_crc = ceph_crc32c_page( + con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page, + con->v2.out_bvec.bv_offset, sent); + ceph_msg_data_advance(&con->v2.out_cursor, sent); + } + WARN_ON(resid > con->v2.out_cursor.total_resid); + con->v2.out_epil.data_crc = crc32c_zeros(con->v2.out_epil.data_crc, + con->v2.out_cursor.total_resid); + + con->v2.out_iter.count -= resid; + out_zero_add(con, con->v2.out_cursor.total_resid); + queue_zeros(con); +} + +static void revoke_at_finish_message(struct ceph_connection *con) +{ + int boundary; + int resid; + + WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); + resid = iov_iter_count(&con->v2.out_iter); + + if (!front_len(con->out_msg) && !middle_len(con->out_msg) && + !data_len(con->out_msg)) { + WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN); + dout("%s con %p was sending head (empty message) - noop\n", + __func__, con); + return; + } + + boundary = front_len(con->out_msg) + middle_len(con->out_msg) + + CEPH_EPILOGUE_PLAIN_LEN; + if (resid > boundary) { + resid -= boundary; + WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN); + dout("%s con %p was sending head\n", __func__, con); + if (front_len(con->out_msg)) + prepare_zero_front(con, front_len(con->out_msg)); + if (middle_len(con->out_msg)) + prepare_zero_middle(con, middle_len(con->out_msg)); + con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; + WARN_ON(iov_iter_count(&con->v2.out_iter) != resid); + con->v2.out_state = OUT_S_QUEUE_ZEROS; + return; + } + + boundary = middle_len(con->out_msg) + CEPH_EPILOGUE_PLAIN_LEN; + if (resid > boundary) { + resid -= boundary; + dout("%s con %p was sending front\n", __func__, con); + prepare_zero_front(con, resid); + if (middle_len(con->out_msg)) + prepare_zero_middle(con, middle_len(con->out_msg)); + con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; + queue_zeros(con); + return; + } + + boundary = CEPH_EPILOGUE_PLAIN_LEN; + if (resid > boundary) { + resid -= boundary; + dout("%s con %p was sending middle\n", __func__, con); + prepare_zero_middle(con, resid); + con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; + queue_zeros(con); + return; + } + + WARN_ON(!resid); + dout("%s con %p was sending epilogue - noop\n", __func__, con); +} + +void ceph_con_v2_revoke(struct ceph_connection *con) +{ + WARN_ON(con->v2.out_zero); + + if (con_secure(con)) { + WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE && + con->v2.out_state != OUT_S_FINISH_MESSAGE); + dout("%s con %p secure - noop\n", __func__, con); + return; + } + + switch (con->v2.out_state) { + case OUT_S_QUEUE_DATA: + revoke_at_queue_data(con); + break; + case OUT_S_QUEUE_DATA_CONT: + revoke_at_queue_data_cont(con); + break; + case OUT_S_FINISH_MESSAGE: + revoke_at_finish_message(con); + break; + default: + WARN(1, "bad out_state %d", con->v2.out_state); + break; + } +} + +static void revoke_at_prepare_read_data(struct ceph_connection *con) +{ + int remaining; /* data + [data padding] + epilogue */ + int resid; + + WARN_ON(!data_len(con->in_msg)); + WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); + resid = iov_iter_count(&con->v2.in_iter); + WARN_ON(!resid); + + if (con_secure(con)) + remaining = padded_len(data_len(con->in_msg)) + + CEPH_EPILOGUE_SECURE_LEN; + else + remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN; + + dout("%s con %p resid %d remaining %d\n", __func__, con, resid, + remaining); + con->v2.in_iter.count -= resid; + set_in_skip(con, resid + remaining); + con->v2.in_state = IN_S_FINISH_SKIP; +} + +static void revoke_at_prepare_read_data_cont(struct ceph_connection *con) +{ + int recved, resid; /* current piece of data */ + int remaining; /* [data padding] + epilogue */ + + WARN_ON(!data_len(con->in_msg)); + WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter)); + resid = iov_iter_count(&con->v2.in_iter); + WARN_ON(!resid || resid > con->v2.in_bvec.bv_len); + recved = con->v2.in_bvec.bv_len - resid; + dout("%s con %p recved %d resid %d\n", __func__, con, recved, resid); + + if (recved) + ceph_msg_data_advance(&con->v2.in_cursor, recved); + WARN_ON(resid > con->v2.in_cursor.total_resid); + + if (con_secure(con)) + remaining = padding_len(data_len(con->in_msg)) + + CEPH_EPILOGUE_SECURE_LEN; + else + remaining = CEPH_EPILOGUE_PLAIN_LEN; + + dout("%s con %p total_resid %zu remaining %d\n", __func__, con, + con->v2.in_cursor.total_resid, remaining); + con->v2.in_iter.count -= resid; + set_in_skip(con, con->v2.in_cursor.total_resid + remaining); + con->v2.in_state = IN_S_FINISH_SKIP; +} + +static void revoke_at_handle_epilogue(struct ceph_connection *con) +{ + int resid; + + WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); + resid = iov_iter_count(&con->v2.in_iter); + WARN_ON(!resid); + + dout("%s con %p resid %d\n", __func__, con, resid); + con->v2.in_iter.count -= resid; + set_in_skip(con, resid); + con->v2.in_state = IN_S_FINISH_SKIP; +} + +void ceph_con_v2_revoke_incoming(struct ceph_connection *con) +{ + switch (con->v2.in_state) { + case IN_S_PREPARE_READ_DATA: + revoke_at_prepare_read_data(con); + break; + case IN_S_PREPARE_READ_DATA_CONT: + revoke_at_prepare_read_data_cont(con); + break; + case IN_S_HANDLE_EPILOGUE: + revoke_at_handle_epilogue(con); + break; + default: + WARN(1, "bad in_state %d", con->v2.in_state); + break; + } +} + +bool ceph_con_v2_opened(struct ceph_connection *con) +{ + return con->v2.peer_global_seq; +} + +void ceph_con_v2_reset_session(struct ceph_connection *con) +{ + con->v2.client_cookie = 0; + con->v2.server_cookie = 0; + con->v2.global_seq = 0; + con->v2.connect_seq = 0; + con->v2.peer_global_seq = 0; +} + +void ceph_con_v2_reset_protocol(struct ceph_connection *con) +{ + iov_iter_truncate(&con->v2.in_iter, 0); + iov_iter_truncate(&con->v2.out_iter, 0); + con->v2.out_zero = 0; + + clear_in_sign_kvecs(con); + clear_out_sign_kvecs(con); + free_conn_bufs(con); + + if (con->v2.out_enc_pages) { + WARN_ON(!con->v2.out_enc_page_cnt); + ceph_release_page_vector(con->v2.out_enc_pages, + con->v2.out_enc_page_cnt); + con->v2.out_enc_pages = NULL; + con->v2.out_enc_page_cnt = 0; + } + + con->v2.con_mode = CEPH_CON_MODE_UNKNOWN; + memzero_explicit(&con->v2.in_gcm_nonce, CEPH_GCM_IV_LEN); + memzero_explicit(&con->v2.out_gcm_nonce, CEPH_GCM_IV_LEN); + + if (con->v2.hmac_tfm) { + crypto_free_shash(con->v2.hmac_tfm); + con->v2.hmac_tfm = NULL; + } + if (con->v2.gcm_req) { + aead_request_free(con->v2.gcm_req); + con->v2.gcm_req = NULL; + } + if (con->v2.gcm_tfm) { + crypto_free_aead(con->v2.gcm_tfm); + con->v2.gcm_tfm = NULL; + } +} diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index c4cf2529d08b..195ceb8afb06 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -36,57 +36,122 @@ static const struct ceph_connection_operations mon_con_ops; static int __validate_auth(struct ceph_mon_client *monc); +static int decode_mon_info(void **p, void *end, bool msgr2, + struct ceph_entity_addr *addr) +{ + void *mon_info_end; + u32 struct_len; + u8 struct_v; + int ret; + + ret = ceph_start_decoding(p, end, 1, "mon_info_t", &struct_v, + &struct_len); + if (ret) + return ret; + + mon_info_end = *p + struct_len; + ceph_decode_skip_string(p, end, e_inval); /* skip mon name */ + ret = ceph_decode_entity_addrvec(p, end, msgr2, addr); + if (ret) + return ret; + + *p = mon_info_end; + return 0; + +e_inval: + return -EINVAL; +} + /* * Decode a monmap blob (e.g., during mount). + * + * Assume MonMap v3 (i.e. encoding with MONNAMES and MONENC). */ -static struct ceph_monmap *ceph_monmap_decode(void *p, void *end) +static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2) { - struct ceph_monmap *m = NULL; - int i, err = -EINVAL; + struct ceph_monmap *monmap = NULL; struct ceph_fsid fsid; - u32 epoch, num_mon; - u32 len; + u32 struct_len; + int blob_len; + int num_mon; + u8 struct_v; + u32 epoch; + int ret; + int i; + + ceph_decode_32_safe(p, end, blob_len, e_inval); + ceph_decode_need(p, end, blob_len, e_inval); + + ret = ceph_start_decoding(p, end, 6, "monmap", &struct_v, &struct_len); + if (ret) + goto fail; - ceph_decode_32_safe(&p, end, len, bad); - ceph_decode_need(&p, end, len, bad); + dout("%s struct_v %d\n", __func__, struct_v); + ceph_decode_copy_safe(p, end, &fsid, sizeof(fsid), e_inval); + ceph_decode_32_safe(p, end, epoch, e_inval); + if (struct_v >= 6) { + u32 feat_struct_len; + u8 feat_struct_v; - dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p)); - p += sizeof(u16); /* skip version */ + *p += sizeof(struct ceph_timespec); /* skip last_changed */ + *p += sizeof(struct ceph_timespec); /* skip created */ - ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); - ceph_decode_copy(&p, &fsid, sizeof(fsid)); - epoch = ceph_decode_32(&p); + ret = ceph_start_decoding(p, end, 1, "mon_feature_t", + &feat_struct_v, &feat_struct_len); + if (ret) + goto fail; - num_mon = ceph_decode_32(&p); + *p += feat_struct_len; /* skip persistent_features */ + + ret = ceph_start_decoding(p, end, 1, "mon_feature_t", + &feat_struct_v, &feat_struct_len); + if (ret) + goto fail; + + *p += feat_struct_len; /* skip optional_features */ + } + ceph_decode_32_safe(p, end, num_mon, e_inval); + dout("%s fsid %pU epoch %u num_mon %d\n", __func__, &fsid, epoch, + num_mon); if (num_mon > CEPH_MAX_MON) - goto bad; - m = kmalloc(struct_size(m, mon_inst, num_mon), GFP_NOFS); - if (m == NULL) - return ERR_PTR(-ENOMEM); - m->fsid = fsid; - m->epoch = epoch; - m->num_mon = num_mon; - for (i = 0; i < num_mon; ++i) { - struct ceph_entity_inst *inst = &m->mon_inst[i]; - - /* copy name portion */ - ceph_decode_copy_safe(&p, end, &inst->name, - sizeof(inst->name), bad); - err = ceph_decode_entity_addr(&p, end, &inst->addr); - if (err) - goto bad; + goto e_inval; + + monmap = kmalloc(struct_size(monmap, mon_inst, num_mon), GFP_NOIO); + if (!monmap) { + ret = -ENOMEM; + goto fail; } - dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, - m->num_mon); - for (i = 0; i < m->num_mon; i++) - dout("monmap_decode mon%d is %s\n", i, - ceph_pr_addr(&m->mon_inst[i].addr)); - return m; -bad: - dout("monmap_decode failed with %d\n", err); - kfree(m); - return ERR_PTR(err); + monmap->fsid = fsid; + monmap->epoch = epoch; + monmap->num_mon = num_mon; + + /* legacy_mon_addr map or mon_info map */ + for (i = 0; i < num_mon; i++) { + struct ceph_entity_inst *inst = &monmap->mon_inst[i]; + + ceph_decode_skip_string(p, end, e_inval); /* skip mon name */ + inst->name.type = CEPH_ENTITY_TYPE_MON; + inst->name.num = cpu_to_le64(i); + + if (struct_v >= 6) + ret = decode_mon_info(p, end, msgr2, &inst->addr); + else + ret = ceph_decode_entity_addr(p, end, &inst->addr); + if (ret) + goto fail; + + dout("%s mon%d addr %s\n", __func__, i, + ceph_pr_addr(&inst->addr)); + } + + return monmap; + +e_inval: + ret = -EINVAL; +fail: + kfree(monmap); + return ERR_PTR(ret); } /* @@ -96,9 +161,11 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) { int i; - for (i = 0; i < m->num_mon; i++) - if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0) + for (i = 0; i < m->num_mon; i++) { + if (ceph_addr_equal_no_type(addr, &m->mon_inst[i].addr)) return 1; + } + return 0; } @@ -190,10 +257,16 @@ static void __open_session(struct ceph_mon_client *monc) &monc->monmap->mon_inst[monc->cur_mon].addr); /* - * send an initial keepalive to ensure our timestamp is valid - * by the time we are in an OPENED state + * Queue a keepalive to ensure that in case of an early fault + * the messenger doesn't put us into STANDBY state and instead + * retries. This also ensures that our timestamp is valid by + * the time we finish hunting and delayed_work() checks it. */ ceph_con_keepalive(&monc->con); + if (ceph_msgr2(monc->client)) { + monc->pending_auth = 1; + return; + } /* initiate authentication handshake */ ret = ceph_auth_build_hello(monc->auth, @@ -476,7 +549,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, p = msg->front.iov_base; end = p + msg->front.iov_len; - monmap = ceph_monmap_decode(p, end); + monmap = ceph_monmap_decode(&p, end, ceph_msgr2(client)); if (IS_ERR(monmap)) { pr_err("problem decoding monmap, %d\n", (int)PTR_ERR(monmap)); @@ -1052,8 +1125,9 @@ static void delayed_work(struct work_struct *work) */ static int build_initial_monmap(struct ceph_mon_client *monc) { + __le32 my_type = ceph_msgr2(monc->client) ? + CEPH_ENTITY_ADDR_TYPE_MSGR2 : CEPH_ENTITY_ADDR_TYPE_LEGACY; struct ceph_options *opt = monc->client->options; - struct ceph_entity_addr *mon_addr = opt->mon_addr; int num_mon = opt->num_mon; int i; @@ -1062,12 +1136,16 @@ static int build_initial_monmap(struct ceph_mon_client *monc) GFP_KERNEL); if (!monc->monmap) return -ENOMEM; + for (i = 0; i < num_mon; i++) { - monc->monmap->mon_inst[i].addr = mon_addr[i]; - monc->monmap->mon_inst[i].addr.nonce = 0; - monc->monmap->mon_inst[i].name.type = - CEPH_ENTITY_TYPE_MON; - monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); + struct ceph_entity_inst *inst = &monc->monmap->mon_inst[i]; + + memcpy(&inst->addr.in_addr, &opt->mon_addr[i].in_addr, + sizeof(inst->addr.in_addr)); + inst->addr.type = my_type; + inst->addr.nonce = 0; + inst->name.type = CEPH_ENTITY_TYPE_MON; + inst->name.num = cpu_to_le64(i); } monc->monmap->num_mon = num_mon; return 0; @@ -1089,8 +1167,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) /* connection */ /* authentication */ - monc->auth = ceph_auth_init(cl->options->name, - cl->options->key); + monc->auth = ceph_auth_init(cl->options->name, cl->options->key, + cl->options->con_modes); if (IS_ERR(monc->auth)) { err = PTR_ERR(monc->auth); goto out_monmap; @@ -1194,30 +1272,22 @@ static void finish_hunting(struct ceph_mon_client *monc) } } -static void handle_auth_reply(struct ceph_mon_client *monc, - struct ceph_msg *msg) +static void finish_auth(struct ceph_mon_client *monc, int auth_err, + bool was_authed) { - int ret; - int was_auth = 0; + dout("%s auth_err %d was_authed %d\n", __func__, auth_err, was_authed); + WARN_ON(auth_err > 0); - mutex_lock(&monc->mutex); - was_auth = ceph_auth_is_authenticated(monc->auth); monc->pending_auth = 0; - ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, - msg->front.iov_len, - monc->m_auth->front.iov_base, - monc->m_auth->front_alloc_len); - if (ret > 0) { - __send_prepared_auth_request(monc, ret); - goto out; + if (auth_err) { + monc->client->auth_err = auth_err; + wake_up_all(&monc->client->auth_wq); + return; } - finish_hunting(monc); - - if (ret < 0) { - monc->client->auth_err = ret; - } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { - dout("authenticated, starting session\n"); + if (!was_authed && ceph_auth_is_authenticated(monc->auth)) { + dout("%s authenticated, starting session global_id %llu\n", + __func__, monc->auth->global_id); monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; monc->client->msgr.inst.name.num = @@ -1229,11 +1299,27 @@ static void handle_auth_reply(struct ceph_mon_client *monc, pr_info("mon%d %s session established\n", monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr)); } +} -out: +static void handle_auth_reply(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + bool was_authed; + int ret; + + mutex_lock(&monc->mutex); + was_authed = ceph_auth_is_authenticated(monc->auth); + ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, + msg->front.iov_len, + monc->m_auth->front.iov_base, + monc->m_auth->front_alloc_len); + if (ret > 0) { + __send_prepared_auth_request(monc, ret); + } else { + finish_auth(monc, ret, was_authed); + finish_hunting(monc); + } mutex_unlock(&monc->mutex); - if (monc->client->auth_err < 0) - wake_up_all(&monc->client->auth_wq); } static int __validate_auth(struct ceph_mon_client *monc) @@ -1262,10 +1348,92 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc) } EXPORT_SYMBOL(ceph_monc_validate_auth); +static int mon_get_auth_request(struct ceph_connection *con, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_mon_client *monc = con->private; + int ret; + + mutex_lock(&monc->mutex); + ret = ceph_auth_get_request(monc->auth, buf, *buf_len); + mutex_unlock(&monc->mutex); + if (ret < 0) + return ret; + + *buf_len = ret; + *authorizer = NULL; + *authorizer_len = 0; + return 0; +} + +static int mon_handle_auth_reply_more(struct ceph_connection *con, + void *reply, int reply_len, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_mon_client *monc = con->private; + int ret; + + mutex_lock(&monc->mutex); + ret = ceph_auth_handle_reply_more(monc->auth, reply, reply_len, + buf, *buf_len); + mutex_unlock(&monc->mutex); + if (ret < 0) + return ret; + + *buf_len = ret; + *authorizer = NULL; + *authorizer_len = 0; + return 0; +} + +static int mon_handle_auth_done(struct ceph_connection *con, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + struct ceph_mon_client *monc = con->private; + bool was_authed; + int ret; + + mutex_lock(&monc->mutex); + WARN_ON(!monc->hunting); + was_authed = ceph_auth_is_authenticated(monc->auth); + ret = ceph_auth_handle_reply_done(monc->auth, global_id, + reply, reply_len, + session_key, session_key_len, + con_secret, con_secret_len); + finish_auth(monc, ret, was_authed); + if (!ret) + finish_hunting(monc); + mutex_unlock(&monc->mutex); + return 0; +} + +static int mon_handle_auth_bad_method(struct ceph_connection *con, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + struct ceph_mon_client *monc = con->private; + bool was_authed; + + mutex_lock(&monc->mutex); + WARN_ON(!monc->hunting); + was_authed = ceph_auth_is_authenticated(monc->auth); + ceph_auth_handle_bad_method(monc->auth, used_proto, result, + allowed_protos, proto_cnt, + allowed_modes, mode_cnt); + finish_auth(monc, -EACCES, was_authed); + mutex_unlock(&monc->mutex); + return 0; +} + /* * handle incoming message */ -static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +static void mon_dispatch(struct ceph_connection *con, struct ceph_msg *msg) { struct ceph_mon_client *monc = con->private; int type = le16_to_cpu(msg->hdr.type); @@ -1397,19 +1565,23 @@ static void mon_fault(struct ceph_connection *con) * will come from the messenger workqueue, which is drained prior to * mon_client destruction. */ -static struct ceph_connection *con_get(struct ceph_connection *con) +static struct ceph_connection *mon_get_con(struct ceph_connection *con) { return con; } -static void con_put(struct ceph_connection *con) +static void mon_put_con(struct ceph_connection *con) { } static const struct ceph_connection_operations mon_con_ops = { - .get = con_get, - .put = con_put, - .dispatch = dispatch, - .fault = mon_fault, + .get = mon_get_con, + .put = mon_put_con, .alloc_msg = mon_alloc_msg, + .dispatch = mon_dispatch, + .fault = mon_fault, + .get_auth_request = mon_get_auth_request, + .handle_auth_reply_more = mon_handle_auth_reply_more, + .handle_auth_done = mon_handle_auth_done, + .handle_auth_bad_method = mon_handle_auth_bad_method, }; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 7901ab6c79fd..ff8624a7c964 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -3918,9 +3918,11 @@ static int handle_one_map(struct ceph_osd_client *osdc, set_pool_was_full(osdc); if (incremental) - newmap = osdmap_apply_incremental(&p, end, osdc->osdmap); + newmap = osdmap_apply_incremental(&p, end, + ceph_msgr2(osdc->client), + osdc->osdmap); else - newmap = ceph_osdmap_decode(&p, end); + newmap = ceph_osdmap_decode(&p, end, ceph_msgr2(osdc->client)); if (IS_ERR(newmap)) return PTR_ERR(newmap); @@ -5410,7 +5412,7 @@ void ceph_osdc_cleanup(void) /* * handle incoming message */ -static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +static void osd_dispatch(struct ceph_connection *con, struct ceph_msg *msg) { struct ceph_osd *osd = con->private; struct ceph_osd_client *osdc = osd->o_osdc; @@ -5532,9 +5534,9 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) return m; } -static struct ceph_msg *alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip) +static struct ceph_msg *osd_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) { struct ceph_osd *osd = con->private; int type = le16_to_cpu(hdr->type); @@ -5558,7 +5560,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, /* * Wrappers to refcount containing ceph_osd struct */ -static struct ceph_connection *get_osd_con(struct ceph_connection *con) +static struct ceph_connection *osd_get_con(struct ceph_connection *con) { struct ceph_osd *osd = con->private; if (get_osd(osd)) @@ -5566,7 +5568,7 @@ static struct ceph_connection *get_osd_con(struct ceph_connection *con) return NULL; } -static void put_osd_con(struct ceph_connection *con) +static void osd_put_con(struct ceph_connection *con) { struct ceph_osd *osd = con->private; put_osd(osd); @@ -5575,39 +5577,29 @@ static void put_osd_con(struct ceph_connection *con) /* * authentication */ + /* * Note: returned pointer is the address of a structure that's * managed separately. Caller must *not* attempt to free it. */ -static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, - int *proto, int force_new) +static struct ceph_auth_handshake * +osd_get_authorizer(struct ceph_connection *con, int *proto, int force_new) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; struct ceph_auth_handshake *auth = &o->o_auth; + int ret; - if (force_new && auth->authorizer) { - ceph_auth_destroy_authorizer(auth->authorizer); - auth->authorizer = NULL; - } - if (!auth->authorizer) { - int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, - auth); - if (ret) - return ERR_PTR(ret); - } else { - int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD, - auth); - if (ret) - return ERR_PTR(ret); - } - *proto = ac->protocol; + ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD, + force_new, proto, NULL, NULL); + if (ret) + return ERR_PTR(ret); return auth; } -static int add_authorizer_challenge(struct ceph_connection *con, +static int osd_add_authorizer_challenge(struct ceph_connection *con, void *challenge_buf, int challenge_buf_len) { struct ceph_osd *o = con->private; @@ -5618,16 +5610,19 @@ static int add_authorizer_challenge(struct ceph_connection *con, challenge_buf, challenge_buf_len); } -static int verify_authorizer_reply(struct ceph_connection *con) +static int osd_verify_authorizer_reply(struct ceph_connection *con) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; - return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer); + return ceph_auth_verify_authorizer_reply(ac, auth->authorizer, + auth->authorizer_reply_buf, auth->authorizer_reply_buf_len, + NULL, NULL, NULL, NULL); } -static int invalidate_authorizer(struct ceph_connection *con) +static int osd_invalidate_authorizer(struct ceph_connection *con) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; @@ -5637,6 +5632,80 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&osdc->client->monc); } +static int osd_get_auth_request(struct ceph_connection *con, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_osd *o = con->private; + struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; + int ret; + + ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD, + buf, buf_len); + if (ret) + return ret; + + *authorizer = auth->authorizer_buf; + *authorizer_len = auth->authorizer_buf_len; + return 0; +} + +static int osd_handle_auth_reply_more(struct ceph_connection *con, + void *reply, int reply_len, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_osd *o = con->private; + struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; + int ret; + + ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len, + buf, buf_len); + if (ret) + return ret; + + *authorizer = auth->authorizer_buf; + *authorizer_len = auth->authorizer_buf_len; + return 0; +} + +static int osd_handle_auth_done(struct ceph_connection *con, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + struct ceph_osd *o = con->private; + struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; + + return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len, + session_key, session_key_len, + con_secret, con_secret_len); +} + +static int osd_handle_auth_bad_method(struct ceph_connection *con, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + struct ceph_osd *o = con->private; + struct ceph_mon_client *monc = &o->o_osdc->client->monc; + int ret; + + if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_OSD, + used_proto, result, + allowed_protos, proto_cnt, + allowed_modes, mode_cnt)) { + ret = ceph_monc_validate_auth(monc); + if (ret) + return ret; + } + + return -EACCES; +} + static void osd_reencode_message(struct ceph_msg *msg) { int type = le16_to_cpu(msg->hdr.type); @@ -5662,16 +5731,20 @@ static int osd_check_message_signature(struct ceph_msg *msg) } static const struct ceph_connection_operations osd_con_ops = { - .get = get_osd_con, - .put = put_osd_con, - .dispatch = dispatch, - .get_authorizer = get_authorizer, - .add_authorizer_challenge = add_authorizer_challenge, - .verify_authorizer_reply = verify_authorizer_reply, - .invalidate_authorizer = invalidate_authorizer, - .alloc_msg = alloc_msg, + .get = osd_get_con, + .put = osd_put_con, + .alloc_msg = osd_alloc_msg, + .dispatch = osd_dispatch, + .fault = osd_fault, .reencode_message = osd_reencode_message, + .get_authorizer = osd_get_authorizer, + .add_authorizer_challenge = osd_add_authorizer_challenge, + .verify_authorizer_reply = osd_verify_authorizer_reply, + .invalidate_authorizer = osd_invalidate_authorizer, .sign_message = osd_sign_message, .check_message_signature = osd_check_message_signature, - .fault = osd_fault, + .get_auth_request = osd_get_auth_request, + .handle_auth_reply_more = osd_handle_auth_reply_more, + .handle_auth_done = osd_handle_auth_done, + .handle_auth_bad_method = osd_handle_auth_bad_method, }; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index fa08c15be0c0..2b1dd252f231 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1647,7 +1647,8 @@ static int decode_old_pg_upmap_items(void **p, void *end, /* * decode a full map. */ -static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) +static int osdmap_decode(void **p, void *end, bool msgr2, + struct ceph_osdmap *map) { u8 struct_v; u32 epoch = 0; @@ -1718,9 +1719,16 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) goto e_inval; for (i = 0; i < map->max_osd; i++) { - err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]); + struct ceph_entity_addr *addr = &map->osd_addr[i]; + + if (struct_v >= 8) + err = ceph_decode_entity_addrvec(p, end, msgr2, addr); + else + err = ceph_decode_entity_addr(p, end, addr); if (err) goto bad; + + dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr)); } /* pg_temp */ @@ -1790,7 +1798,7 @@ bad: /* * Allocate and decode a full map. */ -struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) +struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2) { struct ceph_osdmap *map; int ret; @@ -1799,7 +1807,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) if (!map) return ERR_PTR(-ENOMEM); - ret = osdmap_decode(p, end, map); + ret = osdmap_decode(p, end, msgr2, map); if (ret) { ceph_osdmap_destroy(map); return ERR_PTR(ret); @@ -1817,12 +1825,13 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) * new_state: { osd=6, xorstate=EXISTS } # clear osd_state */ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, - struct ceph_osdmap *map) + bool msgr2, struct ceph_osdmap *map) { void *new_up_client; void *new_state; void *new_weight_end; u32 len; + int ret; int i; new_up_client = *p; @@ -1831,8 +1840,12 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, struct ceph_entity_addr addr; ceph_decode_skip_32(p, end, e_inval); - if (ceph_decode_entity_addr(p, end, &addr)) - goto e_inval; + if (struct_v >= 7) + ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr); + else + ret = ceph_decode_entity_addr(p, end, &addr); + if (ret) + return ret; } new_state = *p; @@ -1874,7 +1887,6 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, while (len--) { s32 osd; u32 xorstate; - int ret; osd = ceph_decode_32(p); if (struct_v >= 5) @@ -1910,8 +1922,15 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, osd = ceph_decode_32(p); BUG_ON(osd >= map->max_osd); - if (ceph_decode_entity_addr(p, end, &addr)) - goto e_inval; + if (struct_v >= 7) + ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr); + else + ret = ceph_decode_entity_addr(p, end, &addr); + if (ret) + return ret; + + dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr)); + pr_info("osd%d up\n", osd); map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; map->osd_addr[osd] = addr; @@ -1927,7 +1946,7 @@ e_inval: /* * decode and apply an incremental map update. */ -struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, +struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2, struct ceph_osdmap *map) { struct ceph_fsid fsid; @@ -1962,7 +1981,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (len > 0) { dout("apply_incremental full map len %d, %p to %p\n", len, *p, end); - return ceph_osdmap_decode(p, min(*p+len, end)); + return ceph_osdmap_decode(p, min(*p+len, end), msgr2); } /* new crush? */ @@ -2014,7 +2033,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_up_client, new_state, new_weight */ - err = decode_new_up_state_weight(p, end, struct_v, map); + err = decode_new_up_state_weight(p, end, struct_v, msgr2, map); if (err) goto bad; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index c907f0dc7f87..4edd033e899c 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -6,6 +6,7 @@ #include <linux/types.h> #include <linux/spinlock.h> #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/bpf_local_storage.h> #include <net/bpf_sk_storage.h> @@ -15,20 +16,8 @@ DEFINE_BPF_STORAGE_CACHE(sk_cache); -static int omem_charge(struct sock *sk, unsigned int size) -{ - /* same check as in sock_kmalloc() */ - if (size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { - atomic_add(size, &sk->sk_omem_alloc); - return 0; - } - - return -ENOMEM; -} - static struct bpf_local_storage_data * -sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) +bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) { struct bpf_local_storage *sk_storage; struct bpf_local_storage_map *smap; @@ -41,11 +30,11 @@ sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit); } -static int sk_storage_delete(struct sock *sk, struct bpf_map *map) +static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) { struct bpf_local_storage_data *sdata; - sdata = sk_storage_lookup(sk, map, false); + sdata = bpf_sk_storage_lookup(sk, map, false); if (!sdata) return -ENOENT; @@ -94,7 +83,7 @@ void bpf_sk_storage_free(struct sock *sk) kfree_rcu(sk_storage, rcu); } -static void sk_storage_map_free(struct bpf_map *map) +static void bpf_sk_storage_map_free(struct bpf_map *map) { struct bpf_local_storage_map *smap; @@ -103,7 +92,7 @@ static void sk_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(smap); } -static struct bpf_map *sk_storage_map_alloc(union bpf_attr *attr) +static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) { struct bpf_local_storage_map *smap; @@ -130,7 +119,7 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { - sdata = sk_storage_lookup(sock->sk, map, true); + sdata = bpf_sk_storage_lookup(sock->sk, map, true); sockfd_put(sock); return sdata ? sdata->data : NULL; } @@ -166,7 +155,7 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { - err = sk_storage_delete(sock->sk, map); + err = bpf_sk_storage_del(sock->sk, map); sockfd_put(sock); return err; } @@ -272,7 +261,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE) return (unsigned long)NULL; - sdata = sk_storage_lookup(sk, map, true); + sdata = bpf_sk_storage_lookup(sk, map, true); if (sdata) return (unsigned long)sdata->data; @@ -305,7 +294,7 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) if (refcount_inc_not_zero(&sk->sk_refcnt)) { int err; - err = sk_storage_delete(sk, map); + err = bpf_sk_storage_del(sk, map); sock_put(sk); return err; } @@ -313,14 +302,23 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) return -ENOENT; } -static int sk_storage_charge(struct bpf_local_storage_map *smap, - void *owner, u32 size) +static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, + void *owner, u32 size) { - return omem_charge(owner, size); + struct sock *sk = (struct sock *)owner; + + /* same check as in sock_kmalloc() */ + if (size <= sysctl_optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + atomic_add(size, &sk->sk_omem_alloc); + return 0; + } + + return -ENOMEM; } -static void sk_storage_uncharge(struct bpf_local_storage_map *smap, - void *owner, u32 size) +static void bpf_sk_storage_uncharge(struct bpf_local_storage_map *smap, + void *owner, u32 size) { struct sock *sk = owner; @@ -328,7 +326,7 @@ static void sk_storage_uncharge(struct bpf_local_storage_map *smap, } static struct bpf_local_storage __rcu ** -sk_storage_ptr(void *owner) +bpf_sk_storage_ptr(void *owner) { struct sock *sk = owner; @@ -339,8 +337,8 @@ static int sk_storage_map_btf_id; const struct bpf_map_ops sk_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, - .map_alloc = sk_storage_map_alloc, - .map_free = sk_storage_map_free, + .map_alloc = bpf_sk_storage_map_alloc, + .map_free = bpf_sk_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, .map_update_elem = bpf_fd_sk_storage_update_elem, @@ -348,9 +346,9 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_check_btf = bpf_local_storage_map_check_btf, .map_btf_name = "bpf_local_storage_map", .map_btf_id = &sk_storage_map_btf_id, - .map_local_storage_charge = sk_storage_charge, - .map_local_storage_uncharge = sk_storage_uncharge, - .map_owner_storage_ptr = sk_storage_ptr, + .map_local_storage_charge = bpf_sk_storage_charge, + .map_local_storage_uncharge = bpf_sk_storage_uncharge, + .map_owner_storage_ptr = bpf_sk_storage_ptr, }; const struct bpf_func_proto bpf_sk_storage_get_proto = { @@ -381,6 +379,80 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; +static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) +{ + const struct btf *btf_vmlinux; + const struct btf_type *t; + const char *tname; + u32 btf_id; + + if (prog->aux->dst_prog) + return false; + + /* Ensure the tracing program is not tracing + * any bpf_sk_storage*() function and also + * use the bpf_sk_storage_(get|delete) helper. + */ + switch (prog->expected_attach_type) { + case BPF_TRACE_ITER: + case BPF_TRACE_RAW_TP: + /* bpf_sk_storage has no trace point */ + return true; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + btf_vmlinux = bpf_get_btf_vmlinux(); + btf_id = prog->aux->attach_btf_id; + t = btf_type_by_id(btf_vmlinux, btf_id); + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + return !!strncmp(tname, "bpf_sk_storage", + strlen("bpf_sk_storage")); + default: + return false; + } + + return false; +} + +BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags) +{ + if (in_irq() || in_nmi()) + return (unsigned long)NULL; + + return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); +} + +BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, + struct sock *, sk) +{ + if (in_irq() || in_nmi()) + return -EPERM; + + return ____bpf_sk_storage_delete(map, sk); +} + +const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = { + .func = bpf_sk_storage_get_tracing, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, + .allowed = bpf_sk_storage_tracing_allowed, +}; + +const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = { + .func = bpf_sk_storage_delete_tracing, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .allowed = bpf_sk_storage_tracing_allowed, +}; + struct bpf_sk_storage_diag { u32 nr_maps; struct bpf_map *maps[]; diff --git a/net/core/datagram.c b/net/core/datagram.c index 9fcaa544f11a..15ab9ffb27fe 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -709,7 +709,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) EXPORT_SYMBOL(zerocopy_sg_from_iter); /** - * skb_copy_and_csum_datagram_iter - Copy datagram to an iovec iterator + * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator * and update a checksum. * @skb: buffer to copy * @offset: offset in the buffer to start copying from @@ -721,8 +721,16 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, __wsum *csump) { - return __skb_datagram_iter(skb, offset, to, len, true, - csum_and_copy_to_iter, csump); + struct csum_state csdata = { .csum = *csump }; + int ret; + + ret = __skb_datagram_iter(skb, offset, to, len, true, + csum_and_copy_to_iter, &csdata); + if (ret) + return ret; + + *csump = csdata.csum; + return 0; } /** diff --git a/net/core/dev.c b/net/core/dev.c index 82dc6b48e45f..6c5967e80132 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -91,6 +91,7 @@ #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/skbuff.h> +#include <linux/kthread.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <net/net_namespace.h> @@ -101,6 +102,7 @@ #include <net/dsa.h> #include <net/dst.h> #include <net/dst_metadata.h> +#include <net/gro.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/checksum.h> @@ -1069,19 +1071,6 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); -struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) -{ - struct net_device *dev; - - ASSERT_RTNL(); - for_each_netdev(net, dev) - if (dev->type == type) - return dev; - - return NULL; -} -EXPORT_SYMBOL(__dev_getfirstbyhwtype); - struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; @@ -1470,6 +1459,25 @@ void netdev_state_change(struct net_device *dev) EXPORT_SYMBOL(netdev_state_change); /** + * __netdev_notify_peers - notify network peers about existence of @dev, + * to be called when rtnl lock is already held. + * @dev: network device + * + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. + */ +void __netdev_notify_peers(struct net_device *dev) +{ + ASSERT_RTNL(); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); +} +EXPORT_SYMBOL(__netdev_notify_peers); + +/** * netdev_notify_peers - notify network peers about existence of @dev * @dev: network device * @@ -1482,12 +1490,32 @@ EXPORT_SYMBOL(netdev_state_change); void netdev_notify_peers(struct net_device *dev) { rtnl_lock(); - call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); - call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); + __netdev_notify_peers(dev); rtnl_unlock(); } EXPORT_SYMBOL(netdev_notify_peers); +static int napi_threaded_poll(void *data); + +static int napi_kthread_create(struct napi_struct *n) +{ + int err = 0; + + /* Create and wake up the kthread once to put it in + * TASK_INTERRUPTIBLE mode to avoid the blocked task + * warning and work with loadavg. + */ + n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", + n->dev->name, n->napi_id); + if (IS_ERR(n->thread)) { + err = PTR_ERR(n->thread); + pr_err("kthread_run failed with err %d\n", err); + n->thread = NULL; + } + + return err; +} + static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; @@ -2189,28 +2217,14 @@ static inline void net_timestamp_set(struct sk_buff *skb) bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { - unsigned int len; - - if (!(dev->flags & IFF_UP)) - return false; - - len = dev->mtu + dev->hard_header_len + VLAN_HLEN; - if (skb->len <= len) - return true; - - /* if TSO is enabled, we don't care about the length as the packet - * could be forwarded without being segmented before - */ - if (skb_is_gso(skb)) - return true; - - return false; + return __is_skb_forwardable(dev, skb, true); } EXPORT_SYMBOL_GPL(is_skb_forwardable); -int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, + bool check_mtu) { - int ret = ____dev_forward_skb(dev, skb); + int ret = ____dev_forward_skb(dev, skb, check_mtu); if (likely(!ret)) { skb->protocol = eth_type_trans(skb, dev); @@ -2219,6 +2233,11 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) return ret; } + +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, true); +} EXPORT_SYMBOL_GPL(__dev_forward_skb); /** @@ -2245,6 +2264,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dev_forward_skb); +int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); +} + static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) @@ -3206,7 +3230,7 @@ int skb_checksum_help(struct sk_buff *skb) if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed; - if (unlikely(skb_shinfo(skb)->gso_size)) { + if (unlikely(skb_is_gso(skb))) { skb_warn_bad_offload(skb); return -EINVAL; } @@ -3495,6 +3519,11 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (gso_segs > dev->gso_max_segs) return features & ~NETIF_F_GSO_MASK; + if (!skb_shinfo(skb)->gso_type) { + skb_warn_bad_offload(skb); + return features & ~NETIF_F_GSO_MASK; + } + /* Support for GSO partial features requires software * intervention before we can actually process the packets * so we need to strip support for any partial features now @@ -3607,11 +3636,22 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, int skb_csum_hwoffload_help(struct sk_buff *skb, const netdev_features_t features) { - if (unlikely(skb->csum_not_inet)) + if (unlikely(skb_csum_is_sctp(skb))) return !!(features & NETIF_F_SCTP_CRC) ? 0 : skb_crc32c_csum_help(skb); - return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb); + if (features & NETIF_F_HW_CSUM) + return 0; + + if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { + switch (skb->csum_offset) { + case offsetof(struct tcphdr, check): + case offsetof(struct udphdr, check): + return 0; + } + } + + return skb_checksum_help(skb); } EXPORT_SYMBOL(skb_csum_hwoffload_help); @@ -3867,6 +3907,8 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) return skb; /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ + qdisc_skb_cb(skb)->mru = 0; + qdisc_skb_cb(skb)->post_ct = false; mini_qdisc_bstats_cpu_update(miniq, skb); switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { @@ -4072,7 +4114,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_reset_mac_header(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) - __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); + __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also * stops preemption for RCU. @@ -4180,7 +4222,7 @@ int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) } EXPORT_SYMBOL(dev_queue_xmit_accel); -int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) +int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { struct net_device *dev = skb->dev; struct sk_buff *orig_skb = skb; @@ -4210,17 +4252,13 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) dev_xmit_recursion_dec(); local_bh_enable(); - - if (!dev_xmit_complete(ret)) - kfree_skb(skb); - return ret; drop: atomic_long_inc(&dev->tx_dropped); kfree_skb_list(skb); return NET_XMIT_DROP; } -EXPORT_SYMBOL(dev_direct_xmit); +EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines @@ -4245,6 +4283,22 @@ int gro_normal_batch __read_mostly = 8; static inline void ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi) { + struct task_struct *thread; + + if (test_bit(NAPI_STATE_THREADED, &napi->state)) { + /* Paired with smp_mb__before_atomic() in + * napi_enable()/dev_set_threaded(). + * Use READ_ONCE() to guarantee a complete + * read on napi->thread. Only call + * wake_up_process() when it's not NULL. + */ + thread = READ_ONCE(napi->thread); + if (thread) { + wake_up_process(thread); + return; + } + } + list_add_tail(&napi->poll_list, &sd->poll_list); __raise_softirq_irqoff(NET_RX_SOFTIRQ); } @@ -4596,14 +4650,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { + void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; - void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; + u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; bool orig_bcast; - int hlen, off; - u32 mac_len; + int off; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. @@ -4635,15 +4689,16 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, * header. */ mac_len = skb->data - skb_mac_header(skb); - hlen = skb_headlen(skb) + mac_len; - xdp->data = skb->data - mac_len; - xdp->data_meta = xdp->data; - xdp->data_end = xdp->data + hlen; - xdp->data_hard_start = skb->data - skb_headroom(skb); + hard_start = skb->data - skb_headroom(skb); /* SKB "head" area always have tailroom for skb_shared_info */ - xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; - xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + frame_sz = (void *)skb_end_pointer(skb) - hard_start; + frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + rxqueue = netif_get_rxqueue(skb); + xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); + xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, + skb_headlen(skb) + mac_len, true); orig_data_end = xdp->data_end; orig_data = xdp->data; @@ -4651,9 +4706,6 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; - rxqueue = netif_get_rxqueue(skb); - xdp->rxq = &rxqueue->xdp_rxq; - act = bpf_prog_run_xdp(xdp_prog, xdp); /* check if bpf_xdp_adjust_head was used */ @@ -4888,8 +4940,6 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) else __kfree_skb_defer(skb); } - - __kfree_skb_flush(); } if (sd->output_queue) { @@ -4954,6 +5004,8 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, } qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_skb_cb(skb)->mru = 0; + qdisc_skb_cb(skb)->post_ct = false; skb->tc_at_ingress = 1; mini_qdisc_bstats_cpu_update(miniq, skb); @@ -5143,8 +5195,7 @@ another_round: skb_reset_mac_len(skb); } - if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || - skb->protocol == cpu_to_be16(ETH_P_8021AD)) { + if (eth_type_vlan(skb->protocol)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; @@ -5228,8 +5279,7 @@ check_vlan_id: * find vlan device. */ skb->pkt_type = PACKET_OTHERHOST; - } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || - skb->protocol == cpu_to_be16(ETH_P_8021AD)) { + } else if (eth_type_vlan(skb->protocol)) { /* Outer header is 802.1P with vlan 0, inner header is * 802.1Q or 802.1AD and vlan_do_receive() above could * not find vlan dev for vlan id 0. @@ -5705,7 +5755,7 @@ static void flush_all_backlogs(void) } /* we can have in flight packet[s] on the cpus we are not flushing, - * synchronize_net() in rollback_registered_many() will take care of + * synchronize_net() in unregister_netdevice_many() will take care of * them */ for_each_cpu(cpu, &flush_cpus) @@ -5727,15 +5777,14 @@ static void gro_normal_list(struct napi_struct *napi) /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, * pass the whole batch up to the stack. */ -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs) { list_add_tail(&skb->list, &napi->rx_list); - if (++napi->rx_count >= gro_normal_batch) + napi->rx_count += segs; + if (napi->rx_count >= gro_normal_batch) gro_normal_list(napi); } -INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); -INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { struct packet_offload *ptype; @@ -5769,7 +5818,7 @@ static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) } out: - gro_normal_one(napi, skb); + gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); return NET_RX_SUCCESS; } @@ -5904,10 +5953,6 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) napi_gro_complete(napi, oldest); } -INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, - struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, - struct sk_buff *)); static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); @@ -6046,31 +6091,20 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); -static void napi_skb_free_stolen_head(struct sk_buff *skb) -{ - skb_dst_drop(skb); - skb_ext_put(skb); - kmem_cache_free(skbuff_head_cache, skb); -} - static gro_result_t napi_skb_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) { switch (ret) { case GRO_NORMAL: - gro_normal_one(napi, skb); - break; - - case GRO_DROP: - kfree_skb(skb); + gro_normal_one(napi, skb, 1); break; case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); else - __kfree_skb(skb); + __kfree_skb_defer(skb); break; case GRO_HELD: @@ -6147,11 +6181,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_NORMAL) - gro_normal_one(napi, skb); - break; - - case GRO_DROP: - napi_reuse_skb(napi, skb); + gro_normal_one(napi, skb, 1); break; case GRO_MERGED_FREE: @@ -6215,9 +6245,6 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) gro_result_t ret; struct sk_buff *skb = napi_frags_skb(napi); - if (!skb) - return GRO_DROP; - trace_napi_gro_frags_entry(skb); ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); @@ -6458,7 +6485,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done) WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); - new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | + NAPIF_STATE_PREFER_BUSY_POLL); /* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. @@ -6495,10 +6523,30 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) #if defined(CONFIG_NET_RX_BUSY_POLL) -#define BUSY_POLL_BUDGET 8 +static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) +{ + if (!skip_schedule) { + gro_normal_list(napi); + __napi_schedule(napi); + return; + } -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) + if (napi->gro_bitmask) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + napi_gro_flush(napi, HZ >= 1000); + } + + gro_normal_list(napi); + clear_bit(NAPI_STATE_SCHED, &napi->state); +} + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, + u16 budget) { + bool skip_schedule = false; + unsigned long timeout; int rc; /* Busy polling means there is a high chance device driver hard irq @@ -6515,29 +6563,33 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) local_bh_disable(); + if (prefer_busy_poll) { + napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); + timeout = READ_ONCE(napi->dev->gro_flush_timeout); + if (napi->defer_hard_irqs_count && timeout) { + hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); + skip_schedule = true; + } + } + /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ - rc = napi->poll(napi, BUSY_POLL_BUDGET); + rc = napi->poll(napi, budget); /* We can't gro_normal_list() here, because napi->poll() might have * rearmed the napi (napi_complete_done()) in which case it could * already be running on another CPU. */ - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); - if (rc == BUSY_POLL_BUDGET) { - /* As the whole budget was spent, we still own the napi so can - * safely handle the rx_list. - */ - gro_normal_list(napi); - __napi_schedule(napi); - } + if (rc == budget) + __busy_poll_stop(napi, skip_schedule); local_bh_enable(); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg) + void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); @@ -6565,17 +6617,23 @@ restart: * we avoid dirtying napi->state as much as we can. */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | - NAPIF_STATE_IN_BUSY_POLL)) + NAPIF_STATE_IN_BUSY_POLL)) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | - NAPIF_STATE_SCHED) != val) + NAPIF_STATE_SCHED) != val) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } - work = napi_poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, work, BUSY_POLL_BUDGET); + work = napi_poll(napi, budget); + trace_napi_poll(napi, work, budget); gro_normal_list(napi); count: if (work > 0) @@ -6588,7 +6646,7 @@ count: if (unlikely(need_resched())) { if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); rcu_read_unlock(); cond_resched(); @@ -6599,7 +6657,7 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); out: rcu_read_unlock(); @@ -6650,8 +6708,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ if (!napi_disable_pending(napi) && - !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); __napi_schedule_irqoff(napi); + } return HRTIMER_NORESTART; } @@ -6667,6 +6727,49 @@ static void init_gro_hash(struct napi_struct *napi) napi->gro_bitmask = 0; } +int dev_set_threaded(struct net_device *dev, bool threaded) +{ + struct napi_struct *napi; + int err = 0; + + if (dev->threaded == threaded) + return 0; + + if (threaded) { + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (!napi->thread) { + err = napi_kthread_create(napi); + if (err) { + threaded = false; + break; + } + } + } + } + + dev->threaded = threaded; + + /* Make sure kthread is created before THREADED bit + * is set. + */ + smp_mb__before_atomic(); + + /* Setting/unsetting threaded mode on a napi might not immediately + * take effect, if the current napi instance is actively being + * polled. In this case, the switch between threaded mode and + * softirq mode will happen in the next round of napi_schedule(). + * This should not cause hiccups/stalls to the live traffic. + */ + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (threaded) + set_bit(NAPI_STATE_THREADED, &napi->state); + else + clear_bit(NAPI_STATE_THREADED, &napi->state); + } + + return err; +} + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { @@ -6694,6 +6797,12 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, set_bit(NAPI_STATE_NPSVC, &napi->state); list_add_rcu(&napi->dev_list, &dev->napi_list); napi_hash_add(napi); + /* Create kthread for this napi if dev->threaded is set. + * Clear dev->threaded if kthread creation failed so that + * threaded mode will not be enabled in napi_enable(). + */ + if (dev->threaded && napi_kthread_create(napi)) + dev->threaded = 0; } EXPORT_SYMBOL(netif_napi_add); @@ -6709,10 +6818,30 @@ void napi_disable(struct napi_struct *n) hrtimer_cancel(&n->timer); + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); clear_bit(NAPI_STATE_DISABLE, &n->state); + clear_bit(NAPI_STATE_THREADED, &n->state); } EXPORT_SYMBOL(napi_disable); +/** + * napi_enable - enable NAPI scheduling + * @n: NAPI context + * + * Resume NAPI from being scheduled on this context. + * Must be paired with napi_disable. + */ +void napi_enable(struct napi_struct *n) +{ + BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); + smp_mb__before_atomic(); + clear_bit(NAPI_STATE_SCHED, &n->state); + clear_bit(NAPI_STATE_NPSVC, &n->state); + if (n->dev->threaded && n->thread) + set_bit(NAPI_STATE_THREADED, &n->state); +} +EXPORT_SYMBOL(napi_enable); + static void flush_gro_hash(struct napi_struct *napi) { int i; @@ -6738,18 +6867,18 @@ void __netif_napi_del(struct napi_struct *napi) flush_gro_hash(napi); napi->gro_bitmask = 0; + + if (napi->thread) { + kthread_stop(napi->thread); + napi->thread = NULL; + } } EXPORT_SYMBOL(__netif_napi_del); -static int napi_poll(struct napi_struct *n, struct list_head *repoll) +static int __napi_poll(struct napi_struct *n, bool *repoll) { - void *have; int work, weight; - list_del_init(&n->poll_list); - - have = netpoll_poll_lock(n); - weight = n->weight; /* This NAPI_STATE_SCHED test is for avoiding a race @@ -6769,7 +6898,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) n->poll, work, weight); if (likely(work < weight)) - goto out_unlock; + return work; /* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code @@ -6778,7 +6907,20 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) */ if (unlikely(napi_disable_pending(n))) { napi_complete(n); - goto out_unlock; + return work; + } + + /* The NAPI context has more processing work, but busy-polling + * is preferred. Exit early. + */ + if (napi_prefer_busy_poll(n)) { + if (napi_complete_done(n, work)) { + /* If timeout is not set, we need to make sure + * that the NAPI is re-scheduled. + */ + napi_schedule(n); + } + return work; } if (n->gro_bitmask) { @@ -6796,17 +6938,78 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) if (unlikely(!list_empty(&n->poll_list))) { pr_warn_once("%s: Budget exhausted after napi rescheduled\n", n->dev ? n->dev->name : "backlog"); - goto out_unlock; + return work; } - list_add_tail(&n->poll_list, repoll); + *repoll = true; + + return work; +} + +static int napi_poll(struct napi_struct *n, struct list_head *repoll) +{ + bool do_repoll = false; + void *have; + int work; + + list_del_init(&n->poll_list); + + have = netpoll_poll_lock(n); + + work = __napi_poll(n, &do_repoll); + + if (do_repoll) + list_add_tail(&n->poll_list, repoll); -out_unlock: netpoll_poll_unlock(have); return work; } +static int napi_thread_wait(struct napi_struct *napi) +{ + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop() && !napi_disable_pending(napi)) { + if (test_bit(NAPI_STATE_SCHED, &napi->state)) { + WARN_ON(!list_empty(&napi->poll_list)); + __set_current_state(TASK_RUNNING); + return 0; + } + + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return -1; +} + +static int napi_threaded_poll(void *data) +{ + struct napi_struct *napi = data; + void *have; + + while (!napi_thread_wait(napi)) { + for (;;) { + bool repoll = false; + + local_bh_disable(); + + have = netpoll_poll_lock(napi); + __napi_poll(napi, &repoll); + netpoll_poll_unlock(have); + + local_bh_enable(); + + if (!repoll) + break; + + cond_resched(); + } + } + return 0; +} + static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -6825,7 +7028,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) if (list_empty(&list)) { if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) - goto out; + return; break; } @@ -6852,8 +7055,6 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); -out: - __kfree_skb_flush(); } struct netdev_adjacent { @@ -6919,7 +7120,7 @@ bool netdev_has_upper_dev(struct net_device *dev, EXPORT_SYMBOL(netdev_has_upper_dev); /** - * netdev_has_upper_dev_all - Check if device is linked to an upper device + * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * @@ -8065,6 +8266,39 @@ struct net_device *netdev_get_xmit_slave(struct net_device *dev, } EXPORT_SYMBOL(netdev_get_xmit_slave); +static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, + struct sock *sk) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_sk_get_lower_dev) + return NULL; + return ops->ndo_sk_get_lower_dev(dev, sk); +} + +/** + * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket + * @dev: device + * @sk: the socket + * + * %NULL is returned if no lower device is found. + */ + +struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, + struct sock *sk) +{ + struct net_device *lower; + + lower = netdev_sk_get_lower_dev(dev, sk); + while (lower) { + dev = lower; + lower = netdev_sk_get_lower_dev(dev, sk); + } + + return dev; +} +EXPORT_SYMBOL(netdev_sk_get_lowest_dev); + static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; @@ -8157,7 +8391,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private); /** - * netdev_lower_change - Dispatch event about lower device state change + * netdev_lower_state_changed - Dispatch event about lower device state change * @lower_dev: device * @lower_state_info: state to dispatch * @@ -8687,6 +8921,48 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, } EXPORT_SYMBOL(dev_set_mac_address); +static DECLARE_RWSEM(dev_addr_sem); + +int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack) +{ + int ret; + + down_write(&dev_addr_sem); + ret = dev_set_mac_address(dev, sa, extack); + up_write(&dev_addr_sem); + return ret; +} +EXPORT_SYMBOL(dev_set_mac_address_user); + +int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) +{ + size_t size = sizeof(sa->sa_data); + struct net_device *dev; + int ret = 0; + + down_read(&dev_addr_sem); + rcu_read_lock(); + + dev = dev_get_by_name_rcu(net, dev_name); + if (!dev) { + ret = -ENODEV; + goto unlock; + } + if (!dev->addr_len) + memset(sa->sa_data, 0, size); + else + memcpy(sa->sa_data, dev->dev_addr, + min_t(size_t, size, dev->addr_len)); + sa->sa_family = dev->type; + +unlock: + rcu_read_unlock(); + up_read(&dev_addr_sem); + return ret; +} +EXPORT_SYMBOL(dev_get_mac_address); + /** * dev_change_carrier - Change device carrier * @dev: device @@ -8902,7 +9178,7 @@ static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) return dev->netdev_ops->ndo_bpf; default: return NULL; - }; + } } static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, @@ -8921,6 +9197,17 @@ static struct bpf_prog *dev_xdp_prog(struct net_device *dev, return dev->xdp_state[mode].prog; } +static u8 dev_xdp_prog_count(struct net_device *dev) +{ + u8 count = 0; + int i; + + for (i = 0; i < __MAX_XDP_MODE; i++) + if (dev->xdp_state[i].prog || dev->xdp_state[i].link) + count++; + return count; +} + u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_prog *prog = dev_xdp_prog(dev, mode); @@ -9011,6 +9298,7 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack struct bpf_xdp_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog, u32 flags) { + unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); struct bpf_prog *cur_prog; enum bpf_xdp_mode mode; bpf_op_t bpf_op; @@ -9026,11 +9314,17 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); return -EINVAL; } - /* just one XDP mode bit should be set, zero defaults to SKB mode */ - if (hweight32(flags & XDP_FLAGS_MODES) > 1) { + /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ + if (num_modes > 1) { NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); return -EINVAL; } + /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ + if (!num_modes && dev_xdp_prog_count(dev) > 1) { + NL_SET_ERR_MSG(extack, + "More than one program loaded, unset mode is ambiguous"); + return -EINVAL; + } /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); @@ -9368,106 +9662,6 @@ static void net_set_todo(struct net_device *dev) dev_net(dev)->dev_unreg_count++; } -static void rollback_registered_many(struct list_head *head) -{ - struct net_device *dev, *tmp; - LIST_HEAD(close_head); - - BUG_ON(dev_boot_phase); - ASSERT_RTNL(); - - list_for_each_entry_safe(dev, tmp, head, unreg_list) { - /* Some devices call without registering - * for initialization unwind. Remove those - * devices and proceed with the remaining. - */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - pr_debug("unregister_netdevice: device %s/%p never was registered\n", - dev->name, dev); - - WARN_ON(1); - list_del(&dev->unreg_list); - continue; - } - dev->dismantle = true; - BUG_ON(dev->reg_state != NETREG_REGISTERED); - } - - /* If device is running, close it first. */ - list_for_each_entry(dev, head, unreg_list) - list_add_tail(&dev->close_list, &close_head); - dev_close_many(&close_head, true); - - list_for_each_entry(dev, head, unreg_list) { - /* And unlink it from device chain. */ - unlist_netdevice(dev); - - dev->reg_state = NETREG_UNREGISTERING; - } - flush_all_backlogs(); - - synchronize_net(); - - list_for_each_entry(dev, head, unreg_list) { - struct sk_buff *skb = NULL; - - /* Shutdown queueing discipline. */ - dev_shutdown(dev); - - dev_xdp_uninstall(dev); - - /* Notify protocols, that we are about to destroy - * this device. They should clean all the things. - */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, - GFP_KERNEL, NULL, 0); - - /* - * Flush the unicast and multicast chains - */ - dev_uc_flush(dev); - dev_mc_flush(dev); - - netdev_name_node_alt_flush(dev); - netdev_name_node_free(dev->name_node); - - if (dev->netdev_ops->ndo_uninit) - dev->netdev_ops->ndo_uninit(dev); - - if (skb) - rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); - - /* Notifier chain MUST detach us all upper devices. */ - WARN_ON(netdev_has_any_upper_dev(dev)); - WARN_ON(netdev_has_any_lower_dev(dev)); - - /* Remove entries from kobject tree */ - netdev_unregister_kobject(dev); -#ifdef CONFIG_XPS - /* Remove XPS queueing entries */ - netif_reset_xps_queues_gt(dev, 0); -#endif - } - - synchronize_net(); - - list_for_each_entry(dev, head, unreg_list) - dev_put(dev); -} - -static void rollback_registered(struct net_device *dev) -{ - LIST_HEAD(single); - - list_add(&dev->unreg_list, &single); - rollback_registered_many(&single); - list_del(&single); -} - static netdev_features_t netdev_sync_upper_features(struct net_device *lower, struct net_device *upper, netdev_features_t features) { @@ -9588,6 +9782,22 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } + if (features & NETIF_F_HW_TLS_TX) { + bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) == + (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); + bool hw_csum = features & NETIF_F_HW_CSUM; + + if (!ip_csum && !hw_csum) { + netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_TX; + } + } + + if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { + netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_RX; + } + return features; } @@ -9763,7 +9973,7 @@ static int netif_alloc_rx_queues(struct net_device *dev) rx[i].dev = dev; /* XDP RX-queue setup */ - err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); if (err < 0) goto err_rxq_info; } @@ -9924,7 +10134,7 @@ int register_netdevice(struct net_device *dev) dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF); dev->features |= NETIF_F_SOFT_FEATURES; - if (dev->netdev_ops->ndo_udp_tunnel_add) { + if (dev->udp_tunnel_nic_info) { dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; } @@ -9999,17 +10209,10 @@ int register_netdevice(struct net_device *dev) ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ret = notifier_to_errno(ret); if (ret) { - rollback_registered(dev); - rcu_barrier(); - - dev->reg_state = NETREG_UNREGISTERED; - /* We should put the kobject that hold in - * netdev_unregister_kobject(), otherwise - * the net device cannot be freed when - * driver calls free_netdev(), because the - * kobject is being hold. - */ - kobject_put(&dev->dev.kobj); + /* Expect explicit free_netdev() on failure */ + dev->needs_free_netdev = false; + unregister_netdevice_queue(dev, NULL); + goto out; } /* * Prevent userspace races by waiting until the network @@ -10366,6 +10569,21 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, } EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); +/** + * dev_get_tstats64 - ndo_get_stats64 implementation + * @dev: device to get statistics from + * @s: place to store stats + * + * Populate @s from dev->stats and dev->tstats. Can be used as + * ndo_get_stats64() callback. + */ +void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s) +{ + netdev_stats_to_stats64(s, &dev->stats); + dev_fetch_sw_netstats(s, dev->tstats); +} +EXPORT_SYMBOL_GPL(dev_get_tstats64); + struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); @@ -10538,6 +10756,17 @@ void free_netdev(struct net_device *dev) struct napi_struct *p, *n; might_sleep(); + + /* When called immediately after register_netdevice() failed the unwind + * handling may still be dismantling the device. Handle that case by + * deferring the free. + */ + if (dev->reg_state == NETREG_UNREGISTERING) { + ASSERT_RTNL(); + dev->needs_free_netdev = true; + return; + } + netif_free_tx_queues(dev); netif_free_rx_queues(dev); @@ -10604,9 +10833,10 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) if (head) { list_move_tail(&dev->unreg_list, head); } else { - rollback_registered(dev); - /* Finish processing unregister after unlock */ - net_set_todo(dev); + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + unregister_netdevice_many(&single); } } EXPORT_SYMBOL(unregister_netdevice_queue); @@ -10620,14 +10850,100 @@ EXPORT_SYMBOL(unregister_netdevice_queue); */ void unregister_netdevice_many(struct list_head *head) { - struct net_device *dev; + struct net_device *dev, *tmp; + LIST_HEAD(close_head); + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + if (list_empty(head)) + return; + + list_for_each_entry_safe(dev, tmp, head, unreg_list) { + /* Some devices call without registering + * for initialization unwind. Remove those + * devices and proceed with the remaining. + */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + pr_debug("unregister_netdevice: device %s/%p never was registered\n", + dev->name, dev); + + WARN_ON(1); + list_del(&dev->unreg_list); + continue; + } + dev->dismantle = true; + BUG_ON(dev->reg_state != NETREG_REGISTERED); + } + + /* If device is running, close it first. */ + list_for_each_entry(dev, head, unreg_list) + list_add_tail(&dev->close_list, &close_head); + dev_close_many(&close_head, true); + + list_for_each_entry(dev, head, unreg_list) { + /* And unlink it from device chain. */ + unlist_netdevice(dev); + + dev->reg_state = NETREG_UNREGISTERING; + } + flush_all_backlogs(); + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) { + struct sk_buff *skb = NULL; + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + dev_xdp_uninstall(dev); + + /* Notify protocols, that we are about to destroy + * this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, + GFP_KERNEL, NULL, 0); + + /* + * Flush the unicast and multicast chains + */ + dev_uc_flush(dev); + dev_mc_flush(dev); + + netdev_name_node_alt_flush(dev); + netdev_name_node_free(dev->name_node); + + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); + + if (skb) + rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); + + /* Notifier chain MUST detach us all upper devices. */ + WARN_ON(netdev_has_any_upper_dev(dev)); + WARN_ON(netdev_has_any_lower_dev(dev)); - if (!list_empty(head)) { - rollback_registered_many(head); - list_for_each_entry(dev, head, unreg_list) - net_set_todo(dev); - list_del(head); + /* Remove entries from kobject tree */ + netdev_unregister_kobject(dev); +#ifdef CONFIG_XPS + /* Remove XPS queueing entries */ + netif_reset_xps_queues_gt(dev, 0); +#endif + } + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) { + dev_put(dev); + net_set_todo(dev); } + + list_del(head); } EXPORT_SYMBOL(unregister_netdevice_many); @@ -11165,8 +11481,7 @@ static int __init net_dev_init(void) INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS - sd->csd.func = rps_trigger_softirq; - sd->csd.info = sd; + INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 205e92e604ef..478d032f34ac 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -123,17 +123,6 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm ifr->ifr_mtu = dev->mtu; return 0; - case SIOCGIFHWADDR: - if (!dev->addr_len) - memset(ifr->ifr_hwaddr.sa_data, 0, - sizeof(ifr->ifr_hwaddr.sa_data)); - else - memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, - min(sizeof(ifr->ifr_hwaddr.sa_data), - (size_t)dev->addr_len)); - ifr->ifr_hwaddr.sa_family = dev->type; - return 0; - case SIOCGIFSLAVE: err = -EINVAL; break; @@ -230,7 +219,7 @@ static int dev_do_ioctl(struct net_device *dev, struct ifreq *ifr, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; - int err = -EOPNOTSUPP; + int err; err = dsa_ndo_do_ioctl(dev, ifr, cmd); if (err == 0 || err != -EOPNOTSUPP) @@ -274,7 +263,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) case SIOCSIFHWADDR: if (dev->addr_len > sizeof(struct sockaddr)) return -EINVAL; - return dev_set_mac_address(dev, &ifr->ifr_hwaddr, NULL); + return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) @@ -418,6 +407,12 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c */ switch (cmd) { + case SIOCGIFHWADDR: + dev_load(net, ifr->ifr_name); + ret = dev_get_mac_address(&ifr->ifr_hwaddr, net, ifr->ifr_name); + if (colon) + *colon = ':'; + return ret; /* * These ioctl calls: * - can be done by all. @@ -427,7 +422,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c case SIOCGIFFLAGS: case SIOCGIFMETRIC: case SIOCGIFMTU: - case SIOCGIFHWADDR: case SIOCGIFSLAVE: case SIOCGIFMAP: case SIOCGIFINDEX: diff --git a/net/core/devlink.c b/net/core/devlink.c index a932d95be798..737b61c2976e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -87,6 +87,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, + [DEVLINK_PORT_FN_ATTR_STATE] = + NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE, + DEVLINK_PORT_FN_STATE_ACTIVE), }; static LIST_HEAD(devlink_list); @@ -517,7 +520,7 @@ devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_l return test_bit(limit, &devlink->ops->reload_limits); } -static int devlink_reload_stat_put(struct sk_buff *msg, enum devlink_reload_action action, +static int devlink_reload_stat_put(struct sk_buff *msg, enum devlink_reload_limit limit, u32 value) { struct nlattr *reload_stats_entry; @@ -526,8 +529,7 @@ static int devlink_reload_stat_put(struct sk_buff *msg, enum devlink_reload_acti if (!reload_stats_entry) return -EMSGSIZE; - if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, action) || - nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) || + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) || nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value)) goto nla_put_failure; nla_nest_end(msg, reload_stats_entry); @@ -540,7 +542,7 @@ nla_put_failure: static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote) { - struct nlattr *reload_stats_attr; + struct nlattr *reload_stats_attr, *act_info, *act_stats; int i, j, stat_idx; u32 value; @@ -552,17 +554,29 @@ static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink if (!reload_stats_attr) return -EMSGSIZE; - for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { - /* Remote stats are shown even if not locally supported. Stats - * of actions with unspecified limit are shown though drivers - * don't need to register unspecified limit. - */ - if (!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC && - !devlink_reload_limit_is_supported(devlink, j)) + for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { + if ((!is_remote && + !devlink_reload_action_is_supported(devlink, i)) || + i == DEVLINK_RELOAD_ACTION_UNSPEC) continue; - for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { - if ((!is_remote && !devlink_reload_action_is_supported(devlink, i)) || - i == DEVLINK_RELOAD_ACTION_UNSPEC || + act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO); + if (!act_info) + goto nla_put_failure; + + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i)) + goto action_info_nest_cancel; + act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS); + if (!act_stats) + goto action_info_nest_cancel; + + for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { + /* Remote stats are shown even if not locally supported. + * Stats of actions with unspecified limit are shown + * though drivers don't need to register unspecified + * limit. + */ + if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC && + !devlink_reload_limit_is_supported(devlink, j)) || devlink_reload_combination_is_invalid(i, j)) continue; @@ -571,13 +585,19 @@ static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink value = devlink->stats.reload_stats[stat_idx]; else value = devlink->stats.remote_reload_stats[stat_idx]; - if (devlink_reload_stat_put(msg, i, j, value)) - goto nla_put_failure; + if (devlink_reload_stat_put(msg, j, value)) + goto action_stats_nest_cancel; } + nla_nest_end(msg, act_stats); + nla_nest_end(msg, act_info); } nla_nest_end(msg, reload_stats_attr); return 0; +action_stats_nest_cancel: + nla_nest_cancel(msg, act_stats); +action_info_nest_cancel: + nla_nest_cancel(msg, act_info); nla_put_failure: nla_nest_cancel(msg, reload_stats_attr); return -EMSGSIZE; @@ -673,6 +693,15 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) return -EMSGSIZE; break; + case DEVLINK_PORT_FLAVOUR_PCI_SF: + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, + attrs->pci_sf.controller) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, + attrs->pci_sf.pf) || + nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER, + attrs->pci_sf.sf)) + return -EMSGSIZE; + break; case DEVLINK_PORT_FLAVOUR_PHYSICAL: case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: @@ -696,42 +725,105 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, } static int +devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops *ops, + struct devlink_port *port, struct sk_buff *msg, + struct netlink_ext_ack *extack, bool *msg_updated) +{ + u8 hw_addr[MAX_ADDR_LEN]; + int hw_addr_len; + int err; + + if (!ops->port_function_hw_addr_get) + return 0; + + err = ops->port_function_hw_addr_get(devlink, port, hw_addr, &hw_addr_len, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); + if (err) + return err; + *msg_updated = true; + return 0; +} + +static bool +devlink_port_fn_state_valid(enum devlink_port_fn_state state) +{ + return state == DEVLINK_PORT_FN_STATE_INACTIVE || + state == DEVLINK_PORT_FN_STATE_ACTIVE; +} + +static bool +devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) +{ + return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED || + opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; +} + +static int +devlink_port_fn_state_fill(struct devlink *devlink, + const struct devlink_ops *ops, + struct devlink_port *port, struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) +{ + enum devlink_port_fn_opstate opstate; + enum devlink_port_fn_state state; + int err; + + if (!ops->port_fn_state_get) + return 0; + + err = ops->port_fn_state_get(devlink, port, &state, &opstate, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + if (!devlink_port_fn_state_valid(state)) { + WARN_ON_ONCE(1); + NL_SET_ERR_MSG_MOD(extack, "Invalid state read from driver"); + return -EINVAL; + } + if (!devlink_port_fn_opstate_valid(opstate)) { + WARN_ON_ONCE(1); + NL_SET_ERR_MSG_MOD(extack, + "Invalid operational state read from driver"); + return -EINVAL; + } + if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) || + nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate)) + return -EMSGSIZE; + *msg_updated = true; + return 0; +} + +static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) { struct devlink *devlink = port->devlink; const struct devlink_ops *ops; struct nlattr *function_attr; - bool empty_nest = true; - int err = 0; + bool msg_updated = false; + int err; function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION); if (!function_attr) return -EMSGSIZE; ops = devlink->ops; - if (ops->port_function_hw_addr_get) { - int hw_addr_len; - u8 hw_addr[MAX_ADDR_LEN]; - - err = ops->port_function_hw_addr_get(devlink, port, hw_addr, &hw_addr_len, extack); - if (err == -EOPNOTSUPP) { - /* Port function attributes are optional for a port. If port doesn't - * support function attribute, returning -EOPNOTSUPP is not an error. - */ - err = 0; - goto out; - } else if (err) { - goto out; - } - err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); - if (err) - goto out; - empty_nest = false; - } - + err = devlink_port_fn_hw_addr_fill(devlink, ops, port, msg, + extack, &msg_updated); + if (err) + goto out; + err = devlink_port_fn_state_fill(devlink, ops, port, msg, extack, + &msg_updated); out: - if (err || empty_nest) + if (err || !msg_updated) nla_nest_cancel(msg, function_attr); else nla_nest_end(msg, function_attr); @@ -755,6 +847,8 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) goto nla_put_failure; + /* Hold rtnl lock while accessing port's netdev attributes. */ + rtnl_lock(); spin_lock_bh(&devlink_port->type_lock); if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) goto nla_put_failure_type_locked; @@ -763,9 +857,10 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, devlink_port->desired_type)) goto nla_put_failure_type_locked; if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { + struct net *net = devlink_net(devlink_port->devlink); struct net_device *netdev = devlink_port->type_dev; - if (netdev && + if (netdev && net_eq(net, dev_net(netdev)) && (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, netdev->ifindex) || nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, @@ -781,6 +876,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, goto nla_put_failure_type_locked; } spin_unlock_bh(&devlink_port->type_lock); + rtnl_unlock(); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) @@ -791,6 +887,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, nla_put_failure_type_locked: spin_unlock_bh(&devlink_port->type_lock); + rtnl_unlock(); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; @@ -964,7 +1061,6 @@ devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port * const struct devlink_ops *ops; const u8 *hw_addr; int hw_addr_len; - int err; hw_addr = nla_data(attr); hw_addr_len = nla_len(attr); @@ -989,12 +1085,25 @@ devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port * return -EOPNOTSUPP; } - err = ops->port_function_hw_addr_set(devlink, port, hw_addr, hw_addr_len, extack); - if (err) - return err; + return ops->port_function_hw_addr_set(devlink, port, hw_addr, hw_addr_len, extack); +} - devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); - return 0; +static int devlink_port_fn_state_set(struct devlink *devlink, + struct devlink_port *port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + enum devlink_port_fn_state state; + const struct devlink_ops *ops; + + state = nla_get_u8(attr); + ops = devlink->ops; + if (!ops->port_fn_state_set) { + NL_SET_ERR_MSG_MOD(extack, + "Function does not support state setting"); + return -EOPNOTSUPP; + } + return ops->port_fn_state_set(devlink, port, state, extack); } static int @@ -1012,9 +1121,21 @@ devlink_port_function_set(struct devlink *devlink, struct devlink_port *port, } attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; - if (attr) + if (attr) { err = devlink_port_function_hw_addr_set(devlink, port, attr, extack); + if (err) + return err; + } + /* Keep this as the last function attribute set, so that when + * multiple port function attributes are set along with state, + * Those can be applied first before activating the state. + */ + attr = tb[DEVLINK_PORT_FN_ATTR_STATE]; + if (attr) + err = devlink_port_fn_state_set(devlink, port, attr, extack); + if (!err) + devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); return err; } @@ -1114,6 +1235,111 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, return devlink_port_unsplit(devlink, port_index, info->extack); } +static int devlink_port_new_notifiy(struct devlink *devlink, + unsigned int port_index, + struct genl_info *info) +{ + struct devlink_port *devlink_port; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&devlink->lock); + devlink_port = devlink_port_get_by_index(devlink, port_index); + if (!devlink_port) { + err = -ENODEV; + goto out; + } + + err = devlink_nl_port_fill(msg, devlink, devlink_port, + DEVLINK_CMD_NEW, info->snd_portid, + info->snd_seq, 0, NULL); + if (err) + goto out; + + err = genlmsg_reply(msg, info); + mutex_unlock(&devlink->lock); + return err; + +out: + mutex_unlock(&devlink->lock); + nlmsg_free(msg); + return err; +} + +static int devlink_nl_cmd_port_new_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink_port_new_attrs new_attrs = {}; + struct devlink *devlink = info->user_ptr[0]; + unsigned int new_port_index; + int err; + + if (!devlink->ops->port_new || !devlink->ops->port_del) + return -EOPNOTSUPP; + + if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] || + !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) { + NL_SET_ERR_MSG_MOD(extack, "Port flavour or PCI PF are not specified"); + return -EINVAL; + } + new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]); + new_attrs.pfnum = + nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]); + + if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + /* Port index of the new port being created by driver. */ + new_attrs.port_index = + nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + new_attrs.port_index_valid = true; + } + if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) { + new_attrs.controller = + nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]); + new_attrs.controller_valid = true; + } + if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF && + info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) { + new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]); + new_attrs.sfnum_valid = true; + } + + err = devlink->ops->port_new(devlink, &new_attrs, extack, + &new_port_index); + if (err) + return err; + + err = devlink_port_new_notifiy(devlink, new_port_index, info); + if (err && err != -ENODEV) { + /* Fail to send the response; destroy newly created port. */ + devlink->ops->port_del(devlink, new_port_index, extack); + } + return err; +} + +static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + unsigned int port_index; + + if (!devlink->ops->port_del) + return -EOPNOTSUPP; + + if (!info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + NL_SET_ERR_MSG_MOD(extack, "Port index is not specified"); + return -EINVAL; + } + port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + + return devlink->ops->port_del(devlink, port_index, extack); +} + static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, enum devlink_command cmd, u32 portid, @@ -1448,7 +1674,7 @@ static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg, err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index, pool_index, &cur, &max); if (err && err != -EOPNOTSUPP) - return err; + goto sb_occ_get_failure; if (!err) { if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur)) goto nla_put_failure; @@ -1461,8 +1687,10 @@ static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg, return 0; nla_put_failure: + err = -EMSGSIZE; +sb_occ_get_failure: genlmsg_cancel(msg, hdr); - return -EMSGSIZE; + return err; } static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb, @@ -3370,7 +3598,7 @@ out_free_msg: nlmsg_free(msg); } -void devlink_flash_update_begin_notify(struct devlink *devlink) +static void devlink_flash_update_begin_notify(struct devlink *devlink) { struct devlink_flash_notify params = { 0 }; @@ -3378,9 +3606,8 @@ void devlink_flash_update_begin_notify(struct devlink *devlink) DEVLINK_CMD_FLASH_UPDATE, ¶ms); } -EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify); -void devlink_flash_update_end_notify(struct devlink *devlink) +static void devlink_flash_update_end_notify(struct devlink *devlink) { struct devlink_flash_notify params = { 0 }; @@ -3388,7 +3615,6 @@ void devlink_flash_update_end_notify(struct devlink *devlink) DEVLINK_CMD_FLASH_UPDATE_END, ¶ms); } -EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify); void devlink_flash_update_status_notify(struct devlink *devlink, const char *status_msg, @@ -3429,10 +3655,12 @@ EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); static int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *nla_component, *nla_overwrite_mask; + struct nlattr *nla_component, *nla_overwrite_mask, *nla_file_name; struct devlink_flash_update_params params = {}; struct devlink *devlink = info->user_ptr[0]; + const char *file_name; u32 supported_params; + int ret; if (!devlink->ops->flash_update) return -EOPNOTSUPP; @@ -3442,8 +3670,6 @@ static int devlink_nl_cmd_flash_update(struct sk_buff *skb, supported_params = devlink->ops->supported_flash_update_params; - params.file_name = nla_data(info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]); - nla_component = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT]; if (nla_component) { if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT)) { @@ -3467,7 +3693,21 @@ static int devlink_nl_cmd_flash_update(struct sk_buff *skb, params.overwrite_mask = sections.value & sections.selector; } - return devlink->ops->flash_update(devlink, ¶ms, info->extack); + nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]; + file_name = nla_data(nla_file_name); + ret = request_firmware(¶ms.fw, file_name, devlink->dev); + if (ret) { + NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name, "failed to locate the requested firmware file"); + return ret; + } + + devlink_flash_update_begin_notify(devlink); + ret = devlink->ops->flash_update(devlink, ¶ms, info->extack); + devlink_flash_update_end_notify(devlink); + + release_firmware(params.fw); + + return ret; } static const struct devlink_param devlink_param_generic[] = { @@ -4110,7 +4350,7 @@ out: static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink_param_item *param_item; struct sk_buff *msg; int err; @@ -4139,7 +4379,7 @@ static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; return __devlink_nl_cmd_param_set_doit(devlink_port->devlink, devlink_port->index, @@ -6957,7 +7197,6 @@ static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb, struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_item *trap_item; - int err; if (list_empty(&devlink->trap_list)) return -EOPNOTSUPP; @@ -6968,11 +7207,7 @@ static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb, return -ENOENT; } - err = devlink_trap_action_set(devlink, trap_item, info); - if (err) - return err; - - return 0; + return devlink_trap_action_set(devlink, trap_item, info); } static struct devlink_trap_group_item * @@ -7563,6 +7798,10 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, DEVLINK_RELOAD_ACTION_MAX), [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(DEVLINK_RELOAD_LIMITS_VALID_MASK), + [DEVLINK_ATTR_PORT_FLAVOUR] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 }, + [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 }, }; static const struct genl_small_ops devlink_nl_ops[] = { @@ -7603,6 +7842,18 @@ static const struct genl_small_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { + .cmd = DEVLINK_CMD_PORT_NEW, + .doit = devlink_nl_cmd_port_new_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, + }, + { + .cmd = DEVLINK_CMD_PORT_DEL, + .doit = devlink_nl_cmd_port_del_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, + }, + { .cmd = DEVLINK_CMD_SB_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_get_doit, @@ -8254,8 +8505,6 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port, { struct devlink_port_attrs *attrs = &devlink_port->attrs; - if (WARN_ON(devlink_port->registered)) - return -EEXIST; devlink_port->attrs_set = true; attrs->flavour = flavour; if (attrs->switch_id.id_len) { @@ -8279,6 +8528,8 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, { int ret; + if (WARN_ON(devlink_port->registered)) + return; devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) @@ -8301,6 +8552,8 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; + if (WARN_ON(devlink_port->registered)) + return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) @@ -8326,6 +8579,8 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; + if (WARN_ON(devlink_port->registered)) + return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) @@ -8337,6 +8592,32 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); +/** + * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes + * + * @devlink_port: devlink port + * @controller: associated controller number for the devlink port instance + * @pf: associated PF for the devlink port instance + * @sf: associated SF of a PF for the devlink port instance + */ +void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, + u16 pf, u32 sf) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + int ret; + + if (WARN_ON(devlink_port->registered)) + return; + ret = __devlink_port_attrs_set(devlink_port, + DEVLINK_PORT_FLAVOUR_PCI_SF); + if (ret) + return; + attrs->pci_sf.controller = controller; + attrs->pci_sf.pf = pf; + attrs->pci_sf.sf = sf; +} +EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); + static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { @@ -8385,6 +8666,10 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, n = snprintf(name, len, "pf%uvf%u", attrs->pci_vf.pf, attrs->pci_vf.vf); break; + case DEVLINK_PORT_FLAVOUR_PCI_SF: + n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, + attrs->pci_sf.sf); + break; } if (n >= len) @@ -8582,6 +8867,10 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); * @resource_id: resource's id * @parent_resource_id: resource's parent id * @size_params: size parameters + * + * Generic resources should reuse the same names across drivers. + * Please see the generic resources list at: + * Documentation/networking/devlink/devlink-resource.rst */ int devlink_resource_register(struct devlink *devlink, const char *resource_name, @@ -9472,6 +9761,8 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(DCCP_PARSING, DROP), DEVLINK_TRAP(GTP_PARSING, DROP), DEVLINK_TRAP(ESP_PARSING, DROP), + DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP), + DEVLINK_TRAP(DMAC_FILTER, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -10221,12 +10512,18 @@ int devlink_compat_flash_update(struct net_device *dev, const char *file_name) goto out; } - params.file_name = file_name; + ret = request_firmware(¶ms.fw, file_name, devlink->dev); + if (ret) + goto out; mutex_lock(&devlink->lock); + devlink_flash_update_begin_notify(devlink); ret = devlink->ops->flash_update(devlink, ¶ms, NULL); + devlink_flash_update_end_notify(devlink); mutex_unlock(&devlink->lock); + release_firmware(params.fw); + out: rtnl_lock(); dev_put(dev); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 7bcfb16854cb..cd80ffed6d26 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -563,7 +563,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev; nlrule->iifindex = -1; - nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); + nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->iifname); if (dev) nlrule->iifindex = dev->ifindex; @@ -573,7 +573,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev; nlrule->oifindex = -1; - nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); + nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->oifname); if (dev) nlrule->oifindex = dev->ifindex; diff --git a/net/core/filter.c b/net/core/filter.c index 2ca5eecebacf..adfdad234674 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2083,13 +2083,13 @@ static const struct bpf_func_proto bpf_csum_level_proto = { static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { - return dev_forward_skb(dev, skb); + return dev_forward_skb_nomtu(dev, skb); } static inline int __bpf_rx_skb_no_mac(struct net_device *dev, struct sk_buff *skb) { - int ret = ____dev_forward_skb(dev, skb); + int ret = ____dev_forward_skb(dev, skb, false); if (likely(!ret)) { skb->dev = dev; @@ -2480,7 +2480,7 @@ int skb_do_redirect(struct sk_buff *skb) goto out_drop; dev = ops->ndo_get_peer_dev(dev); if (unlikely(!dev || - !is_skb_forwardable(dev, skb) || + !(dev->flags & IFF_UP) || net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; @@ -3552,11 +3552,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, return 0; } -static u32 __bpf_skb_max_len(const struct sk_buff *skb) -{ - return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len : - SKB_MAX_ALLOC; -} +#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) @@ -3605,7 +3601,7 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, { u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); - u32 len_max = __bpf_skb_max_len(skb); + u32 len_max = BPF_SKB_MAX_LEN; __be16 proto = skb->protocol; bool shrink = len_diff < 0; u32 off; @@ -3688,7 +3684,7 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, u64 flags) { - u32 max_len = __bpf_skb_max_len(skb); + u32 max_len = BPF_SKB_MAX_LEN; u32 min_len = __bpf_skb_min_len(skb); int ret; @@ -3764,7 +3760,7 @@ static const struct bpf_func_proto sk_skb_change_tail_proto = { static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, u64 flags) { - u32 max_len = __bpf_skb_max_len(skb); + u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; int ret; @@ -4631,6 +4627,18 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk) +{ + return sk ? sock_gen_cookie(sk) : 0; +} + +const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { + .func = bpf_get_socket_ptr_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +}; + BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __sock_gen_cookie(ctx->sk); @@ -4645,11 +4653,9 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { static u64 __bpf_get_netns_cookie(struct sock *sk) { -#ifdef CONFIG_NET_NS - return __net_gen_cookie(sk ? sk->sk_net.net : &init_net); -#else - return 0; -#endif + const struct net *net = sk ? sock_net(sk) : &init_net; + + return net->net_cookie; } BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) @@ -4770,6 +4776,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, ifindex = dev->ifindex; dev_put(dev); } + fallthrough; + case SO_BINDTOIFINDEX: + if (optname == SO_BINDTOIFINDEX) + ifindex = val; ret = sock_bindtoindex(sk, ifindex, false); break; case SO_KEEPALIVE: @@ -4910,6 +4920,9 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, tp->notsent_lowat = val; sk->sk_write_space(sk); break; + case TCP_WINDOW_CLAMP: + ret = tcp_set_window_clamp(sk, val); + break; default: ret = -EINVAL; } @@ -4929,8 +4942,25 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, sock_owned_by_me(sk); + if (level == SOL_SOCKET) { + if (optlen != sizeof(int)) + goto err_clear; + + switch (optname) { + case SO_MARK: + *((int *)optval) = sk->sk_mark; + break; + case SO_PRIORITY: + *((int *)optval) = sk->sk_priority; + break; + case SO_BINDTOIFINDEX: + *((int *)optval) = sk->sk_bound_dev_if; + break; + default: + goto err_clear; + } #ifdef CONFIG_INET - if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { + } else if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { struct inet_connection_sock *icsk; struct tcp_sock *tp; @@ -4984,11 +5014,11 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, goto err_clear; } #endif +#endif } else { goto err_clear; } return 0; -#endif err_clear: memset(optval, 0, optlen); return -EINVAL; @@ -5269,12 +5299,14 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, const struct neighbour *neigh, - const struct net_device *dev) + const struct net_device *dev, u32 mtu) { memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; + if (mtu) + params->mtu_result = mtu; /* union with tot_len */ return 0; } @@ -5290,8 +5322,8 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct net_device *dev; struct fib_result res; struct flowi4 fl4; + u32 mtu = 0; int err; - u32 mtu; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) @@ -5358,8 +5390,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); - if (params->tot_len > mtu) + if (params->tot_len > mtu) { + params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; + } } nhc = res.nhc; @@ -5393,7 +5427,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; - return bpf_fib_set_fwd_params(params, neigh, dev); + return bpf_fib_set_fwd_params(params, neigh, dev, mtu); } #endif @@ -5410,7 +5444,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct flowi6 fl6; int strict = 0; int oif, err; - u32 mtu; + u32 mtu = 0; /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) @@ -5485,8 +5519,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src); - if (params->tot_len > mtu) + if (params->tot_len > mtu) { + params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; + } } if (res.nh->fib_nh_lws) @@ -5506,7 +5542,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; - return bpf_fib_set_fwd_params(params, neigh, dev); + return bpf_fib_set_fwd_params(params, neigh, dev, mtu); } #endif @@ -5549,6 +5585,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, { struct net *net = dev_net(skb->dev); int rc = -EAFNOSUPPORT; + bool check_mtu = false; if (plen < sizeof(*params)) return -EINVAL; @@ -5556,25 +5593,33 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) return -EINVAL; + if (params->tot_len) + check_mtu = true; + switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: - rc = bpf_ipv4_fib_lookup(net, params, flags, false); + rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - rc = bpf_ipv6_fib_lookup(net, params, flags, false); + rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu); break; #endif } - if (!rc) { + if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) { struct net_device *dev; + /* When tot_len isn't provided by user, check skb + * against MTU of FIB lookup resulting net_device + */ dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; + + params->mtu_result = dev->mtu; /* union with tot_len */ } return rc; @@ -5590,6 +5635,116 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .arg4_type = ARG_ANYTHING, }; +static struct net_device *__dev_via_ifindex(struct net_device *dev_curr, + u32 ifindex) +{ + struct net *netns = dev_net(dev_curr); + + /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */ + if (ifindex == 0) + return dev_curr; + + return dev_get_by_index_rcu(netns, ifindex); +} + +BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + struct net_device *dev = skb->dev; + int skb_len, dev_len; + int mtu; + + if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) + return -EINVAL; + + if (unlikely(flags & BPF_MTU_CHK_SEGS && len_diff)) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + + dev_len = mtu + dev->hard_header_len; + skb_len = skb->len + len_diff; /* minus result pass check */ + if (skb_len <= dev_len) { + ret = BPF_MTU_CHK_RET_SUCCESS; + goto out; + } + /* At this point, skb->len exceed MTU, but as it include length of all + * segments, it can still be below MTU. The SKB can possibly get + * re-segmented in transmit path (see validate_xmit_skb). Thus, user + * must choose if segs are to be MTU checked. + */ + if (skb_is_gso(skb)) { + ret = BPF_MTU_CHK_RET_SUCCESS; + + if (flags & BPF_MTU_CHK_SEGS && + !skb_gso_validate_network_len(skb, mtu)) + ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + } +out: + /* BPF verifier guarantees valid pointer */ + *mtu_len = mtu; + + return ret; +} + +BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + struct net_device *dev = xdp->rxq->dev; + int xdp_len = xdp->data_end - xdp->data; + int ret = BPF_MTU_CHK_RET_SUCCESS; + int mtu, dev_len; + + /* XDP variant doesn't support multi-buffer segment check (yet) */ + if (unlikely(flags)) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + + /* Add L2-header as dev MTU is L3 size */ + dev_len = mtu + dev->hard_header_len; + + xdp_len += len_diff; /* minus result pass check */ + if (xdp_len > dev_len) + ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + + /* BPF verifier guarantees valid pointer */ + *mtu_len = mtu; + + return ret; +} + +static const struct bpf_func_proto bpf_skb_check_mtu_proto = { + .func = bpf_skb_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_INT, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { + .func = bpf_xdp_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_INT, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) { @@ -6995,16 +7150,36 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_delete_proto; case BPF_FUNC_setsockopt: switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: return &bpf_sock_addr_setsockopt_proto; default: return NULL; } case BPF_FUNC_getsockopt: switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: return &bpf_sock_addr_getsockopt_proto; default: return NULL; @@ -7155,6 +7330,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_skb_check_mtu_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: @@ -7224,6 +7401,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_xdp_check_mtu_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_udp: return &bpf_xdp_sk_lookup_udp_proto; @@ -8788,7 +8967,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, target_size)); break; case offsetof(struct bpf_sock, rx_queue_mapping): -#ifdef CONFIG_XPS +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), si->dst_reg, si->src_reg, @@ -10406,6 +10585,24 @@ const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], }; +BPF_CALL_1(bpf_sock_from_file, struct file *, file) +{ + return (unsigned long)sock_from_file(file); +} + +BTF_ID_LIST(bpf_sock_from_file_btf_ids) +BTF_ID(struct, socket) +BTF_ID(struct, file) + +const struct bpf_func_proto bpf_sock_from_file_proto = { + .func = bpf_sock_from_file, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &bpf_sock_from_file_btf_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_sock_from_file_btf_ids[1], +}; + static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id) { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index e21950a2c897..2ef2224b3bff 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -23,6 +23,7 @@ #include <linux/if_ether.h> #include <linux/mpls.h> #include <linux/tcp.h> +#include <linux/ptp_classify.h> #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> #include <uapi/linux/batadv_packet.h> @@ -48,7 +49,7 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, memset(flow_dissector, 0, sizeof(*flow_dissector)); for (i = 0; i < key_count; i++, key++) { - /* User should make sure that every key target offset is withing + /* User should make sure that every key target offset is within * boundaries of unsigned short. */ BUG_ON(key->offset > USHRT_MAX); @@ -236,9 +237,8 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, - u16 *ctinfo_map, - size_t mapsize) + void *target_container, u16 *ctinfo_map, + size_t mapsize, bool post_ct) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct flow_dissector_key_ct *key; @@ -250,13 +250,19 @@ skb_flow_dissect_ct(const struct sk_buff *skb, return; ct = nf_ct_get(skb, &ctinfo); - if (!ct) + if (!ct && !post_ct) return; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CT, target_container); + if (!ct) { + key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_INVALID; + return; + } + if (ctinfo < mapsize) key->ct_state = ctinfo_map[ctinfo]; #if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) @@ -1050,6 +1056,9 @@ proto_again: key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } + __skb_flow_dissect_ipv4(skb, flow_dissector, + target_container, data, iph); + if (ip_is_fragment(iph)) { key_control->flags |= FLOW_DIS_IS_FRAGMENT; @@ -1066,9 +1075,6 @@ proto_again: } } - __skb_flow_dissect_ipv4(skb, flow_dissector, - target_container, data, iph); - break; } case htons(ETH_P_IPV6): { @@ -1251,6 +1257,21 @@ proto_again: &proto, &nhoff, hlen, flags); break; + case htons(ETH_P_1588): { + struct ptp_header *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, + hlen, &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + nhoff += ntohs(hdr->message_length); + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index d4474c812b64..715b67f6c62f 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -381,10 +381,8 @@ static void __flow_block_indr_cleanup(void (*release)(void *cb_priv), list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) { if (this->release == release && - this->indr.cb_priv == cb_priv) { + this->indr.cb_priv == cb_priv) list_move(&this->indr.list, cleanup_list); - return; - } } } diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 80dbf2f4016e..8e582e29a41e 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -80,11 +80,11 @@ static void est_timer(struct timer_list *t) u64 rate, brate; est_fetch_counters(est, &b); - brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log); - brate -= (est->avbps >> est->ewma_log); + brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log); + brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); - rate = (b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); - rate -= (est->avpps >> est->ewma_log); + rate = (b.packets - est->last_packets) << (10 - est->intvl_log); + rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); est->avbps += brate; @@ -143,6 +143,9 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, if (parm->interval < -2 || parm->interval > 3) return -EINVAL; + if (parm->ewma_log == 0 || parm->ewma_log >= 31) + return -EINVAL; + est = kzalloc(sizeof(*est), GFP_KERNEL); if (!est) return -ENOBUFS; diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index e095fb871d91..6eb2e5ec2c50 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -99,9 +99,14 @@ void gro_cells_destroy(struct gro_cells *gcells) struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); napi_disable(&cell->napi); - netif_napi_del(&cell->napi); + __netif_napi_del(&cell->napi); __skb_queue_purge(&cell->napi_skbs); } + /* This barrier is needed because netpoll could access dev->napi_list + * under rcu protection. + */ + synchronize_net(); + free_percpu(gcells->cells); gcells->cells = NULL; } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 7d3438215f32..2f7940bcf715 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -39,12 +39,11 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, { int ret; - /* Preempt disable is needed to protect per-cpu redirect_info between - * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and - * access to maps strictly require a rcu_read_lock() for protection, - * mixing with BH RCU lock doesn't work. + /* Migration disable and BH disable are needed to protect per-cpu + * redirect_info between BPF prog and skb_do_redirect(). */ - preempt_disable(); + migrate_disable(); + local_bh_disable(); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); @@ -78,7 +77,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, break; } - preempt_enable(); + local_bh_enable(); + migrate_enable(); return ret; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8e39e28b0a8d..e2982b3970b8 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -41,7 +41,6 @@ #include <trace/events/neigh.h> -#define DEBUG #define NEIGH_DEBUG 1 #define neigh_dbg(level, fmt, ...) \ do { \ @@ -235,6 +234,8 @@ static int neigh_forced_gc(struct neigh_table *tbl) write_lock(&n->lock); if ((n->nud_state == NUD_FAILED) || + (tbl->is_multicast && + tbl->is_multicast(n->primary_key)) || time_after(tref, n->updated)) remove = true; write_unlock(&n->lock); @@ -1243,13 +1244,14 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, old = neigh->nud_state; err = -EPERM; - if (!(flags & NEIGH_UPDATE_F_ADMIN) && - (old & (NUD_NOARP | NUD_PERMANENT))) - goto out; if (neigh->dead) { NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); + new = old; goto out; } + if (!(flags & NEIGH_UPDATE_F_ADMIN) && + (old & (NUD_NOARP | NUD_PERMANENT))) + goto out; ext_learn_change = neigh_update_ext_learned(neigh, flags, ¬ify); @@ -1567,10 +1569,8 @@ static void neigh_proxy_process(struct timer_list *t) void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { - unsigned long now = jiffies; - - unsigned long sched_next = now + (prandom_u32() % - NEIGH_VAR(p, PROXY_DELAY)); + unsigned long sched_next = jiffies + + prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY)); if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 94fff0700bdd..307628fdf380 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -538,6 +538,45 @@ static ssize_t phys_switch_id_show(struct device *dev, } static DEVICE_ATTR_RO(phys_switch_id); +static ssize_t threaded_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) + ret = sprintf(buf, fmt_dec, netdev->threaded); + + rtnl_unlock(); + return ret; +} + +static int modify_napi_threaded(struct net_device *dev, unsigned long val) +{ + int ret; + + if (list_empty(&dev->napi_list)) + return -EOPNOTSUPP; + + if (val != 0 && val != 1) + return -EOPNOTSUPP; + + ret = dev_set_threaded(dev, val); + + return ret; +} + +static ssize_t threaded_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, modify_napi_threaded); +} +static DEVICE_ATTR_RW(threaded); + static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_netdev_group.attr, &dev_attr_type.attr, @@ -570,6 +609,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_proto_down.attr, &dev_attr_carrier_up_count.attr, &dev_attr_carrier_down_count.attr, + &dev_attr_threaded.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); @@ -1027,7 +1067,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct kobject *kobj = &dev->_rx[i].kobj; - if (!refcount_read(&dev_net(dev)->count)) + if (!refcount_read(&dev_net(dev)->ns.count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); @@ -1136,18 +1176,25 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; + int num_tc, tc; int index; - int tc; if (!netif_is_multiqueue(dev)) return -ENOENT; + if (!rtnl_trylock()) + return restart_syscall(); + index = get_netdev_queue_index(queue); /* If queue belongs to subordinate dev use its TC mapping */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + num_tc = dev->num_tc; tc = netdev_txq_to_tc(dev, index); + + rtnl_unlock(); + if (tc < 0) return -EINVAL; @@ -1158,8 +1205,8 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, * belongs to the root device it will be reported with just the * traffic class, so just "0" for TC 0 for example. */ - return dev->num_tc < 0 ? sprintf(buf, "%d%d\n", tc, dev->num_tc) : - sprintf(buf, "%d\n", tc); + return num_tc < 0 ? sprintf(buf, "%d%d\n", tc, num_tc) : + sprintf(buf, "%d\n", tc); } #ifdef CONFIG_XPS @@ -1317,8 +1364,8 @@ static const struct attribute_group dql_group = { static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) { + int cpu, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; - int cpu, len, num_tc = 1, tc = 0; struct xps_dev_maps *dev_maps; cpumask_var_t mask; unsigned long index; @@ -1328,22 +1375,31 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, index = get_netdev_queue_index(queue); + if (!rtnl_trylock()) + return restart_syscall(); + if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; - if (num_tc < 0) - return -EINVAL; + if (num_tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); - if (tc < 0) - return -EINVAL; + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } } - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err_rtnl_unlock; + } rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_cpus_map); @@ -1366,9 +1422,15 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, } rcu_read_unlock(); + rtnl_unlock(); + len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); free_cpumask_var(mask); return len < PAGE_SIZE ? len : -EINVAL; + +err_rtnl_unlock: + rtnl_unlock(); + return ret; } static ssize_t xps_cpus_store(struct netdev_queue *queue, @@ -1396,7 +1458,13 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, return err; } + if (!rtnl_trylock()) { + free_cpumask_var(mask); + return restart_syscall(); + } + err = netif_set_xps_queue(dev, mask, index); + rtnl_unlock(); free_cpumask_var(mask); @@ -1408,22 +1476,29 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { + int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; unsigned long *mask, index; - int j, len, num_tc = 1, tc = 0; index = get_netdev_queue_index(queue); + if (!rtnl_trylock()) + return restart_syscall(); + if (dev->num_tc) { num_tc = dev->num_tc; tc = netdev_txq_to_tc(dev, index); - if (tc < 0) - return -EINVAL; + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } } mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); - if (!mask) - return -ENOMEM; + if (!mask) { + ret = -ENOMEM; + goto err_rtnl_unlock; + } rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_rxqs_map); @@ -1449,10 +1524,16 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) out_no_maps: rcu_read_unlock(); + rtnl_unlock(); + len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; + +err_rtnl_unlock: + rtnl_unlock(); + return ret; } static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, @@ -1478,10 +1559,17 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, return err; } + if (!rtnl_trylock()) { + bitmap_free(mask); + return restart_syscall(); + } + cpus_read_lock(); err = __netif_set_xps_queue(dev, mask, index, true); cpus_read_unlock(); + rtnl_unlock(); + bitmap_free(mask); return err ? : len; } @@ -1605,7 +1693,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!refcount_read(&dev_net(dev)->count)) + if (!refcount_read(&dev_net(dev)->ns.count)) queue->kobj.uevent_suppress = 1; #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group); @@ -1852,7 +1940,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; - if (!refcount_read(&dev_net(ndev)->count)) + if (!refcount_read(&dev_net(ndev)->ns.count)) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index dbc66b896287..43b6ac4c4439 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -45,7 +45,7 @@ static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) }; #endif struct net init_net = { - .count = REFCOUNT_INIT(1), + .ns.count = REFCOUNT_INIT(1), .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), #ifdef CONFIG_KEYS .key_domain = &init_net_key_domain, @@ -72,18 +72,6 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; DEFINE_COOKIE(net_cookie); -u64 __net_gen_cookie(struct net *net) -{ - while (1) { - u64 res = atomic64_read(&net->net_cookie); - - if (res) - return res; - res = gen_cookie_next(&net_cookie); - atomic64_cmpxchg(&net->net_cookie, 0, res); - } -} - static struct net_generic *net_alloc_generic(void) { struct net_generic *ng; @@ -249,7 +237,7 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { int id; - if (refcount_read(&net->count) == 0) + if (refcount_read(&net->ns.count) == 0) return NETNSA_NSID_NOT_ASSIGNED; spin_lock_bh(&net->nsid_lock); @@ -329,9 +317,12 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) int error = 0; LIST_HEAD(net_exit_list); - refcount_set(&net->count, 1); + refcount_set(&net->ns.count, 1); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); + preempt_disable(); + net->net_cookie = gen_cookie_next(&net_cookie); + preempt_enable(); net->dev_base_seq = 1; net->user_ns = user_ns; idr_init(&net->netns_ids); @@ -1103,10 +1094,6 @@ static int __init net_ns_init(void) rcu_assign_pointer(init_net.gen, ng); - preempt_disable(); - __net_gen_cookie(&init_net); - preempt_enable(); - down_write(&pernet_ops_rwsem); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 41b24cd31562..b49c57d35a88 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -68,9 +68,8 @@ struct update_classid_context { static int update_classid_sock(const void *v, struct file *file, unsigned n) { - int err; struct update_classid_context *ctx = (void *)v; - struct socket *sock = sock_from_file(file, &err); + struct socket *sock = sock_from_file(file); if (sock) { spin_lock(&cgroup_sk_update_lock); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 9bd4cab7d510..99a431c56f23 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -220,8 +220,7 @@ static ssize_t write_priomap(struct kernfs_open_file *of, static int update_netprio(const void *v, struct file *file, unsigned n) { - int err; - struct socket *sock = sock_from_file(file, &err); + struct socket *sock = sock_from_file(file); if (sock) { spin_lock(&cgroup_sk_update_lock); sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, diff --git a/net/core/page_pool.c b/net/core/page_pool.c index ef98372facf6..ad8b0707af04 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -11,6 +11,8 @@ #include <linux/device.h> #include <net/page_pool.h> +#include <net/xdp.h> + #include <linux/dma-direction.h> #include <linux/dma-mapping.h> #include <linux/page-flags.h> @@ -348,46 +350,38 @@ static bool page_pool_recycle_in_cache(struct page *page, return true; } -/* page is NOT reusable when: - * 1) allocated when system is under some pressure. (page_is_pfmemalloc) - */ -static bool pool_page_reusable(struct page_pool *pool, struct page *page) -{ - return !page_is_pfmemalloc(page); -} - /* If the page refcnt == 1, this will try to recycle the page. * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for * the configured size min(dma_sync_size, pool->max_len). * If the page refcnt != 1, then the page will be returned to memory * subsystem. */ -void page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +static __always_inline struct page * +__page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. * * refcnt == 1 means page_pool owns page, and can recycle it. + * + * page is NOT reusable when allocated when system is under + * some pressure. (page_is_pfmemalloc) */ - if (likely(page_ref_count(page) == 1 && - pool_page_reusable(pool, page))) { + if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { /* Read barrier done in page_ref_count / READ_ONCE */ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) page_pool_dma_sync_for_device(pool, page, dma_sync_size); - if (allow_direct && in_serving_softirq()) - if (page_pool_recycle_in_cache(page, pool)) - return; + if (allow_direct && in_serving_softirq() && + page_pool_recycle_in_cache(page, pool)) + return NULL; - if (!page_pool_recycle_in_ring(pool, page)) { - /* Cache full, fallback to free pages */ - page_pool_return_page(pool, page); - } - return; + /* Page found as candidate for recycling */ + return page; } /* Fallback/non-XDP mode: API user have elevated refcnt. * @@ -405,9 +399,59 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, /* Do not replace this with page_pool_return_page() */ page_pool_release_page(pool, page); put_page(page); + + return NULL; +} + +void page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) +{ + page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); + if (page && !page_pool_recycle_in_ring(pool, page)) { + /* Cache full, fallback to free pages */ + page_pool_return_page(pool, page); + } } EXPORT_SYMBOL(page_pool_put_page); +/* Caller must not use data area after call, as this function overwrites it */ +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ + int i, bulk_len = 0; + + for (i = 0; i < count; i++) { + struct page *page = virt_to_head_page(data[i]); + + page = __page_pool_put_page(pool, page, -1, false); + /* Approved for bulk recycling in ptr_ring cache */ + if (page) + data[bulk_len++] = page; + } + + if (unlikely(!bulk_len)) + return; + + /* Bulk producer into ptr_ring page_pool cache */ + page_pool_ring_lock(pool); + for (i = 0; i < bulk_len; i++) { + if (__ptr_ring_produce(&pool->ring, data[i])) + break; /* ring full */ + } + page_pool_ring_unlock(pool); + + /* Hopefully all pages was return into ptr_ring */ + if (likely(i == bulk_len)) + return; + + /* ptr_ring cache full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation + */ + for (; i < bulk_len; i++) + page_pool_return_page(pool, data[i]); +} +EXPORT_SYMBOL(page_pool_put_page_bulk); + static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 105978604ffd..3fba429f1f57 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3464,7 +3464,7 @@ static int pktgen_thread_worker(void *arg) struct pktgen_dev *pkt_dev = NULL; int cpu = t->cpu; - BUG_ON(smp_processor_id() != cpu); + WARN_ON(smp_processor_id() != cpu); init_waitqueue_head(&t->queue); complete(&t->start_done); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7d7223691783..0edc0b2baaa4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -55,7 +55,7 @@ #include <net/net_namespace.h> #define RTNL_MAX_TYPE 50 -#define RTNL_SLAVE_MAX_TYPE 36 +#define RTNL_SLAVE_MAX_TYPE 40 struct rtnl_link { rtnl_doit_func doit; @@ -139,7 +139,7 @@ bool lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ -static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; +static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { @@ -157,7 +157,7 @@ static inline int rtm_msgindex(int msgtype) static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) { - struct rtnl_link **tab; + struct rtnl_link __rcu **tab; if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) protocol = PF_UNSPEC; @@ -166,7 +166,7 @@ static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) if (!tab) tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); - return tab[msgtype]; + return rcu_dereference_rtnl(tab[msgtype]); } static int rtnl_register_internal(struct module *owner, @@ -183,7 +183,7 @@ static int rtnl_register_internal(struct module *owner, msgindex = rtm_msgindex(msgtype); rtnl_lock(); - tab = rtnl_msg_handlers[protocol]; + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (tab == NULL) { tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); if (!tab) @@ -286,7 +286,8 @@ void rtnl_register(int protocol, int msgtype, */ int rtnl_unregister(int protocol, int msgtype) { - struct rtnl_link **tab, *link; + struct rtnl_link __rcu **tab; + struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); @@ -299,7 +300,7 @@ int rtnl_unregister(int protocol, int msgtype) return -ENOENT; } - link = tab[msgindex]; + link = rtnl_dereference(tab[msgindex]); rcu_assign_pointer(tab[msgindex], NULL); rtnl_unlock(); @@ -318,20 +319,21 @@ EXPORT_SYMBOL_GPL(rtnl_unregister); */ void rtnl_unregister_all(int protocol) { - struct rtnl_link **tab, *link; + struct rtnl_link __rcu **tab; + struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); - tab = rtnl_msg_handlers[protocol]; + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (!tab) { rtnl_unlock(); return; } RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { - link = tab[msgindex]; + link = rtnl_dereference(tab[msgindex]); if (!link) continue; @@ -1939,7 +1941,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla if (linfo[IFLA_INFO_KIND]) { char kind[MODULE_NAME_LEN]; - nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); + nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } @@ -2658,7 +2660,7 @@ static int do_setlink(const struct sk_buff *skb, sa->sa_family = dev->type; memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); - err = dev_set_mac_address(dev, sa, extack); + err = dev_set_mac_address_user(dev, sa, extack); kfree(sa); if (err) goto errout; @@ -2953,9 +2955,9 @@ static struct net_device *rtnl_dev_get(struct net *net, if (!ifname) { ifname = buffer; if (ifname_attr) - nla_strlcpy(ifname, ifname_attr, IFNAMSIZ); + nla_strscpy(ifname, ifname_attr, IFNAMSIZ); else if (altifname_attr) - nla_strlcpy(ifname, altifname_attr, ALTIFNAMSIZ); + nla_strscpy(ifname, altifname_attr, ALTIFNAMSIZ); else return NULL; } @@ -2983,7 +2985,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; @@ -3264,7 +3266,7 @@ replay: return err; if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; @@ -3296,7 +3298,7 @@ replay: memset(linkinfo, 0, sizeof(linkinfo)); if (linkinfo[IFLA_INFO_KIND]) { - nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); + nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } else { kind[0] = '\0'; @@ -3437,26 +3439,15 @@ replay: dev->ifindex = ifm->ifi_index; - if (ops->newlink) { + if (ops->newlink) err = ops->newlink(link_net ? : net, dev, tb, data, extack); - /* Drivers should call free_netdev() in ->destructor - * and unregister it on failure after registration - * so that device could be finally freed in rtnl_unlock. - */ - if (err < 0) { - /* If device is not registered at all, free it now */ - if (dev->reg_state == NETREG_UNINITIALIZED || - dev->reg_state == NETREG_UNREGISTERED) - free_netdev(dev); - goto out; - } - } else { + else err = register_netdevice(dev); - if (err < 0) { - free_netdev(dev); - goto out; - } + if (err < 0) { + free_netdev(dev); + goto out; } + err = rtnl_configure_link(dev, ifm); if (err < 0) goto out_unregister; @@ -3754,7 +3745,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { - struct rtnl_link **tab; + struct rtnl_link __rcu **tab; struct rtnl_link *link; rtnl_dumpit_func dumpit; @@ -3768,7 +3759,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) if (!tab) continue; - link = tab[type]; + link = rcu_dereference_rtnl(tab[type]); if (!link) continue; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1ba8f0163744..545a472273a5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -119,148 +119,75 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) skb_panic(skb, sz, addr, __func__); } -/* - * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells - * the caller if emergency pfmemalloc reserves are being used. If it is and - * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves - * may be used. Otherwise, the packet data may be discarded until enough - * memory is free - */ -#define kmalloc_reserve(size, gfp, node, pfmemalloc) \ - __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc) - -static void *__kmalloc_reserve(size_t size, gfp_t flags, int node, - unsigned long ip, bool *pfmemalloc) -{ - void *obj; - bool ret_pfmemalloc = false; +#define NAPI_SKB_CACHE_SIZE 64 +#define NAPI_SKB_CACHE_BULK 16 +#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) - /* - * Try a regular allocation, when that fails and we're not entitled - * to the reserves, fail. - */ - obj = kmalloc_node_track_caller(size, - flags | __GFP_NOMEMALLOC | __GFP_NOWARN, - node); - if (obj || !(gfp_pfmemalloc_allowed(flags))) - goto out; +struct napi_alloc_cache { + struct page_frag_cache page; + unsigned int skb_count; + void *skb_cache[NAPI_SKB_CACHE_SIZE]; +}; - /* Try again but now we are using pfmemalloc reserves */ - ret_pfmemalloc = true; - obj = kmalloc_node_track_caller(size, flags, node); +static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); +static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); -out: - if (pfmemalloc) - *pfmemalloc = ret_pfmemalloc; +static void *__alloc_frag_align(unsigned int fragsz, gfp_t gfp_mask, + unsigned int align_mask) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return obj; + return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align_mask); } -/* Allocate a new skbuff. We do this ourselves so we can fill in a few - * 'private' fields and also do memory statistics to find all the - * [BEEP] leaks. - * - */ - -/** - * __alloc_skb - allocate a network buffer - * @size: size to allocate - * @gfp_mask: allocation mask - * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache - * instead of head cache and allocate a cloned (child) skb. - * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for - * allocations in case the data is required for writeback - * @node: numa node to allocate memory on - * - * Allocate a new &sk_buff. The returned buffer has no headroom and a - * tail room of at least size bytes. The object has a reference count - * of one. The return is the buffer. On a failure the return is %NULL. - * - * Buffers may only be allocated from interrupts using a @gfp_mask of - * %GFP_ATOMIC. - */ -struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, - int flags, int node) +void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { - struct kmem_cache *cache; - struct skb_shared_info *shinfo; - struct sk_buff *skb; - u8 *data; - bool pfmemalloc; - - cache = (flags & SKB_ALLOC_FCLONE) - ? skbuff_fclone_cache : skbuff_head_cache; - - if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) - gfp_mask |= __GFP_MEMALLOC; - - /* Get the HEAD */ - skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); - if (!skb) - goto out; - prefetchw(skb); + fragsz = SKB_DATA_ALIGN(fragsz); - /* We do our best to align skb_shared_info on a separate cache - * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives - * aligned memory blocks, unless SLUB/SLAB debug is enabled. - * Both skb->head and skb_shared_info are cache line aligned. - */ - size = SKB_DATA_ALIGN(size); - size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); - if (!data) - goto nodata; - /* kmalloc(size) might give us more room than requested. - * Put skb_shared_info exactly at the end of allocated zone, - * to allow max possible filling before reallocation. - */ - size = SKB_WITH_OVERHEAD(ksize(data)); - prefetchw(data + size); + return __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask); +} +EXPORT_SYMBOL(__napi_alloc_frag_align); - /* - * Only clear those fields we need to clear, not those that we will - * actually initialise below. Hence, don't put any more fields after - * the tail pointer in struct sk_buff! - */ - memset(skb, 0, offsetof(struct sk_buff, tail)); - /* Account for allocated memory : skb + skb->head */ - skb->truesize = SKB_TRUESIZE(size); - skb->pfmemalloc = pfmemalloc; - refcount_set(&skb->users, 1); - skb->head = data; - skb->data = data; - skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; - skb->mac_header = (typeof(skb->mac_header))~0U; - skb->transport_header = (typeof(skb->transport_header))~0U; +void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) +{ + struct page_frag_cache *nc; + void *data; - /* make sure we initialize shinfo sequentially */ - shinfo = skb_shinfo(skb); - memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); - atomic_set(&shinfo->dataref, 1); + fragsz = SKB_DATA_ALIGN(fragsz); + if (in_irq() || irqs_disabled()) { + nc = this_cpu_ptr(&netdev_alloc_cache); + data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); + } else { + local_bh_disable(); + data = __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask); + local_bh_enable(); + } + return data; +} +EXPORT_SYMBOL(__netdev_alloc_frag_align); - if (flags & SKB_ALLOC_FCLONE) { - struct sk_buff_fclones *fclones; +static struct sk_buff *napi_skb_cache_get(void) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct sk_buff *skb; - fclones = container_of(skb, struct sk_buff_fclones, skb1); + if (unlikely(!nc->skb_count)) + nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache, + GFP_ATOMIC, + NAPI_SKB_CACHE_BULK, + nc->skb_cache); + if (unlikely(!nc->skb_count)) + return NULL; - skb->fclone = SKB_FCLONE_ORIG; - refcount_set(&fclones->fclone_ref, 1); + skb = nc->skb_cache[--nc->skb_count]; + kasan_unpoison_object_data(skbuff_head_cache, skb); - fclones->skb2.fclone = SKB_FCLONE_CLONE; - } -out: return skb; -nodata: - kmem_cache_free(cache, skb); - skb = NULL; - goto out; } -EXPORT_SYMBOL(__alloc_skb); /* Caller must provide SKB that is memset cleared */ -static struct sk_buff *__build_skb_around(struct sk_buff *skb, - void *data, unsigned int frag_size) +static void __build_skb_around(struct sk_buff *skb, void *data, + unsigned int frag_size) { struct skb_shared_info *shinfo; unsigned int size = frag_size ? : ksize(data); @@ -282,7 +209,7 @@ static struct sk_buff *__build_skb_around(struct sk_buff *skb, memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); - return skb; + skb_set_kcov_handle(skb, kcov_common_handle()); } /** @@ -313,8 +240,9 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) return NULL; memset(skb, 0, offsetof(struct sk_buff, tail)); + __build_skb_around(skb, data, frag_size); - return __build_skb_around(skb, data, frag_size); + return skb; } /* build_skb() is wrapper over __build_skb(), that specifically @@ -347,9 +275,9 @@ struct sk_buff *build_skb_around(struct sk_buff *skb, if (unlikely(!skb)) return NULL; - skb = __build_skb_around(skb, data, frag_size); + __build_skb_around(skb, data, frag_size); - if (skb && frag_size) { + if (frag_size) { skb->head_frag = 1; if (page_is_pfmemalloc(virt_to_head_page(data))) skb->pfmemalloc = 1; @@ -358,56 +286,178 @@ struct sk_buff *build_skb_around(struct sk_buff *skb, } EXPORT_SYMBOL(build_skb_around); -#define NAPI_SKB_CACHE_SIZE 64 +/** + * __napi_build_skb - build a network buffer + * @data: data buffer provided by caller + * @frag_size: size of data, or 0 if head was kmalloced + * + * Version of __build_skb() that uses NAPI percpu caches to obtain + * skbuff_head instead of inplace allocation. + * + * Returns a new &sk_buff on success, %NULL on allocation failure. + */ +static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb; -struct napi_alloc_cache { - struct page_frag_cache page; - unsigned int skb_count; - void *skb_cache[NAPI_SKB_CACHE_SIZE]; -}; + skb = napi_skb_cache_get(); + if (unlikely(!skb)) + return NULL; -static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); + memset(skb, 0, offsetof(struct sk_buff, tail)); + __build_skb_around(skb, data, frag_size); + + return skb; +} -static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +/** + * napi_build_skb - build a network buffer + * @data: data buffer provided by caller + * @frag_size: size of data, or 0 if head was kmalloced + * + * Version of __napi_build_skb() that takes care of skb->head_frag + * and skb->pfmemalloc when the data is a page or page fragment. + * + * Returns a new &sk_buff on success, %NULL on allocation failure. + */ +struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) { - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct sk_buff *skb = __napi_build_skb(data, frag_size); + + if (likely(skb) && frag_size) { + skb->head_frag = 1; + skb_propagate_pfmemalloc(virt_to_head_page(data), skb); + } - return page_frag_alloc(&nc->page, fragsz, gfp_mask); + return skb; } +EXPORT_SYMBOL(napi_build_skb); -void *napi_alloc_frag(unsigned int fragsz) +/* + * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells + * the caller if emergency pfmemalloc reserves are being used. If it is and + * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves + * may be used. Otherwise, the packet data may be discarded until enough + * memory is free + */ +static void *kmalloc_reserve(size_t size, gfp_t flags, int node, + bool *pfmemalloc) { - fragsz = SKB_DATA_ALIGN(fragsz); + void *obj; + bool ret_pfmemalloc = false; - return __napi_alloc_frag(fragsz, GFP_ATOMIC); + /* + * Try a regular allocation, when that fails and we're not entitled + * to the reserves, fail. + */ + obj = kmalloc_node_track_caller(size, + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, + node); + if (obj || !(gfp_pfmemalloc_allowed(flags))) + goto out; + + /* Try again but now we are using pfmemalloc reserves */ + ret_pfmemalloc = true; + obj = kmalloc_node_track_caller(size, flags, node); + +out: + if (pfmemalloc) + *pfmemalloc = ret_pfmemalloc; + + return obj; } -EXPORT_SYMBOL(napi_alloc_frag); + +/* Allocate a new skbuff. We do this ourselves so we can fill in a few + * 'private' fields and also do memory statistics to find all the + * [BEEP] leaks. + * + */ /** - * netdev_alloc_frag - allocate a page fragment - * @fragsz: fragment size + * __alloc_skb - allocate a network buffer + * @size: size to allocate + * @gfp_mask: allocation mask + * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache + * instead of head cache and allocate a cloned (child) skb. + * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for + * allocations in case the data is required for writeback + * @node: numa node to allocate memory on * - * Allocates a frag from a page for receive buffer. - * Uses GFP_ATOMIC allocations. + * Allocate a new &sk_buff. The returned buffer has no headroom and a + * tail room of at least size bytes. The object has a reference count + * of one. The return is the buffer. On a failure the return is %NULL. + * + * Buffers may only be allocated from interrupts using a @gfp_mask of + * %GFP_ATOMIC. */ -void *netdev_alloc_frag(unsigned int fragsz) +struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, + int flags, int node) { - struct page_frag_cache *nc; - void *data; + struct kmem_cache *cache; + struct sk_buff *skb; + u8 *data; + bool pfmemalloc; - fragsz = SKB_DATA_ALIGN(fragsz); - if (in_irq() || irqs_disabled()) { - nc = this_cpu_ptr(&netdev_alloc_cache); - data = page_frag_alloc(nc, fragsz, GFP_ATOMIC); - } else { - local_bh_disable(); - data = __napi_alloc_frag(fragsz, GFP_ATOMIC); - local_bh_enable(); + cache = (flags & SKB_ALLOC_FCLONE) + ? skbuff_fclone_cache : skbuff_head_cache; + + if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) + gfp_mask |= __GFP_MEMALLOC; + + /* Get the HEAD */ + if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI && + likely(node == NUMA_NO_NODE || node == numa_mem_id())) + skb = napi_skb_cache_get(); + else + skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); + if (unlikely(!skb)) + return NULL; + prefetchw(skb); + + /* We do our best to align skb_shared_info on a separate cache + * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives + * aligned memory blocks, unless SLUB/SLAB debug is enabled. + * Both skb->head and skb_shared_info are cache line aligned. + */ + size = SKB_DATA_ALIGN(size); + size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); + if (unlikely(!data)) + goto nodata; + /* kmalloc(size) might give us more room than requested. + * Put skb_shared_info exactly at the end of allocated zone, + * to allow max possible filling before reallocation. + */ + size = SKB_WITH_OVERHEAD(ksize(data)); + prefetchw(data + size); + + /* + * Only clear those fields we need to clear, not those that we will + * actually initialise below. Hence, don't put any more fields after + * the tail pointer in struct sk_buff! + */ + memset(skb, 0, offsetof(struct sk_buff, tail)); + __build_skb_around(skb, data, 0); + skb->pfmemalloc = pfmemalloc; + + if (flags & SKB_ALLOC_FCLONE) { + struct sk_buff_fclones *fclones; + + fclones = container_of(skb, struct sk_buff_fclones, skb1); + + skb->fclone = SKB_FCLONE_ORIG; + refcount_set(&fclones->fclone_ref, 1); + + fclones->skb2.fclone = SKB_FCLONE_CLONE; } - return data; + + return skb; + +nodata: + kmem_cache_free(cache, skb); + return NULL; } -EXPORT_SYMBOL(netdev_alloc_frag); +EXPORT_SYMBOL(__alloc_skb); /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device @@ -432,7 +482,11 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, len += NET_SKB_PAD; - if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + */ + if (len <= SKB_WITH_OVERHEAD(1024) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) @@ -496,20 +550,26 @@ EXPORT_SYMBOL(__netdev_alloc_skb); struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, gfp_t gfp_mask) { - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc; struct sk_buff *skb; void *data; len += NET_SKB_PAD + NET_IP_ALIGN; - if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + */ + if (len <= SKB_WITH_OVERHEAD(1024) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { - skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, + NUMA_NO_NODE); if (!skb) goto skb_fail; goto skb_success; } + nc = this_cpu_ptr(&napi_alloc_cache); len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); len = SKB_DATA_ALIGN(len); @@ -520,7 +580,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, if (unlikely(!data)) return NULL; - skb = __build_skb(data, len); + skb = __napi_build_skb(data, len); if (unlikely(!skb)) { skb_free_frag(data); return NULL; @@ -600,13 +660,14 @@ static void skb_release_data(struct sk_buff *skb) &shinfo->dataref)) return; + skb_zcopy_clear(skb, true); + for (i = 0; i < shinfo->nr_frags; i++) __skb_frag_unref(&shinfo->frags[i]); if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); - skb_zcopy_clear(skb, true); skb_free_head(skb); } @@ -837,7 +898,7 @@ EXPORT_SYMBOL(consume_skb); #endif /** - * consume_stateless_skb - free an skbuff, assuming it is stateless + * __consume_stateless_skb - free an skbuff, assuming it is stateless * @skb: buffer to free * * Alike consume_skb(), but this variant assumes that this is the last @@ -850,43 +911,36 @@ void __consume_stateless_skb(struct sk_buff *skb) kfree_skbmem(skb); } -void __kfree_skb_flush(void) +static void napi_skb_cache_put(struct sk_buff *skb) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + u32 i; - /* flush skb_cache if containing objects */ - if (nc->skb_count) { - kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count, - nc->skb_cache); - nc->skb_count = 0; - } -} - -static inline void _kfree_skb_defer(struct sk_buff *skb) -{ - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - - /* drop skb->head and call any destructors for packet */ - skb_release_all(skb); - - /* record skb to CPU local list */ + kasan_poison_object_data(skbuff_head_cache, skb); nc->skb_cache[nc->skb_count++] = skb; -#ifdef CONFIG_SLUB - /* SLUB writes into objects when freeing */ - prefetchw(skb); -#endif - - /* flush skb_cache if it is filled */ if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { - kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE, - nc->skb_cache); - nc->skb_count = 0; + for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) + kasan_unpoison_object_data(skbuff_head_cache, + nc->skb_cache[i]); + + kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF, + nc->skb_cache + NAPI_SKB_CACHE_HALF); + nc->skb_count = NAPI_SKB_CACHE_HALF; } } + void __kfree_skb_defer(struct sk_buff *skb) { - _kfree_skb_defer(skb); + skb_release_all(skb); + napi_skb_cache_put(skb); +} + +void napi_skb_free_stolen_head(struct sk_buff *skb) +{ + skb_dst_drop(skb); + skb_ext_put(skb); + napi_skb_cache_put(skb); } void napi_consume_skb(struct sk_buff *skb, int budget) @@ -897,6 +951,8 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } + lockdep_assert_in_softirq(); + if (!skb_unref(skb)) return; @@ -909,7 +965,8 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - _kfree_skb_defer(skb); + skb_release_all(skb); + napi_skb_cache_put(skb); } EXPORT_SYMBOL(napi_consume_skb); @@ -1086,7 +1143,7 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); -struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) +struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) { struct ubuf_info *uarg; struct sk_buff *skb; @@ -1106,25 +1163,26 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) return NULL; } - uarg->callback = sock_zerocopy_callback; + uarg->callback = msg_zerocopy_callback; uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; uarg->len = 1; uarg->bytelen = size; uarg->zerocopy = 1; + uarg->flags = SKBFL_ZEROCOPY_FRAG; refcount_set(&uarg->refcnt, 1); sock_hold(sk); return uarg; } -EXPORT_SYMBOL_GPL(sock_zerocopy_alloc); +EXPORT_SYMBOL_GPL(msg_zerocopy_alloc); static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) { return container_of((void *)uarg, struct sk_buff, cb); } -struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg) +struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, + struct ubuf_info *uarg) { if (uarg) { const u32 byte_limit = 1 << 19; /* limit to a few TSO */ @@ -1156,16 +1214,16 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, /* no extra ref when appending to datagram (MSG_MORE) */ if (sk->sk_type == SOCK_STREAM) - sock_zerocopy_get(uarg); + net_zcopy_get(uarg); return uarg; } } new_alloc: - return sock_zerocopy_alloc(sk, size); + return msg_zerocopy_alloc(sk, size); } -EXPORT_SYMBOL_GPL(sock_zerocopy_realloc); +EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) { @@ -1187,7 +1245,7 @@ static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) return true; } -void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) +static void __msg_zerocopy_callback(struct ubuf_info *uarg) { struct sk_buff *tail, *skb = skb_from_uarg(uarg); struct sock_exterr_skb *serr; @@ -1215,7 +1273,7 @@ void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; serr->ee.ee_data = hi; serr->ee.ee_info = lo; - if (!success) + if (!uarg->zerocopy) serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; q = &sk->sk_error_queue; @@ -1234,32 +1292,28 @@ release: consume_skb(skb); sock_put(sk); } -EXPORT_SYMBOL_GPL(sock_zerocopy_callback); -void sock_zerocopy_put(struct ubuf_info *uarg) +void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) { - if (uarg && refcount_dec_and_test(&uarg->refcnt)) { - if (uarg->callback) - uarg->callback(uarg, uarg->zerocopy); - else - consume_skb(skb_from_uarg(uarg)); - } + uarg->zerocopy = uarg->zerocopy & success; + + if (refcount_dec_and_test(&uarg->refcnt)) + __msg_zerocopy_callback(uarg); } -EXPORT_SYMBOL_GPL(sock_zerocopy_put); +EXPORT_SYMBOL_GPL(msg_zerocopy_callback); -void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) +void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { - if (uarg) { - struct sock *sk = skb_from_uarg(uarg)->sk; + struct sock *sk = skb_from_uarg(uarg)->sk; - atomic_dec(&sk->sk_zckey); - uarg->len--; + atomic_dec(&sk->sk_zckey); + uarg->len--; - if (have_uref) - sock_zerocopy_put(uarg); - } + if (have_uref) + msg_zerocopy_callback(NULL, uarg, true); } -EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); +EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { @@ -1323,7 +1377,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, * @skb: the skb to modify * @gfp_mask: allocation priority * - * This must be called on SKBTX_DEV_ZEROCOPY skb. + * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. * It will copy all frags into kernel and drop the reference * to userspace pages. * @@ -2011,6 +2065,12 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) skb->csum = csum_block_sub(skb->csum, skb_checksum(skb, len, delta, 0), len); + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; + int offset = skb_checksum_start_offset(skb) + skb->csum_offset; + + if (offset + sizeof(__sum16) > hdlen) + return -EINVAL; } return __pskb_trim(skb, len); } @@ -3254,8 +3314,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); - skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags & - SKBTX_SHARED_FRAG; + skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); @@ -3270,7 +3329,19 @@ EXPORT_SYMBOL(skb_split); */ static int skb_prepare_for_shift(struct sk_buff *skb) { - return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + int ret = 0; + + if (skb_cloned(skb)) { + /* Save and restore truesize: pskb_expand_head() may reallocate + * memory where ksize(kmalloc(S)) != ksize(kmalloc(S)), but we + * cannot change truesize at this point. + */ + unsigned int save_truesize = skb->truesize; + + ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + skb->truesize = save_truesize; + } + return ret; } /** @@ -3429,6 +3500,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, st->root_skb = st->cur_skb = skb; st->frag_idx = st->stepped_offset = 0; st->frag_data = NULL; + st->frag_off = 0; } EXPORT_SYMBOL(skb_prepare_seq_read); @@ -3483,14 +3555,27 @@ next_skb: st->stepped_offset += skb_headlen(st->cur_skb); while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { + unsigned int pg_idx, pg_off, pg_sz; + frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; - block_limit = skb_frag_size(frag) + st->stepped_offset; + pg_idx = 0; + pg_off = skb_frag_off(frag); + pg_sz = skb_frag_size(frag); + + if (skb_frag_must_loop(skb_frag_page(frag))) { + pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; + pg_off = offset_in_page(pg_off + st->frag_off); + pg_sz = min_t(unsigned int, pg_sz - st->frag_off, + PAGE_SIZE - pg_off); + } + + block_limit = pg_sz + st->stepped_offset; if (abs_offset < block_limit) { if (!st->frag_data) - st->frag_data = kmap_atomic(skb_frag_page(frag)); + st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx); - *data = (u8 *) st->frag_data + skb_frag_off(frag) + + *data = (u8 *)st->frag_data + pg_off + (abs_offset - st->stepped_offset); return block_limit - abs_offset; @@ -3501,8 +3586,12 @@ next_skb: st->frag_data = NULL; } - st->frag_idx++; - st->stepped_offset += skb_frag_size(frag); + st->stepped_offset += pg_sz; + st->frag_off += pg_sz; + if (st->frag_off == skb_frag_size(frag)) { + st->frag_off = 0; + st->frag_idx++; + } } if (st->frag_data) { @@ -3642,7 +3731,8 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, unsigned int delta_truesize = 0; unsigned int delta_len = 0; struct sk_buff *tail = NULL; - struct sk_buff *nskb; + struct sk_buff *nskb, *tmp; + int err; skb_push(skb, -skb_network_offset(skb) + offset); @@ -3652,11 +3742,28 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, nskb = list_skb; list_skb = list_skb->next; + err = 0; + if (skb_shared(nskb)) { + tmp = skb_clone(nskb, GFP_ATOMIC); + if (tmp) { + consume_skb(nskb); + nskb = tmp; + err = skb_unclone(nskb, GFP_ATOMIC); + } else { + err = -ENOMEM; + } + } + if (!tail) skb->next = nskb; else tail->next = nskb; + if (unlikely(err)) { + nskb->next = list_skb; + goto err_linearize; + } + tail = nskb; delta_len += nskb->len; @@ -3843,12 +3950,8 @@ normal: } hsize = skb_headlen(head_skb) - offset; - if (hsize < 0) - hsize = 0; - if (hsize > len || !sg) - hsize = len; - if (!hsize && i >= nfrags && skb_headlen(list_skb) && + if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) && (skb_headlen(list_skb) == len || sg)) { BUG_ON(skb_headlen(list_skb) > len); @@ -3891,6 +3994,11 @@ normal: skb_release_head_state(nskb); __skb_push(nskb, doffset); } else { + if (hsize < 0) + hsize = 0; + if (hsize > len || !sg) + hsize = len; + nskb = __alloc_skb(hsize + doffset + headroom, GFP_ATOMIC, skb_alloc_rx_flag(head_skb), NUMA_NO_NODE); @@ -3944,8 +4052,8 @@ normal: skb_copy_from_linear_data_offset(head_skb, offset, skb_put(nskb, hsize), hsize); - skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & - SKBTX_SHARED_FRAG; + skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & + SKBFL_SHARED_FRAG; if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) @@ -4549,7 +4657,7 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk) if (skb && (skb_next = skb_peek(q))) { icmp_next = is_icmp_err_skb(skb_next); if (icmp_next) - sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin; + sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; } spin_unlock_irqrestore(&q->lock, flags); @@ -4665,6 +4773,7 @@ err: EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); void __skb_tstamp_tx(struct sk_buff *orig_skb, + const struct sk_buff *ack_skb, struct skb_shared_hwtstamps *hwtstamps, struct sock *sk, int tstype) { @@ -4687,7 +4796,8 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk->sk_protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { - skb = tcp_get_timestamping_opt_stats(sk, orig_skb); + skb = tcp_get_timestamping_opt_stats(sk, orig_skb, + ack_skb); opt_stats = true; } else #endif @@ -4716,7 +4826,7 @@ EXPORT_SYMBOL_GPL(__skb_tstamp_tx); void skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps) { - return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk, + return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk, SCM_TSTAMP_SND); } EXPORT_SYMBOL_GPL(skb_tstamp_tx); @@ -5430,7 +5540,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb) goto err_free; skb_reset_network_header(skb); - skb_reset_transport_header(skb); + if (!skb_transport_header_was_set(skb)) + skb_reset_transport_header(skb); skb_reset_mac_len(skb); return skb; @@ -5786,6 +5897,9 @@ int skb_mpls_dec_ttl(struct sk_buff *skb) if (unlikely(!eth_p_mpls(skb->protocol))) return -EINVAL; + if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) + return -ENOMEM; + lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; if (!--ttl) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 654182ecf87b..1261512d6807 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -170,10 +170,12 @@ static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, struct scatterlist *sge = sk_msg_elem(msg, i); u32 len = sge->length; - if (charge) - sk_mem_uncharge(sk, len); - if (!msg->skb) + /* When the skb owns the memory we free it from consume_skb path. */ + if (!msg->skb) { + if (charge) + sk_mem_uncharge(sk, len); put_page(sg_page(sge)); + } memset(sge, 0, sizeof(*sge)); return len; } @@ -397,28 +399,45 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) +static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, + struct sk_buff *skb) { - struct sock *sk = psock->sk; - int copied = 0, num_sge; struct sk_msg *msg; + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) + return NULL; + + if (!sk_rmem_schedule(sk, skb, skb->truesize)) + return NULL; + msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); if (unlikely(!msg)) - return -EAGAIN; - if (!sk_rmem_schedule(sk, skb, skb->len)) { - kfree(msg); - return -EAGAIN; - } + return NULL; sk_msg_init(msg); + return msg; +} + +static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, + struct sk_psock *psock, + struct sock *sk, + struct sk_msg *msg) +{ + int num_sge, copied; + + /* skb linearize may fail with ENOMEM, but lets simply try again + * later if this happens. Under memory pressure we don't want to + * drop the skb. We need to linearize the skb so that the mapping + * in skb_to_sgvec can not error. + */ + if (skb_linearize(skb)) + return -EAGAIN; num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len); if (unlikely(num_sge < 0)) { kfree(msg); return num_sge; } - sk_mem_charge(sk, skb->len); copied = skb->len; msg->sg.start = 0; msg->sg.size = copied; @@ -430,6 +449,48 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) return copied; } +static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb); + +static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) +{ + struct sock *sk = psock->sk; + struct sk_msg *msg; + + /* If we are receiving on the same sock skb->sk is already assigned, + * skip memory accounting and owner transition seeing it already set + * correctly. + */ + if (unlikely(skb->sk == sk)) + return sk_psock_skb_ingress_self(psock, skb); + msg = sk_psock_create_ingress_msg(sk, skb); + if (!msg) + return -EAGAIN; + + /* This will transition ownership of the data from the socket where + * the BPF program was run initiating the redirect to the socket + * we will eventually receive this data on. The data will be released + * from skb_consume found in __tcp_bpf_recvmsg() after its been copied + * into user buffers. + */ + skb_set_owner_r(skb, sk); + return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg); +} + +/* Puts an skb on the ingress queue of the socket already assigned to the + * skb. In this case we do not need to check memory limits or skb_set_owner_r + * because the skb is already accounted for here. + */ +static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb) +{ + struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); + struct sock *sk = psock->sk; + + if (unlikely(!msg)) + return -EAGAIN; + sk_msg_init(msg); + return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg); +} + static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { @@ -608,14 +669,13 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) kfree(psock); } -void sk_psock_destroy(struct rcu_head *rcu) +static void sk_psock_destroy(struct rcu_head *rcu) { struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); INIT_WORK(&psock->gc, sk_psock_destroy_deferred); schedule_work(&psock->gc); } -EXPORT_SYMBOL_GPL(sk_psock_destroy); void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { @@ -789,7 +849,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, * retrying later from workqueue. */ if (skb_queue_empty(&psock->ingress_skb)) { - err = sk_psock_skb_ingress(psock, skb); + err = sk_psock_skb_ingress_self(psock, skb); } if (err < 0) { skb_queue_tail(&psock->ingress_skb, skb); diff --git a/net/core/sock.c b/net/core/sock.c index 727ea1cc633c..0ed98f20448a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -526,11 +526,17 @@ discard_and_relse: } EXPORT_SYMBOL(__sk_receive_skb); +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, + u32)); +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, + u32)); struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + if (dst && dst->obsolete && + INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, + dst, cookie) == NULL) { sk_tx_queue_clear(sk); sk->sk_dst_pending_confirm = 0; RCU_INIT_POINTER(sk->sk_dst_cache, NULL); @@ -546,7 +552,9 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk_dst_get(sk); - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + if (dst && dst->obsolete && + INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, + dst, cookie) == NULL) { sk_dst_reset(sk); dst_release(dst); return NULL; @@ -1159,6 +1167,22 @@ set_sndbuf: sk->sk_ll_usec = val; } break; + case SO_PREFER_BUSY_POLL: + if (valbool && !capable(CAP_NET_ADMIN)) + ret = -EPERM; + else + WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); + break; + case SO_BUSY_POLL_BUDGET: + if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { + ret = -EPERM; + } else { + if (val < 0 || val > U16_MAX) + ret = -EINVAL; + else + WRITE_ONCE(sk->sk_busy_poll_budget, val); + } + break; #endif case SO_MAX_PACING_RATE: @@ -1523,6 +1547,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_BUSY_POLL: v.val = sk->sk_ll_usec; break; + case SO_PREFER_BUSY_POLL: + v.val = READ_ONCE(sk->sk_prefer_busy_poll); + break; #endif case SO_MAX_PACING_RATE: @@ -1638,6 +1665,16 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif + + /* If we move sk_tx_queue_mapping out of the private section, + * we must check if sk_tx_queue_clear() is called after + * sock_copy() in sk_clone_lock(). + */ + BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < + offsetof(struct sock, sk_dontcopy_begin) || + offsetof(struct sock, sk_tx_queue_mapping) >= + offsetof(struct sock, sk_dontcopy_end)); + memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, @@ -1671,7 +1708,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; - sk_tx_queue_clear(sk); } return sk; @@ -1857,123 +1893,120 @@ static void sk_init_common(struct sock *sk) struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { struct proto *prot = READ_ONCE(sk->sk_prot); - struct sock *newsk; + struct sk_filter *filter; bool is_charged = true; + struct sock *newsk; newsk = sk_prot_alloc(prot, priority, sk->sk_family); - if (newsk != NULL) { - struct sk_filter *filter; + if (!newsk) + goto out; - sock_copy(newsk, sk); + sock_copy(newsk, sk); - newsk->sk_prot_creator = prot; + newsk->sk_prot_creator = prot; - /* SANITY */ - if (likely(newsk->sk_net_refcnt)) - get_net(sock_net(newsk)); - sk_node_init(&newsk->sk_node); - sock_lock_init(newsk); - bh_lock_sock(newsk); - newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; - newsk->sk_backlog.len = 0; + /* SANITY */ + if (likely(newsk->sk_net_refcnt)) + get_net(sock_net(newsk)); + sk_node_init(&newsk->sk_node); + sock_lock_init(newsk); + bh_lock_sock(newsk); + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; + newsk->sk_backlog.len = 0; - atomic_set(&newsk->sk_rmem_alloc, 0); - /* - * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) - */ - refcount_set(&newsk->sk_wmem_alloc, 1); - atomic_set(&newsk->sk_omem_alloc, 0); - sk_init_common(newsk); + atomic_set(&newsk->sk_rmem_alloc, 0); - newsk->sk_dst_cache = NULL; - newsk->sk_dst_pending_confirm = 0; - newsk->sk_wmem_queued = 0; - newsk->sk_forward_alloc = 0; - atomic_set(&newsk->sk_drops, 0); - newsk->sk_send_head = NULL; - newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; - atomic_set(&newsk->sk_zckey, 0); + /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ + refcount_set(&newsk->sk_wmem_alloc, 1); - sock_reset_flag(newsk, SOCK_DONE); + atomic_set(&newsk->sk_omem_alloc, 0); + sk_init_common(newsk); - /* sk->sk_memcg will be populated at accept() time */ - newsk->sk_memcg = NULL; + newsk->sk_dst_cache = NULL; + newsk->sk_dst_pending_confirm = 0; + newsk->sk_wmem_queued = 0; + newsk->sk_forward_alloc = 0; + atomic_set(&newsk->sk_drops, 0); + newsk->sk_send_head = NULL; + newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + atomic_set(&newsk->sk_zckey, 0); - cgroup_sk_clone(&newsk->sk_cgrp_data); + sock_reset_flag(newsk, SOCK_DONE); - rcu_read_lock(); - filter = rcu_dereference(sk->sk_filter); - if (filter != NULL) - /* though it's an empty new sock, the charging may fail - * if sysctl_optmem_max was changed between creation of - * original socket and cloning - */ - is_charged = sk_filter_charge(newsk, filter); - RCU_INIT_POINTER(newsk->sk_filter, filter); - rcu_read_unlock(); + /* sk->sk_memcg will be populated at accept() time */ + newsk->sk_memcg = NULL; - if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { - /* We need to make sure that we don't uncharge the new - * socket if we couldn't charge it in the first place - * as otherwise we uncharge the parent's filter. - */ - if (!is_charged) - RCU_INIT_POINTER(newsk->sk_filter, NULL); - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; - } - RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); + cgroup_sk_clone(&newsk->sk_cgrp_data); - if (bpf_sk_storage_clone(sk, newsk)) { - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; - } + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); + if (filter != NULL) + /* though it's an empty new sock, the charging may fail + * if sysctl_optmem_max was changed between creation of + * original socket and cloning + */ + is_charged = sk_filter_charge(newsk, filter); + RCU_INIT_POINTER(newsk->sk_filter, filter); + rcu_read_unlock(); - /* Clear sk_user_data if parent had the pointer tagged - * as not suitable for copying when cloning. + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { + /* We need to make sure that we don't uncharge the new + * socket if we couldn't charge it in the first place + * as otherwise we uncharge the parent's filter. */ - if (sk_user_data_is_nocopy(newsk)) - newsk->sk_user_data = NULL; + if (!is_charged) + RCU_INIT_POINTER(newsk->sk_filter, NULL); + sk_free_unlock_clone(newsk); + newsk = NULL; + goto out; + } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); - newsk->sk_err = 0; - newsk->sk_err_soft = 0; - newsk->sk_priority = 0; - newsk->sk_incoming_cpu = raw_smp_processor_id(); - if (likely(newsk->sk_net_refcnt)) - sock_inuse_add(sock_net(newsk), 1); + if (bpf_sk_storage_clone(sk, newsk)) { + sk_free_unlock_clone(newsk); + newsk = NULL; + goto out; + } - /* - * Before updating sk_refcnt, we must commit prior changes to memory - * (Documentation/RCU/rculist_nulls.rst for details) - */ - smp_wmb(); - refcount_set(&newsk->sk_refcnt, 2); + /* Clear sk_user_data if parent had the pointer tagged + * as not suitable for copying when cloning. + */ + if (sk_user_data_is_nocopy(newsk)) + newsk->sk_user_data = NULL; - /* - * Increment the counter in the same struct proto as the master - * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that - * is the same as sk->sk_prot->socks, as this field was copied - * with memcpy). - * - * This _changes_ the previous behaviour, where - * tcp_create_openreq_child always was incrementing the - * equivalent to tcp_prot->socks (inet_sock_nr), so this have - * to be taken into account in all callers. -acme - */ - sk_refcnt_debug_inc(newsk); - sk_set_socket(newsk, NULL); - sk_tx_queue_clear(newsk); - RCU_INIT_POINTER(newsk->sk_wq, NULL); + newsk->sk_err = 0; + newsk->sk_err_soft = 0; + newsk->sk_priority = 0; + newsk->sk_incoming_cpu = raw_smp_processor_id(); + if (likely(newsk->sk_net_refcnt)) + sock_inuse_add(sock_net(newsk), 1); - if (newsk->sk_prot->sockets_allocated) - sk_sockets_allocated_inc(newsk); + /* Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.rst for details) + */ + smp_wmb(); + refcount_set(&newsk->sk_refcnt, 2); - if (sock_needs_netstamp(sk) && - newsk->sk_flags & SK_FLAGS_TIMESTAMP) - net_enable_timestamp(); - } + /* Increment the counter in the same struct proto as the master + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that + * is the same as sk->sk_prot->socks, as this field was copied + * with memcpy). + * + * This _changes_ the previous behaviour, where + * tcp_create_openreq_child always was incrementing the + * equivalent to tcp_prot->socks (inet_sock_nr), so this have + * to be taken into account in all callers. -acme + */ + sk_refcnt_debug_inc(newsk); + sk_set_socket(newsk, NULL); + sk_tx_queue_clear(newsk); + RCU_INIT_POINTER(newsk->sk_wq, NULL); + + if (newsk->sk_prot->sockets_allocated) + sk_sockets_allocated_inc(newsk); + + if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) + net_enable_timestamp(); out: return newsk; } @@ -2486,7 +2519,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); -static void __lock_sock(struct sock *sk) +void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { @@ -2808,14 +2841,8 @@ EXPORT_SYMBOL(sock_no_mmap); void __receive_sock(struct file *file) { struct socket *sock; - int error; - /* - * The resulting value of "error" is ignored here since we only - * need to take action when the file is a socket and testing - * "sock" for NULL is sufficient. - */ - sock = sock_from_file(file, &error); + sock = sock_from_file(file); if (sock) { sock_update_netprioidx(&sock->sk->sk_cgrp_data); sock_update_classid(&sock->sk->sk_cgrp_data); @@ -3078,7 +3105,7 @@ EXPORT_SYMBOL(release_sock); * * sk_lock.slock unlocked, owned = 1, BH enabled */ -bool lock_sock_fast(struct sock *sk) +bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); @@ -3096,6 +3123,7 @@ bool lock_sock_fast(struct sock *sk) * The sk_lock has mutex_lock() semantics here: */ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); + __acquire(&sk->sk_lock.slock); local_bh_enable(); return true; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index ddc899e83313..d758fb83c884 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -27,8 +27,6 @@ struct bpf_stab { static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; - u64 cost; - int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -39,29 +37,22 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - stab = kzalloc(sizeof(*stab), GFP_USER); + stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_ACCOUNT); if (!stab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); raw_spin_lock_init(&stab->lock); - /* Make sure page count doesn't overflow. */ - cost = (u64) stab->map.max_entries * sizeof(struct sock *); - err = bpf_map_charge_init(&stab->map.memory, cost); - if (err) - goto free_stab; - stab->sks = bpf_map_area_alloc(stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); - if (stab->sks) - return &stab->map; - err = -ENOMEM; - bpf_map_charge_finish(&stab->map.memory); -free_stab: - kfree(stab); - return ERR_PTR(err); + if (!stab->sks) { + kfree(stab); + return ERR_PTR(-ENOMEM); + } + + return &stab->map; } int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) @@ -611,7 +602,7 @@ int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: - fput(sock->file); + sockfd_put(sock); return ret; } @@ -975,8 +966,9 @@ static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, } } - new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, - htab->map.numa_node); + new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, + GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); if (!new) { atomic_dec(&htab->count); return ERR_PTR(-ENOMEM); @@ -1103,7 +1095,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) { struct bpf_shtab *htab; int i, err; - u64 cost; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -1116,7 +1107,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); - htab = kzalloc(sizeof(*htab), GFP_USER); + htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); if (!htab) return ERR_PTR(-ENOMEM); @@ -1131,21 +1122,10 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) goto free_htab; } - cost = (u64) htab->buckets_num * sizeof(struct bpf_shtab_bucket) + - (u64) htab->elem_size * htab->map.max_entries; - if (cost >= U32_MAX - PAGE_SIZE) { - err = -EINVAL; - goto free_htab; - } - err = bpf_map_charge_init(&htab->map.memory, cost); - if (err) - goto free_htab; - htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_shtab_bucket), htab->map.numa_node); if (!htab->buckets) { - bpf_map_charge_finish(&htab->map.memory); err = -ENOMEM; goto free_htab; } diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index bbdd3c7b6cb5..b065f0a103ed 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -293,7 +293,7 @@ select_by_hash: i = j = reciprocal_scale(hash, socks); while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { i++; - if (i >= reuse->num_socks) + if (i >= socks) i = 0; if (i == j) goto out; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d86d8d11cfe4..4567de519603 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -309,7 +309,6 @@ proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, #endif static struct ctl_table net_core_table[] = { -#ifdef CONFIG_NET { .procname = "wmem_max", .data = &sysctl_wmem_max, @@ -507,7 +506,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = set_default_qdisc }, #endif -#endif /* CONFIG_NET */ { .procname = "netdev_budget", .data = &netdev_budget, diff --git a/net/core/xdp.c b/net/core/xdp.c index 48aba933a5a8..05354976c1fc 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -158,7 +158,7 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) /* Returns 0 on success, negative on failure */ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index) + struct net_device *dev, u32 queue_index, unsigned int napi_id) { if (xdp_rxq->reg_state == REG_STATE_UNUSED) { WARN(1, "Driver promised not to register this"); @@ -179,6 +179,7 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, xdp_rxq_info_init(xdp_rxq); xdp_rxq->dev = dev; xdp_rxq->queue_index = queue_index; + xdp_rxq->napi_id = napi_id; xdp_rxq->reg_state = REG_STATE_REGISTERED; return 0; @@ -335,11 +336,10 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling - * of xdp_frames/pages in those cases. This path is never used by the - * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of - * the switch-statement. + * of xdp_frames/pages in those cases. */ -static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, + struct xdp_buff *xdp) { struct xdp_mem_allocator *xa; struct page *page; @@ -361,6 +361,10 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); break; + case MEM_TYPE_XSK_BUFF_POOL: + /* NB! Only valid from an xdp_buff! */ + xsk_buff_free(xdp); + break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); @@ -370,19 +374,73 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false); + __xdp_return(xdpf->data, &xdpf->mem, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true); + __xdp_return(xdpf->data, &xdpf->mem, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); +/* XDP bulk APIs introduce a defer/flush mechanism to return + * pages belonging to the same xdp_mem_allocator object + * (identified via the mem.id field) in bulk to optimize + * I-cache and D-cache. + * The bulk queue size is set to 16 to be aligned to how + * XDP_REDIRECT bulking works. The bulk is flushed when + * it is full or when mem.id changes. + * xdp_frame_bulk is usually stored/allocated on the function + * call-stack to avoid locking penalties. + */ +void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) +{ + struct xdp_mem_allocator *xa = bq->xa; + + if (unlikely(!xa || !bq->count)) + return; + + page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count); + /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */ + bq->count = 0; +} +EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk); + +/* Must be called with rcu_read_lock held */ +void xdp_return_frame_bulk(struct xdp_frame *xdpf, + struct xdp_frame_bulk *bq) +{ + struct xdp_mem_info *mem = &xdpf->mem; + struct xdp_mem_allocator *xa; + + if (mem->type != MEM_TYPE_PAGE_POOL) { + __xdp_return(xdpf->data, &xdpf->mem, false, NULL); + return; + } + + xa = bq->xa; + if (unlikely(!xa)) { + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + bq->count = 0; + bq->xa = xa; + } + + if (bq->count == XDP_BULK_QUEUE_SIZE) + xdp_flush_frame_bulk(bq); + + if (unlikely(mem->id != xa->mem.id)) { + xdp_flush_frame_bulk(bq); + bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + } + + bq->q[bq->count++] = xdpf->data; +} +EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); + void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true); + __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); } /* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ @@ -400,18 +458,6 @@ void __xdp_release_frame(void *data, struct xdp_mem_info *mem) } EXPORT_SYMBOL_GPL(__xdp_release_frame); -bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, - struct netdev_bpf *bpf) -{ - if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) { - NL_SET_ERR_MSG(bpf->extack, - "program loaded with different flags"); - return false; - } - return true; -} -EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok); - void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf) { @@ -467,3 +513,73 @@ void xdp_warn(const char *msg, const char *func, const int line) WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); }; EXPORT_SYMBOL_GPL(xdp_warn); + +int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) +{ + n_skb = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, + n_skb, skbs); + if (unlikely(!n_skb)) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk); + +struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct sk_buff *skb, + struct net_device *dev) +{ + unsigned int headroom, frame_size; + void *hard_start; + + /* Part of headroom was reserved to xdpf */ + headroom = sizeof(*xdpf) + xdpf->headroom; + + /* Memory size backing xdp_frame data already have reserved + * room for build_skb to place skb_shared_info in tailroom. + */ + frame_size = xdpf->frame_sz; + + hard_start = xdpf->data - headroom; + skb = build_skb_around(skb, hard_start, frame_size); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, headroom); + __skb_put(skb, xdpf->len); + if (xdpf->metasize) + skb_metadata_set(skb, xdpf->metasize); + + /* Essential SKB info: protocol and skb->dev */ + skb->protocol = eth_type_trans(skb, dev); + + /* Optional SKB info, currently missing: + * - HW checksum info (skb->ip_summed) + * - HW RX hash (skb_set_hash) + * - RX ring dev queue index (skb_record_rx_queue) + */ + + /* Until page_pool get SKB return path, release DMA here */ + xdp_release_frame(xdpf); + + /* Allow SKB to reuse area used by xdp_frame */ + xdp_scrub_frame(xdpf); + + return skb; +} +EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame); + +struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct net_device *dev) +{ + struct sk_buff *skb; + + skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + + return __xdp_build_skb_from_frame(xdpf, skb, dev); +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame); diff --git a/net/dcb/Makefile b/net/dcb/Makefile index 3016e5a7716a..2c0fa16ee2a9 100644 --- a/net/dcb/Makefile +++ b/net/dcb/Makefile @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_DCB) += dcbnl.o dcbevent.o +obj-y += dcbnl.o dcbevent.o diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 16014ad19406..653e3bc9c87b 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1765,6 +1765,8 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, fn = &reply_funcs[dcb->cmd]; if (!fn->cb) return -EOPNOTSUPP; + if (fn->type == RTM_SETDCB && !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; if (!tb[DCB_ATTR_IFNAME]) return -EINVAL; @@ -1827,6 +1829,8 @@ static int dcb_app_add(const struct dcb_app *app, int ifindex) /** * dcb_getapp - retrieve the DCBX application user priority + * @dev: network interface + * @app: application to get user priority of * * On success returns a non-zero 802.1p user priority bitmap * otherwise returns 0 as the invalid user priority bitmap to @@ -1849,6 +1853,8 @@ EXPORT_SYMBOL(dcb_getapp); /** * dcb_setapp - add CEE dcb application data to app list + * @dev: network interface + * @new: application data to add * * Priority 0 is an invalid priority in CEE spec. This routine * removes applications from the app list if the priority is @@ -1890,6 +1896,8 @@ EXPORT_SYMBOL(dcb_setapp); /** * dcb_ieee_getapp_mask - retrieve the IEEE DCB application priority + * @dev: network interface + * @app: where to store the retrieve application data * * Helper routine which on success returns a non-zero 802.1Qaz user * priority bitmap otherwise returns 0 to indicate the dcb_app was @@ -1912,6 +1920,8 @@ EXPORT_SYMBOL(dcb_ieee_getapp_mask); /** * dcb_ieee_setapp - add IEEE dcb application data to app list + * @dev: network interface + * @new: application data to add * * This adds Application data to the list. Multiple application * entries may exists for the same selector and protocol as long @@ -1946,6 +1956,8 @@ EXPORT_SYMBOL(dcb_ieee_setapp); /** * dcb_ieee_delapp - delete IEEE dcb application data from list + * @dev: network interface + * @del: application data to delete * * This removes a matching APP data from the APP list */ @@ -1975,7 +1987,7 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) } EXPORT_SYMBOL(dcb_ieee_delapp); -/** +/* * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. Initialize p_map * such that each map element holds a bit mask of DSCP values configured for @@ -2004,7 +2016,7 @@ void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev, } EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map); -/** +/* * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from * DSCP values to the priorities assigned to that DSCP value. Initialize p_map * such that each map element holds a bit mask of priorities configured for a @@ -2031,7 +2043,7 @@ dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev, } EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map); -/** +/* * Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet * type, with valid PID values >= 1536. A special meaning is then assigned to * protocol value of 0: "default priority. For use when priority is not diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 8f3dd3b1d2d0..c4bbac99740d 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -242,6 +242,8 @@ static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, /** * dccp_ackvec_input - Register incoming packet in the buffer + * @av: Ack Vector to register packet to + * @skb: Packet to register */ void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) { @@ -273,6 +275,9 @@ void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) /** * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection + * @av: Ack Vector record to clean + * @ackno: last Ack Vector which has been acknowledged + * * This routine is called when the peer acknowledges the receipt of Ack Vectors * up to and including @ackno. While based on section A.3 of RFC 4340, here * are additional precautions to prevent corrupted buffer state. In particular, diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 1e9bb121ba72..6beac5d348e2 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -76,7 +76,7 @@ int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, return err; } -static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...) +static __printf(3, 4) struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...) { struct kmem_cache *slab; va_list args; diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 3da1f77bd039..4d9823d6dced 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -181,6 +181,9 @@ MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation"); /** * ccid2_update_used_window - Track how much of cwnd is actually used + * @hc: socket to update window + * @new_wnd: new window values to add into the filter + * * This is done in addition to CWV. The sender needs to have an idea of how many * packets may be in flight, to set the local Sequence Window value accordingly * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the @@ -349,6 +352,8 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) /** * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm + * @sk: socket to perform estimator on + * * This code is almost identical with TCP's tcp_rtt_estimator(), since * - it has a higher sampling frequency (recommended by RFC 1323), * - the RTO does not collapse into RTT due to RTTVAR going towards zero, diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index b9ee1a4a8955..ca8670f78ac6 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -79,6 +79,8 @@ static inline u64 rfc3390_initial_rate(struct sock *sk) /** * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst + * @hc: socket to have the send interval updated + * * This respects the granularity of X_inst (64 * bytes/second). */ static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc) @@ -99,6 +101,7 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now) /** * ccid3_hc_tx_update_x - Update allowed sending rate X + * @sk: socket to be updated * @stamp: most recent time if available - can be left NULL. * * This function tracks draft rfc3448bis, check there for latest details. @@ -151,6 +154,7 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) /** * ccid3_hc_tx_update_s - Track the mean packet size `s' + * @hc: socket to be updated * @len: DCCP packet payload size in bytes * * cf. RFC 4342, 5.3 and RFC 3448, 4.1 @@ -259,6 +263,7 @@ out: /** * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets + * @sk: socket to send packet from * @skb: next packet candidate to send on @sk * * This function uses the convention of ccid_packet_dequeue_eval() and @@ -655,6 +660,7 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) /** * ccid3_first_li - Implements [RFC 5348, 6.3.1] + * @sk: socket to calculate loss interval for * * Determine the length of the first loss interval via inverse lookup. * Assume that X_recv can be computed by the throughput equation diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index 67abad695e66..da95319842bb 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -79,6 +79,9 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) /** * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 + * @lh: histogram to update + * @skb: received socket triggering loss interval update + * * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev */ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index af08e2df7108..0cdda3c66fb5 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -385,6 +385,9 @@ static inline struct tfrc_rx_hist_entry * /** * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal + * @h: receive histogram + * @skb: packet containing timestamp. + * * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able * to compute a sample with given data - calling function should check this. */ diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 788dd629c420..54086bb05c42 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -371,7 +371,7 @@ static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) fval->sp.vec = kmemdup(val, len, gfp_any()); if (fval->sp.vec == NULL) { fval->sp.len = 0; - return -ENOBUFS; + return -ENOMEM; } } return 0; @@ -996,6 +996,8 @@ int dccp_feat_finalise_settings(struct dccp_sock *dp) /** * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features + * @dreq: server socket to resolve + * * It is the server which resolves the dependencies once the CCID has been * fully negotiated. If no CCID has been negotiated, it uses the default CCID. */ @@ -1033,6 +1035,10 @@ static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) /** * dccp_feat_prefer - Move preferred entry to the start of array + * @preferred_value: entry to move to start of array + * @array: array of preferred entries + * @array_len: size of the array + * * Reorder the @array_len elements in @array so that @preferred_value comes * first. Returns >0 to indicate that @preferred_value does occur in @array. */ diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index bb3d70664dde..2455b0c0e486 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -427,7 +427,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); if (*own_req) ireq->ireq_opt = NULL; else @@ -464,7 +464,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, .fl4_dport = dccp_hdr(skb)->dccph_sport, }; - security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index ef4ab28cfde0..1f73603913f5 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -203,7 +203,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req fl6.flowi6_oif = ireq->ir_iif; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = htons(ireq->ir_num); - security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); rcu_read_lock(); @@ -279,7 +279,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) fl6.flowi6_oif = inet6_iif(rxskb); fl6.fl6_dport = dccp_hdr(skb)->dccph_dport; fl6.fl6_sport = dccp_hdr(skb)->dccph_sport; - security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6)); + security_skb_classify_flow(rxskb, flowi6_to_flowi_common(&fl6)); /* sk = NULL, but it is safe for now. RST socket required. */ dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); @@ -533,7 +533,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, dccp_done(newsk); goto out; } - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); /* Clone pktoptions received with SYN, if we own the req */ if (*own_req && ireq->pktopts) { newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC); @@ -907,7 +907,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); diff --git a/net/dccp/output.c b/net/dccp/output.c index 50e6d5699bb2..b8a24734385e 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -143,6 +143,8 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) /** * dccp_determine_ccmps - Find out about CCID-specific packet-size limits + * @dp: socket to find packet size limits of + * * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.), * since the RX CCID is restricted to feedback packets (Acks), which are small * in comparison with the data traffic. A value of 0 means "no current CCMPS". @@ -236,6 +238,8 @@ static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) /** * dccp_xmit_packet - Send data packet under control of CCID + * @sk: socket to send data packet on + * * Transmits next-queued payload and informs CCID to account for the packet. */ static void dccp_xmit_packet(struct sock *sk) @@ -296,6 +300,9 @@ static void dccp_xmit_packet(struct sock *sk) /** * dccp_flush_write_queue - Drain queue at end of connection + * @sk: socket to be drained + * @time_budget: time allowed to drain the queue + * * Since dccp_sendmsg queues packets without waiting for them to be sent, it may * happen that the TX queue is not empty at the end of a connection. We give the * HC-sender CCID a grace period of up to @time_budget jiffies. If this function @@ -367,6 +374,8 @@ void dccp_write_xmit(struct sock *sk) /** * dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets + * @sk: socket to perform retransmit on + * * There are only four retransmittable packet types in DCCP: * - Request in client-REQUEST state (sec. 8.1.1), * - CloseReq in server-CLOSEREQ state (sec. 8.3), diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c index db2448c33a62..5ba204ec0aca 100644 --- a/net/dccp/qpolicy.c +++ b/net/dccp/qpolicy.c @@ -65,14 +65,16 @@ static bool qpolicy_prio_full(struct sock *sk) * @push: add a new @skb to the write queue * @full: indicates that no more packets will be admitted * @top: peeks at whatever the queueing policy defines as its `top' + * @params: parameter passed to policy operation */ -static struct dccp_qpolicy_operations { +struct dccp_qpolicy_operations { void (*push) (struct sock *sk, struct sk_buff *skb); bool (*full) (struct sock *sk); struct sk_buff* (*top) (struct sock *sk); __be32 params; +}; -} qpol_table[DCCPQ_POLICY_MAX] = { +static struct dccp_qpolicy_operations qpol_table[DCCPQ_POLICY_MAX] = { [DCCPQ_POLICY_SIMPLE] = { .push = qpolicy_simple_push, .full = qpolicy_simple_full, diff --git a/net/dccp/timer.c b/net/dccp/timer.c index a934d2932373..db768f223ef7 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -215,13 +215,14 @@ out: /** * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface - * @data: Socket to act on + * @t: pointer to the tasklet associated with this handler * * See the comments above %ccid_dequeueing_decision for supported modes. */ -static void dccp_write_xmitlet(unsigned long data) +static void dccp_write_xmitlet(struct tasklet_struct *t) { - struct sock *sk = (struct sock *)data; + struct dccp_sock *dp = from_tasklet(dp, t, dccps_xmitlet); + struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk; bh_lock_sock(sk); if (sock_owned_by_user(sk)) @@ -235,16 +236,15 @@ static void dccp_write_xmitlet(unsigned long data) static void dccp_write_xmit_timer(struct timer_list *t) { struct dccp_sock *dp = from_timer(dp, t, dccps_xmit_timer); - struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk; - dccp_write_xmitlet((unsigned long)sk); + dccp_write_xmitlet(&dp->dccps_xmitlet); } void dccp_init_xmit_timers(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); - tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk); + tasklet_setup(&dp->dccps_xmitlet, dccp_write_xmitlet); timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0); inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, &dccp_keepalive_timer); diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 15d42353f1a3..d1c50a48614b 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -658,7 +658,7 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, ifa->ifa_dev = dn_db; if (tb[IFA_LABEL]) - nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); + nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 4cac31d22a50..2193ae529e75 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1035,7 +1035,7 @@ source_ok: fld.saddr = dnet_select_source(dev_out, 0, RT_SCOPE_HOST); if (!fld.daddr) - goto out; + goto done; } fld.flowidn_oif = LOOPBACK_IFINDEX; res.type = RTN_LOCAL; diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig index 255df9b6e9e8..155b06163409 100644 --- a/net/dns_resolver/Kconfig +++ b/net/dns_resolver/Kconfig @@ -4,7 +4,7 @@ # config DNS_RESOLVER tristate "DNS Resolver support" - depends on NET && KEYS + depends on KEYS help Saying Y here will include support for the DNS Resolver key type which can be used to make upcalls to perform DNS lookups in diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 1f9b9b11008c..3589224c8da9 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -9,6 +9,7 @@ menuconfig NET_DSA tristate "Distributed Switch Architecture" depends on HAVE_NET_DSA depends on BRIDGE || BRIDGE=n + depends on HSR || HSR=n select GRO_CELLS select NET_SWITCHDEV select PHYLINK @@ -56,20 +57,31 @@ config NET_DSA_TAG_BRCM_PREPEND Broadcom switches which places the tag before the Ethernet header (prepended). +config NET_DSA_TAG_HELLCREEK + tristate "Tag driver for Hirschmann Hellcreek TSN switches" + help + Say Y or M if you want to enable support for tagging frames + for the Hirschmann Hellcreek TSN switches. + config NET_DSA_TAG_GSWIP tristate "Tag driver for Lantiq / Intel GSWIP switches" help Say Y or M if you want to enable support for tagging frames for the Lantiq / Intel GSWIP switches. +config NET_DSA_TAG_DSA_COMMON + tristate + config NET_DSA_TAG_DSA tristate "Tag driver for Marvell switches using DSA headers" + select NET_DSA_TAG_DSA_COMMON help Say Y or M if you want to enable support for tagging frames for the Marvell switches which use DSA headers. config NET_DSA_TAG_EDSA tristate "Tag driver for Marvell switches using EtherType DSA headers" + select NET_DSA_TAG_DSA_COMMON help Say Y or M if you want to enable support for tagging frames for the Marvell switches which use EtherType DSA headers. @@ -94,11 +106,26 @@ config NET_DSA_TAG_RTL4_A the Realtek RTL8366RB. config NET_DSA_TAG_OCELOT - tristate "Tag driver for Ocelot family of switches" + tristate "Tag driver for Ocelot family of switches, using NPI port" select PACKING help - Say Y or M if you want to enable support for tagging frames for the - Ocelot switches (VSC7511, VSC7512, VSC7513, VSC7514, VSC9959). + Say Y or M if you want to enable NPI tagging for the Ocelot switches + (VSC7511, VSC7512, VSC7513, VSC7514, VSC9953, VSC9959). In this mode, + the frames over the Ethernet CPU port are prepended with a + hardware-defined injection/extraction frame header. Flow control + (PAUSE frames) over the CPU port is not supported when operating in + this mode. + +config NET_DSA_TAG_OCELOT_8021Q + tristate "Tag driver for Ocelot family of switches, using VLAN" + select NET_DSA_TAG_8021Q + help + Say Y or M if you want to enable support for tagging frames with a + custom VLAN-based header. Frames that require timestamping, such as + PTP, are not delivered over Ethernet but over register-based MMIO. + Flow control over the CPU port is functional in this mode. When using + this mode, less TCAM resources (VCAP IS1, IS2, ES0) are available for + use with tc-flower. config NET_DSA_TAG_QCA tristate "Tag driver for Qualcomm Atheros QCA8K switches" @@ -128,4 +155,10 @@ config NET_DSA_TAG_TRAILER Say Y or M if you want to enable support for tagging frames at with a trailed. e.g. Marvell 88E6060. +config NET_DSA_TAG_XRS700X + tristate "Tag driver for XRS700x switches" + help + Say Y or M if you want to enable support for tagging frames for + Arrow SpeedChips XRS700x switches that use a single byte tag trailer. + endif diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 4f47b2025ff5..44bc79952b8b 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -7,14 +7,16 @@ dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o obj-$(CONFIG_NET_DSA_TAG_8021Q) += tag_8021q.o obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o obj-$(CONFIG_NET_DSA_TAG_BRCM_COMMON) += tag_brcm.o -obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o -obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o +obj-$(CONFIG_NET_DSA_TAG_DSA_COMMON) += tag_dsa.o obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o +obj-$(CONFIG_NET_DSA_TAG_HELLCREEK) += tag_hellcreek.o obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o +obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o +obj-$(CONFIG_NET_DSA_TAG_XRS700X) += tag_xrs700x.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 2131bf2b3a67..84cad1be9ce4 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -84,6 +84,32 @@ const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops) return ops->name; }; +/* Function takes a reference on the module owning the tagger, + * so dsa_tag_driver_put must be called afterwards. + */ +const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf) +{ + const struct dsa_device_ops *ops = ERR_PTR(-ENOPROTOOPT); + struct dsa_tag_driver *dsa_tag_driver; + + mutex_lock(&dsa_tag_drivers_lock); + list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { + const struct dsa_device_ops *tmp = dsa_tag_driver->ops; + + if (!sysfs_streq(buf, tmp->name)) + continue; + + if (!try_module_get(dsa_tag_driver->owner)) + break; + + ops = tmp; + break; + } + mutex_unlock(&dsa_tag_drivers_lock); + + return ops; +} + const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol) { struct dsa_tag_driver *dsa_tag_driver; @@ -201,7 +227,6 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, { struct dsa_port *cpu_dp = dev->dsa_ptr; struct sk_buff *nskb = NULL; - struct pcpu_sw_netstats *s; struct dsa_slave_priv *p; if (unlikely(!cpu_dp)) { @@ -220,11 +245,21 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, } skb = nskb; - p = netdev_priv(skb->dev); skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); + if (unlikely(!dsa_slave_dev_check(skb->dev))) { + /* Packet is to be injected directly on an upper + * device, e.g. a team/bond, so skip all DSA-port + * specific actions. + */ + netif_rx(skb); + return 0; + } + + p = netdev_priv(skb->dev); + if (unlikely(cpu_dp->ds->untag_bridge_pvid)) { nskb = dsa_untag_bridge_pvid(skb); if (!nskb) { @@ -234,11 +269,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, skb = nskb; } - s = this_cpu_ptr(p->stats64); - u64_stats_update_begin(&s->syncp); - s->rx_packets++; - s->rx_bytes += skb->len; - u64_stats_update_end(&s->syncp); + dev_sw_netstats_rx_add(skb->dev, skb->len); if (dsa_skb_defer_rx_timestamp(p, skb)) return 0; @@ -314,28 +345,6 @@ bool dsa_schedule_work(struct work_struct *work) return queue_work(dsa_owq, work); } -static ATOMIC_NOTIFIER_HEAD(dsa_notif_chain); - -int register_dsa_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_register(&dsa_notif_chain, nb); -} -EXPORT_SYMBOL_GPL(register_dsa_notifier); - -int unregister_dsa_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&dsa_notif_chain, nb); -} -EXPORT_SYMBOL_GPL(unregister_dsa_notifier); - -int call_dsa_notifiers(unsigned long val, struct net_device *dev, - struct dsa_notifier_info *info) -{ - info->dev = dev; - return atomic_notifier_call_chain(&dsa_notif_chain, val, info); -} -EXPORT_SYMBOL_GPL(call_dsa_notifiers); - int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx) { diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 183003e45762..4d4956ed303b 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -21,6 +21,108 @@ static DEFINE_MUTEX(dsa2_mutex); LIST_HEAD(dsa_tree_list); +/** + * dsa_tree_notify - Execute code for all switches in a DSA switch tree. + * @dst: collection of struct dsa_switch devices to notify. + * @e: event, must be of type DSA_NOTIFIER_* + * @v: event-specific value. + * + * Given a struct dsa_switch_tree, this can be used to run a function once for + * each member DSA switch. The other alternative of traversing the tree is only + * through its ports list, which does not uniquely list the switches. + */ +int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v) +{ + struct raw_notifier_head *nh = &dst->nh; + int err; + + err = raw_notifier_call_chain(nh, e, v); + + return notifier_to_errno(err); +} + +/** + * dsa_broadcast - Notify all DSA trees in the system. + * @e: event, must be of type DSA_NOTIFIER_* + * @v: event-specific value. + * + * Can be used to notify the switching fabric of events such as cross-chip + * bridging between disjoint trees (such as islands of tagger-compatible + * switches bridged by an incompatible middle switch). + */ +int dsa_broadcast(unsigned long e, void *v) +{ + struct dsa_switch_tree *dst; + int err = 0; + + list_for_each_entry(dst, &dsa_tree_list, list) { + err = dsa_tree_notify(dst, e, v); + if (err) + break; + } + + return err; +} + +/** + * dsa_lag_map() - Map LAG netdev to a linear LAG ID + * @dst: Tree in which to record the mapping. + * @lag: Netdev that is to be mapped to an ID. + * + * dsa_lag_id/dsa_lag_dev can then be used to translate between the + * two spaces. The size of the mapping space is determined by the + * driver by setting ds->num_lag_ids. It is perfectly legal to leave + * it unset if it is not needed, in which case these functions become + * no-ops. + */ +void dsa_lag_map(struct dsa_switch_tree *dst, struct net_device *lag) +{ + unsigned int id; + + if (dsa_lag_id(dst, lag) >= 0) + /* Already mapped */ + return; + + for (id = 0; id < dst->lags_len; id++) { + if (!dsa_lag_dev(dst, id)) { + dst->lags[id] = lag; + return; + } + } + + /* No IDs left, which is OK. Some drivers do not need it. The + * ones that do, e.g. mv88e6xxx, will discover that dsa_lag_id + * returns an error for this device when joining the LAG. The + * driver can then return -EOPNOTSUPP back to DSA, which will + * fall back to a software LAG. + */ +} + +/** + * dsa_lag_unmap() - Remove a LAG ID mapping + * @dst: Tree in which the mapping is recorded. + * @lag: Netdev that was mapped. + * + * As there may be multiple users of the mapping, it is only removed + * if there are no other references to it. + */ +void dsa_lag_unmap(struct dsa_switch_tree *dst, struct net_device *lag) +{ + struct dsa_port *dp; + unsigned int id; + + dsa_lag_foreach_port(dp, dst, lag) + /* There are remaining users of this mapping */ + return; + + dsa_lags_foreach_id(id, dst) { + if (dsa_lag_dev(dst, id) == lag) { + dst->lags[id] = NULL; + break; + } + } +} + struct dsa_switch *dsa_switch_find(int tree_index, int sw_index) { struct dsa_switch_tree *dst; @@ -77,6 +179,8 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index) static void dsa_tree_free(struct dsa_switch_tree *dst) { + if (dst->tag_ops) + dsa_tag_driver_put(dst->tag_ops); list_del(&dst->list); kfree(dst); } @@ -353,15 +457,18 @@ static int dsa_port_devlink_setup(struct dsa_port *dp) static void dsa_port_teardown(struct dsa_port *dp) { + struct devlink_port *dlp = &dp->devlink_port; + if (!dp->setup) return; + devlink_port_type_clear(dlp); + switch (dp->type) { case DSA_PORT_TYPE_UNUSED: break; case DSA_PORT_TYPE_CPU: dsa_port_disable(dp); - dsa_tag_driver_put(dp->tag_ops); dsa_port_link_unregister_of(dp); break; case DSA_PORT_TYPE_DSA: @@ -400,8 +507,165 @@ static int dsa_devlink_info_get(struct devlink *dl, return -EOPNOTSUPP; } +static int dsa_devlink_sb_pool_get(struct devlink *dl, + unsigned int sb_index, u16 pool_index, + struct devlink_sb_pool_info *pool_info) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (!ds->ops->devlink_sb_pool_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_pool_get(ds, sb_index, pool_index, + pool_info); +} + +static int dsa_devlink_sb_pool_set(struct devlink *dl, unsigned int sb_index, + u16 pool_index, u32 size, + enum devlink_sb_threshold_type threshold_type, + struct netlink_ext_ack *extack) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (!ds->ops->devlink_sb_pool_set) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_pool_set(ds, sb_index, pool_index, size, + threshold_type, extack); +} + +static int dsa_devlink_sb_port_pool_get(struct devlink_port *dlp, + unsigned int sb_index, u16 pool_index, + u32 *p_threshold) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_port_pool_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_port_pool_get(ds, port, sb_index, + pool_index, p_threshold); +} + +static int dsa_devlink_sb_port_pool_set(struct devlink_port *dlp, + unsigned int sb_index, u16 pool_index, + u32 threshold, + struct netlink_ext_ack *extack) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_port_pool_set) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_port_pool_set(ds, port, sb_index, + pool_index, threshold, extack); +} + +static int +dsa_devlink_sb_tc_pool_bind_get(struct devlink_port *dlp, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 *p_pool_index, u32 *p_threshold) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_tc_pool_bind_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_tc_pool_bind_get(ds, port, sb_index, + tc_index, pool_type, + p_pool_index, p_threshold); +} + +static int +dsa_devlink_sb_tc_pool_bind_set(struct devlink_port *dlp, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 pool_index, u32 threshold, + struct netlink_ext_ack *extack) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_tc_pool_bind_set) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_tc_pool_bind_set(ds, port, sb_index, + tc_index, pool_type, + pool_index, threshold, + extack); +} + +static int dsa_devlink_sb_occ_snapshot(struct devlink *dl, + unsigned int sb_index) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (!ds->ops->devlink_sb_occ_snapshot) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_occ_snapshot(ds, sb_index); +} + +static int dsa_devlink_sb_occ_max_clear(struct devlink *dl, + unsigned int sb_index) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (!ds->ops->devlink_sb_occ_max_clear) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_occ_max_clear(ds, sb_index); +} + +static int dsa_devlink_sb_occ_port_pool_get(struct devlink_port *dlp, + unsigned int sb_index, + u16 pool_index, u32 *p_cur, + u32 *p_max) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_occ_port_pool_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_occ_port_pool_get(ds, port, sb_index, + pool_index, p_cur, p_max); +} + +static int +dsa_devlink_sb_occ_tc_port_bind_get(struct devlink_port *dlp, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u32 *p_cur, u32 *p_max) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_occ_tc_port_bind_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_occ_tc_port_bind_get(ds, port, + sb_index, tc_index, + pool_type, p_cur, + p_max); +} + static const struct devlink_ops dsa_devlink_ops = { - .info_get = dsa_devlink_info_get, + .info_get = dsa_devlink_info_get, + .sb_pool_get = dsa_devlink_sb_pool_get, + .sb_pool_set = dsa_devlink_sb_pool_set, + .sb_port_pool_get = dsa_devlink_sb_port_pool_get, + .sb_port_pool_set = dsa_devlink_sb_port_pool_set, + .sb_tc_pool_bind_get = dsa_devlink_sb_tc_pool_bind_get, + .sb_tc_pool_bind_set = dsa_devlink_sb_tc_pool_bind_set, + .sb_occ_snapshot = dsa_devlink_sb_occ_snapshot, + .sb_occ_max_clear = dsa_devlink_sb_occ_max_clear, + .sb_occ_port_pool_get = dsa_devlink_sb_occ_port_pool_get, + .sb_occ_tc_port_bind_get = dsa_devlink_sb_occ_tc_port_bind_get, }; static int dsa_switch_setup(struct dsa_switch *ds) @@ -448,6 +712,8 @@ static int dsa_switch_setup(struct dsa_switch *ds) if (err) goto unregister_devlink_ports; + ds->configure_vlan_while_not_filtering = true; + err = ds->ops->setup(ds); if (err < 0) goto unregister_notifier; @@ -458,20 +724,23 @@ static int dsa_switch_setup(struct dsa_switch *ds) ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) { err = -ENOMEM; - goto unregister_notifier; + goto teardown; } dsa_slave_mii_bus_init(ds); err = mdiobus_register(ds->slave_mii_bus); if (err < 0) - goto unregister_notifier; + goto teardown; } ds->setup = true; return 0; +teardown: + if (ds->ops->teardown) + ds->ops->teardown(ds); unregister_notifier: dsa_switch_unregister_notifier(ds); unregister_devlink_ports: @@ -578,6 +847,32 @@ static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) dsa_master_teardown(dp->master); } +static int dsa_tree_setup_lags(struct dsa_switch_tree *dst) +{ + unsigned int len = 0; + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) { + if (dp->ds->num_lag_ids > len) + len = dp->ds->num_lag_ids; + } + + if (!len) + return 0; + + dst->lags = kcalloc(len, sizeof(*dst->lags), GFP_KERNEL); + if (!dst->lags) + return -ENOMEM; + + dst->lags_len = len; + return 0; +} + +static void dsa_tree_teardown_lags(struct dsa_switch_tree *dst) +{ + kfree(dst->lags); +} + static int dsa_tree_setup(struct dsa_switch_tree *dst) { bool complete; @@ -605,12 +900,18 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst) if (err) goto teardown_switches; + err = dsa_tree_setup_lags(dst); + if (err) + goto teardown_master; + dst->setup = true; pr_info("DSA: tree %d setup\n", dst->index); return 0; +teardown_master: + dsa_tree_teardown_master(dst); teardown_switches: dsa_tree_teardown_switches(dst); teardown_default_cpu: @@ -626,6 +927,8 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst) if (!dst->setup) return; + dsa_tree_teardown_lags(dst); + dsa_tree_teardown_master(dst); dsa_tree_teardown_switches(dst); @@ -642,6 +945,57 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst) dst->setup = false; } +/* Since the dsa/tagging sysfs device attribute is per master, the assumption + * is that all DSA switches within a tree share the same tagger, otherwise + * they would have formed disjoint trees (different "dsa,member" values). + */ +int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, + struct net_device *master, + const struct dsa_device_ops *tag_ops, + const struct dsa_device_ops *old_tag_ops) +{ + struct dsa_notifier_tag_proto_info info; + struct dsa_port *dp; + int err = -EBUSY; + + if (!rtnl_trylock()) + return restart_syscall(); + + /* At the moment we don't allow changing the tag protocol under + * traffic. The rtnl_mutex also happens to serialize concurrent + * attempts to change the tagging protocol. If we ever lift the IFF_UP + * restriction, there needs to be another mutex which serializes this. + */ + if (master->flags & IFF_UP) + goto out_unlock; + + list_for_each_entry(dp, &dst->ports, list) { + if (!dsa_is_user_port(dp->ds, dp->index)) + continue; + + if (dp->slave->flags & IFF_UP) + goto out_unlock; + } + + info.tag_ops = tag_ops; + err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info); + if (err) + goto out_unwind_tagger; + + dst->tag_ops = tag_ops; + + rtnl_unlock(); + + return 0; + +out_unwind_tagger: + info.tag_ops = old_tag_ops; + dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info); +out_unlock: + rtnl_unlock(); + return err; +} + static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) { struct dsa_switch_tree *dst = ds->dst; @@ -712,24 +1066,33 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) { struct dsa_switch *ds = dp->ds; struct dsa_switch_tree *dst = ds->dst; - const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; tag_protocol = dsa_get_tag_protocol(dp, master); - tag_ops = dsa_tag_driver_get(tag_protocol); - if (IS_ERR(tag_ops)) { - if (PTR_ERR(tag_ops) == -ENOPROTOOPT) - return -EPROBE_DEFER; - dev_warn(ds->dev, "No tagger for this switch\n"); - dp->master = NULL; - return PTR_ERR(tag_ops); + if (dst->tag_ops) { + if (dst->tag_ops->proto != tag_protocol) { + dev_err(ds->dev, + "A DSA switch tree can have only one tagging protocol\n"); + return -EINVAL; + } + /* In the case of multiple CPU ports per switch, the tagging + * protocol is still reference-counted only per switch tree, so + * nothing to do here. + */ + } else { + dst->tag_ops = dsa_tag_driver_get(tag_protocol); + if (IS_ERR(dst->tag_ops)) { + if (PTR_ERR(dst->tag_ops) == -ENOPROTOOPT) + return -EPROBE_DEFER; + dev_warn(ds->dev, "No tagger for this switch\n"); + dp->master = NULL; + return PTR_ERR(dst->tag_ops); + } } dp->master = master; dp->type = DSA_PORT_TYPE_CPU; - dp->filter = tag_ops->filter; - dp->rcv = tag_ops->rcv; - dp->tag_ops = tag_ops; + dsa_port_set_tag_protocol(dp, dst->tag_ops); dp->dst = dst; return 0; @@ -783,6 +1146,8 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, goto out_put_node; if (reg >= ds->num_ports) { + dev_err(ds->dev, "port %pOF index %u exceeds num_ports (%zu)\n", + port, reg, ds->num_ports); err = -EINVAL; goto out_put_node; } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 12998bf04e55..2eeaa42f2e08 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -20,16 +20,25 @@ enum { DSA_NOTIFIER_BRIDGE_LEAVE, DSA_NOTIFIER_FDB_ADD, DSA_NOTIFIER_FDB_DEL, + DSA_NOTIFIER_HSR_JOIN, + DSA_NOTIFIER_HSR_LEAVE, + DSA_NOTIFIER_LAG_CHANGE, + DSA_NOTIFIER_LAG_JOIN, + DSA_NOTIFIER_LAG_LEAVE, DSA_NOTIFIER_MDB_ADD, DSA_NOTIFIER_MDB_DEL, DSA_NOTIFIER_VLAN_ADD, DSA_NOTIFIER_VLAN_DEL, DSA_NOTIFIER_MTU, + DSA_NOTIFIER_TAG_PROTO, + DSA_NOTIFIER_MRP_ADD, + DSA_NOTIFIER_MRP_DEL, + DSA_NOTIFIER_MRP_ADD_RING_ROLE, + DSA_NOTIFIER_MRP_DEL_RING_ROLE, }; /* DSA_NOTIFIER_AGEING_TIME */ struct dsa_notifier_ageing_time_info { - struct switchdev_trans *trans; unsigned int ageing_time; }; @@ -52,17 +61,25 @@ struct dsa_notifier_fdb_info { /* DSA_NOTIFIER_MDB_* */ struct dsa_notifier_mdb_info { const struct switchdev_obj_port_mdb *mdb; - struct switchdev_trans *trans; int sw_index; int port; }; +/* DSA_NOTIFIER_LAG_* */ +struct dsa_notifier_lag_info { + struct net_device *lag; + int sw_index; + int port; + + struct netdev_lag_upper_info *info; +}; + /* DSA_NOTIFIER_VLAN_* */ struct dsa_notifier_vlan_info { const struct switchdev_obj_port_vlan *vlan; - struct switchdev_trans *trans; int sw_index; int port; + struct netlink_ext_ack *extack; }; /* DSA_NOTIFIER_MTU */ @@ -73,13 +90,49 @@ struct dsa_notifier_mtu_info { int mtu; }; +/* DSA_NOTIFIER_TAG_PROTO_* */ +struct dsa_notifier_tag_proto_info { + const struct dsa_device_ops *tag_ops; +}; + +/* DSA_NOTIFIER_MRP_* */ +struct dsa_notifier_mrp_info { + const struct switchdev_obj_mrp *mrp; + int sw_index; + int port; +}; + +/* DSA_NOTIFIER_MRP_* */ +struct dsa_notifier_mrp_ring_role_info { + const struct switchdev_obj_ring_role_mrp *mrp; + int sw_index; + int port; +}; + +struct dsa_switchdev_event_work { + struct dsa_switch *ds; + int port; + struct work_struct work; + unsigned long event; + /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and + * SWITCHDEV_FDB_DEL_TO_DEVICE + */ + unsigned char addr[ETH_ALEN]; + u16 vid; +}; + +/* DSA_NOTIFIER_HSR_* */ +struct dsa_notifier_hsr_info { + struct net_device *hsr; + int sw_index; + int port; +}; + struct dsa_slave_priv { /* Copy of CPU port xmit for faster access in slave transmit hot path */ struct sk_buff * (*xmit)(struct sk_buff *skb, struct net_device *dev); - struct pcpu_sw_netstats __percpu *stats64; - struct gro_cells gcells; /* DSA port data, such as switch, port index, etc. */ @@ -96,19 +149,11 @@ struct dsa_slave_priv { /* dsa.c */ const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol); void dsa_tag_driver_put(const struct dsa_device_ops *ops); +const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf); bool dsa_schedule_work(struct work_struct *work); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); -int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid, - u16 flags, - struct netlink_ext_ack *extack); -int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid); - /* master.c */ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp); void dsa_master_teardown(struct net_device *dev); @@ -129,19 +174,24 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, } /* port.c */ -int dsa_port_set_state(struct dsa_port *dp, u8 state, - struct switchdev_trans *trans); +void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, + const struct dsa_device_ops *tag_ops); +int dsa_port_set_state(struct dsa_port *dp, u8 state); int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy); int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy); void dsa_port_disable_rt(struct dsa_port *dp); void dsa_port_disable(struct dsa_port *dp); int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); +int dsa_port_lag_change(struct dsa_port *dp, + struct netdev_lag_lower_state_info *linfo); +int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev, + struct netdev_lag_upper_info *uinfo); +void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, - struct switchdev_trans *trans); + struct netlink_ext_ack *extack); bool dsa_port_skip_vlan_configuration(struct dsa_port *dp); -int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, - struct switchdev_trans *trans); +int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock); int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu, bool propagate_upstream); int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, @@ -150,35 +200,84 @@ int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, u16 vid); int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data); int dsa_port_mdb_add(const struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb, - struct switchdev_trans *trans); + const struct switchdev_obj_port_mdb *mdb); int dsa_port_mdb_del(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); -int dsa_port_pre_bridge_flags(const struct dsa_port *dp, unsigned long flags, - struct switchdev_trans *trans); -int dsa_port_bridge_flags(const struct dsa_port *dp, unsigned long flags, - struct switchdev_trans *trans); +int dsa_port_pre_bridge_flags(const struct dsa_port *dp, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack); +int dsa_port_bridge_flags(const struct dsa_port *dp, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack); int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, - struct switchdev_trans *trans); + struct netlink_ext_ack *extack); int dsa_port_vlan_add(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan, - struct switchdev_trans *trans); + struct netlink_ext_ack *extack); int dsa_port_vlan_del(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan); +int dsa_port_mrp_add(const struct dsa_port *dp, + const struct switchdev_obj_mrp *mrp); +int dsa_port_mrp_del(const struct dsa_port *dp, + const struct switchdev_obj_mrp *mrp); +int dsa_port_mrp_add_ring_role(const struct dsa_port *dp, + const struct switchdev_obj_ring_role_mrp *mrp); +int dsa_port_mrp_del_ring_role(const struct dsa_port *dp, + const struct switchdev_obj_ring_role_mrp *mrp); int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); +int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr); +void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr); extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; +static inline bool dsa_port_offloads_netdev(struct dsa_port *dp, + struct net_device *dev) +{ + /* Switchdev offloading can be configured on: */ + + if (dev == dp->slave) + /* DSA ports directly connected to a bridge, and event + * was emitted for the ports themselves. + */ + return true; + + if (dp->bridge_dev == dev) + /* DSA ports connected to a bridge, and event was emitted + * for the bridge. + */ + return true; + + if (dp->lag_dev == dev) + /* DSA ports connected to a bridge via a LAG */ + return true; + + return false; +} + +/* Returns true if any port of this tree offloads the given net_device */ +static inline bool dsa_tree_offloads_netdev(struct dsa_switch_tree *dst, + struct net_device *dev) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_offloads_netdev(dp, dev)) + return true; + + return false; +} + /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); int dsa_slave_create(struct dsa_port *dp); void dsa_slave_destroy(struct net_device *slave_dev); -bool dsa_slave_dev_check(const struct net_device *dev); int dsa_slave_suspend(struct net_device *slave_dev); int dsa_slave_resume(struct net_device *slave_dev); int dsa_slave_register_notifier(void); void dsa_slave_unregister_notifier(void); +void dsa_slave_setup_tagger(struct net_device *slave); +int dsa_slave_change_mtu(struct net_device *dev, int new_mtu); static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev) { @@ -259,6 +358,15 @@ int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); /* dsa2.c */ +void dsa_lag_map(struct dsa_switch_tree *dst, struct net_device *lag); +void dsa_lag_unmap(struct dsa_switch_tree *dst, struct net_device *lag); +int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v); +int dsa_broadcast(unsigned long e, void *v); +int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, + struct net_device *master, + const struct dsa_device_ops *tag_ops, + const struct dsa_device_ops *old_tag_ops); + extern struct list_head dsa_tree_list; #endif diff --git a/net/dsa/master.c b/net/dsa/master.c index c91de041a91d..052a977914a6 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -280,7 +280,44 @@ static ssize_t tagging_show(struct device *d, struct device_attribute *attr, return sprintf(buf, "%s\n", dsa_tag_protocol_to_str(cpu_dp->tag_ops)); } -static DEVICE_ATTR_RO(tagging); + +static ssize_t tagging_store(struct device *d, struct device_attribute *attr, + const char *buf, size_t count) +{ + const struct dsa_device_ops *new_tag_ops, *old_tag_ops; + struct net_device *dev = to_net_dev(d); + struct dsa_port *cpu_dp = dev->dsa_ptr; + int err; + + old_tag_ops = cpu_dp->tag_ops; + new_tag_ops = dsa_find_tagger_by_name(buf); + /* Bad tagger name, or module is not loaded? */ + if (IS_ERR(new_tag_ops)) + return PTR_ERR(new_tag_ops); + + if (new_tag_ops == old_tag_ops) + /* Drop the temporarily held duplicate reference, since + * the DSA switch tree uses this tagger. + */ + goto out; + + err = dsa_tree_change_tag_proto(cpu_dp->ds->dst, dev, new_tag_ops, + old_tag_ops); + if (err) { + /* On failure the old tagger is restored, so we don't need the + * driver for the new one. + */ + dsa_tag_driver_put(new_tag_ops); + return err; + } + + /* On success we no longer need the module for the old tagging protocol + */ +out: + dsa_tag_driver_put(old_tag_ops); + return count; +} +static DEVICE_ATTR_RW(tagging); static struct attribute *dsa_slave_attrs[] = { &dev_attr_tagging.attr, @@ -308,14 +345,25 @@ static struct lock_class_key dsa_master_addr_list_lock_key; int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { + int mtu = ETH_DATA_LEN + cpu_dp->tag_ops->overhead; + struct dsa_switch *ds = cpu_dp->ds; + struct device_link *consumer_link; int ret; + /* The DSA master must use SET_NETDEV_DEV for this to work. */ + consumer_link = device_link_add(ds->dev, dev->dev.parent, + DL_FLAG_AUTOREMOVE_CONSUMER); + if (!consumer_link) + netdev_err(dev, + "Failed to create a device link to DSA switch %s\n", + dev_name(ds->dev)); + rtnl_lock(); - ret = dev_set_mtu(dev, ETH_DATA_LEN + cpu_dp->tag_ops->overhead); + ret = dev_set_mtu(dev, mtu); rtnl_unlock(); if (ret) - netdev_warn(dev, "error %d setting MTU to include DSA overhead\n", - ret); + netdev_warn(dev, "error %d setting MTU to %d to include DSA overhead\n", + ret, mtu); /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get diff --git a/net/dsa/port.c b/net/dsa/port.c index 73569c9af3cc..c9c6d7ab3f47 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -13,44 +13,32 @@ #include "dsa_priv.h" -static int dsa_broadcast(unsigned long e, void *v) -{ - struct dsa_switch_tree *dst; - int err = 0; - - list_for_each_entry(dst, &dsa_tree_list, list) { - struct raw_notifier_head *nh = &dst->nh; - - err = raw_notifier_call_chain(nh, e, v); - err = notifier_to_errno(err); - if (err) - break; - } - - return err; -} - +/** + * dsa_port_notify - Notify the switching fabric of changes to a port + * @dp: port on which change occurred + * @e: event, must be of type DSA_NOTIFIER_* + * @v: event-specific value. + * + * Notify all switches in the DSA tree that this port's switch belongs to, + * including this switch itself, of an event. Allows the other switches to + * reconfigure themselves for cross-chip operations. Can also be used to + * reconfigure ports without net_devices (CPU ports, DSA links) whenever + * a user port's state changes. + */ static int dsa_port_notify(const struct dsa_port *dp, unsigned long e, void *v) { - struct raw_notifier_head *nh = &dp->ds->dst->nh; - int err; - - err = raw_notifier_call_chain(nh, e, v); - - return notifier_to_errno(err); + return dsa_tree_notify(dp->ds->dst, e, v); } -int dsa_port_set_state(struct dsa_port *dp, u8 state, - struct switchdev_trans *trans) +int dsa_port_set_state(struct dsa_port *dp, u8 state) { struct dsa_switch *ds = dp->ds; int port = dp->index; - if (switchdev_trans_ph_prepare(trans)) - return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP; + if (!ds->ops->port_stp_state_set) + return -EOPNOTSUPP; - if (ds->ops->port_stp_state_set) - ds->ops->port_stp_state_set(ds, port, state); + ds->ops->port_stp_state_set(ds, port, state); if (ds->ops->port_fast_age) { /* Fast age FDB entries or flush appropriate forwarding database @@ -75,7 +63,7 @@ static void dsa_port_set_state_now(struct dsa_port *dp, u8 state) { int err; - err = dsa_port_set_state(dp, state, NULL); + err = dsa_port_set_state(dp, state); if (err) pr_err("DSA: failed to set STP state %u (%d)\n", state, err); } @@ -134,6 +122,28 @@ void dsa_port_disable(struct dsa_port *dp) rtnl_unlock(); } +static void dsa_port_change_brport_flags(struct dsa_port *dp, + bool bridge_offload) +{ + struct switchdev_brport_flags flags; + int flag; + + flags.mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; + if (bridge_offload) + flags.val = flags.mask; + else + flags.val = flags.mask & ~BR_LEARNING; + + for_each_set_bit(flag, &flags.mask, 32) { + struct switchdev_brport_flags tmp; + + tmp.val = flags.val & BIT(flag); + tmp.mask = BIT(flag); + + dsa_port_bridge_flags(dp, tmp, NULL); + } +} + int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) { struct dsa_notifier_bridge_info info = { @@ -144,10 +154,10 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) }; int err; - /* Set the flooding mode before joining the port in the switch */ - err = dsa_port_bridge_flags(dp, BR_FLOOD | BR_MCAST_FLOOD, NULL); - if (err) - return err; + /* Notify the port driver to set its configurable flags in a way that + * matches the initial settings of a bridge port. + */ + dsa_port_change_brport_flags(dp, true); /* Here the interface is already bridged. Reflect the current * configuration so that drivers can program their chips accordingly. @@ -158,7 +168,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) /* The bridging is rolled back on error */ if (err) { - dsa_port_bridge_flags(dp, 0, NULL); + dsa_port_change_brport_flags(dp, false); dp->bridge_dev = NULL; } @@ -184,8 +194,18 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) if (err) pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); - /* Port is leaving the bridge, disable flooding */ - dsa_port_bridge_flags(dp, 0, NULL); + /* Configure the port for standalone mode (no address learning, + * flood everything). + * The bridge only emits SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS events + * when the user requests it through netlink or sysfs, but not + * automatically at port join or leave, so we need to handle resetting + * the brport flags ourselves. But we even prefer it that way, because + * otherwise, some setups might never get the notification they need, + * for example, when a port leaves a LAG that offloads the bridge, + * it becomes standalone, but as far as the bridge is concerned, no + * port ever left. + */ + dsa_port_change_brport_flags(dp, false); /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, * so allow it to be in BR_STATE_FORWARDING to be kept functional @@ -193,9 +213,89 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) dsa_port_set_state_now(dp, BR_STATE_FORWARDING); } +int dsa_port_lag_change(struct dsa_port *dp, + struct netdev_lag_lower_state_info *linfo) +{ + struct dsa_notifier_lag_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + }; + bool tx_enabled; + + if (!dp->lag_dev) + return 0; + + /* On statically configured aggregates (e.g. loadbalance + * without LACP) ports will always be tx_enabled, even if the + * link is down. Thus we require both link_up and tx_enabled + * in order to include it in the tx set. + */ + tx_enabled = linfo->link_up && linfo->tx_enabled; + + if (tx_enabled == dp->lag_tx_enabled) + return 0; + + dp->lag_tx_enabled = tx_enabled; + + return dsa_port_notify(dp, DSA_NOTIFIER_LAG_CHANGE, &info); +} + +int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag, + struct netdev_lag_upper_info *uinfo) +{ + struct dsa_notifier_lag_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .lag = lag, + .info = uinfo, + }; + int err; + + dsa_lag_map(dp->ds->dst, lag); + dp->lag_dev = lag; + + err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_JOIN, &info); + if (err) { + dp->lag_dev = NULL; + dsa_lag_unmap(dp->ds->dst, lag); + } + + return err; +} + +void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag) +{ + struct dsa_notifier_lag_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .lag = lag, + }; + int err; + + if (!dp->lag_dev) + return; + + /* Port might have been part of a LAG that in turn was + * attached to a bridge. + */ + if (dp->bridge_dev) + dsa_port_bridge_leave(dp, dp->bridge_dev); + + dp->lag_tx_enabled = false; + dp->lag_dev = NULL; + + err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_LEAVE, &info); + if (err) + pr_err("DSA: failed to notify DSA_NOTIFIER_LAG_LEAVE: %d\n", + err); + + dsa_lag_unmap(dp->ds->dst, lag); +} + /* Must be called under rcu_read_lock() */ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, - bool vlan_filtering) + bool vlan_filtering, + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; int err, i; @@ -225,8 +325,8 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, */ err = br_vlan_get_info(br, vid, &br_info); if (err == 0) { - dev_err(ds->dev, "Must remove upper %s first\n", - upper_dev->name); + NL_SET_ERR_MSG_MOD(extack, + "Must first remove VLAN uppers having VIDs also present in bridge"); return false; } } @@ -252,7 +352,8 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, if (other_bridge == dp->bridge_dev) continue; if (br_vlan_enabled(other_bridge) != vlan_filtering) { - dev_err(ds->dev, "VLAN filtering is a global setting\n"); + NL_SET_ERR_MSG_MOD(extack, + "VLAN filtering is a global setting"); return false; } } @@ -260,42 +361,37 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, } int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, - struct switchdev_trans *trans) + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; + bool apply; int err; - if (switchdev_trans_ph_prepare(trans)) { - bool apply; - - if (!ds->ops->port_vlan_filtering) - return -EOPNOTSUPP; + if (!ds->ops->port_vlan_filtering) + return -EOPNOTSUPP; - /* We are called from dsa_slave_switchdev_blocking_event(), - * which is not under rcu_read_lock(), unlike - * dsa_slave_switchdev_event(). - */ - rcu_read_lock(); - apply = dsa_port_can_apply_vlan_filtering(dp, vlan_filtering); - rcu_read_unlock(); - if (!apply) - return -EINVAL; - } + /* We are called from dsa_slave_switchdev_blocking_event(), + * which is not under rcu_read_lock(), unlike + * dsa_slave_switchdev_event(). + */ + rcu_read_lock(); + apply = dsa_port_can_apply_vlan_filtering(dp, vlan_filtering, extack); + rcu_read_unlock(); + if (!apply) + return -EINVAL; if (dsa_port_is_vlan_filtering(dp) == vlan_filtering) return 0; err = ds->ops->port_vlan_filtering(ds, dp->index, vlan_filtering, - trans); + extack); if (err) return err; - if (switchdev_trans_ph_commit(trans)) { - if (ds->vlan_filtering_is_global) - ds->vlan_filtering = vlan_filtering; - else - dp->vlan_filtering = vlan_filtering; - } + if (ds->vlan_filtering_is_global) + ds->vlan_filtering = vlan_filtering; + else + dp->vlan_filtering = vlan_filtering; return 0; } @@ -314,63 +410,57 @@ bool dsa_port_skip_vlan_configuration(struct dsa_port *dp) !br_vlan_enabled(dp->bridge_dev)); } -int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, - struct switchdev_trans *trans) +int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock) { unsigned long ageing_jiffies = clock_t_to_jiffies(ageing_clock); unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); - struct dsa_notifier_ageing_time_info info = { - .ageing_time = ageing_time, - .trans = trans, - }; + struct dsa_notifier_ageing_time_info info; + int err; + + info.ageing_time = ageing_time; - if (switchdev_trans_ph_prepare(trans)) - return dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, &info); + err = dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, &info); + if (err) + return err; dp->ageing_time = ageing_time; - return dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, &info); + return 0; } -int dsa_port_pre_bridge_flags(const struct dsa_port *dp, unsigned long flags, - struct switchdev_trans *trans) +int dsa_port_pre_bridge_flags(const struct dsa_port *dp, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; - if (!ds->ops->port_egress_floods || - (flags & ~(BR_FLOOD | BR_MCAST_FLOOD))) + if (!ds->ops->port_pre_bridge_flags) return -EINVAL; - return 0; + return ds->ops->port_pre_bridge_flags(ds, dp->index, flags, extack); } -int dsa_port_bridge_flags(const struct dsa_port *dp, unsigned long flags, - struct switchdev_trans *trans) +int dsa_port_bridge_flags(const struct dsa_port *dp, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; - int port = dp->index; - int err = 0; - if (switchdev_trans_ph_prepare(trans)) - return 0; - - if (ds->ops->port_egress_floods) - err = ds->ops->port_egress_floods(ds, port, flags & BR_FLOOD, - flags & BR_MCAST_FLOOD); + if (!ds->ops->port_bridge_flags) + return -EINVAL; - return err; + return ds->ops->port_bridge_flags(ds, dp->index, flags, extack); } int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, - struct switchdev_trans *trans) + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; - int port = dp->index; - if (switchdev_trans_ph_prepare(trans)) - return ds->ops->port_egress_floods ? 0 : -EOPNOTSUPP; + if (!ds->ops->port_set_mrouter) + return -EOPNOTSUPP; - return ds->ops->port_egress_floods(ds, port, true, mrouter); + return ds->ops->port_set_mrouter(ds, dp->index, mrouter, extack); } int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu, @@ -425,13 +515,11 @@ int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data) } int dsa_port_mdb_add(const struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb, - struct switchdev_trans *trans) + const struct switchdev_obj_port_mdb *mdb) { struct dsa_notifier_mdb_info info = { .sw_index = dp->ds->index, .port = dp->index, - .trans = trans, .mdb = mdb, }; @@ -452,13 +540,13 @@ int dsa_port_mdb_del(const struct dsa_port *dp, int dsa_port_vlan_add(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan, - struct switchdev_trans *trans) + struct netlink_ext_ack *extack) { struct dsa_notifier_vlan_info info = { .sw_index = dp->ds->index, .port = dp->index, - .trans = trans, .vlan = vlan, + .extack = extack, }; return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); @@ -476,6 +564,62 @@ int dsa_port_vlan_del(struct dsa_port *dp, return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); } +int dsa_port_mrp_add(const struct dsa_port *dp, + const struct switchdev_obj_mrp *mrp) +{ + struct dsa_notifier_mrp_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mrp = mrp, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_MRP_ADD, &info); +} + +int dsa_port_mrp_del(const struct dsa_port *dp, + const struct switchdev_obj_mrp *mrp) +{ + struct dsa_notifier_mrp_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mrp = mrp, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_MRP_DEL, &info); +} + +int dsa_port_mrp_add_ring_role(const struct dsa_port *dp, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct dsa_notifier_mrp_ring_role_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mrp = mrp, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_MRP_ADD_RING_ROLE, &info); +} + +int dsa_port_mrp_del_ring_role(const struct dsa_port *dp, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct dsa_notifier_mrp_ring_role_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mrp = mrp, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_MRP_DEL_RING_ROLE, &info); +} + +void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, + const struct dsa_device_ops *tag_ops) +{ + cpu_dp->filter = tag_ops->filter; + cpu_dp->rcv = tag_ops->rcv; + cpu_dp->tag_ops = tag_ops; +} + static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) { struct device_node *phy_dn; @@ -810,3 +954,37 @@ int dsa_port_get_phy_sset_count(struct dsa_port *dp) return ret; } EXPORT_SYMBOL_GPL(dsa_port_get_phy_sset_count); + +int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr) +{ + struct dsa_notifier_hsr_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .hsr = hsr, + }; + int err; + + dp->hsr_dev = hsr; + + err = dsa_port_notify(dp, DSA_NOTIFIER_HSR_JOIN, &info); + if (err) + dp->hsr_dev = NULL; + + return err; +} + +void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr) +{ + struct dsa_notifier_hsr_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .hsr = hsr, + }; + int err; + + dp->hsr_dev = NULL; + + err = dsa_port_notify(dp, DSA_NOTIFIER_HSR_LEAVE, &info); + if (err) + pr_err("DSA: failed to notify DSA_NOTIFIER_HSR_LEAVE\n"); +} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 3bc5ca40c9fb..491e3761b5f4 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -17,6 +17,7 @@ #include <net/pkt_cls.h> #include <net/tc_act/tc_mirred.h> #include <linux/if_bridge.h> +#include <linux/if_hsr.h> #include <linux/netpoll.h> #include <linux/ptp_classify.h> @@ -68,8 +69,11 @@ static int dsa_slave_open(struct net_device *dev) struct dsa_port *dp = dsa_slave_to_port(dev); int err; - if (!(master->flags & IFF_UP)) - return -ENETDOWN; + err = dev_open(master, NULL); + if (err < 0) { + netdev_err(dev, "failed to open master %s\n", master->name); + goto out; + } if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) { err = dev_uc_add(master, dev->dev_addr); @@ -269,31 +273,34 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) static int dsa_slave_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, - struct switchdev_trans *trans) + struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); int ret; + if (!dsa_port_offloads_netdev(dp, attr->orig_dev)) + return -EOPNOTSUPP; + switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: - ret = dsa_port_set_state(dp, attr->u.stp_state, trans); + ret = dsa_port_set_state(dp, attr->u.stp_state); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering, - trans); + extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: - ret = dsa_port_ageing_time(dp, attr->u.ageing_time, trans); + ret = dsa_port_ageing_time(dp, attr->u.ageing_time); break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags, - trans); + extack); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: - ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, trans); + ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_MROUTER: - ret = dsa_port_mrouter(dp->cpu_dp, attr->u.mrouter, trans); + ret = dsa_port_mrouter(dp->cpu_dp, attr->u.mrouter, extack); break; default: ret = -EOPNOTSUPP; @@ -318,7 +325,7 @@ dsa_slave_vlan_check_for_8021q_uppers(struct net_device *slave, continue; vid = vlan_dev_vlan_id(upper_dev); - if (vid >= vlan->vid_begin && vid <= vlan->vid_end) + if (vid == vlan->vid) return -EBUSY; } @@ -327,33 +334,38 @@ dsa_slave_vlan_check_for_8021q_uppers(struct net_device *slave, static int dsa_slave_vlan_add(struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans) + struct netlink_ext_ack *extack) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan vlan; - int vid, err; + int err; - if (obj->orig_dev != dev) + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) return -EOPNOTSUPP; - if (dsa_port_skip_vlan_configuration(dp)) + if (dsa_port_skip_vlan_configuration(dp)) { + NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN"); return 0; + } vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj); /* Deny adding a bridge VLAN when there is already an 802.1Q upper with * the same VID. */ - if (trans->ph_prepare && br_vlan_enabled(dp->bridge_dev)) { + if (br_vlan_enabled(dp->bridge_dev)) { rcu_read_lock(); err = dsa_slave_vlan_check_for_8021q_uppers(dev, &vlan); rcu_read_unlock(); - if (err) + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Port already has a VLAN upper with this VID"); return err; + } } - err = dsa_port_vlan_add(dp, &vlan, trans); + err = dsa_port_vlan_add(dp, &vlan, extack); if (err) return err; @@ -363,47 +375,45 @@ static int dsa_slave_vlan_add(struct net_device *dev, */ vlan.flags &= ~BRIDGE_VLAN_INFO_PVID; - err = dsa_port_vlan_add(dp->cpu_dp, &vlan, trans); + err = dsa_port_vlan_add(dp->cpu_dp, &vlan, extack); if (err) return err; - for (vid = vlan.vid_begin; vid <= vlan.vid_end; vid++) { - err = vlan_vid_add(master, htons(ETH_P_8021Q), vid); - if (err) - return err; - } - - return 0; + return vlan_vid_add(master, htons(ETH_P_8021Q), vlan.vid); } static int dsa_slave_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); int err; - /* For the prepare phase, ensure the full set of changes is feasable in - * one go in order to signal a failure properly. If an operation is not - * supported, return -EOPNOTSUPP. - */ - switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: - if (obj->orig_dev != dev) + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) return -EOPNOTSUPP; - err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans); + err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: /* DSA can directly translate this to a normal MDB add, * but on the CPU port. */ - err = dsa_port_mdb_add(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj), - trans); + err = dsa_port_mdb_add(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: - err = dsa_slave_vlan_add(dev, obj, trans); + err = dsa_slave_vlan_add(dev, obj, extack); + break; + case SWITCHDEV_OBJ_ID_MRP: + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + return -EOPNOTSUPP; + err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj)); + break; + case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + return -EOPNOTSUPP; + err = dsa_port_mrp_add_ring_role(dp, + SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); break; default: err = -EOPNOTSUPP; @@ -419,9 +429,9 @@ static int dsa_slave_vlan_del(struct net_device *dev, struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan *vlan; - int vid, err; + int err; - if (obj->orig_dev != dev) + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) return -EOPNOTSUPP; if (dsa_port_skip_vlan_configuration(dp)) @@ -436,8 +446,7 @@ static int dsa_slave_vlan_del(struct net_device *dev, if (err) return err; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) - vlan_vid_del(master, htons(ETH_P_8021Q), vid); + vlan_vid_del(master, htons(ETH_P_8021Q), vlan->vid); return 0; } @@ -450,7 +459,7 @@ static int dsa_slave_port_obj_del(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: - if (obj->orig_dev != dev) + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; @@ -463,6 +472,17 @@ static int dsa_slave_port_obj_del(struct net_device *dev, case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_slave_vlan_del(dev, obj); break; + case SWITCHDEV_OBJ_ID_MRP: + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + return -EOPNOTSUPP; + err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj)); + break; + case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + return -EOPNOTSUPP; + err = dsa_port_mrp_del_ring_role(dp, + SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); + break; default: err = -EOPNOTSUPP; break; @@ -522,10 +542,10 @@ static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, if (!clone) return; - DSA_SKB_CB(skb)->clone = clone; - - if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type)) + if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type)) { + DSA_SKB_CB(skb)->clone = clone; return; + } kfree_skb(clone); } @@ -548,17 +568,36 @@ netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev) } EXPORT_SYMBOL_GPL(dsa_enqueue_skb); +static int dsa_realloc_skb(struct sk_buff *skb, struct net_device *dev) +{ + int needed_headroom = dev->needed_headroom; + int needed_tailroom = dev->needed_tailroom; + + /* For tail taggers, we need to pad short frames ourselves, to ensure + * that the tail tag does not fail at its role of being at the end of + * the packet, once the master interface pads the frame. Account for + * that pad length here, and pad later. + */ + if (unlikely(needed_tailroom && skb->len < ETH_ZLEN)) + needed_tailroom += ETH_ZLEN - skb->len; + /* skb_headroom() returns unsigned int... */ + needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0); + needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0); + + if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb))) + /* No reallocation needed, yay! */ + return 0; + + return pskb_expand_head(skb, needed_headroom, needed_tailroom, + GFP_ATOMIC); +} + static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct pcpu_sw_netstats *s; struct sk_buff *nskb; - s = this_cpu_ptr(p->stats64); - u64_stats_update_begin(&s->syncp); - s->tx_packets++; - s->tx_bytes += skb->len; - u64_stats_update_end(&s->syncp); + dev_sw_netstats_tx_add(dev, 1, skb->len); DSA_SKB_CB(skb)->clone = NULL; @@ -567,6 +606,17 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) */ dsa_skb_tx_timestamp(p, skb); + if (dsa_realloc_skb(skb, dev)) { + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + + /* needed_tailroom should still be 'warm' in the cache line from + * dsa_realloc_skb(), which has also ensured that padding is safe. + */ + if (dev->needed_tailroom) + eth_skb_pad(skb); + /* Transmit function may have to reallocate the original SKB, * in which case it must have freed it. Only free it here on error. */ @@ -679,7 +729,6 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, uint64_t *data) { struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = dp->ds; struct pcpu_sw_netstats *s; unsigned int start; @@ -688,7 +737,7 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, for_each_possible_cpu(i) { u64 tx_packets, tx_bytes, rx_packets, rx_bytes; - s = per_cpu_ptr(p->stats64, i); + s = per_cpu_ptr(dev->tstats, i); do { start = u64_stats_fetch_begin_irq(&s->syncp); tx_packets = s->tx_packets; @@ -1217,15 +1266,6 @@ static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type, return ds->ops->port_setup_tc(ds, dp->index, type, type_data); } -static void dsa_slave_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *stats) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - - netdev_stats_to_stats64(stats, &dev->stats); - dev_fetch_sw_netstats(stats, p->stats64); -} - static int dsa_slave_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc, u32 *rule_locs) { @@ -1269,35 +1309,29 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan vlan = { .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, - .vid_begin = vid, - .vid_end = vid, + .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; - struct switchdev_trans trans; + struct netlink_ext_ack extack = {0}; int ret; /* User port... */ - trans.ph_prepare = true; - ret = dsa_port_vlan_add(dp, &vlan, &trans); - if (ret) - return ret; - - trans.ph_prepare = false; - ret = dsa_port_vlan_add(dp, &vlan, &trans); - if (ret) + ret = dsa_port_vlan_add(dp, &vlan, &extack); + if (ret) { + if (extack._msg) + netdev_err(dev, "%s\n", extack._msg); return ret; + } /* And CPU port... */ - trans.ph_prepare = true; - ret = dsa_port_vlan_add(dp->cpu_dp, &vlan, &trans); - if (ret) - return ret; - - trans.ph_prepare = false; - ret = dsa_port_vlan_add(dp->cpu_dp, &vlan, &trans); - if (ret) + ret = dsa_port_vlan_add(dp->cpu_dp, &vlan, &extack); + if (ret) { + if (extack._msg) + netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index, + extack._msg); return ret; + } return vlan_vid_add(master, proto, vid); } @@ -1308,8 +1342,7 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan vlan = { - .vid_begin = vid, - .vid_end = vid, + .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; @@ -1437,7 +1470,7 @@ out: dsa_hw_port_list_free(&hw_port_list); } -static int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) +int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); @@ -1555,20 +1588,20 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { }; /* legacy way, bypassing the bridge *****************************************/ -int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid, - u16 flags, - struct netlink_ext_ack *extack) +static int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, u16 vid, + u16 flags, + struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); return dsa_port_fdb_add(dp, addr, vid); } -int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid) +static int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, u16 vid) { struct dsa_port *dp = dsa_slave_to_port(dev); @@ -1582,6 +1615,18 @@ static struct devlink_port *dsa_slave_get_devlink_port(struct net_device *dev) return dp->ds->devlink ? &dp->devlink_port : NULL; } +static void dsa_slave_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *s) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->get_stats64) + ds->ops->get_stats64(ds, dp->index, s); + else + dev_get_tstats64(dev, s); +} + static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, @@ -1703,6 +1748,27 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return ret; } +void dsa_slave_setup_tagger(struct net_device *slave) +{ + struct dsa_port *dp = dsa_slave_to_port(slave); + struct dsa_slave_priv *p = netdev_priv(slave); + const struct dsa_port *cpu_dp = dp->cpu_dp; + struct net_device *master = cpu_dp->master; + + if (cpu_dp->tag_ops->tail_tag) + slave->needed_tailroom = cpu_dp->tag_ops->overhead; + else + slave->needed_headroom = cpu_dp->tag_ops->overhead; + /* Try to save one extra realloc later in the TX path (in the master) + * by also inheriting the master's needed headroom and tailroom. + * The 8021q driver also does this. + */ + slave->needed_headroom += master->needed_headroom; + slave->needed_tailroom += master->needed_tailroom; + + p->xmit = cpu_dp->tag_ops->xmit; +} + static struct lock_class_key dsa_slave_netdev_xmit_lock_key; static void dsa_slave_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, @@ -1744,20 +1810,6 @@ int dsa_slave_resume(struct net_device *slave_dev) return 0; } -static void dsa_slave_notify(struct net_device *dev, unsigned long val) -{ - struct net_device *master = dsa_slave_to_master(dev); - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_notifier_register_info rinfo = { - .switch_number = dp->ds->index, - .port_number = dp->index, - .master = master, - .info.dev = dev, - }; - - call_dsa_notifiers(val, dev, &rinfo.info); -} - int dsa_slave_create(struct dsa_port *port) { const struct dsa_port *cpu_dp = port->cpu_dp; @@ -1801,8 +1853,8 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->vlan_features = master->vlan_features; p = netdev_priv(slave_dev); - p->stats64 = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!p->stats64) { + slave_dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!slave_dev->tstats) { free_netdev(slave_dev); return -ENOMEM; } @@ -1813,15 +1865,15 @@ int dsa_slave_create(struct dsa_port *port) p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); - p->xmit = cpu_dp->tag_ops->xmit; port->slave = slave_dev; + dsa_slave_setup_tagger(slave_dev); rtnl_lock(); ret = dsa_slave_change_mtu(slave_dev, ETH_DATA_LEN); rtnl_unlock(); if (ret && ret != -EOPNOTSUPP) - dev_warn(ds->dev, "nonfatal error %d setting MTU on port %d\n", - ret, port->index); + dev_warn(ds->dev, "nonfatal error %d setting MTU to %d on port %d\n", + ret, ETH_DATA_LEN, port->index); netif_carrier_off(slave_dev); @@ -1833,8 +1885,6 @@ int dsa_slave_create(struct dsa_port *port) goto out_gcells; } - dsa_slave_notify(slave_dev, DSA_PORT_REGISTER); - rtnl_lock(); ret = register_netdevice(slave_dev); @@ -1864,7 +1914,7 @@ out_phy: out_gcells: gro_cells_destroy(&p->gcells); out_free: - free_percpu(p->stats64); + free_percpu(slave_dev->tstats); free_netdev(slave_dev); port->slave = NULL; return ret; @@ -1883,10 +1933,9 @@ void dsa_slave_destroy(struct net_device *slave_dev) phylink_disconnect_phy(dp->pl); rtnl_unlock(); - dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); phylink_destroy(dp->pl); gro_cells_destroy(&p->gcells); - free_percpu(p->stats64); + free_percpu(slave_dev->tstats); free_netdev(slave_dev); } @@ -1894,6 +1943,7 @@ bool dsa_slave_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_slave_netdev_ops; } +EXPORT_SYMBOL_GPL(dsa_slave_dev_check); static int dsa_slave_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) @@ -1911,6 +1961,59 @@ static int dsa_slave_changeupper(struct net_device *dev, dsa_port_bridge_leave(dp, info->upper_dev); err = NOTIFY_OK; } + } else if (netif_is_lag_master(info->upper_dev)) { + if (info->linking) { + err = dsa_port_lag_join(dp, info->upper_dev, + info->upper_info); + if (err == -EOPNOTSUPP) { + NL_SET_ERR_MSG_MOD(info->info.extack, + "Offloading not supported"); + err = 0; + } + err = notifier_from_errno(err); + } else { + dsa_port_lag_leave(dp, info->upper_dev); + err = NOTIFY_OK; + } + } else if (is_hsr_master(info->upper_dev)) { + if (info->linking) { + err = dsa_port_hsr_join(dp, info->upper_dev); + if (err == -EOPNOTSUPP) { + NL_SET_ERR_MSG_MOD(info->info.extack, + "Offloading not supported"); + err = 0; + } + err = notifier_from_errno(err); + } else { + dsa_port_hsr_leave(dp, info->upper_dev); + err = NOTIFY_OK; + } + } + + return err; +} + +static int +dsa_slave_lag_changeupper(struct net_device *dev, + struct netdev_notifier_changeupper_info *info) +{ + struct net_device *lower; + struct list_head *iter; + int err = NOTIFY_DONE; + struct dsa_port *dp; + + netdev_for_each_lower_dev(dev, lower, iter) { + if (!dsa_slave_dev_check(lower)) + continue; + + dp = dsa_slave_to_port(lower); + if (!dp->lag_dev) + /* Software LAG */ + continue; + + err = dsa_slave_changeupper(lower, info); + if (notifier_to_errno(err)) + break; } return err; @@ -1987,137 +2090,245 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, switch (event) { case NETDEV_PRECHANGEUPPER: { struct netdev_notifier_changeupper_info *info = ptr; + struct dsa_switch *ds; + struct dsa_port *dp; + int err; if (!dsa_slave_dev_check(dev)) return dsa_prevent_bridging_8021q_upper(dev, ptr); + dp = dsa_slave_to_port(dev); + ds = dp->ds; + + if (ds->ops->port_prechangeupper) { + err = ds->ops->port_prechangeupper(ds, dp->index, info); + if (err) + return notifier_from_errno(err); + } + if (is_vlan_dev(info->upper_dev)) return dsa_slave_check_8021q_upper(dev, ptr); break; } case NETDEV_CHANGEUPPER: + if (dsa_slave_dev_check(dev)) + return dsa_slave_changeupper(dev, ptr); + + if (netif_is_lag_master(dev)) + return dsa_slave_lag_changeupper(dev, ptr); + + break; + case NETDEV_CHANGELOWERSTATE: { + struct netdev_notifier_changelowerstate_info *info = ptr; + struct dsa_port *dp; + int err; + if (!dsa_slave_dev_check(dev)) + break; + + dp = dsa_slave_to_port(dev); + + err = dsa_port_lag_change(dp, info->lower_state_info); + return notifier_from_errno(err); + } + case NETDEV_GOING_DOWN: { + struct dsa_port *dp, *cpu_dp; + struct dsa_switch_tree *dst; + LIST_HEAD(close_list); + + if (!netdev_uses_dsa(dev)) return NOTIFY_DONE; - return dsa_slave_changeupper(dev, ptr); + cpu_dp = dev->dsa_ptr; + dst = cpu_dp->ds->dst; + + list_for_each_entry(dp, &dst->ports, list) { + if (!dsa_is_user_port(dp->ds, dp->index)) + continue; + + list_add(&dp->slave->close_list, &close_list); + } + + dev_close_many(&close_list, true); + + return NOTIFY_OK; + } + default: + break; } return NOTIFY_DONE; } -struct dsa_switchdev_event_work { - struct work_struct work; - struct switchdev_notifier_fdb_info fdb_info; - struct net_device *dev; - unsigned long event; -}; +static void +dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work) +{ + struct dsa_switch *ds = switchdev_work->ds; + struct switchdev_notifier_fdb_info info; + struct dsa_port *dp; + + if (!dsa_is_user_port(ds, switchdev_work->port)) + return; + + info.addr = switchdev_work->addr; + info.vid = switchdev_work->vid; + info.offloaded = true; + dp = dsa_to_port(ds, switchdev_work->port); + call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, + dp->slave, &info.info, NULL); +} static void dsa_slave_switchdev_event_work(struct work_struct *work) { struct dsa_switchdev_event_work *switchdev_work = container_of(work, struct dsa_switchdev_event_work, work); - struct net_device *dev = switchdev_work->dev; - struct switchdev_notifier_fdb_info *fdb_info; - struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = switchdev_work->ds; + struct dsa_port *dp; int err; + dp = dsa_to_port(ds, switchdev_work->port); + rtnl_lock(); switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: - fdb_info = &switchdev_work->fdb_info; - if (!fdb_info->added_by_user) - break; - - err = dsa_port_fdb_add(dp, fdb_info->addr, fdb_info->vid); + err = dsa_port_fdb_add(dp, switchdev_work->addr, + switchdev_work->vid); if (err) { - netdev_dbg(dev, "fdb add failed err=%d\n", err); + dev_err(ds->dev, + "port %d failed to add %pM vid %d to fdb: %d\n", + dp->index, switchdev_work->addr, + switchdev_work->vid, err); break; } - fdb_info->offloaded = true; - call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, dev, - &fdb_info->info, NULL); + dsa_fdb_offload_notify(switchdev_work); break; case SWITCHDEV_FDB_DEL_TO_DEVICE: - fdb_info = &switchdev_work->fdb_info; - if (!fdb_info->added_by_user) - break; - - err = dsa_port_fdb_del(dp, fdb_info->addr, fdb_info->vid); + err = dsa_port_fdb_del(dp, switchdev_work->addr, + switchdev_work->vid); if (err) { - netdev_dbg(dev, "fdb del failed err=%d\n", err); - dev_close(dev); + dev_err(ds->dev, + "port %d failed to delete %pM vid %d from fdb: %d\n", + dp->index, switchdev_work->addr, + switchdev_work->vid, err); } + break; } rtnl_unlock(); - kfree(switchdev_work->fdb_info.addr); kfree(switchdev_work); - dev_put(dev); + if (dsa_is_user_port(ds, dp->index)) + dev_put(dp->slave); } -static int -dsa_slave_switchdev_fdb_work_init(struct dsa_switchdev_event_work * - switchdev_work, - const struct switchdev_notifier_fdb_info * - fdb_info) -{ - memcpy(&switchdev_work->fdb_info, fdb_info, - sizeof(switchdev_work->fdb_info)); - switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC); - if (!switchdev_work->fdb_info.addr) - return -ENOMEM; - ether_addr_copy((u8 *)switchdev_work->fdb_info.addr, - fdb_info->addr); +static int dsa_lower_dev_walk(struct net_device *lower_dev, + struct netdev_nested_priv *priv) +{ + if (dsa_slave_dev_check(lower_dev)) { + priv->data = (void *)netdev_priv(lower_dev); + return 1; + } + return 0; } +static struct dsa_slave_priv *dsa_slave_dev_lower_find(struct net_device *dev) +{ + struct netdev_nested_priv priv = { + .data = NULL, + }; + + netdev_walk_all_lower_dev_rcu(dev, dsa_lower_dev_walk, &priv); + + return (struct dsa_slave_priv *)priv.data; +} + /* Called under rcu_read_lock() */ static int dsa_slave_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); + const struct switchdev_notifier_fdb_info *fdb_info; struct dsa_switchdev_event_work *switchdev_work; + struct dsa_port *dp; int err; - if (event == SWITCHDEV_PORT_ATTR_SET) { + switch (event) { + case SWITCHDEV_PORT_ATTR_SET: err = switchdev_handle_port_attr_set(dev, ptr, dsa_slave_dev_check, dsa_slave_port_attr_set); return notifier_from_errno(err); - } + case SWITCHDEV_FDB_ADD_TO_DEVICE: + case SWITCHDEV_FDB_DEL_TO_DEVICE: + fdb_info = ptr; - if (!dsa_slave_dev_check(dev)) - return NOTIFY_DONE; + if (dsa_slave_dev_check(dev)) { + if (!fdb_info->added_by_user) + return NOTIFY_OK; - switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); - if (!switchdev_work) - return NOTIFY_BAD; + dp = dsa_slave_to_port(dev); + } else { + /* Snoop addresses learnt on foreign interfaces + * bridged with us, for switches that don't + * automatically learn SA from CPU-injected traffic + */ + struct net_device *br_dev; + struct dsa_slave_priv *p; + + br_dev = netdev_master_upper_dev_get_rcu(dev); + if (!br_dev) + return NOTIFY_DONE; + + if (!netif_is_bridge_master(br_dev)) + return NOTIFY_DONE; + + p = dsa_slave_dev_lower_find(br_dev); + if (!p) + return NOTIFY_DONE; + + dp = p->dp->cpu_dp; + + if (!dp->ds->assisted_learning_on_cpu_port) + return NOTIFY_DONE; + + /* When the bridge learns an address on an offloaded + * LAG we don't want to send traffic to the CPU, the + * other ports bridged with the LAG should be able to + * autonomously forward towards it. + */ + if (dsa_tree_offloads_netdev(dp->ds->dst, dev)) + return NOTIFY_DONE; + } - INIT_WORK(&switchdev_work->work, - dsa_slave_switchdev_event_work); - switchdev_work->dev = dev; - switchdev_work->event = event; + if (!dp->ds->ops->port_fdb_add || !dp->ds->ops->port_fdb_del) + return NOTIFY_DONE; - switch (event) { - case SWITCHDEV_FDB_ADD_TO_DEVICE: - case SWITCHDEV_FDB_DEL_TO_DEVICE: - if (dsa_slave_switchdev_fdb_work_init(switchdev_work, ptr)) - goto err_fdb_work_init; - dev_hold(dev); + switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); + if (!switchdev_work) + return NOTIFY_BAD; + + INIT_WORK(&switchdev_work->work, + dsa_slave_switchdev_event_work); + switchdev_work->ds = dp->ds; + switchdev_work->port = dp->index; + switchdev_work->event = event; + + ether_addr_copy(switchdev_work->addr, + fdb_info->addr); + switchdev_work->vid = fdb_info->vid; + + /* Hold a reference on the slave for dsa_fdb_offload_notify */ + if (dsa_is_user_port(dp->ds, dp->index)) + dev_hold(dev); + dsa_schedule_work(&switchdev_work->work); break; default: - kfree(switchdev_work); return NOTIFY_DONE; } - dsa_schedule_work(&switchdev_work->work); return NOTIFY_OK; - -err_fdb_work_init: - kfree(switchdev_work); - return NOTIFY_BAD; } static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused, diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 3fb362b6874e..4b5da89dc27a 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -33,15 +33,12 @@ static int dsa_switch_ageing_time(struct dsa_switch *ds, struct dsa_notifier_ageing_time_info *info) { unsigned int ageing_time = info->ageing_time; - struct switchdev_trans *trans = info->trans; - - if (switchdev_trans_ph_prepare(trans)) { - if (ds->ageing_time_min && ageing_time < ds->ageing_time_min) - return -ERANGE; - if (ds->ageing_time_max && ageing_time > ds->ageing_time_max) - return -ERANGE; - return 0; - } + + if (ds->ageing_time_min && ageing_time < ds->ageing_time_min) + return -ERANGE; + + if (ds->ageing_time_max && ageing_time > ds->ageing_time_max) + return -ERANGE; /* Program the fastest ageing time in case of multiple bridges */ ageing_time = dsa_switch_fastest_ageing_time(ds, ageing_time); @@ -109,6 +106,7 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, { bool unset_vlan_filtering = br_vlan_enabled(info->br); struct dsa_switch_tree *dst = ds->dst; + struct netlink_ext_ack extack = {0}; int err, i; if (dst->index == info->tree_index && ds->index == info->sw_index && @@ -139,17 +137,11 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, } } if (unset_vlan_filtering) { - struct switchdev_trans trans; - - trans.ph_prepare = true; err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port), - false, &trans); - if (err && err != EOPNOTSUPP) - return err; - - trans.ph_prepare = false; - err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port), - false, &trans); + false, &extack); + if (extack._msg) + dev_err(ds->dev, "port %d: %s\n", info->port, + extack._msg); if (err && err != EOPNOTSUPP) return err; } @@ -178,6 +170,65 @@ static int dsa_switch_fdb_del(struct dsa_switch *ds, return ds->ops->port_fdb_del(ds, port, info->addr, info->vid); } +static int dsa_switch_hsr_join(struct dsa_switch *ds, + struct dsa_notifier_hsr_info *info) +{ + if (ds->index == info->sw_index && ds->ops->port_hsr_join) + return ds->ops->port_hsr_join(ds, info->port, info->hsr); + + return -EOPNOTSUPP; +} + +static int dsa_switch_hsr_leave(struct dsa_switch *ds, + struct dsa_notifier_hsr_info *info) +{ + if (ds->index == info->sw_index && ds->ops->port_hsr_leave) + return ds->ops->port_hsr_leave(ds, info->port, info->hsr); + + return -EOPNOTSUPP; +} + +static int dsa_switch_lag_change(struct dsa_switch *ds, + struct dsa_notifier_lag_info *info) +{ + if (ds->index == info->sw_index && ds->ops->port_lag_change) + return ds->ops->port_lag_change(ds, info->port); + + if (ds->index != info->sw_index && ds->ops->crosschip_lag_change) + return ds->ops->crosschip_lag_change(ds, info->sw_index, + info->port); + + return 0; +} + +static int dsa_switch_lag_join(struct dsa_switch *ds, + struct dsa_notifier_lag_info *info) +{ + if (ds->index == info->sw_index && ds->ops->port_lag_join) + return ds->ops->port_lag_join(ds, info->port, info->lag, + info->info); + + if (ds->index != info->sw_index && ds->ops->crosschip_lag_join) + return ds->ops->crosschip_lag_join(ds, info->sw_index, + info->port, info->lag, + info->info); + + return 0; +} + +static int dsa_switch_lag_leave(struct dsa_switch *ds, + struct dsa_notifier_lag_info *info) +{ + if (ds->index == info->sw_index && ds->ops->port_lag_leave) + return ds->ops->port_lag_leave(ds, info->port, info->lag); + + if (ds->index != info->sw_index && ds->ops->crosschip_lag_leave) + return ds->ops->crosschip_lag_leave(ds, info->sw_index, + info->port, info->lag); + + return 0; +} + static bool dsa_switch_mdb_match(struct dsa_switch *ds, int port, struct dsa_notifier_mdb_info *info) { @@ -190,41 +241,24 @@ static bool dsa_switch_mdb_match(struct dsa_switch *ds, int port, return false; } -static int dsa_switch_mdb_prepare(struct dsa_switch *ds, - struct dsa_notifier_mdb_info *info) +static int dsa_switch_mdb_add(struct dsa_switch *ds, + struct dsa_notifier_mdb_info *info) { - int port, err; + int err = 0; + int port; - if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) + if (!ds->ops->port_mdb_add) return -EOPNOTSUPP; for (port = 0; port < ds->num_ports; port++) { if (dsa_switch_mdb_match(ds, port, info)) { - err = ds->ops->port_mdb_prepare(ds, port, info->mdb); + err = ds->ops->port_mdb_add(ds, port, info->mdb); if (err) - return err; + break; } } - return 0; -} - -static int dsa_switch_mdb_add(struct dsa_switch *ds, - struct dsa_notifier_mdb_info *info) -{ - int port; - - if (switchdev_trans_ph_prepare(info->trans)) - return dsa_switch_mdb_prepare(ds, info); - - if (!ds->ops->port_mdb_add) - return 0; - - for (port = 0; port < ds->num_ports; port++) - if (dsa_switch_mdb_match(ds, port, info)) - ds->ops->port_mdb_add(ds, port, info->mdb); - - return 0; + return err; } static int dsa_switch_mdb_del(struct dsa_switch *ds, @@ -251,17 +285,18 @@ static bool dsa_switch_vlan_match(struct dsa_switch *ds, int port, return false; } -static int dsa_switch_vlan_prepare(struct dsa_switch *ds, - struct dsa_notifier_vlan_info *info) +static int dsa_switch_vlan_add(struct dsa_switch *ds, + struct dsa_notifier_vlan_info *info) { int port, err; - if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) + if (!ds->ops->port_vlan_add) return -EOPNOTSUPP; for (port = 0; port < ds->num_ports; port++) { if (dsa_switch_vlan_match(ds, port, info)) { - err = ds->ops->port_vlan_prepare(ds, port, info->vlan); + err = ds->ops->port_vlan_add(ds, port, info->vlan, + info->extack); if (err) return err; } @@ -270,36 +305,163 @@ static int dsa_switch_vlan_prepare(struct dsa_switch *ds, return 0; } -static int dsa_switch_vlan_add(struct dsa_switch *ds, +static int dsa_switch_vlan_del(struct dsa_switch *ds, struct dsa_notifier_vlan_info *info) { + if (!ds->ops->port_vlan_del) + return -EOPNOTSUPP; + + if (ds->index == info->sw_index) + return ds->ops->port_vlan_del(ds, info->port, info->vlan); + + /* Do not deprogram the DSA links as they may be used as conduit + * for other VLAN members in the fabric. + */ + return 0; +} + +static bool dsa_switch_tag_proto_match(struct dsa_switch *ds, int port, + struct dsa_notifier_tag_proto_info *info) +{ + if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) + return true; + + return false; +} + +static int dsa_switch_change_tag_proto(struct dsa_switch *ds, + struct dsa_notifier_tag_proto_info *info) +{ + const struct dsa_device_ops *tag_ops = info->tag_ops; + int port, err; + + if (!ds->ops->change_tag_protocol) + return -EOPNOTSUPP; + + ASSERT_RTNL(); + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_tag_proto_match(ds, port, info)) { + err = ds->ops->change_tag_protocol(ds, port, + tag_ops->proto); + if (err) + return err; + + if (dsa_is_cpu_port(ds, port)) + dsa_port_set_tag_protocol(dsa_to_port(ds, port), + tag_ops); + } + } + + /* Now that changing the tag protocol can no longer fail, let's update + * the remaining bits which are "duplicated for faster access", and the + * bits that depend on the tagger, such as the MTU. + */ + for (port = 0; port < ds->num_ports; port++) { + if (dsa_is_user_port(ds, port)) { + struct net_device *slave; + + slave = dsa_to_port(ds, port)->slave; + dsa_slave_setup_tagger(slave); + + /* rtnl_mutex is held in dsa_tree_change_tag_proto */ + dsa_slave_change_mtu(slave, slave->mtu); + } + } + + return 0; +} + +static bool dsa_switch_mrp_match(struct dsa_switch *ds, int port, + struct dsa_notifier_mrp_info *info) +{ + if (ds->index == info->sw_index && port == info->port) + return true; + + if (dsa_is_dsa_port(ds, port)) + return true; + + return false; +} + +static int dsa_switch_mrp_add(struct dsa_switch *ds, + struct dsa_notifier_mrp_info *info) +{ + int err = 0; int port; - if (switchdev_trans_ph_prepare(info->trans)) - return dsa_switch_vlan_prepare(ds, info); + if (!ds->ops->port_mrp_add) + return -EOPNOTSUPP; - if (!ds->ops->port_vlan_add) - return 0; + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_mrp_match(ds, port, info)) { + err = ds->ops->port_mrp_add(ds, port, info->mrp); + if (err) + break; + } + } + + return err; +} - for (port = 0; port < ds->num_ports; port++) - if (dsa_switch_vlan_match(ds, port, info)) - ds->ops->port_vlan_add(ds, port, info->vlan); +static int dsa_switch_mrp_del(struct dsa_switch *ds, + struct dsa_notifier_mrp_info *info) +{ + if (!ds->ops->port_mrp_del) + return -EOPNOTSUPP; + + if (ds->index == info->sw_index) + return ds->ops->port_mrp_del(ds, info->port, info->mrp); return 0; } -static int dsa_switch_vlan_del(struct dsa_switch *ds, - struct dsa_notifier_vlan_info *info) +static bool +dsa_switch_mrp_ring_role_match(struct dsa_switch *ds, int port, + struct dsa_notifier_mrp_ring_role_info *info) { - if (!ds->ops->port_vlan_del) + if (ds->index == info->sw_index && port == info->port) + return true; + + if (dsa_is_dsa_port(ds, port)) + return true; + + return false; +} + +static int +dsa_switch_mrp_add_ring_role(struct dsa_switch *ds, + struct dsa_notifier_mrp_ring_role_info *info) +{ + int err = 0; + int port; + + if (!ds->ops->port_mrp_add) + return -EOPNOTSUPP; + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_mrp_ring_role_match(ds, port, info)) { + err = ds->ops->port_mrp_add_ring_role(ds, port, + info->mrp); + if (err) + break; + } + } + + return err; +} + +static int +dsa_switch_mrp_del_ring_role(struct dsa_switch *ds, + struct dsa_notifier_mrp_ring_role_info *info) +{ + if (!ds->ops->port_mrp_del) return -EOPNOTSUPP; if (ds->index == info->sw_index) - return ds->ops->port_vlan_del(ds, info->port, info->vlan); + return ds->ops->port_mrp_del_ring_role(ds, info->port, + info->mrp); - /* Do not deprogram the DSA links as they may be used as conduit - * for other VLAN members in the fabric. - */ return 0; } @@ -325,6 +487,21 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_FDB_DEL: err = dsa_switch_fdb_del(ds, info); break; + case DSA_NOTIFIER_HSR_JOIN: + err = dsa_switch_hsr_join(ds, info); + break; + case DSA_NOTIFIER_HSR_LEAVE: + err = dsa_switch_hsr_leave(ds, info); + break; + case DSA_NOTIFIER_LAG_CHANGE: + err = dsa_switch_lag_change(ds, info); + break; + case DSA_NOTIFIER_LAG_JOIN: + err = dsa_switch_lag_join(ds, info); + break; + case DSA_NOTIFIER_LAG_LEAVE: + err = dsa_switch_lag_leave(ds, info); + break; case DSA_NOTIFIER_MDB_ADD: err = dsa_switch_mdb_add(ds, info); break; @@ -340,15 +517,26 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_MTU: err = dsa_switch_mtu(ds, info); break; + case DSA_NOTIFIER_TAG_PROTO: + err = dsa_switch_change_tag_proto(ds, info); + break; + case DSA_NOTIFIER_MRP_ADD: + err = dsa_switch_mrp_add(ds, info); + break; + case DSA_NOTIFIER_MRP_DEL: + err = dsa_switch_mrp_del(ds, info); + break; + case DSA_NOTIFIER_MRP_ADD_RING_ROLE: + err = dsa_switch_mrp_add_ring_role(ds, info); + break; + case DSA_NOTIFIER_MRP_DEL_RING_ROLE: + err = dsa_switch_mrp_del_ring_role(ds, info); + break; default: err = -EOPNOTSUPP; break; } - /* Non-switchdev operations cannot be rolled back. If a DSA driver - * returns an error during the chained call, switch chips may be in an - * inconsistent state. - */ if (err) dev_dbg(ds->dev, "breaking chain for DSA event %lu (%d)\n", event, err); diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 8e3e8a5b8559..008c1ec6e20c 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -133,10 +133,21 @@ u16 dsa_8021q_rx_subvlan(u16 vid) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_subvlan); +bool vid_is_dsa_8021q_rxvlan(u16 vid) +{ + return (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX; +} +EXPORT_SYMBOL_GPL(vid_is_dsa_8021q_rxvlan); + +bool vid_is_dsa_8021q_txvlan(u16 vid) +{ + return (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_TX; +} +EXPORT_SYMBOL_GPL(vid_is_dsa_8021q_txvlan); + bool vid_is_dsa_8021q(u16 vid) { - return ((vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX || - (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_TX); + return vid_is_dsa_8021q_rxvlan(vid) || vid_is_dsa_8021q_txvlan(vid); } EXPORT_SYMBOL_GPL(vid_is_dsa_8021q); diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c index 55b00694cdba..002cf7f952e2 100644 --- a/net/dsa/tag_ar9331.c +++ b/net/dsa/tag_ar9331.c @@ -31,9 +31,6 @@ static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb, __le16 *phdr; u16 hdr; - if (skb_cow_head(skb, AR9331_HDR_LEN) < 0) - return NULL; - phdr = skb_push(skb, AR9331_HDR_LEN); hdr = FIELD_PREP(AR9331_HDR_VERSION_MASK, AR9331_HDR_VERSION); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index ad72dff8d524..e2577a7dcbca 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -5,6 +5,7 @@ * Copyright (C) 2014 Broadcom Corporation */ +#include <linux/dsa/brcm.h> #include <linux/etherdevice.h> #include <linux/list.h> #include <linux/slab.h> @@ -66,9 +67,6 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb, u16 queue = skb_get_queue_mapping(skb); u8 *brcm_tag; - if (skb_cow_head(skb, BRCM_TAG_LEN) < 0) - return NULL; - /* The Ethernet switch we are interfaced with needs packets to be at * least 64 bytes (including FCS) otherwise they will be discarded when * they enter the switch port logic. When Broadcom tags are enabled, we diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 0b756fae68a5..7e7b7decdf39 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -1,7 +1,48 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * net/dsa/tag_dsa.c - (Non-ethertype) DSA tagging + * Regular and Ethertype DSA tagging * Copyright (c) 2008-2009 Marvell Semiconductor + * + * Regular DSA + * ----------- + + * For untagged (in 802.1Q terms) packets, the switch will splice in + * the tag between the SA and the ethertype of the original + * packet. Tagged frames will instead have their outermost .1Q tag + * converted to a DSA tag. It expects the same layout when receiving + * packets from the CPU. + * + * Example: + * + * .----.----.----.--------- + * Pu: | DA | SA | ET | Payload ... + * '----'----'----'--------- + * 6 6 2 N + * .----.----.--------.-----.----.--------- + * Pt: | DA | SA | 0x8100 | TCI | ET | Payload ... + * '----'----'--------'-----'----'--------- + * 6 6 2 2 2 N + * .----.----.-----.----.--------- + * Pd: | DA | SA | DSA | ET | Payload ... + * '----'----'-----'----'--------- + * 6 6 4 2 N + * + * No matter if a packet is received untagged (Pu) or tagged (Pt), + * they will both have the same layout (Pd) when they are sent to the + * CPU. This is done by ignoring 802.3, replacing the ethertype field + * with more metadata, among which is a bit to signal if the original + * packet was tagged or not. + * + * Ethertype DSA + * ------------- + * Uses the exact same tag format as regular DSA, but also includes a + * proper ethertype field (which the mv88e6xxx driver sets to + * ETH_P_EDSA/0xdada) followed by two zero bytes: + * + * .----.----.--------.--------.-----.----.--------- + * | DA | SA | 0xdada | 0x0000 | DSA | ET | Payload ... + * '----'----'--------'--------'-----'----'--------- + * 6 6 2 2 4 2 N */ #include <linux/etherdevice.h> @@ -12,46 +53,104 @@ #define DSA_HLEN 4 -static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) +/** + * enum dsa_cmd - DSA Command + * @DSA_CMD_TO_CPU: Set on packets that were trapped or mirrored to + * the CPU port. This is needed to implement control protocols, + * e.g. STP and LLDP, that must not allow those control packets to + * be switched according to the normal rules. + * @DSA_CMD_FROM_CPU: Used by the CPU to send a packet to a specific + * port, ignoring all the barriers that the switch normally + * enforces (VLANs, STP port states etc.). No source address + * learning takes place. "sudo send packet" + * @DSA_CMD_TO_SNIFFER: Set on the copies of packets that matched some + * user configured ingress or egress monitor criteria. These are + * forwarded by the switch tree to the user configured ingress or + * egress monitor port, which can be set to the CPU port or a + * regular port. If the destination is a regular port, the tag + * will be removed before egressing the port. If the destination + * is the CPU port, the tag will not be removed. + * @DSA_CMD_FORWARD: This tag is used on all bulk traffic passing + * through the switch tree, including the flows that are directed + * towards the CPU. Its device/port tuple encodes the original + * source port on which the packet ingressed. It can also be used + * on transmit by the CPU to defer the forwarding decision to the + * hardware, based on the current config of PVT/VTU/ATU + * etc. Source address learning takes places if enabled on the + * receiving DSA/CPU port. + */ +enum dsa_cmd { + DSA_CMD_TO_CPU = 0, + DSA_CMD_FROM_CPU = 1, + DSA_CMD_TO_SNIFFER = 2, + DSA_CMD_FORWARD = 3 +}; + +/** + * enum dsa_code - TO_CPU Code + * + * @DSA_CODE_MGMT_TRAP: DA was classified as a management + * address. Typical examples include STP BPDUs and LLDP. + * @DSA_CODE_FRAME2REG: Response to a "remote management" request. + * @DSA_CODE_IGMP_MLD_TRAP: IGMP/MLD signaling. + * @DSA_CODE_POLICY_TRAP: Frame matched some policy configuration on + * the device. Typical examples are matching on DA/SA/VID and DHCP + * snooping. + * @DSA_CODE_ARP_MIRROR: The name says it all really. + * @DSA_CODE_POLICY_MIRROR: Same as @DSA_CODE_POLICY_TRAP, but the + * particular policy was set to trigger a mirror instead of a + * trap. + * @DSA_CODE_RESERVED_6: Unused on all devices up to at least 6393X. + * @DSA_CODE_RESERVED_7: Unused on all devices up to at least 6393X. + * + * A 3-bit code is used to relay why a particular frame was sent to + * the CPU. We only use this to determine if the packet was mirrored + * or trapped, i.e. whether the packet has been forwarded by hardware + * or not. + * + * This is the superset of all possible codes. Any particular device + * may only implement a subset. + */ +enum dsa_code { + DSA_CODE_MGMT_TRAP = 0, + DSA_CODE_FRAME2REG = 1, + DSA_CODE_IGMP_MLD_TRAP = 2, + DSA_CODE_POLICY_TRAP = 3, + DSA_CODE_ARP_MIRROR = 4, + DSA_CODE_POLICY_MIRROR = 5, + DSA_CODE_RESERVED_6 = 6, + DSA_CODE_RESERVED_7 = 7 +}; + +static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, + u8 extra) { struct dsa_port *dp = dsa_slave_to_port(dev); u8 *dsa_header; - /* - * Convert the outermost 802.1q tag to a DSA tag for tagged - * packets, or insert a DSA tag between the addresses and - * the ethertype field for untagged packets. - */ if (skb->protocol == htons(ETH_P_8021Q)) { - if (skb_cow_head(skb, 0) < 0) - return NULL; + if (extra) { + skb_push(skb, extra); + memmove(skb->data, skb->data + extra, 2 * ETH_ALEN); + } - /* - * Construct tagged FROM_CPU DSA tag from 802.1q tag. - */ - dsa_header = skb->data + 2 * ETH_ALEN; - dsa_header[0] = 0x60 | dp->ds->index; + /* Construct tagged FROM_CPU DSA tag from 802.1Q tag. */ + dsa_header = skb->data + 2 * ETH_ALEN + extra; + dsa_header[0] = (DSA_CMD_FROM_CPU << 6) | 0x20 | dp->ds->index; dsa_header[1] = dp->index << 3; - /* - * Move CFI field from byte 2 to byte 1. - */ + /* Move CFI field from byte 2 to byte 1. */ if (dsa_header[2] & 0x10) { dsa_header[1] |= 0x01; dsa_header[2] &= ~0x10; } } else { - if (skb_cow_head(skb, DSA_HLEN) < 0) - return NULL; - skb_push(skb, DSA_HLEN); - - memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN); + skb_push(skb, DSA_HLEN + extra); + memmove(skb->data, skb->data + DSA_HLEN + extra, 2 * ETH_ALEN); - /* - * Construct untagged FROM_CPU DSA tag. - */ - dsa_header = skb->data + 2 * ETH_ALEN; - dsa_header[0] = 0x40 | dp->ds->index; + /* Construct untagged FROM_CPU DSA tag. */ + dsa_header = skb->data + 2 * ETH_ALEN + extra; + dsa_header[0] = (DSA_CMD_FROM_CPU << 6) | dp->ds->index; dsa_header[1] = dp->index << 3; dsa_header[2] = 0x00; dsa_header[3] = 0x00; @@ -60,47 +159,91 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) return skb; } -static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, + u8 extra) { + int source_device, source_port; + bool trunk = false; + enum dsa_code code; + enum dsa_cmd cmd; u8 *dsa_header; - int source_device; - int source_port; - if (unlikely(!pskb_may_pull(skb, DSA_HLEN))) - return NULL; - - /* - * The ethertype field is part of the DSA header. - */ + /* The ethertype field is part of the DSA header. */ dsa_header = skb->data - 2; - /* - * Check that frame type is either TO_CPU or FORWARD. - */ - if ((dsa_header[0] & 0xc0) != 0x00 && (dsa_header[0] & 0xc0) != 0xc0) + cmd = dsa_header[0] >> 6; + switch (cmd) { + case DSA_CMD_FORWARD: + skb->offload_fwd_mark = 1; + + trunk = !!(dsa_header[1] & 7); + break; + + case DSA_CMD_TO_CPU: + code = (dsa_header[1] & 0x6) | ((dsa_header[2] >> 4) & 1); + + switch (code) { + case DSA_CODE_FRAME2REG: + /* Remote management is not implemented yet, + * drop. + */ + return NULL; + case DSA_CODE_ARP_MIRROR: + case DSA_CODE_POLICY_MIRROR: + /* Mark mirrored packets to notify any upper + * device (like a bridge) that forwarding has + * already been done by hardware. + */ + skb->offload_fwd_mark = 1; + break; + case DSA_CODE_MGMT_TRAP: + case DSA_CODE_IGMP_MLD_TRAP: + case DSA_CODE_POLICY_TRAP: + /* Traps have, by definition, not been + * forwarded by hardware, so don't mark them. + */ + break; + default: + /* Reserved code, this could be anything. Drop + * seems like the safest option. + */ + return NULL; + } + + break; + + default: return NULL; + } - /* - * Determine source device and port. - */ source_device = dsa_header[0] & 0x1f; source_port = (dsa_header[1] >> 3) & 0x1f; - skb->dev = dsa_master_find_slave(dev, source_device, source_port); + if (trunk) { + struct dsa_port *cpu_dp = dev->dsa_ptr; + + /* The exact source port is not available in the tag, + * so we inject the frame directly on the upper + * team/bond. + */ + skb->dev = dsa_lag_dev(cpu_dp->dst, source_port); + } else { + skb->dev = dsa_master_find_slave(dev, source_device, + source_port); + } + if (!skb->dev) return NULL; - /* - * Convert the DSA header to an 802.1q header if the 'tagged' - * bit in the DSA header is set. If the 'tagged' bit is clear, - * delete the DSA header entirely. + /* If the 'tagged' bit is set; convert the DSA tag to a 802.1Q + * tag, and delete the ethertype (extra) if applicable. If the + * 'tagged' bit is cleared; delete the DSA tag, and ethertype + * if applicable. */ if (dsa_header[0] & 0x20) { u8 new_header[4]; - /* - * Insert 802.1q ethertype and copy the VLAN-related + /* Insert 802.1Q ethertype and copy the VLAN-related * fields, but clear the bit that will hold CFI (since * DSA uses that bit location for another purpose). */ @@ -109,16 +252,13 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, new_header[2] = dsa_header[2] & ~0x10; new_header[3] = dsa_header[3]; - /* - * Move CFI bit from its place in the DSA header to - * its 802.1q-designated place. + /* Move CFI bit from its place in the DSA header to + * its 802.1Q-designated place. */ if (dsa_header[1] & 0x01) new_header[2] |= 0x10; - /* - * Update packet checksum if skb is CHECKSUM_COMPLETE. - */ + /* Update packet checksum if skb is CHECKSUM_COMPLETE. */ if (skb->ip_summed == CHECKSUM_COMPLETE) { __wsum c = skb->csum; c = csum_add(c, csum_partial(new_header + 2, 2, 0)); @@ -127,30 +267,101 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, } memcpy(dsa_header, new_header, DSA_HLEN); + + if (extra) + memmove(skb->data - ETH_HLEN, + skb->data - ETH_HLEN - extra, + 2 * ETH_ALEN); } else { - /* - * Remove DSA tag and update checksum. - */ skb_pull_rcsum(skb, DSA_HLEN); memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - DSA_HLEN, + skb->data - ETH_HLEN - DSA_HLEN - extra, 2 * ETH_ALEN); } - skb->offload_fwd_mark = 1; - return skb; } +#if IS_ENABLED(CONFIG_NET_DSA_TAG_DSA) + +static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) +{ + return dsa_xmit_ll(skb, dev, 0); +} + +static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + if (unlikely(!pskb_may_pull(skb, DSA_HLEN))) + return NULL; + + return dsa_rcv_ll(skb, dev, 0); +} + static const struct dsa_device_ops dsa_netdev_ops = { - .name = "dsa", - .proto = DSA_TAG_PROTO_DSA, - .xmit = dsa_xmit, - .rcv = dsa_rcv, + .name = "dsa", + .proto = DSA_TAG_PROTO_DSA, + .xmit = dsa_xmit, + .rcv = dsa_rcv, .overhead = DSA_HLEN, }; -MODULE_LICENSE("GPL"); +DSA_TAG_DRIVER(dsa_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_DSA); +#endif /* CONFIG_NET_DSA_TAG_DSA */ + +#if IS_ENABLED(CONFIG_NET_DSA_TAG_EDSA) -module_dsa_tag_driver(dsa_netdev_ops); +#define EDSA_HLEN 8 + +static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) +{ + u8 *edsa_header; + + skb = dsa_xmit_ll(skb, dev, EDSA_HLEN - DSA_HLEN); + if (!skb) + return NULL; + + edsa_header = skb->data + 2 * ETH_ALEN; + edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff; + edsa_header[1] = ETH_P_EDSA & 0xff; + edsa_header[2] = 0x00; + edsa_header[3] = 0x00; + return skb; +} + +static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + if (unlikely(!pskb_may_pull(skb, EDSA_HLEN))) + return NULL; + + skb_pull_rcsum(skb, EDSA_HLEN - DSA_HLEN); + + return dsa_rcv_ll(skb, dev, EDSA_HLEN - DSA_HLEN); +} + +static const struct dsa_device_ops edsa_netdev_ops = { + .name = "edsa", + .proto = DSA_TAG_PROTO_EDSA, + .xmit = edsa_xmit, + .rcv = edsa_rcv, + .overhead = EDSA_HLEN, +}; + +DSA_TAG_DRIVER(edsa_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_EDSA); +#endif /* CONFIG_NET_DSA_TAG_EDSA */ + +static struct dsa_tag_driver *dsa_tag_drivers[] = { +#if IS_ENABLED(CONFIG_NET_DSA_TAG_DSA) + &DSA_TAG_DRIVER_NAME(dsa_netdev_ops), +#endif +#if IS_ENABLED(CONFIG_NET_DSA_TAG_EDSA) + &DSA_TAG_DRIVER_NAME(edsa_netdev_ops), +#endif +}; + +module_dsa_tag_drivers(dsa_tag_drivers); + +MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c deleted file mode 100644 index 120614240319..000000000000 --- a/net/dsa/tag_edsa.c +++ /dev/null @@ -1,206 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * net/dsa/tag_edsa.c - Ethertype DSA tagging - * Copyright (c) 2008-2009 Marvell Semiconductor - */ - -#include <linux/etherdevice.h> -#include <linux/list.h> -#include <linux/slab.h> - -#include "dsa_priv.h" - -#define DSA_HLEN 4 -#define EDSA_HLEN 8 - -#define FRAME_TYPE_TO_CPU 0x00 -#define FRAME_TYPE_FORWARD 0x03 - -#define TO_CPU_CODE_MGMT_TRAP 0x00 -#define TO_CPU_CODE_FRAME2REG 0x01 -#define TO_CPU_CODE_IGMP_MLD_TRAP 0x02 -#define TO_CPU_CODE_POLICY_TRAP 0x03 -#define TO_CPU_CODE_ARP_MIRROR 0x04 -#define TO_CPU_CODE_POLICY_MIRROR 0x05 - -static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - u8 *edsa_header; - - /* - * Convert the outermost 802.1q tag to a DSA tag and prepend - * a DSA ethertype field is the packet is tagged, or insert - * a DSA ethertype plus DSA tag between the addresses and the - * current ethertype field if the packet is untagged. - */ - if (skb->protocol == htons(ETH_P_8021Q)) { - if (skb_cow_head(skb, DSA_HLEN) < 0) - return NULL; - skb_push(skb, DSA_HLEN); - - memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN); - - /* - * Construct tagged FROM_CPU DSA tag from 802.1q tag. - */ - edsa_header = skb->data + 2 * ETH_ALEN; - edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff; - edsa_header[1] = ETH_P_EDSA & 0xff; - edsa_header[2] = 0x00; - edsa_header[3] = 0x00; - edsa_header[4] = 0x60 | dp->ds->index; - edsa_header[5] = dp->index << 3; - - /* - * Move CFI field from byte 6 to byte 5. - */ - if (edsa_header[6] & 0x10) { - edsa_header[5] |= 0x01; - edsa_header[6] &= ~0x10; - } - } else { - if (skb_cow_head(skb, EDSA_HLEN) < 0) - return NULL; - skb_push(skb, EDSA_HLEN); - - memmove(skb->data, skb->data + EDSA_HLEN, 2 * ETH_ALEN); - - /* - * Construct untagged FROM_CPU DSA tag. - */ - edsa_header = skb->data + 2 * ETH_ALEN; - edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff; - edsa_header[1] = ETH_P_EDSA & 0xff; - edsa_header[2] = 0x00; - edsa_header[3] = 0x00; - edsa_header[4] = 0x40 | dp->ds->index; - edsa_header[5] = dp->index << 3; - edsa_header[6] = 0x00; - edsa_header[7] = 0x00; - } - - return skb; -} - -static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) -{ - u8 *edsa_header; - int frame_type; - int code; - int source_device; - int source_port; - - if (unlikely(!pskb_may_pull(skb, EDSA_HLEN))) - return NULL; - - /* - * Skip the two null bytes after the ethertype. - */ - edsa_header = skb->data + 2; - - /* - * Check that frame type is either TO_CPU or FORWARD. - */ - frame_type = edsa_header[0] >> 6; - - switch (frame_type) { - case FRAME_TYPE_TO_CPU: - code = (edsa_header[1] & 0x6) | ((edsa_header[2] >> 4) & 1); - - /* - * Mark the frame to never egress on any port of the same switch - * unless it's a trapped IGMP/MLD packet, in which case the - * bridge might want to forward it. - */ - if (code != TO_CPU_CODE_IGMP_MLD_TRAP) - skb->offload_fwd_mark = 1; - - break; - - case FRAME_TYPE_FORWARD: - skb->offload_fwd_mark = 1; - break; - - default: - return NULL; - } - - /* - * Determine source device and port. - */ - source_device = edsa_header[0] & 0x1f; - source_port = (edsa_header[1] >> 3) & 0x1f; - - skb->dev = dsa_master_find_slave(dev, source_device, source_port); - if (!skb->dev) - return NULL; - - /* - * If the 'tagged' bit is set, convert the DSA tag to a 802.1q - * tag and delete the ethertype part. If the 'tagged' bit is - * clear, delete the ethertype and the DSA tag parts. - */ - if (edsa_header[0] & 0x20) { - u8 new_header[4]; - - /* - * Insert 802.1q ethertype and copy the VLAN-related - * fields, but clear the bit that will hold CFI (since - * DSA uses that bit location for another purpose). - */ - new_header[0] = (ETH_P_8021Q >> 8) & 0xff; - new_header[1] = ETH_P_8021Q & 0xff; - new_header[2] = edsa_header[2] & ~0x10; - new_header[3] = edsa_header[3]; - - /* - * Move CFI bit from its place in the DSA header to - * its 802.1q-designated place. - */ - if (edsa_header[1] & 0x01) - new_header[2] |= 0x10; - - skb_pull_rcsum(skb, DSA_HLEN); - - /* - * Update packet checksum if skb is CHECKSUM_COMPLETE. - */ - if (skb->ip_summed == CHECKSUM_COMPLETE) { - __wsum c = skb->csum; - c = csum_add(c, csum_partial(new_header + 2, 2, 0)); - c = csum_sub(c, csum_partial(edsa_header + 2, 2, 0)); - skb->csum = c; - } - - memcpy(edsa_header, new_header, DSA_HLEN); - - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - DSA_HLEN, - 2 * ETH_ALEN); - } else { - /* - * Remove DSA tag and update checksum. - */ - skb_pull_rcsum(skb, EDSA_HLEN); - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - EDSA_HLEN, - 2 * ETH_ALEN); - } - - return skb; -} - -static const struct dsa_device_ops edsa_netdev_ops = { - .name = "edsa", - .proto = DSA_TAG_PROTO_EDSA, - .xmit = edsa_xmit, - .rcv = edsa_rcv, - .overhead = EDSA_HLEN, -}; - -MODULE_LICENSE("GPL"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_EDSA); - -module_dsa_tag_driver(edsa_netdev_ops); diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index 408d4af390a0..2f5bd5e338ab 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -60,13 +60,8 @@ static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - int err; u8 *gswip_tag; - err = skb_cow_head(skb, GSWIP_TX_HEADER_LEN); - if (err) - return NULL; - skb_push(skb, GSWIP_TX_HEADER_LEN); gswip_tag = skb->data; diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c new file mode 100644 index 000000000000..a09805c8e1ab --- /dev/null +++ b/net/dsa/tag_hellcreek.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* + * net/dsa/tag_hellcreek.c - Hirschmann Hellcreek switch tag format handling + * + * Copyright (C) 2019,2020 Linutronix GmbH + * Author Kurt Kanzenbach <kurt@linutronix.de> + * + * Based on tag_ksz.c. + */ + +#include <linux/skbuff.h> +#include <net/dsa.h> + +#include "dsa_priv.h" + +#define HELLCREEK_TAG_LEN 1 + +static struct sk_buff *hellcreek_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + u8 *tag; + + /* Tag encoding */ + tag = skb_put(skb, HELLCREEK_TAG_LEN); + *tag = BIT(dp->index); + + return skb; +} + +static struct sk_buff *hellcreek_rcv(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt) +{ + /* Tag decoding */ + u8 *tag = skb_tail_pointer(skb) - HELLCREEK_TAG_LEN; + unsigned int port = tag[0] & 0x03; + + skb->dev = dsa_master_find_slave(dev, 0, port); + if (!skb->dev) { + netdev_warn(dev, "Failed to get source port: %d\n", port); + return NULL; + } + + pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN); + + skb->offload_fwd_mark = true; + + return skb; +} + +static const struct dsa_device_ops hellcreek_netdev_ops = { + .name = "hellcreek", + .proto = DSA_TAG_PROTO_HELLCREEK, + .xmit = hellcreek_xmit, + .rcv = hellcreek_rcv, + .overhead = HELLCREEK_TAG_LEN, + .tail_tag = true, +}; + +MODULE_LICENSE("Dual MIT/GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_HELLCREEK); + +module_dsa_tag_driver(hellcreek_netdev_ops); diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 0a5aa982c60d..4820dbcedfa2 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -14,46 +14,6 @@ #define KSZ_EGRESS_TAG_LEN 1 #define KSZ_INGRESS_TAG_LEN 1 -static struct sk_buff *ksz_common_xmit(struct sk_buff *skb, - struct net_device *dev, int len) -{ - struct sk_buff *nskb; - int padlen; - - padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len; - - if (skb_tailroom(skb) >= padlen + len) { - /* Let dsa_slave_xmit() free skb */ - if (__skb_put_padto(skb, skb->len + padlen, false)) - return NULL; - - nskb = skb; - } else { - nskb = alloc_skb(NET_IP_ALIGN + skb->len + - padlen + len, GFP_ATOMIC); - if (!nskb) - return NULL; - skb_reserve(nskb, NET_IP_ALIGN); - - skb_reset_mac_header(nskb); - skb_set_network_header(nskb, - skb_network_header(skb) - skb->head); - skb_set_transport_header(nskb, - skb_transport_header(skb) - skb->head); - skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len)); - - /* Let skb_put_padto() free nskb, and let dsa_slave_xmit() free - * skb - */ - if (skb_put_padto(nskb, nskb->len + padlen)) - return NULL; - - consume_skb(skb); - } - - return nskb; -} - static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, struct net_device *dev, unsigned int port, unsigned int len) @@ -90,23 +50,18 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - struct sk_buff *nskb; u8 *tag; u8 *addr; - nskb = ksz_common_xmit(skb, dev, KSZ_INGRESS_TAG_LEN); - if (!nskb) - return NULL; - /* Tag encoding */ - tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN); - addr = skb_mac_header(nskb); + tag = skb_put(skb, KSZ_INGRESS_TAG_LEN); + addr = skb_mac_header(skb); *tag = 1 << dp->index; if (is_link_local_ether_addr(addr)) *tag |= KSZ8795_TAIL_TAG_OVERRIDE; - return nskb; + return skb; } static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev, @@ -156,18 +111,13 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - struct sk_buff *nskb; __be16 *tag; u8 *addr; u16 val; - nskb = ksz_common_xmit(skb, dev, KSZ9477_INGRESS_TAG_LEN); - if (!nskb) - return NULL; - /* Tag encoding */ - tag = skb_put(nskb, KSZ9477_INGRESS_TAG_LEN); - addr = skb_mac_header(nskb); + tag = skb_put(skb, KSZ9477_INGRESS_TAG_LEN); + addr = skb_mac_header(skb); val = BIT(dp->index); @@ -176,7 +126,7 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, *tag = cpu_to_be16(val); - return nskb; + return skb; } static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev, @@ -213,24 +163,19 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - struct sk_buff *nskb; u8 *addr; u8 *tag; - nskb = ksz_common_xmit(skb, dev, KSZ_INGRESS_TAG_LEN); - if (!nskb) - return NULL; - /* Tag encoding */ - tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN); - addr = skb_mac_header(nskb); + tag = skb_put(skb, KSZ_INGRESS_TAG_LEN); + addr = skb_mac_header(skb); *tag = BIT(dp->index); if (is_link_local_ether_addr(addr)) *tag |= KSZ9893_TAIL_TAG_OVERRIDE; - return nskb; + return skb; } static const struct dsa_device_ops ksz9893_netdev_ops = { diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index ccfb6f641bbf..aa1318dccaf0 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -58,15 +58,6 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) __be16 *lan9303_tag; u16 tag; - /* insert a special VLAN tag between the MAC addresses - * and the current ethertype field. - */ - if (skb_cow_head(skb, LAN9303_TAG_LEN) < 0) { - dev_dbg(&dev->dev, - "Cannot make room for the special tag. Dropping packet\n"); - return NULL; - } - /* provide 'LAN9303_TAG_LEN' bytes additional space */ skb_push(skb, LAN9303_TAG_LEN); diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 4cdd9cf428fb..38dcdded74c0 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -34,9 +34,6 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, * table with VID. */ if (!skb_vlan_tagged(skb)) { - if (skb_cow_head(skb, MTK_HDR_LEN) < 0) - return NULL; - skb_push(skb, MTK_HDR_LEN); memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN); is_vlan_skb = false; diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 3b468aca5c53..743809b5806b 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -1,181 +1,74 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright 2019 NXP Semiconductors */ +#include <linux/dsa/ocelot.h> #include <soc/mscc/ocelot.h> -#include <linux/packing.h> #include "dsa_priv.h" -/* The CPU injection header and the CPU extraction header can have 3 types of - * prefixes: long, short and no prefix. The format of the header itself is the - * same in all 3 cases. - * - * Extraction with long prefix: - * - * +-------------------+-------------------+------+------+------------+-------+ - * | ff:ff:ff:ff:ff:ff | ff:ff:ff:ff:ff:ff | 8880 | 000a | extraction | frame | - * | | | | | header | | - * +-------------------+-------------------+------+------+------------+-------+ - * 48 bits 48 bits 16 bits 16 bits 128 bits - * - * Extraction with short prefix: - * - * +------+------+------------+-------+ - * | 8880 | 000a | extraction | frame | - * | | | header | | - * +------+------+------------+-------+ - * 16 bits 16 bits 128 bits - * - * Extraction with no prefix: - * - * +------------+-------+ - * | extraction | frame | - * | header | | - * +------------+-------+ - * 128 bits - * - * - * Injection with long prefix: - * - * +-------------------+-------------------+------+------+------------+-------+ - * | any dmac | any smac | 8880 | 000a | injection | frame | - * | | | | | header | | - * +-------------------+-------------------+------+------+------------+-------+ - * 48 bits 48 bits 16 bits 16 bits 128 bits - * - * Injection with short prefix: - * - * +------+------+------------+-------+ - * | 8880 | 000a | injection | frame | - * | | | header | | - * +------+------+------------+-------+ - * 16 bits 16 bits 128 bits - * - * Injection with no prefix: - * - * +------------+-------+ - * | injection | frame | - * | header | | - * +------------+-------+ - * 128 bits - * - * The injection header looks like this (network byte order, bit 127 - * is part of lowest address byte in memory, bit 0 is part of highest - * address byte): - * - * +------+------+------+------+------+------+------+------+ - * 127:120 |BYPASS| MASQ | MASQ_PORT |REW_OP|REW_OP| - * +------+------+------+------+------+------+------+------+ - * 119:112 | REW_OP | - * +------+------+------+------+------+------+------+------+ - * 111:104 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 103: 96 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 95: 88 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 87: 80 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 79: 72 | RSV | - * +------+------+------+------+------+------+------+------+ - * 71: 64 | RSV | DEST | - * +------+------+------+------+------+------+------+------+ - * 63: 56 | DEST | - * +------+------+------+------+------+------+------+------+ - * 55: 48 | RSV | - * +------+------+------+------+------+------+------+------+ - * 47: 40 | RSV | SRC_PORT | RSV |TFRM_TIMER| - * +------+------+------+------+------+------+------+------+ - * 39: 32 | TFRM_TIMER | RSV | - * +------+------+------+------+------+------+------+------+ - * 31: 24 | RSV | DP | POP_CNT | CPUQ | - * +------+------+------+------+------+------+------+------+ - * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| - * +------+------+------+------+------+------+------+------+ - * 15: 8 | PCP | DEI | VID | - * +------+------+------+------+------+------+------+------+ - * 7: 0 | VID | - * +------+------+------+------+------+------+------+------+ - * - * And the extraction header looks like this: - * - * +------+------+------+------+------+------+------+------+ - * 127:120 | RSV | REW_OP | - * +------+------+------+------+------+------+------+------+ - * 119:112 | REW_OP | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 111:104 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 103: 96 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 95: 88 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 87: 80 | REW_VAL | LLEN | - * +------+------+------+------+------+------+------+------+ - * 79: 72 | LLEN | WLEN | - * +------+------+------+------+------+------+------+------+ - * 71: 64 | WLEN | RSV | - * +------+------+------+------+------+------+------+------+ - * 63: 56 | RSV | - * +------+------+------+------+------+------+------+------+ - * 55: 48 | RSV | - * +------+------+------+------+------+------+------+------+ - * 47: 40 | RSV | SRC_PORT | ACL_ID | - * +------+------+------+------+------+------+------+------+ - * 39: 32 | ACL_ID | RSV | SFLOW_ID | - * +------+------+------+------+------+------+------+------+ - * 31: 24 |ACL_HIT| DP | LRN_FLAGS | CPUQ | - * +------+------+------+------+------+------+------+------+ - * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| - * +------+------+------+------+------+------+------+------+ - * 15: 8 | PCP | DEI | VID | - * +------+------+------+------+------+------+------+------+ - * 7: 0 | VID | - * +------+------+------+------+------+------+------+------+ - */ +static void ocelot_xmit_ptp(struct dsa_port *dp, void *injection, + struct sk_buff *clone) +{ + struct ocelot *ocelot = dp->ds->priv; + struct ocelot_port *ocelot_port; + u64 rew_op; -static struct sk_buff *ocelot_xmit(struct sk_buff *skb, - struct net_device *netdev) + ocelot_port = ocelot->ports[dp->index]; + rew_op = ocelot_port->ptp_cmd; + + /* Retrieve timestamp ID populated inside skb->cb[0] of the + * clone by ocelot_port_add_txtstamp_skb + */ + if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) + rew_op |= clone->cb[0] << 3; + + ocelot_ifh_set_rew_op(injection, rew_op); +} + +static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, + __be32 ifh_prefix, void **ifh) { struct dsa_port *dp = dsa_slave_to_port(netdev); struct sk_buff *clone = DSA_SKB_CB(skb)->clone; struct dsa_switch *ds = dp->ds; - struct ocelot *ocelot = ds->priv; - struct ocelot_port *ocelot_port; - u8 *prefix, *injection; - u64 qos_class, rew_op; - int err; + void *injection; + __be32 *prefix; - err = skb_cow_head(skb, OCELOT_TOTAL_TAG_LEN); - if (unlikely(err < 0)) { - netdev_err(netdev, "Cannot make room for tag.\n"); - return NULL; - } + injection = skb_push(skb, OCELOT_TAG_LEN); + prefix = skb_push(skb, OCELOT_SHORT_PREFIX_LEN); - ocelot_port = ocelot->ports[dp->index]; + *prefix = ifh_prefix; + memset(injection, 0, OCELOT_TAG_LEN); + ocelot_ifh_set_bypass(injection, 1); + ocelot_ifh_set_src(injection, ds->num_ports); + ocelot_ifh_set_qos_class(injection, skb->priority); - injection = skb_push(skb, OCELOT_TAG_LEN); + /* TX timestamping was requested */ + if (clone) + ocelot_xmit_ptp(dp, injection, clone); - prefix = skb_push(skb, OCELOT_SHORT_PREFIX_LEN); + *ifh = injection; +} + +static struct sk_buff *ocelot_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct dsa_port *dp = dsa_slave_to_port(netdev); + void *injection; - memcpy(prefix, ocelot_port->xmit_template, OCELOT_TOTAL_TAG_LEN); + ocelot_xmit_common(skb, netdev, cpu_to_be32(0x8880000a), &injection); + ocelot_ifh_set_dest(injection, BIT_ULL(dp->index)); - /* Fix up the fields which are not statically determined - * in the template - */ - qos_class = skb->priority; - packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0); + return skb; +} - /* TX timestamping was requested */ - if (clone) { - rew_op = ocelot_port->ptp_cmd; - /* Retrieve timestamp ID populated inside skb->cb[0] of the - * clone by ocelot_port_add_txtstamp_skb - */ - if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) - rew_op |= clone->cb[0] << 3; +static struct sk_buff *seville_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct dsa_port *dp = dsa_slave_to_port(netdev); + void *injection; - packing(injection, &rew_op, 125, 117, OCELOT_TAG_LEN, PACK, 0); - } + ocelot_xmit_common(skb, netdev, cpu_to_be32(0x88800005), &injection); + seville_ifh_set_dest(injection, BIT_ULL(dp->index)); return skb; } @@ -184,14 +77,13 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) { - struct dsa_port *cpu_dp = netdev->dsa_ptr; - struct dsa_switch *ds = cpu_dp->ds; - struct ocelot *ocelot = ds->priv; u64 src_port, qos_class; u64 vlan_tci, tag_type; u8 *start = skb->data; + struct dsa_port *dp; u8 *extraction; u16 vlan_tpid; + u64 cpuq; /* Revert skb->data by the amount consumed by the DSA master, * so it points to the beginning of the frame. @@ -217,10 +109,11 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, /* Remove from inet csum the extraction header */ skb_postpull_rcsum(skb, start, OCELOT_TOTAL_TAG_LEN); - packing(extraction, &src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0); - packing(extraction, &qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0); - packing(extraction, &tag_type, 16, 16, OCELOT_TAG_LEN, UNPACK, 0); - packing(extraction, &vlan_tci, 15, 0, OCELOT_TAG_LEN, UNPACK, 0); + ocelot_xfh_get_src_port(extraction, &src_port); + ocelot_xfh_get_qos_class(extraction, &qos_class); + ocelot_xfh_get_tag_type(extraction, &tag_type); + ocelot_xfh_get_vlan_tci(extraction, &vlan_tci); + ocelot_xfh_get_cpuq(extraction, &cpuq); skb->dev = dsa_master_find_slave(netdev, 0, src_port); if (!skb->dev) @@ -235,6 +128,12 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, skb->offload_fwd_mark = 1; skb->priority = qos_class; +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + if (eth_hdr(skb)->h_proto == cpu_to_be16(ETH_P_MRP) && + cpuq & BIT(OCELOT_MRP_CPUQ)) + skb->offload_fwd_mark = 0; +#endif + /* Ocelot switches copy frames unmodified to the CPU. However, it is * possible for the user to request a VLAN modification through * VCAP_IS1_ACT_VID_REPLACE_ENA. In this case, what will happen is that @@ -250,9 +149,10 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, * equal to the pvid of the ingress port and should not be used for * processing. */ + dp = dsa_slave_to_port(skb->dev); vlan_tpid = tag_type ? ETH_P_8021AD : ETH_P_8021Q; - if (ocelot->ports[src_port]->vlan_aware && + if (dsa_port_is_vlan_filtering(dp) && eth_hdr(skb)->h_proto == htons(vlan_tpid)) { u16 dummy_vlan_tci; @@ -274,7 +174,26 @@ static const struct dsa_device_ops ocelot_netdev_ops = { .promisc_on_master = true, }; -MODULE_LICENSE("GPL v2"); +DSA_TAG_DRIVER(ocelot_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT); -module_dsa_tag_driver(ocelot_netdev_ops); +static const struct dsa_device_ops seville_netdev_ops = { + .name = "seville", + .proto = DSA_TAG_PROTO_SEVILLE, + .xmit = seville_xmit, + .rcv = ocelot_rcv, + .overhead = OCELOT_TOTAL_TAG_LEN, + .promisc_on_master = true, +}; + +DSA_TAG_DRIVER(seville_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SEVILLE); + +static struct dsa_tag_driver *ocelot_tag_driver_array[] = { + &DSA_TAG_DRIVER_NAME(ocelot_netdev_ops), + &DSA_TAG_DRIVER_NAME(seville_netdev_ops), +}; + +module_dsa_tag_drivers(ocelot_tag_driver_array); + +MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c new file mode 100644 index 000000000000..5f3e8e124a82 --- /dev/null +++ b/net/dsa/tag_ocelot_8021q.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2020-2021 NXP Semiconductors + * + * An implementation of the software-defined tag_8021q.c tagger format, which + * also preserves full functionality under a vlan_filtering bridge. It does + * this by using the TCAM engines for: + * - pushing the RX VLAN as a second, outer tag, on egress towards the CPU port + * - redirecting towards the correct front port based on TX VLAN and popping + * that on egress + */ +#include <linux/dsa/8021q.h> +#include <soc/mscc/ocelot.h> +#include <soc/mscc/ocelot_ptp.h> +#include "dsa_priv.h" + +static struct sk_buff *ocelot_xmit_ptp(struct dsa_port *dp, + struct sk_buff *skb, + struct sk_buff *clone) +{ + struct ocelot *ocelot = dp->ds->priv; + struct ocelot_port *ocelot_port; + int port = dp->index; + u32 rew_op; + + if (!ocelot_can_inject(ocelot, 0)) + return NULL; + + ocelot_port = ocelot->ports[port]; + rew_op = ocelot_port->ptp_cmd; + + /* Retrieve timestamp ID populated inside skb->cb[0] of the + * clone by ocelot_port_add_txtstamp_skb + */ + if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) + rew_op |= clone->cb[0] << 3; + + ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); + + return NULL; +} + +static struct sk_buff *ocelot_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct dsa_port *dp = dsa_slave_to_port(netdev); + u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); + u16 queue_mapping = skb_get_queue_mapping(skb); + u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); + struct sk_buff *clone = DSA_SKB_CB(skb)->clone; + + /* TX timestamping was requested, so inject through MMIO */ + if (clone) + return ocelot_xmit_ptp(dp, skb, clone); + + return dsa_8021q_xmit(skb, netdev, ETH_P_8021Q, + ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); +} + +static struct sk_buff *ocelot_rcv(struct sk_buff *skb, + struct net_device *netdev, + struct packet_type *pt) +{ + int src_port, switch_id, qos_class; + u16 vid, tci; + + skb_push_rcsum(skb, ETH_HLEN); + if (skb_vlan_tag_present(skb)) { + tci = skb_vlan_tag_get(skb); + __vlan_hwaccel_clear_tag(skb); + } else { + __skb_vlan_pop(skb, &tci); + } + skb_pull_rcsum(skb, ETH_HLEN); + + vid = tci & VLAN_VID_MASK; + src_port = dsa_8021q_rx_source_port(vid); + switch_id = dsa_8021q_rx_switch_id(vid); + qos_class = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + + skb->dev = dsa_master_find_slave(netdev, switch_id, src_port); + if (!skb->dev) + return NULL; + + skb->offload_fwd_mark = 1; + skb->priority = qos_class; + + return skb; +} + +static const struct dsa_device_ops ocelot_8021q_netdev_ops = { + .name = "ocelot-8021q", + .proto = DSA_TAG_PROTO_OCELOT_8021Q, + .xmit = ocelot_xmit, + .rcv = ocelot_rcv, + .overhead = VLAN_HLEN, + .promisc_on_master = true, +}; + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT_8021Q); + +module_dsa_tag_driver(ocelot_8021q_netdev_ops); diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 1b9e8507112b..88181b52f480 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -34,9 +34,6 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) __be16 *phdr; u16 hdr; - if (skb_cow_head(skb, QCA_HDR_LEN) < 0) - return NULL; - skb_push(skb, QCA_HDR_LEN); memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN); diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c index 2646abe5a69e..c17d39b4a1a0 100644 --- a/net/dsa/tag_rtl4_a.c +++ b/net/dsa/tag_rtl4_a.c @@ -12,9 +12,7 @@ * * The 2 bytes tag form a 16 bit big endian word. The exact * meaning has been guessed from packet dumps from ingress - * frames, as no working egress traffic has been available - * we do not know the format of the egress tags or if they - * are even supported. + * frames. */ #include <linux/etherdevice.h> @@ -36,17 +34,34 @@ static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb, struct net_device *dev) { - /* - * Just let it pass thru, we don't know if it is possible - * to tag a frame with the 0x8899 ethertype and direct it - * to a specific port, all attempts at reverse-engineering have - * ended up with the frames getting dropped. - * - * The VLAN set-up needs to restrict the frames to the right port. - * - * If you have documentation on the tagging format for RTL8366RB - * (tag type A) then please contribute. - */ + struct dsa_port *dp = dsa_slave_to_port(dev); + u8 *tag; + u16 *p; + u16 out; + + /* Pad out to at least 60 bytes */ + if (unlikely(eth_skb_pad(skb))) + return NULL; + if (skb_cow_head(skb, RTL4_A_HDR_LEN) < 0) + return NULL; + + netdev_dbg(dev, "add realtek tag to package to port %d\n", + dp->index); + skb_push(skb, RTL4_A_HDR_LEN); + + memmove(skb->data, skb->data + RTL4_A_HDR_LEN, 2 * ETH_ALEN); + tag = skb->data + 2 * ETH_ALEN; + + /* Set Ethertype */ + p = (u16 *)tag; + *p = htons(RTL4_A_ETHERTYPE); + + out = (RTL4_A_PROTOCOL_RTL8366RB << 12) | (2 << 8); + /* The lower bits is the port numer */ + out |= (u8)dp->index; + p = (u16 *)(tag + 2); + *p = htons(out); + return skb; } diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 3a1cc24a4f0a..5b97ede56a0f 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -13,42 +13,15 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - struct sk_buff *nskb; - int padlen; u8 *trailer; - /* - * We have to make sure that the trailer ends up as the very - * last 4 bytes of the packet. This means that we have to pad - * the packet to the minimum ethernet frame size, if necessary, - * before adding the trailer. - */ - padlen = 0; - if (skb->len < 60) - padlen = 60 - skb->len; - - nskb = alloc_skb(NET_IP_ALIGN + skb->len + padlen + 4, GFP_ATOMIC); - if (!nskb) - return NULL; - skb_reserve(nskb, NET_IP_ALIGN); - - skb_reset_mac_header(nskb); - skb_set_network_header(nskb, skb_network_header(skb) - skb->head); - skb_set_transport_header(nskb, skb_transport_header(skb) - skb->head); - skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len)); - consume_skb(skb); - - if (padlen) { - skb_put_zero(nskb, padlen); - } - - trailer = skb_put(nskb, 4); + trailer = skb_put(skb, 4); trailer[0] = 0x80; trailer[1] = 1 << dp->index; trailer[2] = 0x10; trailer[3] = 0x00; - return nskb; + return skb; } static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c new file mode 100644 index 000000000000..858cdf9d2913 --- /dev/null +++ b/net/dsa/tag_xrs700x.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * XRS700x tag format handling + * Copyright (c) 2008-2009 Marvell Semiconductor + * Copyright (c) 2020 NovaTech LLC + */ + +#include <linux/bitops.h> + +#include "dsa_priv.h" + +static struct sk_buff *xrs700x_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_port *partner, *dp = dsa_slave_to_port(dev); + u8 *trailer; + + trailer = skb_put(skb, 1); + trailer[0] = BIT(dp->index); + + if (dp->hsr_dev) + dsa_hsr_foreach_port(partner, dp->ds, dp->hsr_dev) + if (partner != dp) + trailer[0] |= BIT(partner->index); + + return skb; +} + +static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + int source_port; + u8 *trailer; + + trailer = skb_tail_pointer(skb) - 1; + + source_port = ffs((int)trailer[0]) - 1; + + if (source_port < 0) + return NULL; + + skb->dev = dsa_master_find_slave(dev, 0, source_port); + if (!skb->dev) + return NULL; + + if (pskb_trim_rcsum(skb, skb->len - 1)) + return NULL; + + /* Frame is forwarded by hardware, don't forward in software. */ + skb->offload_fwd_mark = 1; + + return skb; +} + +static const struct dsa_device_ops xrs700x_netdev_ops = { + .name = "xrs700x", + .proto = DSA_TAG_PROTO_XRS700X, + .xmit = xrs700x_xmit, + .rcv = xrs700x_rcv, + .overhead = 1, + .tail_tag = true, +}; + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_XRS700X); + +module_dsa_tag_driver(xrs700x_netdev_ops); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index dac65180c4ef..4106373180c6 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -272,7 +272,7 @@ void eth_header_cache_update(struct hh_cache *hh, EXPORT_SYMBOL(eth_header_cache_update); /** - * eth_header_parser_protocol - extract protocol from L2 header + * eth_header_parse_protocol - extract protocol from L2 header * @skb: packet to extract protocol from */ __be16 eth_header_parse_protocol(const struct sk_buff *skb) @@ -523,8 +523,8 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) EXPORT_SYMBOL(eth_platform_get_mac_address); /** - * Obtain the MAC address from an nvmem cell named 'mac-address' associated - * with given device. + * nvmem_get_mac_address - Obtain the MAC address from an nvmem cell named + * 'mac-address' associated with given device. * * @dev: Device with which the mac-address cell is associated. * @addrbuf: Buffer to which the MAC address will be copied on success. diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c index 1fb3603d92ad..0515d6604b3b 100644 --- a/net/ethtool/bitset.c +++ b/net/ethtool/bitset.c @@ -628,6 +628,8 @@ int ethnl_parse_bitset(unsigned long *val, unsigned long *mask, return ret; change_bits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); + if (change_bits > nbits) + change_bits = nbits; bitmap_from_arr32(val, nla_data(tb[ETHTOOL_A_BITSET_VALUE]), change_bits); if (change_bits < nbits) diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 5635604cb9ba..25a9e566ef5c 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -194,8 +194,9 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) if (netif_is_rxfh_configured(dev) && !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) && (channels.combined_count + channels.rx_count) <= max_rx_in_use) { + ret = -EINVAL; GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing indirection table settings"); - return -EINVAL; + goto out_ops; } /* Disabling channels, query zero-copy AF_XDP sockets */ @@ -203,8 +204,9 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) min(channels.rx_count, channels.tx_count); for (i = from_channel; i < old_total; i++) if (xsk_get_pool_from_qid(dev, i)) { + ret = -EINVAL; GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets"); - return -EINVAL; + goto out_ops; } ret = dev->ethtool_ops->set_channels(dev, &channels); diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 24036e3055a1..c6a383dfd6c2 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -68,6 +68,11 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list", [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload", + [NETIF_F_GRO_UDP_FWD_BIT] = "rx-udp-gro-forwarding", + [NETIF_F_HW_HSR_TAG_INS_BIT] = "hsr-tag-ins-offload", + [NETIF_F_HW_HSR_TAG_RM_BIT] = "hsr-tag-rm-offload", + [NETIF_F_HW_HSR_FWD_BIT] = "hsr-fwd-offload", + [NETIF_F_HW_HSR_DUP_BIT] = "hsr-dup-offload", }; const char @@ -197,6 +202,153 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); +#define __LINK_MODE_LANES_CR 1 +#define __LINK_MODE_LANES_CR2 2 +#define __LINK_MODE_LANES_CR4 4 +#define __LINK_MODE_LANES_CR8 8 +#define __LINK_MODE_LANES_DR 1 +#define __LINK_MODE_LANES_DR2 2 +#define __LINK_MODE_LANES_DR4 4 +#define __LINK_MODE_LANES_DR8 8 +#define __LINK_MODE_LANES_KR 1 +#define __LINK_MODE_LANES_KR2 2 +#define __LINK_MODE_LANES_KR4 4 +#define __LINK_MODE_LANES_KR8 8 +#define __LINK_MODE_LANES_SR 1 +#define __LINK_MODE_LANES_SR2 2 +#define __LINK_MODE_LANES_SR4 4 +#define __LINK_MODE_LANES_SR8 8 +#define __LINK_MODE_LANES_ER 1 +#define __LINK_MODE_LANES_KX 1 +#define __LINK_MODE_LANES_KX4 4 +#define __LINK_MODE_LANES_LR 1 +#define __LINK_MODE_LANES_LR4 4 +#define __LINK_MODE_LANES_LR4_ER4 4 +#define __LINK_MODE_LANES_LR_ER_FR 1 +#define __LINK_MODE_LANES_LR2_ER2_FR2 2 +#define __LINK_MODE_LANES_LR4_ER4_FR4 4 +#define __LINK_MODE_LANES_LR8_ER8_FR8 8 +#define __LINK_MODE_LANES_LRM 1 +#define __LINK_MODE_LANES_MLD2 2 +#define __LINK_MODE_LANES_T 1 +#define __LINK_MODE_LANES_T1 1 +#define __LINK_MODE_LANES_X 1 +#define __LINK_MODE_LANES_FX 1 + +#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \ + [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ + .speed = SPEED_ ## _speed, \ + .lanes = __LINK_MODE_LANES_ ## _type, \ + .duplex = __DUPLEX_ ## _duplex \ + } +#define __DUPLEX_Half DUPLEX_HALF +#define __DUPLEX_Full DUPLEX_FULL +#define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \ + [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = { \ + .speed = SPEED_UNKNOWN, \ + .lanes = 0, \ + .duplex = DUPLEX_UNKNOWN, \ + } + +const struct link_mode_info link_mode_params[] = { + __DEFINE_LINK_MODE_PARAMS(10, T, Half), + __DEFINE_LINK_MODE_PARAMS(10, T, Full), + __DEFINE_LINK_MODE_PARAMS(100, T, Half), + __DEFINE_LINK_MODE_PARAMS(100, T, Full), + __DEFINE_LINK_MODE_PARAMS(1000, T, Half), + __DEFINE_LINK_MODE_PARAMS(1000, T, Full), + __DEFINE_SPECIAL_MODE_PARAMS(Autoneg), + __DEFINE_SPECIAL_MODE_PARAMS(TP), + __DEFINE_SPECIAL_MODE_PARAMS(AUI), + __DEFINE_SPECIAL_MODE_PARAMS(MII), + __DEFINE_SPECIAL_MODE_PARAMS(FIBRE), + __DEFINE_SPECIAL_MODE_PARAMS(BNC), + __DEFINE_LINK_MODE_PARAMS(10000, T, Full), + __DEFINE_SPECIAL_MODE_PARAMS(Pause), + __DEFINE_SPECIAL_MODE_PARAMS(Asym_Pause), + __DEFINE_LINK_MODE_PARAMS(2500, X, Full), + __DEFINE_SPECIAL_MODE_PARAMS(Backplane), + __DEFINE_LINK_MODE_PARAMS(1000, KX, Full), + __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full), + __DEFINE_LINK_MODE_PARAMS(10000, KR, Full), + [ETHTOOL_LINK_MODE_10000baseR_FEC_BIT] = { + .speed = SPEED_10000, + .duplex = DUPLEX_FULL, + }, + __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full), + __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full), + __DEFINE_LINK_MODE_PARAMS(25000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(25000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(25000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR4_ER4, Full), + __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(1000, X, Full), + __DEFINE_LINK_MODE_PARAMS(10000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(10000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(10000, LR, Full), + __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full), + __DEFINE_LINK_MODE_PARAMS(10000, ER, Full), + __DEFINE_LINK_MODE_PARAMS(2500, T, Full), + __DEFINE_LINK_MODE_PARAMS(5000, T, Full), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_NONE), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_RS), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_BASER), + __DEFINE_LINK_MODE_PARAMS(50000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(100, T1, Full), + __DEFINE_LINK_MODE_PARAMS(1000, T1, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS), + __DEFINE_LINK_MODE_PARAMS(100000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(100, FX, Half), + __DEFINE_LINK_MODE_PARAMS(100, FX, Full), +}; +static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS); + const char netif_msg_class_names[][ETH_GSTRING_LEN] = { [NETIF_MSG_DRV_BIT] = "drv", [NETIF_MSG_PROBE_BIT] = "probe", diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 3d9251c95a8b..a9d071248698 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -14,6 +14,12 @@ #define __SOF_TIMESTAMPING_CNT (const_ilog2(SOF_TIMESTAMPING_LAST) + 1) +struct link_mode_info { + int speed; + u8 lanes; + u8 duplex; +}; + extern const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]; extern const char @@ -23,6 +29,7 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN]; extern const char phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN]; extern const char link_mode_names[][ETH_GSTRING_LEN]; +extern const struct link_mode_info link_mode_params[]; extern const char netif_msg_class_names[][ETH_GSTRING_LEN]; extern const char wol_mode_names[][ETH_GSTRING_LEN]; extern const char sof_timestamping_names[][ETH_GSTRING_LEN]; diff --git a/net/ethtool/features.c b/net/ethtool/features.c index 8ee4cdbd6b82..1c9f4df273bd 100644 --- a/net/ethtool/features.c +++ b/net/ethtool/features.c @@ -280,7 +280,7 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) active_diff_mask, compact); } if (mod) - ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL); + netdev_features_change(dev); out_rtnl: rtnl_unlock(); diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index ec2cd7aab5ad..24783b71c584 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -426,13 +426,29 @@ struct ethtool_link_usettings { int __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings) { + const struct link_mode_info *link_info; + int err; + ASSERT_RTNL(); if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; memset(link_ksettings, 0, sizeof(*link_ksettings)); - return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings); + + link_ksettings->link_mode = -1; + err = dev->ethtool_ops->get_link_ksettings(dev, link_ksettings); + if (err) + return err; + + if (link_ksettings->link_mode != -1) { + link_info = &link_mode_params[link_ksettings->link_mode]; + link_ksettings->base.speed = link_info->speed; + link_ksettings->lanes = link_info->lanes; + link_ksettings->base.duplex = link_info->duplex; + } + + return 0; } EXPORT_SYMBOL(__ethtool_get_link_ksettings); @@ -2433,7 +2449,7 @@ static int noinline_for_stack ethtool_set_per_queue(struct net_device *dev, return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt); default: return -EOPNOTSUPP; - }; + } } static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna) diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index c5bcb9abc8b9..f9eda596f301 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -4,6 +4,8 @@ #include "common.h" #include "bitset.h" +/* LINKMODES_GET */ + struct linkmodes_req_info { struct ethnl_req_info base; }; @@ -43,6 +45,9 @@ static int linkmodes_prepare_data(const struct ethnl_req_info *req_base, goto out; } + if (!dev->ethtool_ops->cap_link_lanes_supported) + data->ksettings.lanes = 0; + data->peer_empty = bitmap_empty(data->ksettings.link_modes.lp_advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -63,6 +68,7 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base, len = nla_total_size(sizeof(u8)) /* LINKMODES_AUTONEG */ + nla_total_size(sizeof(u32)) /* LINKMODES_SPEED */ + + nla_total_size(sizeof(u32)) /* LINKMODES_LANES */ + nla_total_size(sizeof(u8)) /* LINKMODES_DUPLEX */ + 0; ret = ethnl_bitset_size(ksettings->link_modes.advertising, @@ -123,6 +129,10 @@ static int linkmodes_fill_reply(struct sk_buff *skb, nla_put_u8(skb, ETHTOOL_A_LINKMODES_DUPLEX, lsettings->duplex)) return -EMSGSIZE; + if (ksettings->lanes && + nla_put_u32(skb, ETHTOOL_A_LINKMODES_LANES, ksettings->lanes)) + return -EMSGSIZE; + if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED && nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, lsettings->master_slave_cfg)) @@ -150,122 +160,6 @@ const struct ethnl_request_ops ethnl_linkmodes_request_ops = { /* LINKMODES_SET */ -struct link_mode_info { - int speed; - u8 duplex; -}; - -#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \ - [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ - .speed = SPEED_ ## _speed, \ - .duplex = __DUPLEX_ ## _duplex \ - } -#define __DUPLEX_Half DUPLEX_HALF -#define __DUPLEX_Full DUPLEX_FULL -#define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \ - [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = { \ - .speed = SPEED_UNKNOWN, \ - .duplex = DUPLEX_UNKNOWN, \ - } - -static const struct link_mode_info link_mode_params[] = { - __DEFINE_LINK_MODE_PARAMS(10, T, Half), - __DEFINE_LINK_MODE_PARAMS(10, T, Full), - __DEFINE_LINK_MODE_PARAMS(100, T, Half), - __DEFINE_LINK_MODE_PARAMS(100, T, Full), - __DEFINE_LINK_MODE_PARAMS(1000, T, Half), - __DEFINE_LINK_MODE_PARAMS(1000, T, Full), - __DEFINE_SPECIAL_MODE_PARAMS(Autoneg), - __DEFINE_SPECIAL_MODE_PARAMS(TP), - __DEFINE_SPECIAL_MODE_PARAMS(AUI), - __DEFINE_SPECIAL_MODE_PARAMS(MII), - __DEFINE_SPECIAL_MODE_PARAMS(FIBRE), - __DEFINE_SPECIAL_MODE_PARAMS(BNC), - __DEFINE_LINK_MODE_PARAMS(10000, T, Full), - __DEFINE_SPECIAL_MODE_PARAMS(Pause), - __DEFINE_SPECIAL_MODE_PARAMS(Asym_Pause), - __DEFINE_LINK_MODE_PARAMS(2500, X, Full), - __DEFINE_SPECIAL_MODE_PARAMS(Backplane), - __DEFINE_LINK_MODE_PARAMS(1000, KX, Full), - __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full), - __DEFINE_LINK_MODE_PARAMS(10000, KR, Full), - [ETHTOOL_LINK_MODE_10000baseR_FEC_BIT] = { - .speed = SPEED_10000, - .duplex = DUPLEX_FULL, - }, - __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full), - __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full), - __DEFINE_LINK_MODE_PARAMS(25000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(25000, KR, Full), - __DEFINE_LINK_MODE_PARAMS(25000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full), - __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(100000, LR4_ER4, Full), - __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full), - __DEFINE_LINK_MODE_PARAMS(1000, X, Full), - __DEFINE_LINK_MODE_PARAMS(10000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(10000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(10000, LR, Full), - __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full), - __DEFINE_LINK_MODE_PARAMS(10000, ER, Full), - __DEFINE_LINK_MODE_PARAMS(2500, T, Full), - __DEFINE_LINK_MODE_PARAMS(5000, T, Full), - __DEFINE_SPECIAL_MODE_PARAMS(FEC_NONE), - __DEFINE_SPECIAL_MODE_PARAMS(FEC_RS), - __DEFINE_SPECIAL_MODE_PARAMS(FEC_BASER), - __DEFINE_LINK_MODE_PARAMS(50000, KR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, LR_ER_FR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, DR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, LR2_ER2_FR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, LR4_ER4_FR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(100, T1, Full), - __DEFINE_LINK_MODE_PARAMS(1000, T1, Full), - __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full), - __DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS), - __DEFINE_LINK_MODE_PARAMS(100000, KR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, DR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full), - __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(100, FX, Half), - __DEFINE_LINK_MODE_PARAMS(100, FX, Full), -}; - const struct nla_policy ethnl_linkmodes_set_policy[] = { [ETHTOOL_A_LINKMODES_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), @@ -274,25 +168,23 @@ const struct nla_policy ethnl_linkmodes_set_policy[] = { [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_U32 }, [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_U8 }, [ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKMODES_LANES] = NLA_POLICY_RANGE(NLA_U32, 1, 8), }; -/* Set advertised link modes to all supported modes matching requested speed - * and duplex values. Called when autonegotiation is on, speed or duplex is - * requested but no link mode change. This is done in userspace with ioctl() - * interface, move it into kernel for netlink. +/* Set advertised link modes to all supported modes matching requested speed, + * lanes and duplex values. Called when autonegotiation is on, speed, lanes or + * duplex is requested but no link mode change. This is done in userspace with + * ioctl() interface, move it into kernel for netlink. * Returns true if advertised modes bitmap was modified. */ static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings, - bool req_speed, bool req_duplex) + bool req_speed, bool req_lanes, bool req_duplex) { unsigned long *advertising = ksettings->link_modes.advertising; unsigned long *supported = ksettings->link_modes.supported; DECLARE_BITMAP(old_adv, __ETHTOOL_LINK_MODE_MASK_NBITS); unsigned int i; - BUILD_BUG_ON(ARRAY_SIZE(link_mode_params) != - __ETHTOOL_LINK_MODE_MASK_NBITS); - bitmap_copy(old_adv, advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); for (i = 0; i < __ETHTOOL_LINK_MODE_MASK_NBITS; i++) { @@ -302,6 +194,7 @@ static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings, continue; if (test_bit(i, supported) && (!req_speed || info->speed == ksettings->base.speed) && + (!req_lanes || info->lanes == ksettings->lanes) && (!req_duplex || info->duplex == ksettings->base.duplex)) set_bit(i, advertising); else @@ -325,38 +218,72 @@ static bool ethnl_validate_master_slave_cfg(u8 cfg) return false; } +static int ethnl_check_linkmodes(struct genl_info *info, struct nlattr **tb) +{ + const struct nlattr *master_slave_cfg, *lanes_cfg; + + master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG]; + if (master_slave_cfg && + !ethnl_validate_master_slave_cfg(nla_get_u8(master_slave_cfg))) { + NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg, + "master/slave value is invalid"); + return -EOPNOTSUPP; + } + + lanes_cfg = tb[ETHTOOL_A_LINKMODES_LANES]; + if (lanes_cfg && !is_power_of_2(nla_get_u32(lanes_cfg))) { + NL_SET_ERR_MSG_ATTR(info->extack, lanes_cfg, + "lanes value is invalid"); + return -EINVAL; + } + + return 0; +} + static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, struct ethtool_link_ksettings *ksettings, - bool *mod) + bool *mod, const struct net_device *dev) { struct ethtool_link_settings *lsettings = &ksettings->base; - bool req_speed, req_duplex; - const struct nlattr *master_slave_cfg; + bool req_speed, req_lanes, req_duplex; + const struct nlattr *master_slave_cfg, *lanes_cfg; int ret; master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG]; if (master_slave_cfg) { - u8 cfg = nla_get_u8(master_slave_cfg); - if (lsettings->master_slave_cfg == MASTER_SLAVE_CFG_UNSUPPORTED) { NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg, "master/slave configuration not supported by device"); return -EOPNOTSUPP; } - - if (!ethnl_validate_master_slave_cfg(cfg)) { - NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg, - "master/slave value is invalid"); - return -EOPNOTSUPP; - } } *mod = false; req_speed = tb[ETHTOOL_A_LINKMODES_SPEED]; + req_lanes = tb[ETHTOOL_A_LINKMODES_LANES]; req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX]; ethnl_update_u8(&lsettings->autoneg, tb[ETHTOOL_A_LINKMODES_AUTONEG], mod); + + lanes_cfg = tb[ETHTOOL_A_LINKMODES_LANES]; + if (lanes_cfg) { + /* If autoneg is off and lanes parameter is not supported by the + * driver, return an error. + */ + if (!lsettings->autoneg && + !dev->ethtool_ops->cap_link_lanes_supported) { + NL_SET_ERR_MSG_ATTR(info->extack, lanes_cfg, + "lanes configuration not supported by device"); + return -EOPNOTSUPP; + } + } else if (!lsettings->autoneg) { + /* If autoneg is off and lanes parameter is not passed from user, + * set the lanes parameter to 0. + */ + ksettings->lanes = 0; + } + ret = ethnl_update_bitset(ksettings->link_modes.advertising, __ETHTOOL_LINK_MODE_MASK_NBITS, tb[ETHTOOL_A_LINKMODES_OURS], link_mode_names, @@ -365,13 +292,14 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, return ret; ethnl_update_u32(&lsettings->speed, tb[ETHTOOL_A_LINKMODES_SPEED], mod); + ethnl_update_u32(&ksettings->lanes, lanes_cfg, mod); ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX], mod); ethnl_update_u8(&lsettings->master_slave_cfg, master_slave_cfg, mod); if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg && - (req_speed || req_duplex) && - ethnl_auto_linkmodes(ksettings, req_speed, req_duplex)) + (req_speed || req_lanes || req_duplex) && + ethnl_auto_linkmodes(ksettings, req_speed, req_lanes, req_duplex)) *mod = true; return 0; @@ -386,6 +314,10 @@ int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) bool mod = false; int ret; + ret = ethnl_check_linkmodes(info, tb); + if (ret < 0) + return ret; + ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_LINKMODES_HEADER], genl_info_net(info), info->extack, @@ -409,7 +341,7 @@ int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) goto out_ops; } - ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod); + ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod, dev); if (ret < 0) goto out_ops; diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index d8efec516d86..6eabd58d81bf 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -351,7 +351,7 @@ extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_O extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1]; extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1]; extern const struct nla_policy ethnl_linkmodes_get_policy[ETHTOOL_A_LINKMODES_HEADER + 1]; -extern const struct nla_policy ethnl_linkmodes_set_policy[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG + 1]; +extern const struct nla_policy ethnl_linkmodes_set_policy[ETHTOOL_A_LINKMODES_LANES + 1]; extern const struct nla_policy ethnl_linkstate_get_policy[ETHTOOL_A_LINKSTATE_HEADER + 1]; extern const struct nla_policy ethnl_debug_get_policy[ETHTOOL_A_DEBUG_HEADER + 1]; extern const struct nla_policy ethnl_debug_set_policy[ETHTOOL_A_DEBUG_MSGMASK + 1]; diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 0baad0ce1832..c3a5489964cd 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -182,7 +182,7 @@ static int strset_parse_request(struct ethnl_req_info *req_base, ret = strset_get_id(attr, &id, extack); if (ret < 0) return ret; - if (ret >= ETH_SS_COUNT) { + if (id >= ETH_SS_COUNT) { NL_SET_ERR_MSG_ATTR(extack, attr, "unknown string set id"); return -EOPNOTSUPP; diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index ab953a1a0d6c..7444ec6e298e 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -230,7 +230,7 @@ static const struct header_ops hsr_header_ops = { .parse = eth_header_parse, }; -static struct sk_buff *hsr_init_skb(struct hsr_port *master, u16 proto) +static struct sk_buff *hsr_init_skb(struct hsr_port *master) { struct hsr_priv *hsr = master->hsr; struct sk_buff *skb; @@ -242,8 +242,7 @@ static struct sk_buff *hsr_init_skb(struct hsr_port *master, u16 proto) * being, for PRP it is a trailer and for HSR it is a * header */ - skb = dev_alloc_skb(sizeof(struct hsr_tag) + - sizeof(struct hsr_sup_tag) + + skb = dev_alloc_skb(sizeof(struct hsr_sup_tag) + sizeof(struct hsr_sup_payload) + hlen + tlen); if (!skb) @@ -251,10 +250,9 @@ static struct sk_buff *hsr_init_skb(struct hsr_port *master, u16 proto) skb_reserve(skb, hlen); skb->dev = master->dev; - skb->protocol = htons(proto); skb->priority = TC_PRIO_CONTROL; - if (dev_hard_header(skb, skb->dev, proto, + if (dev_hard_header(skb, skb->dev, ETH_P_PRP, hsr->sup_multicast_addr, skb->dev->dev_addr, skb->len) <= 0) goto out; @@ -275,12 +273,10 @@ static void send_hsr_supervision_frame(struct hsr_port *master, { struct hsr_priv *hsr = master->hsr; __u8 type = HSR_TLV_LIFE_CHECK; - struct hsr_tag *hsr_tag = NULL; struct hsr_sup_payload *hsr_sp; struct hsr_sup_tag *hsr_stag; unsigned long irqflags; struct sk_buff *skb; - u16 proto; *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); if (hsr->announce_count < 3 && hsr->prot_version == 0) { @@ -289,23 +285,12 @@ static void send_hsr_supervision_frame(struct hsr_port *master, hsr->announce_count++; } - if (!hsr->prot_version) - proto = ETH_P_PRP; - else - proto = ETH_P_HSR; - - skb = hsr_init_skb(master, proto); + skb = hsr_init_skb(master); if (!skb) { WARN_ONCE(1, "HSR: Could not send supervision frame\n"); return; } - if (hsr->prot_version > 0) { - hsr_tag = skb_put(skb, sizeof(struct hsr_tag)); - hsr_tag->encap_proto = htons(ETH_P_PRP); - set_hsr_tag_LSDU_size(hsr_tag, HSR_V1_SUP_LSDUSIZE); - } - hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag)); set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf)); set_hsr_stag_HSR_ver(hsr_stag, hsr->prot_version); @@ -315,8 +300,6 @@ static void send_hsr_supervision_frame(struct hsr_port *master, if (hsr->prot_version > 0) { hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); hsr->sup_sequence_nr++; - hsr_tag->sequence_nr = htons(hsr->sequence_nr); - hsr->sequence_nr++; } else { hsr_stag->sequence_nr = htons(hsr->sequence_nr); hsr->sequence_nr++; @@ -332,7 +315,7 @@ static void send_hsr_supervision_frame(struct hsr_port *master, hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr); - if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN)) + if (skb_put_padto(skb, ETH_ZLEN)) return; hsr_forward_skb(skb, master); @@ -348,10 +331,8 @@ static void send_prp_supervision_frame(struct hsr_port *master, struct hsr_sup_tag *hsr_stag; unsigned long irqflags; struct sk_buff *skb; - struct prp_rct *rct; - u8 *tail; - skb = hsr_init_skb(master, ETH_P_PRP); + skb = hsr_init_skb(master); if (!skb) { WARN_ONCE(1, "PRP: Could not send supervision frame\n"); return; @@ -373,17 +354,11 @@ static void send_prp_supervision_frame(struct hsr_port *master, hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr); - if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN)) { + if (skb_put_padto(skb, ETH_ZLEN)) { spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); return; } - tail = skb_tail_pointer(skb) - HSR_HLEN; - rct = (struct prp_rct *)tail; - rct->PRP_suffix = htons(ETH_P_PRP); - set_prp_LSDU_size(rct, HSR_V1_SUP_LSDUSIZE); - rct->sequence_nr = htons(hsr->sequence_nr); - hsr->sequence_nr++; spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); hsr_forward_skb(skb, master); @@ -442,6 +417,7 @@ static struct hsr_proto_ops hsr_ops = { .send_sv_frame = send_hsr_supervision_frame, .create_tagged_frame = hsr_create_tagged_frame, .get_untagged_frame = hsr_get_untagged_frame, + .drop_frame = hsr_drop_frame, .fill_frame_info = hsr_fill_frame_info, .invalid_dan_ingress_frame = hsr_invalid_dan_ingress_frame, }; @@ -489,10 +465,11 @@ void hsr_dev_setup(struct net_device *dev) /* Return true if dev is a HSR master; return false otherwise. */ -inline bool is_hsr_master(struct net_device *dev) +bool is_hsr_master(struct net_device *dev) { return (dev->netdev_ops->ndo_start_xmit == hsr_dev_xmit); } +EXPORT_SYMBOL(is_hsr_master); /* Default multicast address for HSR Supervision frames */ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { @@ -545,16 +522,6 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], hsr->prot_version = protocol_version; - /* FIXME: should I modify the value of these? - * - * - hsr_dev->flags - i.e. - * IFF_MASTER/SLAVE? - * - hsr_dev->priv_flags - i.e. - * IFF_EBRIDGE? - * IFF_TX_SKB_SHARING? - * IFF_HSR_MASTER/SLAVE? - */ - /* Make sure the 1st call to netif_carrier_on() gets through */ netif_carrier_off(hsr_dev); diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h index 868373822ee4..9060c92168f9 100644 --- a/net/hsr/hsr_device.h +++ b/net/hsr/hsr_device.h @@ -19,6 +19,5 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], unsigned char multicast_spec, u8 protocol_version, struct netlink_ext_ack *extack); void hsr_check_carrier_and_operstate(struct hsr_priv *hsr); -bool is_hsr_master(struct net_device *dev); int hsr_get_max_mtu(struct hsr_priv *hsr); #endif /* __HSR_DEVICE_H */ diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index cadfccd7876e..ed82a470b6e1 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -186,6 +186,7 @@ static struct sk_buff *prp_fill_rct(struct sk_buff *skb, set_prp_LSDU_size(trailer, lsdu_size); trailer->sequence_nr = htons(frame->sequence_nr); trailer->PRP_suffix = htons(ETH_P_PRP); + skb->protocol = eth_hdr(skb)->h_proto; return skb; } @@ -226,6 +227,7 @@ static struct sk_buff *hsr_fill_tag(struct sk_buff *skb, hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; hsr_ethhdr->ethhdr.h_proto = htons(proto_version ? ETH_P_HSR : ETH_P_PRP); + skb->protocol = hsr_ethhdr->ethhdr.h_proto; return skb; } @@ -247,6 +249,8 @@ struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame, /* set the lane id properly */ hsr_set_path_id(hsr_ethhdr, port); return skb_clone(frame->skb_hsr, GFP_ATOMIC); + } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) { + return skb_clone(frame->skb_std, GFP_ATOMIC); } /* Create the new skb with enough headroom to fit the HSR tag */ @@ -289,6 +293,8 @@ struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame, return NULL; } return skb_clone(frame->skb_prp, GFP_ATOMIC); + } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) { + return skb_clone(frame->skb_std, GFP_ATOMIC); } skb = skb_copy_expand(frame->skb_std, 0, @@ -341,6 +347,14 @@ bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) port->type == HSR_PT_SLAVE_A)); } +bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) +{ + if (port->dev->features & NETIF_F_HW_HSR_FWD) + return prp_drop_frame(frame, port); + + return false; +} + /* Forward the frame through all devices except: * - Back through the receiving device * - If it's a HSR frame: through a device where it has passed before @@ -357,6 +371,7 @@ static void hsr_forward_do(struct hsr_frame_info *frame) { struct hsr_port *port; struct sk_buff *skb; + bool sent = false; hsr_for_each_port(frame->port_rcv->hsr, port) { struct hsr_priv *hsr = port->hsr; @@ -372,6 +387,12 @@ static void hsr_forward_do(struct hsr_frame_info *frame) if (port->type != HSR_PT_MASTER && frame->is_local_exclusive) continue; + /* If hardware duplicate generation is enabled, only send out + * one port. + */ + if ((port->dev->features & NETIF_F_HW_HSR_DUP) && sent) + continue; + /* Don't send frame over port where it has been sent before. * Also fro SAN, this shouldn't be done. */ @@ -403,10 +424,12 @@ static void hsr_forward_do(struct hsr_frame_info *frame) } skb->dev = port->dev; - if (port->type == HSR_PT_MASTER) + if (port->type == HSR_PT_MASTER) { hsr_deliver_master(skb, port->dev, frame->node_src); - else - hsr_xmit(skb, port, frame); + } else { + if (!hsr_xmit(skb, port, frame)) + sent = true; + } } } @@ -454,7 +477,11 @@ static void handle_std_frame(struct sk_buff *skb, void hsr_fill_frame_info(__be16 proto, struct sk_buff *skb, struct hsr_frame_info *frame) { - if (proto == htons(ETH_P_PRP) || + struct hsr_port *port = frame->port_rcv; + struct hsr_priv *hsr = port->hsr; + + /* HSRv0 supervisory frames double as a tag so treat them as tagged. */ + if ((!hsr->prot_version && proto == htons(ETH_P_PRP)) || proto == htons(ETH_P_HSR)) { /* HSR tagged frame :- Data or Supervision */ frame->skb_std = NULL; diff --git a/net/hsr/hsr_forward.h b/net/hsr/hsr_forward.h index 618140d484ad..b6acaafa83fc 100644 --- a/net/hsr/hsr_forward.h +++ b/net/hsr/hsr_forward.h @@ -23,6 +23,7 @@ struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame, struct sk_buff *prp_get_untagged_frame(struct hsr_frame_info *frame, struct hsr_port *port); bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port); +bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port); void prp_fill_frame_info(__be16 proto, struct sk_buff *skb, struct hsr_frame_info *frame); void hsr_fill_frame_info(__be16 proto, struct sk_buff *skb, diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 5c97de459905..f9a8cc82ae2e 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -277,6 +277,8 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) skb = frame->skb_hsr; else if (frame->skb_prp) skb = frame->skb_prp; + else if (frame->skb_std) + skb = frame->skb_std; if (!skb) return; diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 2fd1976e5b1c..f7e284f23b1f 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -131,6 +131,17 @@ struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt) return NULL; } +int hsr_get_version(struct net_device *dev, enum hsr_version *ver) +{ + struct hsr_priv *hsr; + + hsr = netdev_priv(dev); + *ver = hsr->prot_version; + + return 0; +} +EXPORT_SYMBOL(hsr_get_version); + static struct notifier_block hsr_nb = { .notifier_call = hsr_netdev_notify, /* Slave event notifications */ }; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 7dc92ce5a134..a169808ee78a 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -13,6 +13,7 @@ #include <linux/netdevice.h> #include <linux/list.h> #include <linux/if_vlan.h> +#include <linux/if_hsr.h> /* Time constants as specified in the HSR specification (IEC-62439-3 2010) * Table 8. @@ -171,13 +172,6 @@ struct hsr_port { enum hsr_port_type type; }; -/* used by driver internally to differentiate various protocols */ -enum hsr_version { - HSR_V0 = 0, - HSR_V1, - PRP_V1, -}; - struct hsr_frame_info; struct hsr_node; @@ -217,7 +211,10 @@ struct hsr_priv { u8 net_id; /* for PRP, it occupies most significant 3 bits * of lan_id */ - unsigned char sup_multicast_addr[ETH_ALEN]; + unsigned char sup_multicast_addr[ETH_ALEN] __aligned(sizeof(u16)); + /* Align to u16 boundary to avoid unaligned access + * in ether_addr_equal + */ #ifdef CONFIG_DEBUG_FS struct dentry *node_tbl_root; #endif diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 36d5fcf09c61..c5227d42faf5 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -48,12 +48,14 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) goto finish_consume; } - /* For HSR, only tagged frames are expected, but for PRP - * there could be non tagged frames as well from Single - * attached nodes (SANs). + /* For HSR, only tagged frames are expected (unless the device offloads + * HSR tag removal), but for PRP there could be non tagged frames as + * well from Single attached nodes (SANs). */ protocol = eth_hdr(skb)->h_proto; - if (hsr->proto_ops->invalid_dan_ingress_frame && + + if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && + hsr->proto_ops->invalid_dan_ingress_frame && hsr->proto_ops->invalid_dan_ingress_frame(protocol)) goto finish_pass; diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index 6d091e419d3e..9c640d670ffe 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -149,7 +149,7 @@ static struct net_device *ieee802154_nl_get_dev(struct genl_info *info) if (info->attrs[IEEE802154_ATTR_DEV_NAME]) { char name[IFNAMSIZ + 1]; - nla_strlcpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME], + nla_strscpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME], sizeof(name)); dev = dev_get_by_name(&init_net, name); } else if (info->attrs[IEEE802154_ATTR_DEV_INDEX]) { diff --git a/net/ife/Kconfig b/net/ife/Kconfig index bcf650564db4..de36a5b91e50 100644 --- a/net/ife/Kconfig +++ b/net/ife/Kconfig @@ -4,7 +4,6 @@ # menuconfig NET_IFE - depends on NET tristate "Inter-FE based on IETF ForCES InterFE LFB" default n help diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b7260c8cef2e..a02ce89b56b5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -438,6 +438,7 @@ EXPORT_SYMBOL(inet_release); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + u32 flags = BIND_WITH_LOCK; int err; /* If the socket has its own bind function then use it. (RAW) */ @@ -450,11 +451,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, + BPF_CGROUP_INET4_BIND, &flags); if (err) return err; - return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); + return __inet_bind(sk, uaddr, addr_len, flags); } EXPORT_SYMBOL(inet_bind); @@ -499,7 +501,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, snum = ntohs(addr->sin_port); err = -EACCES; - if (snum && inet_port_requires_bind_service(net, snum) && + if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && + snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) goto out; @@ -777,18 +780,19 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, return -ENOTCONN; sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + BPF_CGROUP_INET4_GETPEERNAME, + NULL); } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; - } - if (cgroup_bpf_enabled) BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - peer ? BPF_CGROUP_INET4_GETPEERNAME : - BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET4_GETSOCKNAME, NULL); + } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } @@ -1419,7 +1423,6 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, out: return segs; } -EXPORT_SYMBOL(inet_gso_segment); static struct sk_buff *ipip_gso_segment(struct sk_buff *skb, netdev_features_t features) @@ -1550,7 +1553,6 @@ out: return pp; } -EXPORT_SYMBOL(inet_gro_receive); static struct sk_buff *ipip_gro_receive(struct list_head *head, struct sk_buff *skb) @@ -1636,7 +1638,6 @@ out_unlock: return err; } -EXPORT_SYMBOL(inet_gro_complete); static int ipip_gro_complete(struct sk_buff *skb, int nhoff) { @@ -1871,6 +1872,8 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_igmp_llm_reports = 1; net->ipv4.sysctl_igmp_qrv = 2; + net->ipv4.sysctl_fib_notify_on_flag_change = 0; + return 0; } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 687971d83b4e..922dd73e5740 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -125,6 +125,7 @@ static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); static void parp_redo(struct sk_buff *skb); +static int arp_is_multicast(const void *pkey); static const struct neigh_ops arp_generic_ops = { .family = AF_INET, @@ -156,6 +157,7 @@ struct neigh_table arp_tbl = { .key_eq = arp_key_eq, .constructor = arp_constructor, .proxy_redo = parp_redo, + .is_multicast = arp_is_multicast, .id = "arp_cache", .parms = { .tbl = &arp_tbl, @@ -928,6 +930,10 @@ static void parp_redo(struct sk_buff *skb) arp_process(dev_net(skb->dev), NULL, skb); } +static int arp_is_multicast(const void *pkey) +{ + return ipv4_is_multicast(*((__be32 *)pkey)); +} /* * Receive an arp request from the device layer. diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 618954f82764..d520e61649c8 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -95,6 +95,7 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size, } static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id) @@ -102,7 +103,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, size_t end; if (atype == BPF_READ) - return btf_struct_access(log, t, off, size, atype, next_btf_id); + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id); if (t != tcp_sock_type) { bpf_log(log, "only read is supported\n"); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 123a6d39438f..75f67994fc85 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -650,8 +650,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, struct in_device *in_dev; struct ifaddrmsg *ifm; struct in_ifaddr *ifa; - - int err = -EINVAL; + int err; ASSERT_RTNL(); @@ -881,7 +880,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]); if (tb[IFA_LABEL]) - nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); + nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 8b07f3a4f2db..a3271ec3e162 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -443,7 +443,6 @@ static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb, int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { u8 *tail; - u8 *vaddr; int nfrags; int esph_offset; struct page *page; @@ -485,14 +484,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * page = pfrag->page; get_page(page); - vaddr = kmap_atomic(page); - - tail = vaddr + pfrag->offset; + tail = page_address(page) + pfrag->offset; esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); - kunmap_atomic(vaddr); - nfrags = skb_shinfo(skb)->nr_frags; __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 5bda5aeda579..601f5fbfc63f 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -285,7 +285,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ esp.esph = ip_esp_hdr(skb); - if (!hw_offload || (hw_offload && !skb_is_gso(skb))) { + if (!hw_offload || !skb_is_gso(skb)) { esp.nfrags = esp_output_head(x, skb, &esp); if (esp.nfrags < 0) return esp.nfrags; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 86a23e4a6a50..84bb707bd88d 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -292,7 +292,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_oif = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, - .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), + .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK, .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; @@ -696,7 +696,7 @@ int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, cfg->fc_gw4 = *((__be32 *)via->rtvia_addr); break; case AF_INET6: -#ifdef CONFIG_IPV6 +#if IS_ENABLED(CONFIG_IPV6) if (alen != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA"); return -EINVAL; @@ -825,7 +825,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); - goto errout; + return -EINVAL; } return 0; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 818916b2a04d..b58db1ca4bfb 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -18,7 +18,8 @@ struct fib_alias { s16 fa_default; u8 offload:1, trap:1, - unused:6; + offload_failed:1, + unused:5; struct rcu_head rcu; }; @@ -39,9 +40,10 @@ int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - struct fib_rt_info *fri, unsigned int flags); + const struct fib_rt_info *fri, unsigned int flags); void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); +size_t fib_nlmsg_size(struct fib_info *fi); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 1f75dc686b6b..a632b66bc13a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -452,7 +452,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev) return -1; } -static inline size_t fib_nlmsg_size(struct fib_info *fi) +size_t fib_nlmsg_size(struct fib_info *fi) { size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ @@ -521,6 +521,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, fri.type = fa->fa_type; fri.offload = fa->offload; fri.trap = fa->trap; + fri.offload_failed = fa->offload_failed; err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ @@ -973,7 +974,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) char tmp[TCP_CA_NAME_MAX]; bool ecn_ca = false; - nla_strlcpy(tmp, nla, sizeof(tmp)); + nla_strscpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); } else { if (nla_len(nla) != sizeof(u32)) @@ -1641,9 +1642,8 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, break; } - *flags |= (nhc->nhc_flags & RTNH_F_ONLINK); - if (nhc->nhc_flags & RTNH_F_OFFLOAD) - *flags |= RTNH_F_OFFLOAD; + *flags |= (nhc->nhc_flags & + (RTNH_F_ONLINK | RTNH_F_OFFLOAD | RTNH_F_TRAP)); if (!skip_oif && nhc->nhc_dev && nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex)) @@ -1734,7 +1734,7 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) #endif int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, - struct fib_rt_info *fri, unsigned int flags) + const struct fib_rt_info *fri, unsigned int flags) { unsigned int nhs = fib_info_num_path(fri->fi); struct fib_info *fi = fri->fi; @@ -1812,6 +1812,8 @@ offload: rtm->rtm_flags |= RTM_F_OFFLOAD; if (fri->trap) rtm->rtm_flags |= RTM_F_TRAP; + if (fri->offload_failed) + rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; nlmsg_end(skb, nlh); return 0; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ffc5332f1390..25cf387cca5b 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1038,6 +1038,8 @@ fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) { struct fib_alias *fa_match; + struct sk_buff *skb; + int err; rcu_read_lock(); @@ -1045,9 +1047,42 @@ void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) if (!fa_match) goto out; + if (fa_match->offload == fri->offload && fa_match->trap == fri->trap && + fa_match->offload_failed == fri->offload_failed) + goto out; + fa_match->offload = fri->offload; fa_match->trap = fri->trap; + /* 2 means send notifications only if offload_failed was changed. */ + if (net->ipv4.sysctl_fib_notify_on_flag_change == 2 && + fa_match->offload_failed == fri->offload_failed) + goto out; + + fa_match->offload_failed = fri->offload_failed; + + if (!net->ipv4.sysctl_fib_notify_on_flag_change) + goto out; + + skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC); + if (!skb) { + err = -ENOBUFS; + goto errout; + } + + err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC); + goto out; + +errout: + rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err); out: rcu_read_unlock(); } @@ -1263,6 +1298,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; + new_fa->offload_failed = 0; hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); @@ -1323,6 +1359,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; + new_fa->offload_failed = 0; /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); @@ -2100,15 +2137,6 @@ static void __fib_info_notify_update(struct net *net, struct fib_table *tb, rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa, KEYLENGTH - fa->fa_slen, tb->tb_id, info, NLM_F_REPLACE); - - /* call_fib_entry_notifiers will be removed when - * in-kernel notifier is implemented and supported - * for nexthop objects - */ - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, - n->key, - KEYLENGTH - fa->fa_slen, fa, - NULL); } } } @@ -2271,6 +2299,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, fri.type = fa->fa_type; fri.offload = fa->offload; fri.trap = fa->trap; + fri.offload_failed = fa->offload_failed; err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 66fdbfe5447c..5d1e6fe9d838 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -128,7 +128,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, * to 0 and sets the configured key in the * inner erspan header field */ - if (greh->protocol == htons(ETH_P_ERSPAN) || + if ((greh->protocol == htons(ETH_P_ERSPAN) && hdr_len != 4) || greh->protocol == htons(ETH_P_ERSPAN2)) { struct erspan_base_hdr *ershdr; diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index e0a246575887..1121a9d5fed9 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -15,7 +15,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); - bool need_csum, need_recompute_csum, gso_partial; + bool need_csum, offload_csum, gso_partial, need_ipsec; struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; __be16 protocol = skb->protocol; @@ -41,10 +41,16 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, skb->protocol = skb->inner_protocol; need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); - need_recompute_csum = skb->csum_not_inet; skb->encap_hdr_csum = need_csum; features &= skb->dev->hw_enc_features; + if (need_csum) + features &= ~NETIF_F_SCTP_CRC; + + need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); + /* Try to offload checksum if possible */ + offload_csum = !!(need_csum && !need_ipsec && + (skb->dev->features & NETIF_F_HW_CSUM)); /* segment inner packet. */ segs = skb_mac_gso_segment(skb, features); @@ -99,14 +105,12 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, } *(pcsum + 1) = 0; - if (need_recompute_csum && !skb_is_gso(skb)) { - __wsum csum; - - csum = skb_checksum(skb, gre_offset, - skb->len - gre_offset, 0); - *pcsum = csum_fold(csum); - } else { + if (skb->encapsulation || !offload_csum) { *pcsum = gso_make_checksum(skb, 0); + } else { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = sizeof(*greh); } } while ((skb = skb->next)); out: diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 005faea415a4..616e2dc1c8fa 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -447,7 +447,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); - security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) goto out_unlock; @@ -503,7 +503,7 @@ static struct rtable *icmp_route_lookup(struct net *net, route_lookup_dev = icmp_get_route_lookup_dev(skb_in); fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev); - security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); + security_skb_classify_flow(skb_in, flowi4_to_flowi_common(fl4)); rt = ip_route_output_key_hash(net, fl4, skb_in); if (IS_ERR(rt)) return rt; @@ -775,13 +775,14 @@ EXPORT_SYMBOL(__icmp_send); void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct sk_buff *cloned_skb = NULL; + struct ip_options opts = { 0 }; enum ip_conntrack_info ctinfo; struct nf_conn *ct; __be32 orig_ip; ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(ct->status & IPS_SRC_NAT)) { - icmp_send(skb_in, type, code, info); + __icmp_send(skb_in, type, code, info, &opts); return; } @@ -796,7 +797,7 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) orig_ip = ip_hdr(skb_in)->saddr; ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip; - icmp_send(skb_in, type, code, info); + __icmp_send(skb_in, type, code, info, &opts); ip_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4148f5f78f31..6bd7ca09af03 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -602,7 +602,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk->sk_uid); - security_req_classify_flow(req, flowi4_to_flowi(fl4)); + security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; @@ -640,7 +640,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk->sk_uid); - security_req_classify_flow(req, flowi4_to_flowi(fl4)); + security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; @@ -787,7 +787,7 @@ static void reqsk_queue_hash_req(struct request_sock *req, timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); - inet_ehash_insert(req_to_sk(req), NULL); + inet_ehash_insert(req_to_sk(req), NULL, NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ @@ -851,6 +851,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; newicsk->icsk_probes_out = 0; + newicsk->icsk_probes_tstamp = 0; /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 366a4507b5a3..93474b1bea4e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -479,8 +479,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, r->idiag_inode = 0; if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, - inet_rsk(reqsk)->ir_mark)) + inet_rsk(reqsk)->ir_mark)) { + nlmsg_cancel(skb, nlh); return -EMSGSIZE; + } nlmsg_end(skb, nlh); return 0; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 10d31733297d..05cd198d7a6b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -145,12 +145,16 @@ static void inet_frags_free_cb(void *ptr, void *arg) inet_frag_destroy(fq); } -static void fqdir_work_fn(struct work_struct *work) +static LLIST_HEAD(fqdir_free_list); + +static void fqdir_free_fn(struct work_struct *work) { - struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); - struct inet_frags *f = fqdir->f; + struct llist_node *kill_list; + struct fqdir *fqdir, *tmp; + struct inet_frags *f; - rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); + /* Atomically snapshot the list of fqdirs to free */ + kill_list = llist_del_all(&fqdir_free_list); /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) * have completed, since they need to dereference fqdir. @@ -158,10 +162,25 @@ static void fqdir_work_fn(struct work_struct *work) */ rcu_barrier(); - if (refcount_dec_and_test(&f->refcnt)) - complete(&f->completion); + llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) { + f = fqdir->f; + if (refcount_dec_and_test(&f->refcnt)) + complete(&f->completion); - kfree(fqdir); + kfree(fqdir); + } +} + +static DECLARE_WORK(fqdir_free_work, fqdir_free_fn); + +static void fqdir_work_fn(struct work_struct *work) +{ + struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); + + rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); + + if (llist_add(&fqdir->free_list, &fqdir_free_list)) + queue_work(system_wq, &fqdir_free_work); } int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) @@ -184,10 +203,22 @@ int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) } EXPORT_SYMBOL(fqdir_init); +static struct workqueue_struct *inet_frag_wq; + +static int __init inet_frag_wq_init(void) +{ + inet_frag_wq = create_workqueue("inet_frag_wq"); + if (!inet_frag_wq) + panic("Could not create inet frag workq"); + return 0; +} + +pure_initcall(inet_frag_wq_init); + void fqdir_exit(struct fqdir *fqdir) { INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); - queue_work(system_wq, &fqdir->destroy_work); + queue_work(inet_frag_wq, &fqdir->destroy_work); } EXPORT_SYMBOL(fqdir_exit); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 8cbe74313f38..c96866a53a66 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -20,6 +20,9 @@ #include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/inet6_hashtables.h> +#endif #include <net/secure_seq.h> #include <net/ip.h> #include <net/tcp.h> @@ -508,10 +511,52 @@ static u32 inet_sk_port_offset(const struct sock *sk) inet->inet_dport); } -/* insert a socket into ehash, and eventually remove another one - * (The another one can be a SYN_RECV or TIMEWAIT +/* Searches for an exsiting socket in the ehash bucket list. + * Returns true if found, false otherwise. */ -bool inet_ehash_insert(struct sock *sk, struct sock *osk) +static bool inet_ehash_lookup_by_sk(struct sock *sk, + struct hlist_nulls_head *list) +{ + const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); + const int sdif = sk->sk_bound_dev_if; + const int dif = sk->sk_bound_dev_if; + const struct hlist_nulls_node *node; + struct net *net = sock_net(sk); + struct sock *esk; + + INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); + + sk_nulls_for_each_rcu(esk, node, list) { + if (esk->sk_hash != sk->sk_hash) + continue; + if (sk->sk_family == AF_INET) { + if (unlikely(INET_MATCH(esk, net, acookie, + sk->sk_daddr, + sk->sk_rcv_saddr, + ports, dif, sdif))) { + return true; + } + } +#if IS_ENABLED(CONFIG_IPV6) + else if (sk->sk_family == AF_INET6) { + if (unlikely(INET6_MATCH(esk, net, + &sk->sk_v6_daddr, + &sk->sk_v6_rcv_saddr, + ports, dif, sdif))) { + return true; + } + } +#endif + } + return false; +} + +/* Insert a socket into ehash, and eventually remove another one + * (The another one can be a SYN_RECV or TIMEWAIT) + * If an existing socket already exists, socket sk is not inserted, + * and sets found_dup_sk parameter to true. + */ +bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; @@ -530,16 +575,23 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk) if (osk) { WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); ret = sk_nulls_del_node_init_rcu(osk); + } else if (found_dup_sk) { + *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); + if (*found_dup_sk) + ret = false; } + if (ret) __sk_nulls_add_node_rcu(sk, list); + spin_unlock(lock); + return ret; } -bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) { - bool ok = inet_ehash_insert(sk, osk); + bool ok = inet_ehash_insert(sk, osk, found_dup_sk); if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -583,7 +635,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) int err = 0; if (sk->sk_state != TCP_LISTEN) { - inet_ehash_nolisten(sk, osk); + inet_ehash_nolisten(sk, osk, NULL); return 0; } WARN_ON(!sk_unhashed(sk)); @@ -657,6 +709,17 @@ unlock: } EXPORT_SYMBOL_GPL(inet_unhash); +/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm + * Note that we use 32bit integers (vs RFC 'short integers') + * because 2^16 is not a multiple of num_ephemeral and this + * property might be used by clever attacker. + * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, + * we use 256 instead to really give more isolation and + * privacy, this only consumes 1 KB of kernel memory. + */ +#define INET_TABLE_PERTURB_SHIFT 8 +static u32 table_perturb[1 << INET_TABLE_PERTURB_SHIFT]; + int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u32 port_offset, int (*check_established)(struct inet_timewait_death_row *, @@ -670,8 +733,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct inet_bind_bucket *tb; u32 remaining, offset; int ret, i, low, high; - static u32 hint; int l3mdev; + u32 index; if (port) { head = &hinfo->bhash[inet_bhashfn(net, port, @@ -679,7 +742,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - inet_ehash_nolisten(sk, NULL); + inet_ehash_nolisten(sk, NULL, NULL); spin_unlock_bh(&head->lock); return 0; } @@ -698,7 +761,10 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, if (likely(remaining > 1)) remaining &= ~1U; - offset = (hint + port_offset) % remaining; + net_get_random_once(table_perturb, sizeof(table_perturb)); + index = hash_32(port_offset, INET_TABLE_PERTURB_SHIFT); + + offset = (READ_ONCE(table_perturb[index]) + port_offset) % remaining; /* In first pass we try ports of @low parity. * inet_csk_get_port() does the opposite choice. */ @@ -752,13 +818,18 @@ next_port: return -EADDRNOTAVAIL; ok: - hint += i + 2; + /* If our first attempt found a candidate, skip next candidate + * in 1/16 of cases to add some noise. + */ + if (!i && !(prandom_u32() % 16)) + i = 2; + WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - inet_ehash_nolisten(sk, (struct sock *)tw); + inet_ehash_nolisten(sk, (struct sock *)tw, NULL); } if (tw) inet_twsk_bind_unhash(tw, hinfo); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index c411c87ae865..437afe392e66 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -272,14 +272,14 @@ restart: continue; tw = inet_twsk(sk); if ((tw->tw_family != family) || - refcount_read(&twsk_net(tw)->count)) + refcount_read(&twsk_net(tw)->ns.count)) continue; if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt))) continue; if (unlikely((tw->tw_family != family) || - refcount_read(&twsk_net(tw)->count))) { + refcount_read(&twsk_net(tw)->ns.count))) { inet_twsk_put(tw); goto restart; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index e70291748889..a68bf4c6fe9b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -920,7 +920,7 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_start_xmit = ipgre_xmit, .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipgre_tunnel_ctl, }; @@ -1275,7 +1275,7 @@ static const struct net_device_ops gre_tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip_tunnel_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_fill_metadata_dst = gre_fill_metadata_dst, }; @@ -1308,7 +1308,7 @@ static const struct net_device_ops erspan_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip_tunnel_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_fill_metadata_dst = gre_fill_metadata_dst, }; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index b0c244af1e4d..3a025c011971 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -253,6 +253,7 @@ int ip_local_deliver(struct sk_buff *skb) net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } +EXPORT_SYMBOL(ip_local_deliver); static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 879b76ae4435..3aab53beb4ea 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -302,7 +302,7 @@ static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff * if (skb_is_gso(skb)) return ip_finish_output_gso(net, sk, skb, mtu); - if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) + if (skb->len > mtu || IPCB(skb)->frag_max_size) return ip_fragment(net, sk, skb, mtu, ip_finish_output2); return ip_finish_output2(net, sk, skb); @@ -434,6 +434,7 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } +EXPORT_SYMBOL(ip_output); /* * copy saddr and daddr, possibly using 64bit load/stores @@ -1018,7 +1019,7 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ @@ -1230,8 +1231,7 @@ alloc_new_skb: error_efault: err = -EFAULT; error: - if (uarg) - sock_zerocopy_put_abort(uarg, extra_uref); + net_zcopy_put_abort(uarg, extra_uref); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); @@ -1700,7 +1700,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, daddr, saddr, tcp_hdr(skb)->source, tcp_hdr(skb)->dest, arg->uid); - security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index ee65c9225178..76a420c76f16 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -317,7 +317,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) } dev->needed_headroom = t_hlen + hlen; - mtu -= (dev->hard_header_len + t_hlen); + mtu -= t_hlen; if (mtu < IPV4_MIN_MTU) mtu = IPV4_MIN_MTU; @@ -347,7 +347,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, nt = netdev_priv(dev); t_hlen = nt->hlen + sizeof(struct iphdr); dev->min_mtu = ETH_MIN_MTU; - dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen; + dev->max_mtu = IP_MAX_MTU - t_hlen; ip_tunnel_add(itn, nt); return nt; @@ -488,11 +488,10 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, int mtu; tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; - pkt_size = skb->len - tunnel_hlen - dev->hard_header_len; + pkt_size = skb->len - tunnel_hlen; if (df) - mtu = dst_mtu(&rt->dst) - dev->hard_header_len - - sizeof(struct iphdr) - tunnel_hlen; + mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen); else mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; @@ -759,8 +758,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph, - 0, 0, false)) { + df = tnl_params->frag_off; + if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) + df |= (inner_iph->frag_off & htons(IP_DF)); + + if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) { ip_rt_put(rt); goto tx_error; } @@ -788,10 +790,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ttl = ip4_dst_hoplimit(&rt->dst); } - df = tnl_params->frag_off; - if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) - df |= (inner_iph->frag_off&htons(IP_DF)); - max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); if (max_headroom > dev->needed_headroom) @@ -973,7 +971,7 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) { struct ip_tunnel *tunnel = netdev_priv(dev); int t_hlen = tunnel->hlen + sizeof(struct iphdr); - int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen; + int max_mtu = IP_MAX_MTU - t_hlen; if (new_mtu < ETH_MIN_MTU) return -EINVAL; @@ -1150,10 +1148,9 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], mtu = ip_tunnel_bind_dev(dev); if (tb[IFLA_MTU]) { - unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen; + unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr)); - mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, - (unsigned int)(max - sizeof(struct iphdr))); + mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max); } err = dev_set_mtu(dev, mtu); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 25f1caf5abf9..6b2dc7b2b612 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -222,7 +222,7 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) .code = ICMP_FRAG_NEEDED, .checksum = 0, .un.frag.__unused = 0, - .un.frag.mtu = ntohs(mtu), + .un.frag.mtu = htons(mtu), }; icmph->checksum = ip_compute_csum(icmph, len); skb_reset_transport_header(skb); @@ -245,7 +245,7 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) skb->ip_summed = CHECKSUM_NONE; - eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0); + eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0); skb_reset_mac_header(skb); return skb->len; @@ -263,7 +263,7 @@ static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu) const struct icmphdr *icmph = icmp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); - if (mtu <= 576 || iph->frag_off != htons(IP_DF)) + if (mtu < 576 || iph->frag_off != htons(IP_DF)) return 0; if (ipv4_is_lbcast(iph->daddr) || ipv4_is_multicast(iph->daddr) || @@ -338,7 +338,7 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) skb->ip_summed = CHECKSUM_NONE; - eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0); + eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0); skb_reset_mac_header(skb); return skb->len; @@ -359,7 +359,7 @@ static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu) __be16 frag_off; int offset; - if (mtu <= IPV6_MIN_MTU) + if (mtu < IPV6_MIN_MTU) return 0; if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST || @@ -429,15 +429,6 @@ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, } EXPORT_SYMBOL(skb_tunnel_check_pmtu); -/* Often modified stats are per cpu, other are shared (netdev->stats) */ -void ip_tunnel_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot) -{ - netdev_stats_to_stats64(tot, &dev->stats); - dev_fetch_sw_netstats(tot, dev->tstats); -} -EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); - static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { [LWTUNNEL_IP_UNSPEC] = { .strict_start_type = LWTUNNEL_IP_OPTS }, [LWTUNNEL_IP_ID] = { .type = NLA_U64 }, @@ -592,8 +583,9 @@ static int ip_tun_parse_opts_erspan(struct nlattr *attr, static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, struct netlink_ext_ack *extack) { - int err, rem, opt_len, opts_len = 0, type = 0; + int err, rem, opt_len, opts_len = 0; struct nlattr *nla; + __be16 type = 0; if (!attr) return 0; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index b957cbee2cf7..abc171e79d3e 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -404,7 +404,7 @@ static const struct net_device_ops vti_netdev_ops = { .ndo_start_xmit = vti_tunnel_xmit, .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = vti_tunnel_ctl, }; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 561f15b5a944..47db1bfdaaa0 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -61,7 +61,6 @@ #include <linux/export.h> #include <net/net_namespace.h> #include <net/arp.h> -#include <net/dsa.h> #include <net/ip.h> #include <net/ipconfig.h> #include <net/route.h> @@ -218,9 +217,9 @@ static int __init ic_open_devs(void) last = &ic_first_dev; rtnl_lock(); - /* bring loopback and DSA master network devices up first */ + /* bring loopback device up first */ for_each_netdev(&init_net, dev) { - if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev)) + if (!(dev->flags & IFF_LOOPBACK)) continue; if (dev_change_flags(dev, dev->flags | IFF_UP, NULL) < 0) pr_err("IP-Config: Failed to open %s\n", dev->name); @@ -305,17 +304,32 @@ have_carrier: return 0; } +/* Close all network interfaces except the one we've autoconfigured, and its + * lowers, in case it's a stacked virtual interface. + */ static void __init ic_close_devs(void) { + struct net_device *selected_dev = ic_dev->dev; struct ic_device *d, *next; struct net_device *dev; rtnl_lock(); next = ic_first_dev; while ((d = next)) { + bool bring_down = (d != ic_dev); + struct net_device *lower_dev; + struct list_head *iter; + next = d->next; dev = d->dev; - if (d != ic_dev && !netdev_uses_dsa(dev)) { + + netdev_for_each_lower_dev(selected_dev, lower_dev, iter) { + if (dev == lower_dev) { + bring_down = false; + break; + } + } + if (bring_down) { pr_debug("IP-Config: Downing %s\n", dev->name); dev_change_flags(dev, d->flags, NULL); } @@ -1441,7 +1455,7 @@ static int __init ip_auto_config(void) int retries = CONF_OPEN_RETRIES; #endif int err; - unsigned int i; + unsigned int i, count; /* Initialise all name servers and NTP servers to NONE (but only if the * "ip=" or "nfsaddrs=" kernel command line parameters weren't decoded, @@ -1575,7 +1589,7 @@ static int __init ip_auto_config(void) if (ic_dev_mtu) pr_cont(", mtu=%d", ic_dev_mtu); /* Name servers (if any): */ - for (i = 0; i < CONF_NAMESERVERS_MAX; i++) { + for (i = 0, count = 0; i < CONF_NAMESERVERS_MAX; i++) { if (ic_nameservers[i] != NONE) { if (i == 0) pr_info(" nameserver%u=%pI4", @@ -1583,12 +1597,14 @@ static int __init ip_auto_config(void) else pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]); + + count++; } - if (i + 1 == CONF_NAMESERVERS_MAX) + if ((i + 1 == CONF_NAMESERVERS_MAX) && count > 0) pr_cont("\n"); } /* NTP servers (if any): */ - for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) { + for (i = 0, count = 0; i < CONF_NTP_SERVERS_MAX; i++) { if (ic_ntp_servers[i] != NONE) { if (i == 0) pr_info(" ntpserver%u=%pI4", @@ -1596,8 +1612,10 @@ static int __init ip_auto_config(void) else pr_cont(", ntpserver%u=%pI4", i, &ic_ntp_servers[i]); + + count++; } - if (i + 1 == CONF_NTP_SERVERS_MAX) + if ((i + 1 == CONF_NTP_SERVERS_MAX) && count > 0) pr_cont("\n"); } #endif /* !SILENT */ diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 75d35e76bec2..d5bfa087c23a 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -347,7 +347,7 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_start_xmit = ipip_tunnel_xmit, .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip_tunnel_ctl, }; diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 3205d5f7c8c9..25ea6ac44db9 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -31,7 +31,7 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; - nla_strlcpy(tmp, nla, sizeof(tmp)); + nla_strscpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) { NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm"); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index d1e04d2b5170..c576a63d09db 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -203,7 +203,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); - private = READ_ONCE(table->private); /* Address dependency. */ + private = rcu_access_pointer(table->private); cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; @@ -649,7 +649,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -673,7 +673,7 @@ static int copy_entries_to_user(unsigned int total_size, unsigned int off, num; const struct arpt_entry *e; struct xt_counters *counters; - struct xt_table_info *private = table->private; + struct xt_table_info *private = xt_table_get_private_protected(table); int ret = 0; void *loc_cpu_entry; @@ -807,7 +807,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, NFPROTO_ARP, name); if (!IS_ERR(t)) { struct arpt_getinfo info; - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); #ifdef CONFIG_COMPAT struct xt_table_info tmp; @@ -860,7 +860,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); if (get.size == private->size) ret = copy_entries_to_user(private->size, @@ -1017,7 +1017,7 @@ static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = t->private; + private = xt_table_get_private_protected(t); if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1330,7 +1330,7 @@ static int compat_copy_entries_to_user(unsigned int total_size, void __user *userptr) { struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); void __user *pos; unsigned int size; int ret = 0; @@ -1379,7 +1379,7 @@ static int compat_get_entries(struct net *net, xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); struct xt_table_info info; ret = compat_table_info(private, &info); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index f15bc21d7301..e8f6f9d86237 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -258,7 +258,7 @@ ipt_do_table(struct sk_buff *skb, WARN_ON(!(table->valid_hooks & (1 << hook))); local_bh_disable(); addend = xt_write_recseq_begin(); - private = READ_ONCE(table->private); /* Address dependency. */ + private = rcu_access_pointer(table->private); cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; @@ -791,7 +791,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -815,7 +815,7 @@ copy_entries_to_user(unsigned int total_size, unsigned int off, num; const struct ipt_entry *e; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); int ret = 0; const void *loc_cpu_entry; @@ -964,7 +964,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, AF_INET, name); if (!IS_ERR(t)) { struct ipt_getinfo info; - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); #ifdef CONFIG_COMPAT struct xt_table_info tmp; @@ -1018,7 +1018,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); @@ -1173,7 +1173,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = t->private; + private = xt_table_get_private_protected(t); if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1543,7 +1543,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); void __user *pos; unsigned int size; int ret = 0; @@ -1589,7 +1589,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index e16b98ee6266..4b8840734762 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -56,7 +56,8 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par) nf_send_unreach(skb, ICMP_PKT_FILTERED, hook); break; case IPT_TCP_RESET: - nf_send_reset(xt_net(par), skb, hook); + nf_send_reset(xt_net(par), par->state->sk, skb, hook); + break; case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index cc23f1ce239c..8cd3224d913e 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -76,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - flow.flowi4_tos = RT_TOS(iph->tos); + flow.flowi4_tos = iph->tos & IPTOS_RT_MASK; flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_oif = l3mdev_master_ifindex_rcu(xt_in(par)); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 93b07739807b..4eed5afca392 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -12,6 +12,128 @@ #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> +static int nf_reject_iphdr_validate(struct sk_buff *skb) +{ + struct iphdr *iph; + u32 len; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return 0; + + iph = ip_hdr(skb); + if (iph->ihl < 5 || iph->version != 4) + return 0; + + len = ntohs(iph->tot_len); + if (skb->len < len) + return 0; + else if (len < (iph->ihl*4)) + return 0; + + if (!pskb_may_pull(skb, iph->ihl*4)) + return 0; + + return 1; +} + +struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook) +{ + const struct tcphdr *oth; + struct sk_buff *nskb; + struct iphdr *niph; + struct tcphdr _oth; + + if (!nf_reject_iphdr_validate(oldskb)) + return NULL; + + oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook); + if (!oth) + return NULL; + + nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + + LL_MAX_HEADER, GFP_ATOMIC); + if (!nskb) + return NULL; + + nskb->dev = (struct net_device *)dev; + + skb_reserve(nskb, LL_MAX_HEADER); + niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, + net->ipv4.sysctl_ip_default_ttl); + nf_reject_ip_tcphdr_put(nskb, oldskb, oth); + niph->tot_len = htons(nskb->len); + ip_send_check(niph); + + return nskb; +} +EXPORT_SYMBOL_GPL(nf_reject_skb_v4_tcp_reset); + +struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook, u8 code) +{ + struct sk_buff *nskb; + struct iphdr *niph; + struct icmphdr *icmph; + unsigned int len; + __wsum csum; + u8 proto; + + if (!nf_reject_iphdr_validate(oldskb)) + return NULL; + + /* IP header checks: fragment. */ + if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) + return NULL; + + /* RFC says return as much as we can without exceeding 576 bytes. */ + len = min_t(unsigned int, 536, oldskb->len); + + if (!pskb_may_pull(oldskb, len)) + return NULL; + + if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len))) + return NULL; + + proto = ip_hdr(oldskb)->protocol; + + if (!skb_csum_unnecessary(oldskb) && + nf_reject_verify_csum(proto) && + nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto)) + return NULL; + + nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmphdr) + + LL_MAX_HEADER + len, GFP_ATOMIC); + if (!nskb) + return NULL; + + nskb->dev = (struct net_device *)dev; + + skb_reserve(nskb, LL_MAX_HEADER); + niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP, + net->ipv4.sysctl_ip_default_ttl); + + skb_reset_transport_header(nskb); + icmph = skb_put_zero(nskb, sizeof(struct icmphdr)); + icmph->type = ICMP_DEST_UNREACH; + icmph->code = code; + + skb_put_data(nskb, skb_network_header(oldskb), len); + + csum = csum_partial((void *)icmph, len + sizeof(struct icmphdr), 0); + icmph->checksum = csum_fold(csum); + + niph->tot_len = htons(nskb->len); + ip_send_check(niph); + + return nskb; +} +EXPORT_SYMBOL_GPL(nf_reject_skb_v4_unreach); + const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, struct tcphdr *_oth, int hook) { @@ -112,7 +234,8 @@ static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) } /* Send RST reply */ -void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) +void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, + int hook) { struct net_device *br_indev __maybe_unused; struct sk_buff *nskb; @@ -124,7 +247,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) if (!oth) return; - if (hook == NF_INET_PRE_ROUTING && nf_reject_fill_skb_dst(oldskb)) + if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && + nf_reject_fill_skb_dst(oldskb) < 0) return; if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -144,8 +268,7 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, ip4_dst_hoplimit(skb_dst(nskb))); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - - if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC)) + if (ip_route_me_harder(net, sk, nskb, RTN_UNSPEC)) goto free_nskb; niph = ip_hdr(nskb); @@ -193,7 +316,8 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) if (iph->frag_off & htons(IP_OFFSET)) return; - if (hook == NF_INET_PRE_ROUTING && nf_reject_fill_skb_dst(skb_in)) + if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && + nf_reject_fill_skb_dst(skb_in) < 0) return; if (skb_csum_unnecessary(skb_in) || !nf_reject_verify_csum(proto)) { diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index bcdb37f86a94..aeb631760eb9 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -13,8 +13,8 @@ #include <net/netfilter/ipv4/nf_dup_ipv4.h> struct nft_dup_ipv4 { - enum nft_registers sreg_addr:8; - enum nft_registers sreg_dev:8; + u8 sreg_addr; + u8 sreg_dev; }; static void nft_dup_ipv4_eval(const struct nft_expr *expr, @@ -40,16 +40,16 @@ static int nft_dup_ipv4_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_ADDR] == NULL) return -EINVAL; - priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]); - err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in_addr)); + err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, + sizeof(struct in_addr)); if (err < 0) return err; - if (tb[NFTA_DUP_SREG_DEV] != NULL) { - priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); - return nft_validate_register_load(priv->sreg_dev, sizeof(int)); - } - return 0; + if (tb[NFTA_DUP_SREG_DEV]) + err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], + &priv->sreg_dev, sizeof(int)); + + return err; } static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index e408f813f5d8..ff437e4ed6db 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -27,7 +27,8 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt)); + nf_send_reset(nft_net(pkt), pkt->xt.state->sk, pkt->skb, + nft_hook(pkt)); break; default: break; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 0dc43ad28eb9..f1c6cbdb9e43 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -22,7 +22,7 @@ static void remove_nexthop(struct net *net, struct nexthop *nh, #define NH_DEV_HASHBITS 8 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) -static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { +static const struct nla_policy rtm_nh_policy_new[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_GROUP] = { .type = NLA_BINARY }, [NHA_GROUP_TYPE] = { .type = NLA_U16 }, @@ -31,19 +31,174 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { [NHA_GATEWAY] = { .type = NLA_BINARY }, [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, [NHA_ENCAP] = { .type = NLA_NESTED }, + [NHA_FDB] = { .type = NLA_FLAG }, +}; + +static const struct nla_policy rtm_nh_policy_get[] = { + [NHA_ID] = { .type = NLA_U32 }, +}; + +static const struct nla_policy rtm_nh_policy_dump[] = { + [NHA_OIF] = { .type = NLA_U32 }, [NHA_GROUPS] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_FDB] = { .type = NLA_FLAG }, }; +static bool nexthop_notifiers_is_empty(struct net *net) +{ + return !net->nexthop.notifier_chain.head; +} + +static void +__nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, + const struct nexthop *nh) +{ + struct nh_info *nhi = rtnl_dereference(nh->nh_info); + + nh_info->dev = nhi->fib_nhc.nhc_dev; + nh_info->gw_family = nhi->fib_nhc.nhc_gw_family; + if (nh_info->gw_family == AF_INET) + nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4; + else if (nh_info->gw_family == AF_INET6) + nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6; + + nh_info->is_reject = nhi->reject_nh; + nh_info->is_fdb = nhi->fdb_nh; + nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate; +} + +static int nh_notifier_single_info_init(struct nh_notifier_info *info, + const struct nexthop *nh) +{ + info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; + info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); + if (!info->nh) + return -ENOMEM; + + __nh_notifier_single_info_init(info->nh, nh); + + return 0; +} + +static void nh_notifier_single_info_fini(struct nh_notifier_info *info) +{ + kfree(info->nh); +} + +static int nh_notifier_mp_info_init(struct nh_notifier_info *info, + struct nh_group *nhg) +{ + u16 num_nh = nhg->num_nh; + int i; + + info->type = NH_NOTIFIER_INFO_TYPE_GRP; + info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh), + GFP_KERNEL); + if (!info->nh_grp) + return -ENOMEM; + + info->nh_grp->num_nh = num_nh; + info->nh_grp->is_fdb = nhg->fdb_nh; + + for (i = 0; i < num_nh; i++) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + + info->nh_grp->nh_entries[i].id = nhge->nh->id; + info->nh_grp->nh_entries[i].weight = nhge->weight; + __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh, + nhge->nh); + } + + return 0; +} + +static int nh_notifier_grp_info_init(struct nh_notifier_info *info, + const struct nexthop *nh) +{ + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + + if (nhg->mpath) + return nh_notifier_mp_info_init(info, nhg); + return -EINVAL; +} + +static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, + const struct nexthop *nh) +{ + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + + if (nhg->mpath) + kfree(info->nh_grp); +} + +static int nh_notifier_info_init(struct nh_notifier_info *info, + const struct nexthop *nh) +{ + info->id = nh->id; + + if (nh->is_group) + return nh_notifier_grp_info_init(info, nh); + else + return nh_notifier_single_info_init(info, nh); +} + +static void nh_notifier_info_fini(struct nh_notifier_info *info, + const struct nexthop *nh) +{ + if (nh->is_group) + nh_notifier_grp_info_fini(info, nh); + else + nh_notifier_single_info_fini(info); +} + static int call_nexthop_notifiers(struct net *net, enum nexthop_event_type event_type, - struct nexthop *nh) + struct nexthop *nh, + struct netlink_ext_ack *extack) { + struct nh_notifier_info info = { + .net = net, + .extack = extack, + }; int err; + ASSERT_RTNL(); + + if (nexthop_notifiers_is_empty(net)) + return 0; + + err = nh_notifier_info_init(&info, nh); + if (err) { + NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); + return err; + } + err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, - event_type, nh); + event_type, &info); + nh_notifier_info_fini(&info, nh); + + return notifier_to_errno(err); +} + +static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, + enum nexthop_event_type event_type, + struct nexthop *nh, + struct netlink_ext_ack *extack) +{ + struct nh_notifier_info info = { + .net = net, + .extack = extack, + }; + int err; + + err = nh_notifier_info_init(&info, nh); + if (err) + return err; + + err = nb->notifier_call(nb, event_type, &info); + nh_notifier_info_fini(&info, nh); + return notifier_to_errno(err); } @@ -69,7 +224,7 @@ static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) hlist_add_head(&nhi->dev_hash, head); } -static void nexthop_free_mpath(struct nexthop *nh) +static void nexthop_free_group(struct nexthop *nh) { struct nh_group *nhg; int i; @@ -109,7 +264,7 @@ void nexthop_free_rcu(struct rcu_head *head) struct nexthop *nh = container_of(head, struct nexthop, rcu); if (nh->is_group) - nexthop_free_mpath(nh); + nexthop_free_group(nh); else nexthop_free_single(nh); @@ -434,7 +589,8 @@ static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, return 0; } -static int nh_check_attr_group(struct net *net, struct nlattr *tb[], +static int nh_check_attr_group(struct net *net, + struct nlattr *tb[], size_t tb_size, struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); @@ -493,10 +649,10 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[], return -EINVAL; } } - for (i = NHA_GROUP_TYPE + 1; i < __NHA_MAX; ++i) { + for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { if (!tb[i]) continue; - if (tb[NHA_FDB]) + if (i == NHA_FDB) continue; NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); @@ -539,21 +695,16 @@ static bool ipv4_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } -struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) +static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash) { struct nexthop *rc = NULL; - struct nh_group *nhg; int i; - if (!nh->is_group) - return nh; - - nhg = rcu_dereference(nh->nh_grp); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; - if (hash > atomic_read(&nhge->upper_bound)) + if (hash > atomic_read(&nhge->mpath.upper_bound)) continue; nhi = rcu_dereference(nhge->nh->nh_info); @@ -580,6 +731,21 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) return rc; } + +struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) +{ + struct nh_group *nhg; + + if (!nh->is_group) + return nh; + + nhg = rcu_dereference(nh->nh_grp); + if (nhg->mpath) + return nexthop_select_path_mp(nhg, hash); + + /* Unreachable. */ + return NULL; +} EXPORT_SYMBOL_GPL(nexthop_select_path); int nexthop_for_each_fib6_nh(struct nexthop *nh, @@ -773,7 +939,7 @@ static void nh_group_rebalance(struct nh_group *nhg) w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; - atomic_set(&nhge->upper_bound, upper_bound); + atomic_set(&nhge->mpath.upper_bound, upper_bound); } } @@ -782,9 +948,10 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, { struct nh_grp_entry *nhges, *new_nhges; struct nexthop *nhp = nhge->nh_parent; + struct netlink_ext_ack extack; struct nexthop *nh = nhge->nh; struct nh_group *nhg, *newg; - int i, j; + int i, j, err; WARN_ON(!nh); @@ -832,6 +999,10 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, list_del(&nhge->nh_list); nexthop_put(nhge->nh); + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack); + if (err) + pr_err("%s\n", extack._msg); + if (nlinfo) nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); } @@ -907,7 +1078,7 @@ static void __remove_nexthop(struct net *net, struct nexthop *nh, static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { - call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); + call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL); /* remove from the tree */ rb_erase(&nh->rb_node, &net->nexthop.rb_root); @@ -940,13 +1111,17 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old, struct netlink_ext_ack *extack) { struct nh_group *oldg, *newg; - int i; + int i, err; if (!new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); return -EINVAL; } + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); + if (err) + return err; + oldg = rtnl_dereference(old->nh_grp); newg = rtnl_dereference(new->nh_grp); @@ -985,31 +1160,54 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { + u8 old_protocol, old_nh_flags; struct nh_info *oldi, *newi; + struct nh_grp_entry *nhge; + int err; if (new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); return -EINVAL; } + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); + if (err) + return err; + + /* Hardware flags were set on 'old' as 'new' is not in the red-black + * tree. Therefore, inherit the flags from 'old' to 'new'. + */ + new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP); + oldi = rtnl_dereference(old->nh_info); newi = rtnl_dereference(new->nh_info); newi->nh_parent = old; oldi->nh_parent = new; + old_protocol = old->protocol; + old_nh_flags = old->nh_flags; + old->protocol = new->protocol; old->nh_flags = new->nh_flags; rcu_assign_pointer(old->nh_info, newi); rcu_assign_pointer(new->nh_info, oldi); + /* Send a replace notification for all the groups using the nexthop. */ + list_for_each_entry(nhge, &old->grp_list, nh_list) { + struct nexthop *nhp = nhge->nh_parent; + + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, + extack); + if (err) + goto err_notify; + } + /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially * update IPv4 indication in all the groups using the nexthop. */ if (oldi->family == AF_INET && newi->family == AF_INET6) { - struct nh_grp_entry *nhge; - list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; struct nh_group *nhg; @@ -1020,6 +1218,21 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, } return 0; + +err_notify: + rcu_assign_pointer(new->nh_info, newi); + rcu_assign_pointer(old->nh_info, oldi); + old->nh_flags = old_nh_flags; + old->protocol = old_protocol; + oldi->nh_parent = old; + newi->nh_parent = new; + list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) { + struct nexthop *nhp = nhge->nh_parent; + + call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, extack); + } + call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack); + return err; } static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, @@ -1168,7 +1381,11 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); - rc = 0; + + rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); + if (rc) + rb_erase(&new_nh->rb_node, &net->nexthop.rb_root); + out: if (!rc) { nh_base_seq_inc(net); @@ -1264,10 +1481,13 @@ static struct nexthop *nexthop_create_group(struct net *net, nhg->nh_entries[i].nh_parent = nh; } - if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { + if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) nhg->mpath = 1; + + WARN_ON_ONCE(nhg->mpath != 1); + + if (nhg->mpath) nh_group_rebalance(nhg); - } if (cfg->nh_fdb) nhg->fdb_nh = 1; @@ -1277,8 +1497,10 @@ static struct nexthop *nexthop_create_group(struct net *net, return nh; out_no_nh: - for (; i >= 0; --i) + for (i--; i >= 0; --i) { + list_del(&nhg->nh_entries[i].nh_list); nexthop_put(nhg->nh_entries[i].nh); + } kfree(nhg->spare); kfree(nhg); @@ -1459,11 +1681,12 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); - struct nlattr *tb[NHA_MAX + 1]; + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)]; int err; - err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, - extack); + err = nlmsg_parse(nlh, sizeof(*nhm), tb, + ARRAY_SIZE(rtm_nh_policy_new) - 1, + rtm_nh_policy_new, extack); if (err < 0) return err; @@ -1490,11 +1713,6 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, goto out; } - if (tb[NHA_GROUPS] || tb[NHA_MASTER]) { - NL_SET_ERR_MSG(extack, "Invalid attributes in request"); - goto out; - } - memset(cfg, 0, sizeof(*cfg)); cfg->nlflags = nlh->nlmsg_flags; cfg->nlinfo.portid = NETLINK_CB(skb).portid; @@ -1536,7 +1754,7 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, NL_SET_ERR_MSG(extack, "Invalid group type"); goto out; } - err = nh_check_attr_group(net, tb, extack); + err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), extack); /* no other attributes should be set */ goto out; @@ -1654,49 +1872,44 @@ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } -static int nh_valid_get_del_req(struct nlmsghdr *nlh, u32 *id, - struct netlink_ext_ack *extack) +static int __nh_valid_get_del_req(const struct nlmsghdr *nlh, + struct nlattr **tb, u32 *id, + struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); - struct nlattr *tb[NHA_MAX + 1]; - int err, i; - err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, - extack); - if (err < 0) - return err; - - err = -EINVAL; - for (i = 0; i < __NHA_MAX; ++i) { - if (!tb[i]) - continue; - - switch (i) { - case NHA_ID: - break; - default: - NL_SET_ERR_MSG_ATTR(extack, tb[i], - "Unexpected attribute in request"); - goto out; - } - } if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header"); - goto out; + return -EINVAL; } if (!tb[NHA_ID]) { NL_SET_ERR_MSG(extack, "Nexthop id is missing"); - goto out; + return -EINVAL; } *id = nla_get_u32(tb[NHA_ID]); - if (!(*id)) + if (!(*id)) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); - else - err = 0; -out: - return err; + return -EINVAL; + } + + return 0; +} + +static int nh_valid_get_del_req(const struct nlmsghdr *nlh, u32 *id, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)]; + int err; + + err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, + ARRAY_SIZE(rtm_nh_policy_get) - 1, + rtm_nh_policy_get, extack); + if (err < 0) + return err; + + return __nh_valid_get_del_req(nlh, tb, id, extack); } /* rtnl */ @@ -1765,16 +1978,23 @@ errout_free: goto out; } -static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx, - bool group_filter, u8 family) +struct nh_dump_filter { + int dev_idx; + int master_idx; + bool group_filter; + bool fdb_filter; +}; + +static bool nh_dump_filtered(struct nexthop *nh, + struct nh_dump_filter *filter, u8 family) { const struct net_device *dev; const struct nh_info *nhi; - if (group_filter && !nh->is_group) + if (filter->group_filter && !nh->is_group) return true; - if (!dev_idx && !master_idx && !family) + if (!filter->dev_idx && !filter->master_idx && !family) return false; if (nh->is_group) @@ -1785,70 +2005,48 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx, return true; dev = nhi->fib_nhc.nhc_dev; - if (dev_idx && (!dev || dev->ifindex != dev_idx)) + if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx)) return true; - if (master_idx) { + if (filter->master_idx) { struct net_device *master; if (!dev) return true; master = netdev_master_upper_dev_get((struct net_device *)dev); - if (!master || master->ifindex != master_idx) + if (!master || master->ifindex != filter->master_idx) return true; } return false; } -static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, - int *master_idx, bool *group_filter, - bool *fdb_filter, struct netlink_callback *cb) +static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb, + struct nh_dump_filter *filter, + struct netlink_ext_ack *extack) { - struct netlink_ext_ack *extack = cb->extack; - struct nlattr *tb[NHA_MAX + 1]; struct nhmsg *nhm; - int err, i; u32 idx; - err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, - NULL); - if (err < 0) - return err; - - for (i = 0; i <= NHA_MAX; ++i) { - if (!tb[i]) - continue; - - switch (i) { - case NHA_OIF: - idx = nla_get_u32(tb[i]); - if (idx > INT_MAX) { - NL_SET_ERR_MSG(extack, "Invalid device index"); - return -EINVAL; - } - *dev_idx = idx; - break; - case NHA_MASTER: - idx = nla_get_u32(tb[i]); - if (idx > INT_MAX) { - NL_SET_ERR_MSG(extack, "Invalid master device index"); - return -EINVAL; - } - *master_idx = idx; - break; - case NHA_GROUPS: - *group_filter = true; - break; - case NHA_FDB: - *fdb_filter = true; - break; - default: - NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); + if (tb[NHA_OIF]) { + idx = nla_get_u32(tb[NHA_OIF]); + if (idx > INT_MAX) { + NL_SET_ERR_MSG(extack, "Invalid device index"); return -EINVAL; } + filter->dev_idx = idx; } + if (tb[NHA_MASTER]) { + idx = nla_get_u32(tb[NHA_MASTER]); + if (idx > INT_MAX) { + NL_SET_ERR_MSG(extack, "Invalid master device index"); + return -EINVAL; + } + filter->master_idx = idx; + } + filter->group_filter = nla_get_flag(tb[NHA_GROUPS]); + filter->fdb_filter = nla_get_flag(tb[NHA_FDB]); nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { @@ -1859,24 +2057,49 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, return 0; } -/* rtnl */ -static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) +static int nh_valid_dump_req(const struct nlmsghdr *nlh, + struct nh_dump_filter *filter, + struct netlink_callback *cb) { - bool group_filter = false, fdb_filter = false; - struct nhmsg *nhm = nlmsg_data(cb->nlh); - int dev_filter_idx = 0, master_idx = 0; - struct net *net = sock_net(skb->sk); - struct rb_root *root = &net->nexthop.rb_root; - struct rb_node *node; - int idx = 0, s_idx; + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)]; int err; - err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx, - &group_filter, &fdb_filter, cb); + err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, + ARRAY_SIZE(rtm_nh_policy_dump) - 1, + rtm_nh_policy_dump, cb->extack); if (err < 0) return err; - s_idx = cb->args[0]; + return __nh_valid_dump_req(nlh, tb, filter, cb->extack); +} + +struct rtm_dump_nh_ctx { + u32 idx; +}; + +static struct rtm_dump_nh_ctx * +rtm_dump_nh_ctx(struct netlink_callback *cb) +{ + struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + return ctx; +} + +static int rtm_dump_walk_nexthops(struct sk_buff *skb, + struct netlink_callback *cb, + struct rb_root *root, + struct rtm_dump_nh_ctx *ctx, + int (*nh_cb)(struct sk_buff *skb, + struct netlink_callback *cb, + struct nexthop *nh, void *data), + void *data) +{ + struct rb_node *node; + int idx = 0, s_idx; + int err; + + s_idx = ctx->idx; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; @@ -1884,30 +2107,58 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) goto cont; nh = rb_entry(node, struct nexthop, rb_node); - if (nh_dump_filtered(nh, dev_filter_idx, master_idx, - group_filter, nhm->nh_family)) - goto cont; - - err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err < 0) { - if (likely(skb->len)) - goto out; - - goto out_err; - } + ctx->idx = idx; + err = nh_cb(skb, cb, nh, data); + if (err) + return err; cont: idx++; } + ctx->idx = idx; + return 0; +} + +static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb, + struct nexthop *nh, void *data) +{ + struct nhmsg *nhm = nlmsg_data(cb->nlh); + struct nh_dump_filter *filter = data; + + if (nh_dump_filtered(nh, filter, nhm->nh_family)) + return 0; + + return nh_fill_node(skb, nh, RTM_NEWNEXTHOP, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); +} + +/* rtnl */ +static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb); + struct net *net = sock_net(skb->sk); + struct rb_root *root = &net->nexthop.rb_root; + struct nh_dump_filter filter = {}; + int err; + + err = nh_valid_dump_req(cb->nlh, &filter, cb); + if (err < 0) + return err; + + err = rtm_dump_walk_nexthops(skb, cb, root, ctx, + &rtm_dump_nexthop_cb, &filter); + if (err < 0) { + if (likely(skb->len)) + goto out; + goto out_err; + } + out: err = skb->len; out_err: - cb->args[0] = idx; cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); - return err; } @@ -1957,10 +2208,40 @@ static struct notifier_block nh_netdev_notifier = { .notifier_call = nh_netdev_event, }; -int register_nexthop_notifier(struct net *net, struct notifier_block *nb) +static int nexthops_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ + struct rb_root *root = &net->nexthop.rb_root; + struct rb_node *node; + int err = 0; + + for (node = rb_first(root); node; node = rb_next(node)) { + struct nexthop *nh; + + nh = rb_entry(node, struct nexthop, rb_node); + err = call_nexthop_notifier(nb, net, NEXTHOP_EVENT_REPLACE, nh, + extack); + if (err) + break; + } + + return err; +} + +int register_nexthop_notifier(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return blocking_notifier_chain_register(&net->nexthop.notifier_chain, - nb); + int err; + + rtnl_lock(); + err = nexthops_dump(net, nb, extack); + if (err) + goto unlock; + err = blocking_notifier_chain_register(&net->nexthop.notifier_chain, + nb); +unlock: + rtnl_unlock(); + return err; } EXPORT_SYMBOL(register_nexthop_notifier); @@ -1971,6 +2252,27 @@ int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) } EXPORT_SYMBOL(unregister_nexthop_notifier); +void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap) +{ + struct nexthop *nexthop; + + rcu_read_lock(); + + nexthop = nexthop_find_by_id(net, id); + if (!nexthop) + goto out; + + nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); + if (offload) + nexthop->nh_flags |= RTNH_F_OFFLOAD; + if (trap) + nexthop->nh_flags |= RTNH_F_TRAP; + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(nexthop_set_hw_flags); + static void __net_exit nexthop_net_exit(struct net *net) { rtnl_lock(); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 248856b301c4..8b943f85fff9 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -778,7 +778,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4.fl4_icmp_type = user_icmph.type; fl4.fl4_icmp_code = user_icmph.code; - security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8d5e1695b9aa..6d46297a99f8 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -167,6 +167,7 @@ static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI), + SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; @@ -463,30 +464,52 @@ static int snmp_seq_show(struct seq_file *seq, void *v) */ static int netstat_seq_show(struct seq_file *seq, void *v) { - int i; + const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list) - 1; + const int tcp_cnt = ARRAY_SIZE(snmp4_net_list) - 1; struct net *net = seq->private; + unsigned long *buff; + int i; seq_puts(seq, "TcpExt:"); - for (i = 0; snmp4_net_list[i].name; i++) + for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %s", snmp4_net_list[i].name); seq_puts(seq, "\nTcpExt:"); - for (i = 0; snmp4_net_list[i].name; i++) - seq_printf(seq, " %lu", - snmp_fold_field(net->mib.net_statistics, - snmp4_net_list[i].entry)); - + buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)), + GFP_KERNEL); + if (buff) { + snmp_get_cpu_field_batch(buff, snmp4_net_list, + net->mib.net_statistics); + for (i = 0; i < tcp_cnt; i++) + seq_printf(seq, " %lu", buff[i]); + } else { + for (i = 0; i < tcp_cnt; i++) + seq_printf(seq, " %lu", + snmp_fold_field(net->mib.net_statistics, + snmp4_net_list[i].entry)); + } seq_puts(seq, "\nIpExt:"); - for (i = 0; snmp4_ipextstats_list[i].name; i++) + for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %s", snmp4_ipextstats_list[i].name); seq_puts(seq, "\nIpExt:"); - for (i = 0; snmp4_ipextstats_list[i].name; i++) - seq_printf(seq, " %llu", - snmp_fold_field64(net->mib.ip_statistics, - snmp4_ipextstats_list[i].entry, - offsetof(struct ipstats_mib, syncp))); - + if (buff) { + u64 *buff64 = (u64 *)buff; + + memset(buff64, 0, ip_cnt * sizeof(u64)); + snmp_get_cpu_field64_batch(buff64, snmp4_ipextstats_list, + net->mib.ip_statistics, + offsetof(struct ipstats_mib, syncp)); + for (i = 0; i < ip_cnt; i++) + seq_printf(seq, " %llu", buff64[i]); + } else { + for (i = 0; i < ip_cnt; i++) + seq_printf(seq, " %llu", + snmp_fold_field64(net->mib.ip_statistics, + snmp4_ipextstats_list[i].entry, + offsetof(struct ipstats_mib, syncp))); + } + kfree(buff); seq_putc(seq, '\n'); mptcp_seq_show(seq); return 0; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 7d26e0f8bdae..50a73178d63a 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -640,7 +640,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto done; } - security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index dc2a399cd9f4..02d81d79deeb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -133,9 +133,11 @@ static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; * Interface to generic destination cache. */ -static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); +INDIRECT_CALLABLE_SCOPE +struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); -static unsigned int ipv4_mtu(const struct dst_entry *dst); +INDIRECT_CALLABLE_SCOPE +unsigned int ipv4_mtu(const struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, @@ -1187,7 +1189,8 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) } EXPORT_SYMBOL_GPL(ipv4_sk_redirect); -static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, + u32 cookie) { struct rtable *rt = (struct rtable *) dst; @@ -1203,6 +1206,7 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) return NULL; return dst; } +EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { @@ -1311,7 +1315,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) return min(advmss, IPV4_MAX_PMTU - header_size); } -static unsigned int ipv4_mtu(const struct dst_entry *dst) +INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) { const struct rtable *rt = (const struct rtable *)dst; unsigned int mtu = rt->rt_pmtu; @@ -1333,6 +1337,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } +EXPORT_INDIRECT_CALLABLE(ipv4_mtu); static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr) { @@ -1741,7 +1746,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, flags |= RTCF_LOCAL; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false); + IN_DEV_ORCONF(in_dev, NOPOLICY), false); if (!rth) return -ENOBUFS; @@ -1857,8 +1862,8 @@ static int __mkroute_input(struct sk_buff *skb, } rth = rt_dst_alloc(out_dev->dev, 0, res->type, - IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(out_dev, NOXFRM)); + IN_DEV_ORCONF(in_dev, NOPOLICY), + IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { err = -ENOBUFS; goto cleanup; @@ -2227,7 +2232,7 @@ local_input: rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, flags | RTCF_LOCAL, res->type, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false); + IN_DEV_ORCONF(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; @@ -2450,8 +2455,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res, add: rth = rt_dst_alloc(dev_out, flags, type, - IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(in_dev, NOXFRM)); + IN_DEV_ORCONF(in_dev, NOPOLICY), + IN_DEV_ORCONF(in_dev, NOXFRM)); if (!rth) return ERR_PTR(-ENOBUFS); @@ -2872,6 +2877,9 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (rt->dst.dev && nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) goto nla_put_failure; + if (rt->dst.lwtstate && + lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) + goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid && nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) @@ -3222,7 +3230,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.daddr = dst; fl4.saddr = src; - fl4.flowi4_tos = rtm->rtm_tos; + fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK; fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; @@ -3246,8 +3254,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.flowi4_iif = iif; /* for rt_fill_info */ skb->dev = dev; skb->mark = mark; - err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos, - dev, &res); + err = ip_route_input_rcu(skb, dst, src, + rtm->rtm_tos & IPTOS_RT_MASK, dev, + &res); rt = skb_rtable(skb); if (err == 0 && rt->dst.error) @@ -3295,6 +3304,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fri.type = rt->rt_type; fri.offload = 0; fri.trap = 0; + fri.offload_failed = 0; if (res.fa_head) { struct fib_alias *fa; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 6ac473b47f30..33792cf55a79 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -331,7 +331,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) __u32 cookie = ntohl(th->ack_seq) - 1; struct sock *ret = sk; struct request_sock *req; - int mss; + int full_space, mss; struct rtable *rt; __u8 rcv_wscale; struct flowi4 fl4; @@ -418,7 +418,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) inet_sk_flowi_flags(sk), opt->srr ? opt->faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid); - security_req_classify_flow(req, flowi4_to_flowi(&fl4)); + security_req_classify_flow(req, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); if (IS_ERR(rt)) { reqsk_free(req); @@ -427,8 +427,13 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + /* limit the window selection if the user enforce a smaller rx buffer */ + full_space = tcp_full_space(sk); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; - tcp_select_initial_window(sk, tcp_full_space(sk), req->mss, + tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3e5f4f2e705e..f55095d3ed16 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1354,6 +1354,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, + { + .procname = "fib_notify_on_flag_change", + .data = &init_net.ipv4.sysctl_fib_notify_on_flag_change, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b2bc3d7fe9e8..a3422e42784e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -280,6 +280,12 @@ #include <asm/ioctls.h> #include <net/busy_poll.h> +/* Track pending CMSGs. */ +enum { + TCP_CMSG_INQ = 1, + TCP_CMSG_TS = 2 +}; + struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -475,19 +481,11 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) } } -static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, - int target, struct sock *sk) +static bool tcp_stream_is_readable(struct sock *sk, int target) { - int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq); - - if (avail > 0) { - if (avail >= target) - return true; - if (tcp_rmem_pressure(sk)) - return true; - if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss) - return true; - } + if (tcp_epollin_ready(sk, target)) + return true; + if (sk->sk_prot->stream_memory_read) return sk->sk_prot->stream_memory_read(sk); return false; @@ -562,7 +560,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) tp->urg_data) target++; - if (tcp_stream_is_readable(tp, target, sk)) + if (tcp_stream_is_readable(sk, target)) mask |= EPOLLIN | EPOLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { @@ -954,7 +952,7 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags) * importantly be able to generate EPOLLOUT for Edge Trigger epoll() * users. */ -static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) +void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) { if (skb && !skb->len) { tcp_unlink_write_queue(skb, sk); @@ -964,6 +962,68 @@ static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) } } +struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, + struct page *page, int offset, size_t *size) +{ + struct sk_buff *skb = tcp_write_queue_tail(sk); + struct tcp_sock *tp = tcp_sk(sk); + bool can_coalesce; + int copy, i; + + if (!skb || (copy = size_goal - skb->len) <= 0 || + !tcp_skb_can_collapse_to(skb)) { +new_segment: + if (!sk_stream_memory_free(sk)) + return NULL; + + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, + tcp_rtx_and_write_queues_empty(sk)); + if (!skb) + return NULL; + +#ifdef CONFIG_TLS_DEVICE + skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); +#endif + skb_entail(sk, skb); + copy = size_goal; + } + + if (copy > *size) + copy = *size; + + i = skb_shinfo(skb)->nr_frags; + can_coalesce = skb_can_coalesce(skb, i, page, offset); + if (!can_coalesce && i >= sysctl_max_skb_frags) { + tcp_mark_push(tp, skb); + goto new_segment; + } + if (!sk_wmem_schedule(sk, copy)) + return NULL; + + if (can_coalesce) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + } else { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, copy); + } + + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; + + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk_wmem_queued_add(sk, copy); + sk_mem_charge(sk, copy); + skb->ip_summed = CHECKSUM_PARTIAL; + WRITE_ONCE(tp->write_seq, tp->write_seq + copy); + TCP_SKB_CB(skb)->end_seq += copy; + tcp_skb_pcount_set(skb, 0); + + *size = copy; + return skb; +} + ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags) { @@ -999,60 +1059,13 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, goto out_err; while (size > 0) { - struct sk_buff *skb = tcp_write_queue_tail(sk); - int copy, i; - bool can_coalesce; + struct sk_buff *skb; + size_t copy = size; - if (!skb || (copy = size_goal - skb->len) <= 0 || - !tcp_skb_can_collapse_to(skb)) { -new_segment: - if (!sk_stream_memory_free(sk)) - goto wait_for_space; - - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, - tcp_rtx_and_write_queues_empty(sk)); - if (!skb) - goto wait_for_space; - -#ifdef CONFIG_TLS_DEVICE - skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); -#endif - skb_entail(sk, skb); - copy = size_goal; - } - - if (copy > size) - copy = size; - - i = skb_shinfo(skb)->nr_frags; - can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= sysctl_max_skb_frags) { - tcp_mark_push(tp, skb); - goto new_segment; - } - if (!sk_wmem_schedule(sk, copy)) + skb = tcp_build_frag(sk, size_goal, flags, page, offset, ©); + if (!skb) goto wait_for_space; - if (can_coalesce) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); - } else { - get_page(page); - skb_fill_page_desc(skb, i, page, offset, copy); - } - - if (!(flags & MSG_NO_SHARED_FRAGS)) - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; - - skb->len += copy; - skb->data_len += copy; - skb->truesize += copy; - sk_wmem_queued_add(sk, copy); - sk_mem_charge(sk, copy); - skb->ip_summed = CHECKSUM_PARTIAL; - WRITE_ONCE(tp->write_seq, tp->write_seq + copy); - TCP_SKB_CB(skb)->end_seq += copy; - tcp_skb_pcount_set(skb, 0); - if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; @@ -1202,7 +1215,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); - uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; goto out_err; @@ -1414,7 +1427,7 @@ out: tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: - sock_zerocopy_put(uarg); + net_zcopy_put(uarg); return copied + copied_syn; do_error: @@ -1425,7 +1438,7 @@ do_fault: if (copied + copied_syn) goto out; out_err: - sock_zerocopy_put_abort(uarg, true); + net_zcopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { @@ -1724,6 +1737,20 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_set_rcvlowat); +static void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss) +{ + if (skb->tstamp) + tss->ts[0] = ktime_to_timespec64(skb->tstamp); + else + tss->ts[0] = (struct timespec64) {0}; + + if (skb_hwtstamps(skb)->hwtstamp) + tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); + else + tss->ts[2] = (struct timespec64) {0}; +} + #ifdef CONFIG_MMU static const struct vm_operations_struct tcp_vm_ops = { }; @@ -1743,52 +1770,308 @@ int tcp_mmap(struct file *file, struct socket *sock, } EXPORT_SYMBOL(tcp_mmap); +static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, + u32 *offset_frag) +{ + skb_frag_t *frag; + + offset_skb -= skb_headlen(skb); + if ((int)offset_skb < 0 || skb_has_frag_list(skb)) + return NULL; + + frag = skb_shinfo(skb)->frags; + while (offset_skb) { + if (skb_frag_size(frag) > offset_skb) { + *offset_frag = offset_skb; + return frag; + } + offset_skb -= skb_frag_size(frag); + ++frag; + } + *offset_frag = 0; + return frag; +} + +static bool can_map_frag(const skb_frag_t *frag) +{ + return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag); +} + +static int find_next_mappable_frag(const skb_frag_t *frag, + int remaining_in_skb) +{ + int offset = 0; + + if (likely(can_map_frag(frag))) + return 0; + + while (offset < remaining_in_skb && !can_map_frag(frag)) { + offset += skb_frag_size(frag); + ++frag; + } + return offset; +} + +static void tcp_zerocopy_set_hint_for_skb(struct sock *sk, + struct tcp_zerocopy_receive *zc, + struct sk_buff *skb, u32 offset) +{ + u32 frag_offset, partial_frag_remainder = 0; + int mappable_offset; + skb_frag_t *frag; + + /* worst case: skip to next skb. try to improve on this case below */ + zc->recv_skip_hint = skb->len - offset; + + /* Find the frag containing this offset (and how far into that frag) */ + frag = skb_advance_to_frag(skb, offset, &frag_offset); + if (!frag) + return; + + if (frag_offset) { + struct skb_shared_info *info = skb_shinfo(skb); + + /* We read part of the last frag, must recvmsg() rest of skb. */ + if (frag == &info->frags[info->nr_frags - 1]) + return; + + /* Else, we must at least read the remainder in this frag. */ + partial_frag_remainder = skb_frag_size(frag) - frag_offset; + zc->recv_skip_hint -= partial_frag_remainder; + ++frag; + } + + /* partial_frag_remainder: If part way through a frag, must read rest. + * mappable_offset: Bytes till next mappable frag, *not* counting bytes + * in partial_frag_remainder. + */ + mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint); + zc->recv_skip_hint = mappable_offset + partial_frag_remainder; +} + +static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, + struct scm_timestamping_internal *tss, + int *cmsg_flags); +static int receive_fallback_to_copy(struct sock *sk, + struct tcp_zerocopy_receive *zc, int inq, + struct scm_timestamping_internal *tss) +{ + unsigned long copy_address = (unsigned long)zc->copybuf_address; + struct msghdr msg = {}; + struct iovec iov; + int err; + + zc->length = 0; + zc->recv_skip_hint = 0; + + if (copy_address != zc->copybuf_address) + return -EINVAL; + + err = import_single_range(READ, (void __user *)copy_address, + inq, &iov, &msg.msg_iter); + if (err) + return err; + + err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0, + tss, &zc->msg_flags); + if (err < 0) + return err; + + zc->copybuf_len = err; + if (likely(zc->copybuf_len)) { + struct sk_buff *skb; + u32 offset; + + skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset); + if (skb) + tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset); + } + return 0; +} + +static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, + struct sk_buff *skb, u32 copylen, + u32 *offset, u32 *seq) +{ + unsigned long copy_address = (unsigned long)zc->copybuf_address; + struct msghdr msg = {}; + struct iovec iov; + int err; + + if (copy_address != zc->copybuf_address) + return -EINVAL; + + err = import_single_range(READ, (void __user *)copy_address, + copylen, &iov, &msg.msg_iter); + if (err) + return err; + err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); + if (err) + return err; + zc->recv_skip_hint -= copylen; + *offset += copylen; + *seq += copylen; + return (__s32)copylen; +} + +static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, + struct sock *sk, + struct sk_buff *skb, + u32 *seq, + s32 copybuf_len, + struct scm_timestamping_internal *tss) +{ + u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint); + + if (!copylen) + return 0; + /* skb is null if inq < PAGE_SIZE. */ + if (skb) { + offset = *seq - TCP_SKB_CB(skb)->seq; + } else { + skb = tcp_recv_skb(sk, *seq, &offset); + if (TCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, tss); + zc->msg_flags |= TCP_CMSG_TS; + } + } + + zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset, + seq); + return zc->copybuf_len < 0 ? 0 : copylen; +} + +static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, + struct page **pending_pages, + unsigned long pages_remaining, + unsigned long *address, + u32 *length, + u32 *seq, + struct tcp_zerocopy_receive *zc, + u32 total_bytes_to_map, + int err) +{ + /* At least one page did not map. Try zapping if we skipped earlier. */ + if (err == -EBUSY && + zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) { + u32 maybe_zap_len; + + maybe_zap_len = total_bytes_to_map - /* All bytes to map */ + *length + /* Mapped or pending */ + (pages_remaining * PAGE_SIZE); /* Failed map. */ + zap_page_range(vma, *address, maybe_zap_len); + err = 0; + } + + if (!err) { + unsigned long leftover_pages = pages_remaining; + int bytes_mapped; + + /* We called zap_page_range, try to reinsert. */ + err = vm_insert_pages(vma, *address, + pending_pages, + &pages_remaining); + bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining); + *seq += bytes_mapped; + *address += bytes_mapped; + } + if (err) { + /* Either we were unable to zap, OR we zapped, retried an + * insert, and still had an issue. Either ways, pages_remaining + * is the number of pages we were unable to map, and we unroll + * some state we speculatively touched before. + */ + const int bytes_not_mapped = PAGE_SIZE * pages_remaining; + + *length -= bytes_not_mapped; + zc->recv_skip_hint += bytes_not_mapped; + } + return err; +} + static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, struct page **pages, - unsigned long pages_to_map, - unsigned long *insert_addr, - u32 *length_with_pending, + unsigned int pages_to_map, + unsigned long *address, + u32 *length, u32 *seq, - struct tcp_zerocopy_receive *zc) + struct tcp_zerocopy_receive *zc, + u32 total_bytes_to_map) { unsigned long pages_remaining = pages_to_map; - int bytes_mapped; - int ret; + unsigned int pages_mapped; + unsigned int bytes_mapped; + int err; - ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining); - bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining); + err = vm_insert_pages(vma, *address, pages, &pages_remaining); + pages_mapped = pages_to_map - (unsigned int)pages_remaining; + bytes_mapped = PAGE_SIZE * pages_mapped; /* Even if vm_insert_pages fails, it may have partially succeeded in * mapping (some but not all of the pages). */ *seq += bytes_mapped; - *insert_addr += bytes_mapped; - if (ret) { - /* But if vm_insert_pages did fail, we have to unroll some state - * we speculatively touched before. - */ - const int bytes_not_mapped = PAGE_SIZE * pages_remaining; - *length_with_pending -= bytes_not_mapped; - zc->recv_skip_hint += bytes_not_mapped; + *address += bytes_mapped; + + if (likely(!err)) + return 0; + + /* Error: maybe zap and retry + rollback state for failed inserts. */ + return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped, + pages_remaining, address, length, seq, zc, total_bytes_to_map, + err); +} + +#define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) +static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping_internal *tss); +static void tcp_zc_finalize_rx_tstamp(struct sock *sk, + struct tcp_zerocopy_receive *zc, + struct scm_timestamping_internal *tss) +{ + unsigned long msg_control_addr; + struct msghdr cmsg_dummy; + + msg_control_addr = (unsigned long)zc->msg_control; + cmsg_dummy.msg_control = (void *)msg_control_addr; + cmsg_dummy.msg_controllen = + (__kernel_size_t)zc->msg_controllen; + cmsg_dummy.msg_flags = in_compat_syscall() + ? MSG_CMSG_COMPAT : 0; + zc->msg_flags = 0; + if (zc->msg_control == msg_control_addr && + zc->msg_controllen == cmsg_dummy.msg_controllen) { + tcp_recv_timestamp(&cmsg_dummy, sk, tss); + zc->msg_control = (__u64) + ((uintptr_t)cmsg_dummy.msg_control); + zc->msg_controllen = + (__u64)cmsg_dummy.msg_controllen; + zc->msg_flags = (__u32)cmsg_dummy.msg_flags; } - return ret; } +#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 static int tcp_zerocopy_receive(struct sock *sk, - struct tcp_zerocopy_receive *zc) + struct tcp_zerocopy_receive *zc, + struct scm_timestamping_internal *tss) { + u32 length = 0, offset, vma_len, avail_len, copylen = 0; unsigned long address = (unsigned long)zc->address; - u32 length = 0, seq, offset, zap_len; - #define PAGE_BATCH_SIZE 8 - struct page *pages[PAGE_BATCH_SIZE]; + struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; + s32 copybuf_len = zc->copybuf_len; + struct tcp_sock *tp = tcp_sk(sk); const skb_frag_t *frags = NULL; + unsigned int pages_to_map = 0; struct vm_area_struct *vma; struct sk_buff *skb = NULL; - unsigned long pg_idx = 0; - unsigned long curr_addr; - struct tcp_sock *tp; - int inq; + u32 seq = tp->copied_seq; + u32 total_bytes_to_map; + int inq = tcp_inq(sk); int ret; + zc->copybuf_len = 0; + zc->msg_flags = 0; + if (address & (PAGE_SIZE - 1) || address != zc->address) return -EINVAL; @@ -1797,7 +2080,16 @@ static int tcp_zerocopy_receive(struct sock *sk, sock_rps_record_flow(sk); - tp = tcp_sk(sk); + if (inq && inq <= copybuf_len) + return receive_fallback_to_copy(sk, zc, inq, tss); + + if (inq < PAGE_SIZE) { + zc->length = 0; + zc->recv_skip_hint = inq; + if (!inq && sock_flag(sk, SOCK_DONE)) + return -EIO; + return 0; + } mmap_read_lock(current->mm); @@ -1806,33 +2098,26 @@ static int tcp_zerocopy_receive(struct sock *sk, mmap_read_unlock(current->mm); return -EINVAL; } - zc->length = min_t(unsigned long, zc->length, vma->vm_end - address); - - seq = tp->copied_seq; - inq = tcp_inq(sk); - zc->length = min_t(u32, zc->length, inq); - zap_len = zc->length & ~(PAGE_SIZE - 1); - if (zap_len) { - zap_page_range(vma, address, zap_len); + vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); + avail_len = min_t(u32, vma_len, inq); + total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); + if (total_bytes_to_map) { + if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) + zap_page_range(vma, address, total_bytes_to_map); + zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { - zc->recv_skip_hint = zc->length; + zc->length = avail_len; + zc->recv_skip_hint = avail_len; } ret = 0; - curr_addr = address; while (length + PAGE_SIZE <= zc->length) { + int mappable_offset; + struct page *page; + if (zc->recv_skip_hint < PAGE_SIZE) { - /* If we're here, finish the current batch. */ - if (pg_idx) { - ret = tcp_zerocopy_vm_insert_batch(vma, pages, - pg_idx, - &curr_addr, - &length, - &seq, zc); - if (ret) - goto out; - pg_idx = 0; - } + u32 offset_frag; + if (skb) { if (zc->recv_skip_hint > 0) break; @@ -1841,57 +2126,62 @@ static int tcp_zerocopy_receive(struct sock *sk, } else { skb = tcp_recv_skb(sk, seq, &offset); } + + if (TCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, tss); + zc->msg_flags |= TCP_CMSG_TS; + } zc->recv_skip_hint = skb->len - offset; - offset -= skb_headlen(skb); - if ((int)offset < 0 || skb_has_frag_list(skb)) + frags = skb_advance_to_frag(skb, offset, &offset_frag); + if (!frags || offset_frag) break; - frags = skb_shinfo(skb)->frags; - while (offset) { - if (skb_frag_size(frags) > offset) - goto out; - offset -= skb_frag_size(frags); - frags++; - } } - if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) { - int remaining = zc->recv_skip_hint; - while (remaining && (skb_frag_size(frags) != PAGE_SIZE || - skb_frag_off(frags))) { - remaining -= skb_frag_size(frags); - frags++; - } - zc->recv_skip_hint -= remaining; + mappable_offset = find_next_mappable_frag(frags, + zc->recv_skip_hint); + if (mappable_offset) { + zc->recv_skip_hint = mappable_offset; break; } - pages[pg_idx] = skb_frag_page(frags); - pg_idx++; + page = skb_frag_page(frags); + prefetchw(page); + pages[pages_to_map++] = page; length += PAGE_SIZE; zc->recv_skip_hint -= PAGE_SIZE; frags++; - if (pg_idx == PAGE_BATCH_SIZE) { - ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, - &curr_addr, &length, - &seq, zc); + if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE || + zc->recv_skip_hint < PAGE_SIZE) { + /* Either full batch, or we're about to go to next skb + * (and we cannot unroll failed ops across skbs). + */ + ret = tcp_zerocopy_vm_insert_batch(vma, pages, + pages_to_map, + &address, &length, + &seq, zc, + total_bytes_to_map); if (ret) goto out; - pg_idx = 0; + pages_to_map = 0; } } - if (pg_idx) { - ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, - &curr_addr, &length, &seq, - zc); + if (pages_to_map) { + ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, + &address, &length, &seq, + zc, total_bytes_to_map); } out: mmap_read_unlock(current->mm); - if (length) { + /* Try to copy straggler data. */ + if (!ret) + copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss); + + if (length + copylen) { WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ tcp_recv_skb(sk, seq, &offset); - tcp_cleanup_rbuf(sk, length); + tcp_cleanup_rbuf(sk, length + copylen); ret = 0; if (length == zc->length) zc->recv_skip_hint = 0; @@ -1904,20 +2194,6 @@ out: } #endif -static void tcp_update_recv_tstamps(struct sk_buff *skb, - struct scm_timestamping_internal *tss) -{ - if (skb->tstamp) - tss->ts[0] = ktime_to_timespec64(skb->tstamp); - else - tss->ts[0] = (struct timespec64) {0}; - - if (skb_hwtstamps(skb)->hwtstamp) - tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); - else - tss->ts[2] = (struct timespec64) {0}; -} - /* Similar to __sock_recv_timestamp, but does not require an skb */ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss) @@ -2013,36 +2289,28 @@ static int tcp_inq_hint(struct sock *sk) * Probably, code can be easily improved even more. */ -int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, - int flags, int *addr_len) +static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, + struct scm_timestamping_internal *tss, + int *cmsg_flags) { struct tcp_sock *tp = tcp_sk(sk); int copied = 0; u32 peek_seq; u32 *seq; unsigned long used; - int err, inq; + int err; int target; /* Read at least this many bytes */ long timeo; struct sk_buff *skb, *last; u32 urg_hole = 0; - struct scm_timestamping_internal tss; - int cmsg_flags; - - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - - if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && - (sk->sk_state == TCP_ESTABLISHED)) - sk_busy_loop(sk, nonblock); - - lock_sock(sk); err = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) goto out; - cmsg_flags = tp->recvmsg_inq ? 1 : 0; + if (tp->recvmsg_inq) + *cmsg_flags = TCP_CMSG_INQ; timeo = sock_rcvtimeo(sk, nonblock); /* Urgent data needs to be handled specially. */ @@ -2222,8 +2490,8 @@ skip_copy: } if (TCP_SKB_CB(skb)->has_rxtstamp) { - tcp_update_recv_tstamps(skb, &tss); - cmsg_flags |= 2; + tcp_update_recv_tstamps(skb, tss); + *cmsg_flags |= TCP_CMSG_TS; } if (used + offset < skb->len) @@ -2249,22 +2517,9 @@ found_fin_ok: /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); - - release_sock(sk); - - if (cmsg_flags) { - if (cmsg_flags & 2) - tcp_recv_timestamp(msg, sk, &tss); - if (cmsg_flags & 1) { - inq = tcp_inq_hint(sk); - put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); - } - } - return copied; out: - release_sock(sk); return err; recv_urg: @@ -2275,6 +2530,36 @@ recv_sndq: err = tcp_peek_sndq(sk, msg, len); goto out; } + +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + int flags, int *addr_len) +{ + int cmsg_flags = 0, ret, inq; + struct scm_timestamping_internal tss; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + if (sk_can_busy_loop(sk) && + skb_queue_empty_lockless(&sk->sk_receive_queue) && + sk->sk_state == TCP_ESTABLISHED) + sk_busy_loop(sk, nonblock); + + lock_sock(sk); + ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss, + &cmsg_flags); + release_sock(sk); + + if (cmsg_flags && ret >= 0) { + if (cmsg_flags & TCP_CMSG_TS) + tcp_recv_timestamp(msg, sk, &tss); + if (cmsg_flags & TCP_CMSG_INQ) { + inq = tcp_inq_hint(sk); + put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); + } + } + return ret; +} EXPORT_SYMBOL(tcp_recvmsg); void tcp_set_state(struct sock *sk, int state) @@ -2405,13 +2690,12 @@ bool tcp_check_oom(struct sock *sk, int shift) return too_many_orphans || out_of_socket_memory; } -void tcp_close(struct sock *sk, long timeout) +void __tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; int data_was_unread = 0; int state; - lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (sk->sk_state == TCP_LISTEN) { @@ -2575,6 +2859,12 @@ adjudge_to_death: out: bh_unlock_sock(sk); local_bh_enable(); +} + +void tcp_close(struct sock *sk, long timeout) +{ + lock_sock(sk); + __tcp_close(sk, timeout); release_sock(sk); sock_put(sk); } @@ -2685,6 +2975,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_backoff = 0; icsk->icsk_probes_out = 0; + icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; @@ -3022,6 +3313,21 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_keepcnt); +int tcp_set_window_clamp(struct sock *sk, int val) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!val) { + if (sk->sk_state != TCP_CLOSE) + return -EINVAL; + tp->window_clamp = 0; + } else { + tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? + SOCK_MIN_RCVBUF / 2 : val; + } + return 0; +} + /* * Socket option code for TCP. */ @@ -3235,15 +3541,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, break; case TCP_WINDOW_CLAMP: - if (!val) { - if (sk->sk_state != TCP_CLOSE) { - err = -EINVAL; - break; - } - tp->window_clamp = 0; - } else - tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? - SOCK_MIN_RCVBUF / 2 : val; + err = tcp_set_window_clamp(sk, val); break; case TCP_QUICKACK: @@ -3507,11 +3805,24 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */ 0; } +/* Returns TTL or hop limit of an incoming packet from skb. */ +static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb) +{ + if (skb->protocol == htons(ETH_P_IP)) + return ip_hdr(skb)->ttl; + else if (skb->protocol == htons(ETH_P_IPV6)) + return ipv6_hdr(skb)->hop_limit; + else + return 0; +} + struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, - const struct sk_buff *orig_skb) + const struct sk_buff *orig_skb, + const struct sk_buff *ack_skb) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; @@ -3567,6 +3878,9 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, max_t(int, 0, tp->write_seq - tp->snd_nxt)); nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, TCP_NLA_PAD); + if (ack_skb) + nla_put_u8(stats, TCP_NLA_TTL, + tcp_skb_ttl_or_hop_limit(ack_skb)); return stats; } @@ -3823,26 +4137,44 @@ static int do_tcp_getsockopt(struct sock *sk, int level, } #ifdef CONFIG_MMU case TCP_ZEROCOPY_RECEIVE: { - struct tcp_zerocopy_receive zc; + struct scm_timestamping_internal tss; + struct tcp_zerocopy_receive zc = {}; int err; if (get_user(len, optlen)) return -EFAULT; if (len < offsetofend(struct tcp_zerocopy_receive, length)) return -EINVAL; - if (len > sizeof(zc)) { + if (unlikely(len > sizeof(zc))) { + err = check_zeroed_user(optval + sizeof(zc), + len - sizeof(zc)); + if (err < 1) + return err == 0 ? -EINVAL : err; len = sizeof(zc); if (put_user(len, optlen)) return -EFAULT; } if (copy_from_user(&zc, optval, len)) return -EFAULT; + if (zc.reserved) + return -EINVAL; + if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) + return -EINVAL; lock_sock(sk); - err = tcp_zerocopy_receive(sk, &zc); + err = tcp_zerocopy_receive(sk, &zc, &tss); + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, + &zc, &len, err); release_sock(sk); - if (len == sizeof(zc)) - goto zerocopy_rcv_sk_err; + if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) + goto zerocopy_rcv_cmsg; switch (len) { + case offsetofend(struct tcp_zerocopy_receive, msg_flags): + goto zerocopy_rcv_cmsg; + case offsetofend(struct tcp_zerocopy_receive, msg_controllen): + case offsetofend(struct tcp_zerocopy_receive, msg_control): + case offsetofend(struct tcp_zerocopy_receive, flags): + case offsetofend(struct tcp_zerocopy_receive, copybuf_len): + case offsetofend(struct tcp_zerocopy_receive, copybuf_address): case offsetofend(struct tcp_zerocopy_receive, err): goto zerocopy_rcv_sk_err; case offsetofend(struct tcp_zerocopy_receive, inq): @@ -3851,6 +4183,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level, default: goto zerocopy_rcv_out; } +zerocopy_rcv_cmsg: + if (zc.msg_flags & TCP_CMSG_TS) + tcp_zc_finalize_rx_tstamp(sk, &zc, &tss); + else + zc.msg_flags = 0; zerocopy_rcv_sk_err: if (!err) zc.err = sock_error(sk); @@ -3873,6 +4210,18 @@ zerocopy_rcv_out: return 0; } +bool tcp_bpf_bypass_getsockopt(int level, int optname) +{ + /* TCP do_tcp_getsockopt has optimized getsockopt implementation + * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. + */ + if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) + return true; + + return false; +} +EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); + int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 6c4d79baff26..6ea3dc2e4219 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -945,7 +945,7 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) filter_expired = after(tcp_jiffies32, bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); if (rs->rtt_us >= 0 && - (rs->rtt_us <= bbr->min_rtt_us || + (rs->rtt_us < bbr->min_rtt_us || (filter_expired && !rs->is_ack_delayed))) { bbr->min_rtt_us = rs->rtt_us; bbr->min_rtt_stamp = tcp_jiffies32; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 37f4cb2bba5c..bc7d2a586e18 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -15,8 +15,8 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, { struct iov_iter *iter = &msg->msg_iter; int peek = flags & MSG_PEEK; - int i, ret, copied = 0; struct sk_msg *msg_rx; + int i, copied = 0; msg_rx = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); @@ -37,17 +37,16 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, page = sg_page(sge); if (copied + copy > len) copy = len - copied; - ret = copy_page_to_iter(page, sge->offset, copy, iter); - if (ret != copy) { - msg_rx->sg.start = i; - return -EFAULT; - } + copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (!copy) + return copied ? copied : -EFAULT; copied += copy; if (likely(!peek)) { sge->offset += copy; sge->length -= copy; - sk_mem_uncharge(sk, copy); + if (!msg_rx->skb) + sk_mem_uncharge(sk, copy); msg_rx->sg.size -= copy; if (!sge->length) { @@ -56,6 +55,11 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, put_page(page); } } else { + /* Lets not optimize peek case if copy_page_to_iter + * didn't copy the entire length lets just break. + */ + if (copy != sge->length) + return copied; sk_msg_iter_var_next(i); } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index db47ac24d057..563d016e7478 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -198,6 +198,11 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_setsockopt = 1; memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + if (ca->flags & TCP_CONG_NEEDS_ECN) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); + if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) tcp_init_congestion_control(sk); } diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index c7bf5b26bf0c..ffcbe46dacdb 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -104,16 +104,7 @@ struct bictcp { static inline void bictcp_reset(struct bictcp *ca) { - ca->cnt = 0; - ca->last_max_cwnd = 0; - ca->last_cwnd = 0; - ca->last_time = 0; - ca->bic_origin_point = 0; - ca->bic_K = 0; - ca->delay_min = 0; - ca->epoch_start = 0; - ca->ack_cnt = 0; - ca->tcp_cwnd = 0; + memset(ca, 0, offsetof(struct bictcp, unused)); ca->found = 0; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 389d1b340248..69a545db80d2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -510,7 +510,6 @@ static void tcp_init_buffer_space(struct sock *sk) if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) tcp_sndbuf_expand(sk); - tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss); tcp_mstamp_refresh(tp); tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; @@ -534,6 +533,8 @@ static void tcp_init_buffer_space(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_jiffies32; + tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd, + (u32)TCP_INIT_CWND * tp->advmss); } /* 4. Recalculate window clamp after socket hit its memory bounds. */ @@ -2546,7 +2547,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) * 1) If the packets in flight is larger than ssthresh, PRR spreads the * cwnd reductions across a full RTT. * 2) Otherwise PRR uses packet conservation to send as much as delivered. - * But when the retransmits are acked without further losses, PRR + * But when SND_UNA is acked without further losses, * slow starts cwnd up to ssthresh to speed up the recovery. */ static void tcp_init_cwnd_reduction(struct sock *sk) @@ -2563,7 +2564,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tcp_ecn_queue_cwr(tp); } -void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) +void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; @@ -2577,8 +2578,7 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + tp->prior_cwnd - 1; sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; - } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) == - FLAG_RETRANS_DATA_ACKED) { + } else if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) { sndcnt = min_t(int, delta, max_t(int, tp->prr_delivered - tp->prr_out, newly_acked_sacked) + 1); @@ -2689,7 +2689,22 @@ void tcp_simple_retransmit(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int mss = tcp_current_mss(sk); + int mss; + + /* A fastopen SYN request is stored as two separate packets within + * the retransmit queue, this is done by tcp_send_syn_data(). + * As a result simply checking the MSS of the frames in the queue + * will not work for the SYN packet. + * + * Us being here is an indication of a path MTU issue so we can + * assume that the fastopen SYN was lost and just mark all the + * frames in the retransmit queue as lost. We will use an MSS of + * -1 to mark all frames as lost, otherwise compute the current MSS. + */ + if (tp->syn_data && sk->sk_state == TCP_SYN_SENT) + mss = -1; + else + mss = tcp_current_mss(sk); skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { if (tcp_skb_seglen(skb) > mss) @@ -2844,7 +2859,8 @@ static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) } else if (tcp_is_rack(sk)) { u32 prior_retrans = tp->retrans_out; - tcp_rack_mark_lost(sk); + if (tcp_rack_mark_lost(sk)) + *ack_flag &= ~FLAG_SET_XMIT_TIMER; if (prior_retrans > tp->retrans_out) *ack_flag |= FLAG_LOST_RETRANS; } @@ -3130,7 +3146,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) } static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, - u32 prior_snd_una) + const struct sk_buff *ack_skb, u32 prior_snd_una) { const struct skb_shared_info *shinfo; @@ -3142,7 +3158,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, if (!before(shinfo->tskey, prior_snd_una) && before(shinfo->tskey, tcp_sk(sk)->snd_una)) { tcp_skb_tsorted_save(skb) { - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + __skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK); } tcp_skb_tsorted_restore(skb); } } @@ -3151,8 +3167,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ -static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, - u32 prior_snd_una, +static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, + u32 prior_fack, u32 prior_snd_una, struct tcp_sacktag_state *sack, bool ece_ack) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3241,7 +3257,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (!fully_acked) break; - tcp_ack_tstamp(sk, skb, prior_snd_una); + tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una); next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) @@ -3259,7 +3275,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, tp->snd_up = tp->snd_una; if (skb) { - tcp_ack_tstamp(sk, skb, prior_snd_una); + tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) flag |= FLAG_SACK_RENEGING; } @@ -3369,6 +3385,7 @@ static void tcp_ack_probe(struct sock *sk) return; if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; + icsk->icsk_probes_tstamp = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). * This function is not for random using! @@ -3376,8 +3393,8 @@ static void tcp_ack_probe(struct sock *sk) } else { unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - when, TCP_RTO_MAX); + when = tcp_clamp_probe0_to_user_timeout(sk, when); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); } } @@ -3419,7 +3436,7 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, if (tcp_in_cwnd_reduction(sk)) { /* Reduce cwnd if state mandates */ - tcp_cwnd_reduction(sk, acked_sacked, flag); + tcp_cwnd_reduction(sk, acked_sacked, rs->losses, flag); } else if (tcp_may_raise_cwnd(sk, flag)) { /* Advance cwnd if state allows */ tcp_cong_avoid(sk, ack, acked_sacked); @@ -3793,16 +3810,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state, - flag & FLAG_ECE); + flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una, + &sack_state, flag & FLAG_ECE); tcp_rack_update_reo_wnd(sk, &rs); if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); - /* If needed, reset TLP/RTO timer; RACK may later override this. */ - if (flag & FLAG_SET_XMIT_TIMER) - tcp_set_xmit_timer(sk); if (tcp_ack_is_dubious(sk, flag)) { if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) { @@ -3815,6 +3829,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) &rexmit); } + /* If needed, reset TLP/RTO timer when RACK doesn't set. */ + if (flag & FLAG_SET_XMIT_TIMER) + tcp_set_xmit_timer(sk); + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); @@ -4218,10 +4236,13 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) } /* When we get a reset we do this. */ -void tcp_reset(struct sock *sk) +void tcp_reset(struct sock *sk, struct sk_buff *skb) { trace_tcp_receive_reset(sk); + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb); + /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: @@ -4378,10 +4399,9 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) * The receiver remembers and reflects via DSACKs. Leverage the * DSACK state and change the txhash to re-route speculatively. */ - if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) { - sk_rethink_txhash(sk); + if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq && + sk_rethink_txhash(sk)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); - } } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) @@ -4904,15 +4924,8 @@ err: void tcp_data_ready(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); - int avail = tp->rcv_nxt - tp->copied_seq; - - if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) && - !sock_flag(sk, SOCK_DONE) && - tcp_receive_window(tp) > inet_csk(sk)->icsk_ack.rcv_mss) - return; - - sk->sk_data_ready(sk); + if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE)) + sk->sk_data_ready(sk); } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) @@ -5604,7 +5617,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, &tp->last_oow_ack_time)) tcp_send_dupack(sk, skb); } else if (tcp_reset_check(sk, skb)) { - tcp_reset(sk); + tcp_reset(sk, skb); } goto discard; } @@ -5640,7 +5653,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } if (rst_seq_match) - tcp_reset(sk); + tcp_reset(sk, skb); else { /* Disable TFO if RST is out-of-order * and no data has been received @@ -6077,7 +6090,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, */ if (th->rst) { - tcp_reset(sk); + tcp_reset(sk, skb); goto discard; } @@ -6519,7 +6532,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); - tcp_reset(sk); + tcp_reset(sk, skb); return 1; } } @@ -6800,18 +6813,13 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); - af_ops->init_req(req, sk, skb); - - if (security_inet_conn_request(sk, skb, req)) + dst = af_ops->route_req(sk, skb, &fl, req); + if (!dst) goto drop_and_free; if (tmp_opt.tstamp_ok) tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); - dst = af_ops->route_req(sk, &fl, req); - if (!dst) - goto drop_and_free; - if (!want_cookie && !isn) { /* Kill the following clause, if you dislike this way. */ if (!net->ipv4.sysctl_tcp_syncookies && diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7352c097ae48..daad4f99db32 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -980,17 +980,23 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); - tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? - tcp_rsk(req)->syn_tos : inet_sk(sk)->tos; - if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); + tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | + (inet_sk(sk)->tos & INET_ECN_MASK) : + inet_sk(sk)->tos; + + if (!INET_ECN_is_capable(tos) && + tcp_bpf_ca_needs_ecn((struct sock *)req)) + tos |= INET_ECN_ECT_0; + rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, rcu_dereference(ireq->ireq_opt), - tos & ~INET_ECN_MASK); + tos); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -1439,9 +1445,15 @@ static void tcp_v4_init_req(struct request_sock *req, } static struct dst_entry *tcp_v4_route_req(const struct sock *sk, + struct sk_buff *skb, struct flowi *fl, - const struct request_sock *req) + struct request_sock *req) { + tcp_v4_init_req(req, sk, skb); + + if (security_inet_conn_request(sk, skb, req)) + return NULL; + return inet_csk_route_req(sk, &fl->u.ip4, req); } @@ -1461,7 +1473,6 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .req_md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, #endif - .init_req = tcp_v4_init_req, #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v4_init_sequence, #endif @@ -1498,6 +1509,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, bool *own_req) { struct inet_request_sock *ireq; + bool found_dup_sk = false; struct inet_sock *newinet; struct tcp_sock *newtp; struct sock *newsk; @@ -1535,7 +1547,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = prandom_u32(); - /* Set ToS of the new socket based upon the value of incoming SYN. */ + /* Set ToS of the new socket based upon the value of incoming SYN. + * ECT bits are set later in tcp_init_transfer(). + */ if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; @@ -1575,12 +1589,22 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), + &found_dup_sk); if (likely(*own_req)) { tcp_move_syn(newtp, req); ireq->ireq_opt = NULL; } else { newinet->inet_opt = NULL; + + if (!req_unhash && found_dup_sk) { + /* This code path should only be executed in the + * syncookie case only + */ + bh_unlock_sock(newsk); + sock_put(newsk); + newsk = NULL; + } } return newsk; @@ -1625,6 +1649,8 @@ u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, return mss; } +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, + u32)); /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * @@ -1644,7 +1670,8 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || - !dst->ops->check(dst, 0)) { + !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, + dst, 0)) { dst_release(dst); sk->sk_rx_dst = NULL; } @@ -1736,6 +1763,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); + u32 tail_gso_size, tail_gso_segs; struct skb_shared_info *shinfo; const struct tcphdr *th; struct tcphdr *thtail; @@ -1743,6 +1771,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) unsigned int hdrlen; bool fragstolen; u32 gso_segs; + u32 gso_size; int delta; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), @@ -1768,13 +1797,6 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) */ th = (const struct tcphdr *)skb->data; hdrlen = th->doff * 4; - shinfo = skb_shinfo(skb); - - if (!shinfo->gso_size) - shinfo->gso_size = skb->len - hdrlen; - - if (!shinfo->gso_segs) - shinfo->gso_segs = 1; tail = sk->sk_backlog.tail; if (!tail) @@ -1797,6 +1819,15 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) goto no_coalesce; __skb_pull(skb, hdrlen); + + shinfo = skb_shinfo(skb); + gso_size = shinfo->gso_size ?: skb->len; + gso_segs = shinfo->gso_segs ?: 1; + + shinfo = skb_shinfo(tail); + tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); + tail_gso_segs = shinfo->gso_segs ?: 1; + if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; @@ -1823,11 +1854,8 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) } /* Not as strict as GRO. We only need to carry mss max value */ - skb_shinfo(tail)->gso_size = max(shinfo->gso_size, - skb_shinfo(tail)->gso_size); - - gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; - skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); + shinfo->gso_size = max(gso_size, tail_gso_size); + shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); sk->sk_backlog.len += delta; __NET_INC_STATS(sock_net(sk), @@ -2740,6 +2768,20 @@ void tcp4_proc_exit(void) } #endif /* CONFIG_PROC_FS */ +/* @wake is one when sk_stream_write_space() calls us. + * This sends EPOLLOUT only if notsent_bytes is half the limit. + * This mimics the strategy used in sock_def_write_space(). + */ +bool tcp_stream_memory_free(const struct sock *sk, int wake) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u32 notsent_bytes = READ_ONCE(tp->write_seq) - + READ_ONCE(tp->snd_nxt); + + return (notsent_bytes << wake) < tcp_notsent_lowat(tp); +} +EXPORT_SYMBOL(tcp_stream_memory_free); + struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, @@ -2754,6 +2796,7 @@ struct proto tcp_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 8c643a4ffad1..e6459537d4d2 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -89,6 +89,7 @@ struct lp { /** * tcp_lp_init + * @sk: socket to initialize congestion control algorithm for * * Init all required variables. * Clone the handling from Vegas module implementation. @@ -111,6 +112,7 @@ static void tcp_lp_init(struct sock *sk) /** * tcp_lp_cong_avoid + * @sk: socket to avoid congesting * * Implementation of cong_avoid. * Will only call newReno CA when away from inference. @@ -126,6 +128,7 @@ static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked) /** * tcp_lp_remote_hz_estimator + * @sk: socket which needs an estimate for the remote HZs * * Estimate remote HZ. * We keep on updating the estimated value, where original TCP-LP @@ -176,6 +179,7 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk) /** * tcp_lp_owd_calculator + * @sk: socket to calculate one way delay for * * Calculate one way delay (in relative format). * Original implement OWD as minus of remote time difference to local time @@ -210,6 +214,8 @@ static u32 tcp_lp_owd_calculator(struct sock *sk) /** * tcp_lp_rtt_sample + * @sk: socket to add a rtt sample to + * @rtt: round trip time, which is ignored! * * Implementation or rtt_sample. * Will take the following action, @@ -254,6 +260,7 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt) /** * tcp_lp_pkts_acked + * @sk: socket requiring congestion avoidance calculations * * Implementation of pkts_acked. * Deal with active drop under Early Congestion Indication. diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 6b27c481fe18..0588b004ddac 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -887,7 +887,7 @@ static void tcp_metrics_flush_all(struct net *net) pp = &hb->chain; for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { match = net ? net_eq(tm_net(tm), net) : - !refcount_read(&tm_net(tm)->count); + !refcount_read(&tm_net(tm)->ns.count); if (match) { *pp = tm->tcpm_next; kfree_rcu(tm, rcu_head); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 495dda2449fe..0055ae0a3bf8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -801,7 +801,7 @@ embryonic_reset: req->rsk_ops->send_reset(sk, skb); } else if (fastopen) { /* received a valid RST pkt */ reqsk_fastopen_remove(sk, req, true); - tcp_reset(sk); + tcp_reset(sk, skb); } if (!fastopen) { inet_csk_reqsk_queue_drop(sk, req); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bf48cd73e967..fbf140a770d8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -445,11 +445,12 @@ struct tcp_out_options { struct mptcp_out_options mptcp; }; -static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) +static void mptcp_options_write(__be32 *ptr, const struct tcp_sock *tp, + struct tcp_out_options *opts) { #if IS_ENABLED(CONFIG_MPTCP) if (unlikely(OPTION_MPTCP & opts->options)) - mptcp_write_options(ptr, &opts->mptcp); + mptcp_write_options(ptr, tp, &opts->mptcp); #endif } @@ -701,7 +702,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, smc_options_write(ptr, &options); - mptcp_options_write(ptr, opts); + mptcp_options_write(ptr, tp, opts); } static void smc_set_option(const struct tcp_sock *tp, @@ -1038,9 +1039,9 @@ static void tcp_tsq_handler(struct sock *sk) * transferring tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ -static void tcp_tasklet_func(unsigned long data) +static void tcp_tasklet_func(struct tasklet_struct *t) { - struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; + struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet); LIST_HEAD(list); unsigned long flags; struct list_head *q, *n; @@ -1125,9 +1126,7 @@ void __init tcp_tasklet_init(void) struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); INIT_LIST_HEAD(&tsq->head); - tasklet_init(&tsq->tasklet, - tcp_tasklet_func, - (unsigned long)tsq); + tasklet_setup(&tsq->tasklet, tcp_tasklet_func); } } @@ -1320,7 +1319,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, skb_orphan(skb); skb->sk = sk; skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; - skb_set_hash_from_sk(skb, sk); refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm); @@ -1348,7 +1346,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } } - tcp_options_write((__be32 *)(th + 1), tp, &opts); skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { th->window = htons(tcp_select_window(sk)); @@ -1359,6 +1356,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, */ th->window = htons(min(tp->rcv_wnd, 65535U)); } + + tcp_options_write((__be32 *)(th + 1), tp, &opts); + #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { @@ -1389,6 +1389,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tcp_skb_pcount(skb)); tp->segs_out += tcp_skb_pcount(skb); + skb_set_hash_from_sk(skb, sk); /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); @@ -1569,6 +1570,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (!buff) return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); + mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); @@ -1880,7 +1882,8 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) * window, and remember whether we were cwnd-limited then. */ if (!before(tp->snd_una, tp->max_packets_seq) || - tp->packets_out > tp->max_packets_out) { + tp->packets_out > tp->max_packets_out || + is_cwnd_limited) { tp->max_packets_out = tp->packets_out; tp->max_packets_seq = tp->snd_nxt; tp->is_cwnd_limited = is_cwnd_limited; @@ -2123,6 +2126,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, if (unlikely(!buff)) return -ENOMEM; skb_copy_decrypted(buff, skb); + mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); @@ -2393,6 +2397,7 @@ static int tcp_mtu_probe(struct sock *sk) skb = tcp_send_head(sk); skb_copy_decrypted(nskb, skb); + mptcp_skb_ext_copy(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; @@ -2702,6 +2707,10 @@ repair: else tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); + is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); + if (likely(sent_pkts || is_cwnd_limited)) + tcp_cwnd_validate(sk, is_cwnd_limited); + if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; @@ -2709,8 +2718,6 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk, false); - is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); - tcp_cwnd_validate(sk, is_cwnd_limited); return false; } return !tp->packets_out && !tcp_write_queue_empty(sk); @@ -4077,6 +4084,7 @@ void tcp_send_probe0(struct sock *sk) /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; + icsk->icsk_probes_tstamp = 0; return; } @@ -4091,6 +4099,8 @@ void tcp_send_probe0(struct sock *sk) */ timeout = TCP_RESOURCE_PROBE_INTERVAL; } + + timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout); tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX); } diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index f65a3ddd0d58..6f1b4ac7fe99 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -96,13 +96,13 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) } } -void tcp_rack_mark_lost(struct sock *sk) +bool tcp_rack_mark_lost(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 timeout; if (!tp->rack.advanced) - return; + return false; /* Reset the advanced flag to avoid unnecessary queue scanning */ tp->rack.advanced = 0; @@ -112,6 +112,7 @@ void tcp_rack_mark_lost(struct sock *sk) inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, timeout, inet_csk(sk)->icsk_rto); } + return !!timeout; } /* Record the most recently (re)sent time among the (s)acked packets @@ -153,6 +154,7 @@ void tcp_rack_reo_timeout(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 timeout, prior_inflight; + u32 lost = tp->lost; prior_inflight = tcp_packets_in_flight(tp); tcp_rack_detect_loss(sk, &timeout); @@ -160,7 +162,7 @@ void tcp_rack_reo_timeout(struct sock *sk) if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { tcp_enter_recovery(sk, false); if (!inet_csk(sk)->icsk_ca_ops->cong_control) - tcp_cwnd_reduction(sk, 1, 0); + tcp_cwnd_reduction(sk, 1, tp->lost - lost, 0); } tcp_xmit_retransmit_queue(sk); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6c62b9ea1320..4ef08079ccfa 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -40,6 +40,24 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining)); } +u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + u32 remaining; + s32 elapsed; + + if (!icsk->icsk_user_timeout || !icsk->icsk_probes_tstamp) + return when; + + elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp; + if (unlikely(elapsed < 0)) + elapsed = 0; + remaining = msecs_to_jiffies(icsk->icsk_user_timeout) - elapsed; + remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN); + + return min_t(u32, remaining, when); +} + /** * tcp_write_err() - close socket and save error info * @sk: The socket the error has appeared on. @@ -219,14 +237,8 @@ static int tcp_write_timeout(struct sock *sk) int retry_until; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { - if (icsk->icsk_retransmits) { - dst_negative_advice(sk); - } else { - sk_rethink_txhash(sk); - tp->timeout_rehash++; - __NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPTIMEOUTREHASH); - } + if (icsk->icsk_retransmits) + __dst_negative_advice(sk); retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; expired = icsk->icsk_retransmits >= retry_until; } else { @@ -234,12 +246,7 @@ static int tcp_write_timeout(struct sock *sk) /* Black hole detection */ tcp_mtu_probing(icsk, sk); - dst_negative_advice(sk); - } else { - sk_rethink_txhash(sk); - tp->timeout_rehash++; - __NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPTIMEOUTREHASH); + __dst_negative_advice(sk); } retry_until = net->ipv4.sysctl_tcp_retries2; @@ -270,6 +277,11 @@ static int tcp_write_timeout(struct sock *sk) return 1; } + if (sk_rethink_txhash(sk)) { + tp->timeout_rehash++; + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH); + } + return 0; } @@ -349,6 +361,7 @@ static void tcp_probe_timer(struct sock *sk) if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; + icsk->icsk_probes_tstamp = 0; return; } @@ -360,13 +373,12 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - if (icsk->icsk_user_timeout) { - u32 elapsed = tcp_model_timeout(sk, icsk->icsk_probes_out, - tcp_probe0_base(sk)); - - if (elapsed >= icsk->icsk_user_timeout) - goto abort; - } + if (!icsk->icsk_probes_tstamp) + icsk->icsk_probes_tstamp = tcp_jiffies32; + else if (icsk->icsk_user_timeout && + (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= + msecs_to_jiffies(icsk->icsk_user_timeout)) + goto abort; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 09f0a23d1a01..4a0478b17243 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -541,7 +541,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, inet_sdif(skb), udptable, skb); } -struct sock *udp4_lib_lookup_skb(struct sk_buff *skb, +struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport) { const struct iphdr *iph = ip_hdr(skb); @@ -550,7 +550,6 @@ struct sock *udp4_lib_lookup_skb(struct sk_buff *skb, iph->daddr, dport, inet_iif(skb), inet_sdif(skb), &udp_table, NULL); } -EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb); /* Must be called under rcu_read_lock(). * Does increment socket refcount. @@ -597,6 +596,12 @@ void udp_encap_enable(void) } EXPORT_SYMBOL(udp_encap_enable); +void udp_encap_disable(void) +{ + static_branch_dec(&udp_encap_needed_key); +} +EXPORT_SYMBOL(udp_encap_disable); + /* Handler for tunnels with arbitrary destination ports: no socket lookup, go * through error handlers in encapsulations looking for a match. */ @@ -702,7 +707,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex, inet_sdif(skb), udptable, NULL); - if (!sk) { + if (!sk || udp_sk(sk)->encap_type) { /* No socket for error: try tunnels before discarding */ sk = ERR_PTR(-ENOENT); if (static_branch_unlikely(&udp_encap_needed_key)) { @@ -874,7 +879,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct udphdr *uh; - int err = 0; + int err; int is_udplite = IS_UDPLITE(sk); int offset = skb_transport_offset(skb); int len = skb->len - offset; @@ -1125,7 +1130,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - if (cgroup_bpf_enabled && !connected) { + if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &ipc.addr); if (err) @@ -1197,7 +1202,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) faddr, saddr, dport, inet->inet_sport, sk->sk_uid); - security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); @@ -1859,9 +1864,8 @@ try_again: memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); - if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, - (struct sockaddr *)sin); + BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, + (struct sockaddr *)sin); } if (udp_sk(sk)->gro_enabled) @@ -2038,6 +2042,9 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (rc == -ENOMEM) UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); + else + UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS, + is_udplite); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); trace_udp_fail_queue_rcv_skb(rc, sk); @@ -2173,7 +2180,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) __skb_pull(skb, skb_transport_offset(skb)); ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0) - ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret); + ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret); } return 0; } @@ -2553,7 +2560,8 @@ int udp_v4_early_demux(struct sk_buff *skb) */ if (!inet_sk(sk)->inet_daddr && in_dev) return ip_mc_validate_source(skb, iph->daddr, - iph->saddr, iph->tos, + iph->saddr, + iph->tos & IPTOS_RT_MASK, skb->dev, in_dev, &itag); } return 0; diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 1dbece34496e..b2cee9a307d4 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -30,7 +30,7 @@ static int udp_dump_one(struct udp_table *tbl, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; - int err = -EINVAL; + int err; struct sock *sk = NULL; struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index e67a66fbf27b..b76c48efd37e 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -49,6 +49,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); + skb_set_transport_header(skb, skb_inner_transport_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = new_protocol; @@ -67,6 +68,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)))); features &= skb->dev->hw_enc_features; + if (need_csum) + features &= ~NETIF_F_SCTP_CRC; /* The only checksum offload we care about from here on out is the * outer one so strip the existing checksum feature flags and @@ -184,8 +187,67 @@ out_unlock: } EXPORT_SYMBOL(skb_udp_tunnel_segment); +static void __udpv4_gso_segment_csum(struct sk_buff *seg, + __be32 *oldip, __be32 *newip, + __be16 *oldport, __be16 *newport) +{ + struct udphdr *uh; + struct iphdr *iph; + + if (*oldip == *newip && *oldport == *newport) + return; + + uh = udp_hdr(seg); + iph = ip_hdr(seg); + + if (uh->check) { + inet_proto_csum_replace4(&uh->check, seg, *oldip, *newip, + true); + inet_proto_csum_replace2(&uh->check, seg, *oldport, *newport, + false); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + *oldport = *newport; + + csum_replace4(&iph->check, *oldip, *newip); + *oldip = *newip; +} + +static struct sk_buff *__udpv4_gso_segment_list_csum(struct sk_buff *segs) +{ + struct sk_buff *seg; + struct udphdr *uh, *uh2; + struct iphdr *iph, *iph2; + + seg = segs; + uh = udp_hdr(seg); + iph = ip_hdr(seg); + + if ((udp_hdr(seg)->dest == udp_hdr(seg->next)->dest) && + (udp_hdr(seg)->source == udp_hdr(seg->next)->source) && + (ip_hdr(seg)->daddr == ip_hdr(seg->next)->daddr) && + (ip_hdr(seg)->saddr == ip_hdr(seg->next)->saddr)) + return segs; + + while ((seg = seg->next)) { + uh2 = udp_hdr(seg); + iph2 = ip_hdr(seg); + + __udpv4_gso_segment_csum(seg, + &iph2->saddr, &iph->saddr, + &uh2->source, &uh->source); + __udpv4_gso_segment_csum(seg, + &iph2->daddr, &iph->daddr, + &uh2->dest, &uh->dest); + } + + return segs; +} + static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, - netdev_features_t features) + netdev_features_t features, + bool is_ipv6) { unsigned int mss = skb_shinfo(skb)->gso_size; @@ -195,11 +257,11 @@ static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss); - return skb; + return is_ipv6 ? skb : __udpv4_gso_segment_list_csum(skb); } struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, - netdev_features_t features) + netdev_features_t features, bool is_ipv6) { struct sock *sk = gso_skb->sk; unsigned int sum_truesize = 0; @@ -211,7 +273,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, __be16 newlen; if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) - return __udp_gso_segment_list(gso_skb, features); + return __udp_gso_segment_list(gso_skb, features, is_ipv6); mss = skb_shinfo(gso_skb)->gso_size; if (gso_skb->len <= sizeof(*uh) + mss) @@ -325,7 +387,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, goto out; if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) - return __udp_gso_segment(skb, features); + return __udp_gso_segment(skb, features, false); mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) @@ -366,7 +428,7 @@ out: static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct sk_buff *skb) { - struct udphdr *uh = udp_hdr(skb); + struct udphdr *uh = udp_gro_udphdr(skb); struct sk_buff *pp = NULL; struct udphdr *uh2; struct sk_buff *p; @@ -457,7 +519,8 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, if (skb->dev->features & NETIF_F_GRO_FRAGLIST) NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1; - if ((sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) { + if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) || + (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) { pp = call_gro_receive(udp_gro_receive_segment, head, skb); return pp; } @@ -500,12 +563,22 @@ out: } EXPORT_SYMBOL(udp_gro_receive); +static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, + __be16 dport) +{ + const struct iphdr *iph = skb_gro_network_header(skb); + + return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + inet_sdif(skb), &udp_table, NULL); +} + INDIRECT_CALLABLE_SCOPE struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + struct sock *sk = NULL; struct sk_buff *pp; - struct sock *sk; if (unlikely(!uh)) goto flush; @@ -523,7 +596,10 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) skip: NAPI_GRO_CB(skb)->is_ipv6 = 0; rcu_read_lock(); - sk = static_branch_unlikely(&udp_encap_needed_key) ? udp4_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + + if (static_branch_unlikely(&udp_encap_needed_key)) + sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest); + pp = udp_gro_receive(head, skb, uh, sk); rcu_read_unlock(); return pp; @@ -551,8 +627,8 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, { __be16 newlen = htons(skb->len - nhoff); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); - int err = -ENOSYS; struct sock *sk; + int err; uh->len = newlen; diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 3eecba0874aa..b97e3635acf5 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -90,15 +90,11 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, struct sock *sk = sock->sk; struct udp_tunnel_info ti; - if (!dev->netdev_ops->ndo_udp_tunnel_add || - !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; - dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); + udp_tunnel_nic_add_port(dev, &ti); } EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); @@ -108,15 +104,11 @@ void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, struct sock *sk = sock->sk; struct udp_tunnel_info ti; - if (!dev->netdev_ops->ndo_udp_tunnel_del || - !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; - dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); + udp_tunnel_nic_del_port(dev, &ti); } EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port); @@ -134,11 +126,7 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) rcu_read_lock(); for_each_netdev_rcu(net, dev) { - if (!dev->netdev_ops->ndo_udp_tunnel_add) - continue; - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - continue; - dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); + udp_tunnel_nic_add_port(dev, &ti); } rcu_read_unlock(); } @@ -158,11 +146,7 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) rcu_read_lock(); for_each_netdev_rcu(net, dev) { - if (!dev->netdev_ops->ndo_udp_tunnel_del) - continue; - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - continue; - dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); + udp_tunnel_nic_del_port(dev, &ti); } rcu_read_unlock(); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 01146b66d666..f2337fb756ac 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -205,6 +205,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, + .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, .accept_ra_pinfo = 1, @@ -260,6 +261,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, + .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, .accept_ra_pinfo = 1, @@ -1997,6 +1999,7 @@ EXPORT_SYMBOL(ipv6_chk_prefix); * ipv6_dev_find - find the first device with a given source address. * @net: the net namespace * @addr: the source address + * @dev: used to find the L3 domain of interest * * The caller should be protected by RCU, or RTNL. */ @@ -2466,8 +2469,9 @@ static void addrconf_add_mroute(struct net_device *dev) .fc_ifindex = dev->ifindex, .fc_dst_len = 8, .fc_flags = RTF_UP, - .fc_type = RTN_UNICAST, + .fc_type = RTN_MULTICAST, .fc_nlinfo.nl_net = dev_net(dev), + .fc_protocol = RTPROT_KERNEL, }; ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); @@ -5022,8 +5026,10 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, return -EMSGSIZE; if (args->netnsid >= 0 && - nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) + nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) { + nlmsg_cancel(skb, nlh); return -EMSGSIZE; + } put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 || @@ -5054,8 +5060,10 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, return -EMSGSIZE; if (args->netnsid >= 0 && - nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) + nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) { + nlmsg_cancel(skb, nlh); return -EMSGSIZE; + } put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 || @@ -5470,6 +5478,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor; array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr; + array[DEVCONF_RA_DEFRTR_METRIC] = cnf->ra_defrtr_metric; array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit; array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; #ifdef CONFIG_IPV6_ROUTER_PREF @@ -6663,6 +6672,14 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { + .procname = "ra_defrtr_metric", + .data = &ipv6_devconf.ra_defrtr_metric, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = (void *)SYSCTL_ONE, + }, + { .procname = "accept_ra_min_hop_limit", .data = &ipv6_devconf.accept_ra_min_hop_limit, .maxlen = sizeof(int), diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 642fc6ac13d2..8a22486cf270 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -306,7 +306,9 @@ static int ip6addrlbl_del(struct net *net, /* add default label */ static int __net_init ip6addrlbl_net_init(struct net *net) { - int err = 0; + struct ip6addrlbl_entry *p = NULL; + struct hlist_node *n; + int err; int i; ADDRLABEL(KERN_DEBUG "%s\n", __func__); @@ -315,14 +317,20 @@ static int __net_init ip6addrlbl_net_init(struct net *net) INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head); for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) { - int ret = ip6addrlbl_add(net, - ip6addrlbl_init_table[i].prefix, - ip6addrlbl_init_table[i].prefixlen, - 0, - ip6addrlbl_init_table[i].label, 0); - /* XXX: should we free all rules when we catch an error? */ - if (ret && (!err || err != -ENOMEM)) - err = ret; + err = ip6addrlbl_add(net, + ip6addrlbl_init_table[i].prefix, + ip6addrlbl_init_table[i].prefixlen, + 0, + ip6addrlbl_init_table[i].label, 0); + if (err) + goto err_ip6addrlbl_add; + } + return 0; + +err_ip6addrlbl_add: + hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { + hlist_del_rcu(&p->list); + kfree_rcu(p, rcu); } return err; } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e648fbebb167..1fb75f01756c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -295,7 +295,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, return -EINVAL; snum = ntohs(addr->sin6_port); - if (snum && inet_port_requires_bind_service(net, snum) && + if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && + snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; @@ -439,6 +440,7 @@ out_unlock: int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + u32 flags = BIND_WITH_LOCK; int err = 0; /* If the socket has its own bind function then use it. */ @@ -451,11 +453,12 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, + BPF_CGROUP_INET6_BIND, &flags); if (err) return err; - return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); + return __inet6_bind(sk, uaddr, addr_len, flags); } EXPORT_SYMBOL(inet6_bind); @@ -527,18 +530,19 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = sk->sk_v6_daddr; if (np->sndflow) sin->sin6_flowinfo = np->flow_label; + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + BPF_CGROUP_INET6_GETPEERNAME, + NULL); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; - } - if (cgroup_bpf_enabled) BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - peer ? BPF_CGROUP_INET6_GETPEERNAME : - BPF_CGROUP_INET6_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, NULL); + } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); return sizeof(*sin); @@ -819,7 +823,7 @@ int inet6_sk_rebuild_header(struct sock *sk) fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; fl6.flowi6_uid = sk->sk_uid; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); rcu_read_lock(); final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), @@ -954,6 +958,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.max_hbh_opts_cnt = IP6_DEFAULT_MAX_HBH_OPTS_CNT; net->ipv6.sysctl.max_dst_opts_len = IP6_DEFAULT_MAX_DST_OPTS_LEN; net->ipv6.sysctl.max_hbh_opts_len = IP6_DEFAULT_MAX_HBH_OPTS_LEN; + net->ipv6.sysctl.fib_notify_on_flag_change = 0; atomic_set(&net->ipv6.fib6_sernum, 1); err = ipv6_init_mibs(net); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index d88d97617f7e..440080da805b 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -588,7 +588,8 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); memset(ah->auth_data, 0, ahp->icv_trunc_len); - if (ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN)) + err = ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN); + if (err) goto out_free; ip6h->priority = 0; diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 78f766019b7e..51184a70ac7e 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -423,7 +423,7 @@ static void calipso_doi_free_rcu(struct rcu_head *entry) /** * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine * @doi: the DOI value - * @audit_secid: the LSM secid to use in the audit message + * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will @@ -1226,7 +1226,7 @@ static int calipso_req_setattr(struct request_sock *req, /** * calipso_req_delattr - Delete the CALIPSO option from a request socket - * @reg: the request socket + * @req: the request socket * * Description: * Removes the CALIPSO option from a request socket, if present. diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index cc8ad7ddecda..206f66310a88 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -60,7 +60,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) fl6->flowi6_oif = np->mcast_oif; - security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 52c2f063529f..153ad103ba74 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -478,7 +478,6 @@ static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb, int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { u8 *tail; - u8 *vaddr; int nfrags; int esph_offset; struct page *page; @@ -519,14 +518,10 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info page = pfrag->page; get_page(page); - vaddr = kmap_atomic(page); - - tail = vaddr + pfrag->offset; + tail = page_address(page) + pfrag->offset; esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); - kunmap_atomic(vaddr); - nfrags = skb_shinfo(skb)->nr_frags; __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, @@ -793,7 +788,7 @@ int esp6_input_done2(struct sk_buff *skb, int err) int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); int hdr_len = skb_network_header_len(skb); - if (!xo || (xo && !(xo->flags & CRYPTO_DONE))) + if (!xo || !(xo->flags & CRYPTO_DONE)) kfree(ESP_SKB_CB(skb)->tmp); if (unlikely(err)) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 374105e4394f..6126f8bf94b3 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -906,11 +906,6 @@ void ipv6_exthdrs_exit(void) /* * Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input(). */ -static inline struct inet6_dev *ipv6_skb_idev(struct sk_buff *skb) -{ - return skb_dst(skb) ? ip6_dst_idev(skb_dst(skb)) : __in6_dev_get(skb->dev); -} - static inline struct net *ipv6_skb_net(struct sk_buff *skb) { return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 8956144ea65e..fd1f896115c1 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -331,10 +331,9 @@ static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, st } #if IS_ENABLED(CONFIG_IPV6_MIP6) -static void mip6_addr_swap(struct sk_buff *skb) +static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) { struct ipv6hdr *iph = ipv6_hdr(skb); - struct inet6_skb_parm *opt = IP6CB(skb); struct ipv6_destopt_hao *hao; struct in6_addr tmp; int off; @@ -351,7 +350,7 @@ static void mip6_addr_swap(struct sk_buff *skb) } } #else -static inline void mip6_addr_swap(struct sk_buff *skb) {} +static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {} #endif static struct dst_entry *icmpv6_route_lookup(struct net *net, @@ -446,7 +445,8 @@ static int icmp6_iif(const struct sk_buff *skb) * Send an ICMP message in response to a packet in error */ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, - const struct in6_addr *force_saddr) + const struct in6_addr *force_saddr, + const struct inet6_skb_parm *parm) { struct inet6_dev *idev = NULL; struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -542,7 +542,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type)) goto out_bh_enable; - mip6_addr_swap(skb); + mip6_addr_swap(skb, parm); sk = icmpv6_xmit_lock(net); if (!sk) @@ -559,7 +559,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, /* select a more meaningful saddr from input if */ struct net_device *in_netdev; - in_netdev = dev_get_by_index(net, IP6CB(skb)->iif); + in_netdev = dev_get_by_index(net, parm->iif); if (in_netdev) { ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr, inet6_sk(sk)->srcprefs, @@ -573,7 +573,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.fl6_icmp_code = code; fl6.flowi6_uid = sock_net_uid(net, NULL); fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); - security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); np = inet6_sk(sk); @@ -640,7 +640,7 @@ EXPORT_SYMBOL(icmp6_send); */ void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) { - icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL); + icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb)); kfree_skb(skb); } @@ -697,10 +697,10 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, } if (type == ICMP_TIME_EXCEEDED) icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, - info, &temp_saddr); + info, &temp_saddr, IP6CB(skb2)); else icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, - info, &temp_saddr); + info, &temp_saddr, IP6CB(skb2)); if (rt) ip6_rt_put(rt); @@ -755,7 +755,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; fl6.flowi6_mark = mark; fl6.flowi6_uid = sock_net_uid(net, NULL); - security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); local_bh_disable(); sk = icmpv6_xmit_lock(net); @@ -1008,7 +1008,7 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, fl6->fl6_icmp_type = type; fl6->fl6_icmp_code = 0; fl6->flowi6_oif = oif; - security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } static void __net_exit icmpv6_sk_exit(struct net *net) diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index e315526fa244..5a9f4d722f35 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -46,7 +46,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, fl6->fl6_dport = ireq->ir_rmt_port; fl6->fl6_sport = htons(ireq->ir_num); fl6->flowi6_uid = sk->sk_uid; - security_req_classify_flow(req, flowi6_to_flowi(fl6)); + security_req_classify_flow(req, flowi6_to_flowi_common(fl6)); dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(dst)) @@ -95,7 +95,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, fl6->fl6_sport = inet->inet_sport; fl6->fl6_dport = inet->inet_dport; fl6->flowi6_uid = sk->sk_uid; - security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); rcu_read_lock(); final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 605cdd38a919..ef9d022e693f 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -499,7 +499,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb, hlist_for_each_entry_rcu(tb, head, tb6_hlist) { err = fib6_table_dump(net, tb, w); - if (err < 0) + if (err) goto out; } } @@ -507,7 +507,8 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb, out: kfree(w); - return err; + /* The tree traversal function should never return a positive value. */ + return err > 0 ? -EINVAL : err; } static int fib6_dump_node(struct fib6_walker *w) @@ -1025,6 +1026,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, { struct fib6_table *table = rt->fib6_table; + /* Flush all cached dst in exception table */ + rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt, table); if (rt->nh && !list_empty(&rt->nh_list)) @@ -1927,9 +1930,6 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; - /* Flush all cached dst in exception table */ - rt6_flush_exceptions(rt); - /* Reset round-robin state, if necessary */ if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 931b186d2e48..c3bc89b6b1a1 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1133,8 +1133,13 @@ static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu, return; if (rt->dst.dev) { - dev->needed_headroom = rt->dst.dev->hard_header_len + - t_hlen; + unsigned short dst_len = rt->dst.dev->hard_header_len + + t_hlen; + + if (t->dev->header_ops) + dev->hard_header_len = dst_len; + else + dev->needed_headroom = dst_len; if (set_mtu) { dev->mtu = rt->dst.dev->mtu - t_hlen; @@ -1159,7 +1164,12 @@ static int ip6gre_calc_hlen(struct ip6_tnl *tunnel) tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); - tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen; + + if (tunnel->dev->header_ops) + tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen; + else + tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen; + return t_hlen; } @@ -1391,7 +1401,7 @@ static const struct net_device_ops ip6gre_netdev_ops = { .ndo_start_xmit = ip6gre_tunnel_xmit, .ndo_do_ioctl = ip6gre_tunnel_ioctl, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1828,7 +1838,7 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1896,7 +1906,7 @@ static const struct net_device_ops ip6erspan_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c index 70c8c2f36c98..9e3574880cb0 100644 --- a/net/ipv6/ip6_icmp.c +++ b/net/ipv6/ip6_icmp.c @@ -33,23 +33,25 @@ int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) } EXPORT_SYMBOL(inet6_unregister_icmp_sender); -void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) +void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + const struct inet6_skb_parm *parm) { ip6_icmp_send_t *send; rcu_read_lock(); send = rcu_dereference(ip6_icmp_send); if (send) - send(skb, type, code, info, NULL); + send(skb, type, code, info, NULL, parm); rcu_read_unlock(); } -EXPORT_SYMBOL(icmpv6_send); +EXPORT_SYMBOL(__icmpv6_send); #endif #if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_conntrack.h> void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { + struct inet6_skb_parm parm = { 0 }; struct sk_buff *cloned_skb = NULL; enum ip_conntrack_info ctinfo; struct in6_addr orig_ip; @@ -57,7 +59,7 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(ct->status & IPS_SRC_NAT)) { - icmpv6_send(skb_in, type, code, info); + __icmpv6_send(skb_in, type, code, info, &parm); return; } @@ -72,7 +74,7 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) orig_ip = ipv6_hdr(skb_in)->saddr; ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6; - icmpv6_send(skb_in, type, code, info); + __icmpv6_send(skb_in, type, code, info, &parm); ipv6_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index e96304d8a4a7..e9d2a4a409aa 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -32,6 +32,7 @@ #include <net/sock.h> #include <net/snmp.h> +#include <net/udp.h> #include <net/ipv6.h> #include <net/protocol.h> @@ -44,7 +45,6 @@ #include <net/inet_ecn.h> #include <net/dst_metadata.h> -INDIRECT_CALLABLE_DECLARE(void udp_v6_early_demux(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *)); static void ip6_rcv_finish_core(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -352,7 +352,6 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, ip6_sublist_rcv(&sublist, curr_dev, curr_net); } -INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *)); /* diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index a80f90bf3ae7..1b9827ff8ccf 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -15,6 +15,7 @@ #include <net/inet_common.h> #include <net/tcp.h> #include <net/udp.h> +#include <net/gro.h> #include "ip6_offload.h" diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 749ad72386b2..ff4f9ebcf7f6 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -125,8 +125,43 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * return -EINVAL; } +static int +ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, + struct sk_buff *skb, unsigned int mtu) +{ + struct sk_buff *segs, *nskb; + netdev_features_t features; + int ret = 0; + + /* Please see corresponding comment in ip_finish_output_gso + * describing the cases where GSO segment length exceeds the + * egress MTU. + */ + features = netif_skb_features(skb); + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR_OR_NULL(segs)) { + kfree_skb(skb); + return -ENOMEM; + } + + consume_skb(skb); + + skb_list_walk_safe(segs, segs, nskb) { + int err; + + skb_mark_not_on_list(segs); + err = ip6_fragment(net, sk, segs, ip6_finish_output2); + if (err && ret == 0) + ret = err; + } + + return ret; +} + static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { + unsigned int mtu; + #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { @@ -135,7 +170,11 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff } #endif - if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || + mtu = ip6_skb_dst_mtu(skb); + if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu)) + return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); + + if ((skb->len > mtu && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)) || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) return ip6_fragment(net, sk, skb, ip6_finish_output2); @@ -178,6 +217,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) ip6_finish_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } +EXPORT_SYMBOL(ip6_output); bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) { @@ -1471,7 +1511,7 @@ emsgsize: csummode = CHECKSUM_PARTIAL; if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ @@ -1715,8 +1755,7 @@ alloc_new_skb: error_efault: err = -EFAULT; error: - if (uarg) - sock_zerocopy_put_abort(uarg, extra_uref); + net_zcopy_put_abort(uarg, extra_uref); cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 648db3fe508f..a7950baa05e5 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -94,36 +94,6 @@ static inline int ip6_tnl_mpls_supported(void) return IS_ENABLED(CONFIG_MPLS); } -static struct net_device_stats *ip6_get_stats(struct net_device *dev) -{ - struct pcpu_sw_netstats tmp, sum = { 0 }; - int i; - - for_each_possible_cpu(i) { - unsigned int start; - const struct pcpu_sw_netstats *tstats = - per_cpu_ptr(dev->tstats, i); - - do { - start = u64_stats_fetch_begin_irq(&tstats->syncp); - tmp.rx_packets = tstats->rx_packets; - tmp.rx_bytes = tstats->rx_bytes; - tmp.tx_packets = tstats->tx_packets; - tmp.tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); - - sum.rx_packets += tmp.rx_packets; - sum.rx_bytes += tmp.rx_bytes; - sum.tx_packets += tmp.tx_packets; - sum.tx_bytes += tmp.tx_bytes; - } - dev->stats.rx_packets = sum.rx_packets; - dev->stats.rx_bytes = sum.rx_bytes; - dev->stats.tx_packets = sum.tx_packets; - dev->stats.tx_bytes = sum.tx_bytes; - return &dev->stats; -} - #define for_each_ip6_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) @@ -204,6 +174,7 @@ ip6_tnl_lookup(struct net *net, int link, /** * ip6_tnl_bucket - get head of list matching given tunnel parameters + * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: @@ -230,6 +201,7 @@ ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p) /** * ip6_tnl_link - add tunnel to hash table + * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be added **/ @@ -246,6 +218,7 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) /** * ip6_tnl_unlink - remove tunnel from hash table + * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be removed **/ @@ -417,6 +390,7 @@ ip6_tnl_dev_uninit(struct net_device *dev) /** * parse_tvl_tnl_enc_lim - handle encapsulation limit option * @skb: received socket buffer + * @raw: the ICMPv6 error message data * * Return: * 0 if none was found, @@ -485,14 +459,9 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) } EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim); -/** - * ip6_tnl_err - tunnel error handler - * - * Description: - * ip6_tnl_err() should handle errors in the tunnel according - * to the specifications in RFC 2473. - **/ - +/* ip6_tnl_err() should handle errors in the tunnel according to the + * specifications in RFC 2473. + */ static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, u8 *type, u8 *code, int *msg, __u32 *info, int offset) @@ -1835,7 +1804,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_start_xmit = ip6_tnl_start_xmit, .ndo_do_ioctl = ip6_tnl_ioctl, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats = ip6_get_stats, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 5f9c4fdc120d..0225fd694192 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -125,6 +125,7 @@ vti6_tnl_lookup(struct net *net, const struct in6_addr *remote, /** * vti6_tnl_bucket - get head of list matching given tunnel parameters + * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: @@ -889,7 +890,7 @@ static const struct net_device_ops vti6_netdev_ops = { .ndo_uninit = vti6_dev_uninit, .ndo_start_xmit = vti6_tnl_xmit, .ndo_do_ioctl = vti6_ioctl, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 43a894bf9a1b..a6804a7e34c1 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1148,7 +1148,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; - msg.msg_control = optval; + msg.msg_control_user = optval; msg.msg_controllen = len; msg.msg_flags = flags; msg.msg_control_is_user = true; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 8cd2782a31e4..6c8604390266 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -548,7 +548,7 @@ done: } int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, - struct sockaddr_storage *p) + struct sockaddr_storage __user *p) { int err, i, count, copycount; const struct in6_addr *group; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 27f29b957ee7..c467c6419893 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -81,6 +81,7 @@ static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); static int pndisc_constructor(struct pneigh_entry *n); static void pndisc_destructor(struct pneigh_entry *n); static void pndisc_redo(struct sk_buff *skb); +static int ndisc_is_multicast(const void *pkey); static const struct neigh_ops ndisc_generic_ops = { .family = AF_INET6, @@ -115,6 +116,7 @@ struct neigh_table nd_tbl = { .pconstructor = pndisc_constructor, .pdestructor = pndisc_destructor, .proxy_redo = pndisc_redo, + .is_multicast = ndisc_is_multicast, .allow_add = ndisc_allow_add, .id = "ndisc_cache", .parms = { @@ -1171,6 +1173,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) struct neighbour *neigh = NULL; struct inet6_dev *in6_dev; struct fib6_info *rt = NULL; + u32 defrtr_usr_metric; struct net *net; int lifetime; struct ndisc_options ndopts; @@ -1301,18 +1304,21 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } } - if (rt && lifetime == 0) { + /* Set default route metric as specified by user */ + defrtr_usr_metric = in6_dev->cnf.ra_defrtr_metric; + /* delete the route if lifetime is 0 or if metric needs change */ + if (rt && (lifetime == 0 || rt->fib6_metric != defrtr_usr_metric)) { ip6_del_rt(net, rt, false); rt = NULL; } - ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, for dev: %s\n", - rt, lifetime, skb->dev->name); + ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n", + rt, lifetime, defrtr_usr_metric, skb->dev->name); if (!rt && lifetime) { ND_PRINTK(3, info, "RA: adding default router\n"); rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr, - skb->dev, pref); + skb->dev, pref, defrtr_usr_metric); if (!rt) { ND_PRINTK(0, err, "RA: %s failed to add default route\n", @@ -1706,6 +1712,11 @@ static void pndisc_redo(struct sk_buff *skb) kfree_skb(skb); } +static int ndisc_is_multicast(const void *pkey) +{ + return ipv6_addr_is_multicast((struct in6_addr *)pkey); +} + static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 2e2119bfcf13..0d453fa9e327 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -280,7 +280,7 @@ ip6t_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); - private = READ_ONCE(table->private); /* Address dependency. */ + private = rcu_access_pointer(table->private); cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; @@ -807,7 +807,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -831,7 +831,7 @@ copy_entries_to_user(unsigned int total_size, unsigned int off, num; const struct ip6t_entry *e; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); int ret = 0; const void *loc_cpu_entry; @@ -980,7 +980,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, AF_INET6, name); if (!IS_ERR(t)) { struct ip6t_getinfo info; - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); #ifdef CONFIG_COMPAT struct xt_table_info tmp; @@ -1035,7 +1035,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { - struct xt_table_info *private = t->private; + struct xt_table_info *private = xt_table_get_private_protected(t); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); @@ -1189,7 +1189,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = t->private; + private = xt_table_get_private_protected(t); if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1552,7 +1552,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); void __user *pos; unsigned int size; int ret = 0; @@ -1598,7 +1598,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 3ac5485049f0..a35019d2e480 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -61,7 +61,7 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) /* Do nothing */ break; case IP6T_TCP_RESET: - nf_send_reset6(net, skb, xt_hooknum(par)); + nf_send_reset6(net, par->state->sk, skb, xt_hooknum(par)); break; case IP6T_ICMP6_POLICY_FAIL: nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, xt_hooknum(par)); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 054d287eb13d..c129ad334eb3 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -440,6 +440,7 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { u16 savethdr = skb->transport_header; + u8 nexthdr = NEXTHDR_FRAGMENT; int fhoff, nhoff, ret; struct frag_hdr *fhdr; struct frag_queue *fq; @@ -455,6 +456,14 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) return 0; + /* Discard the first fragment if it does not include all headers + * RFC 8200, Section 4.5 + */ + if (ipv6frag_thdr_truncated(skb, fhoff, &nexthdr)) { + pr_debug("Drop incomplete fragment\n"); + return 0; + } + if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr))) return -ENOMEM; diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 4aef6baaa55e..dffeaaaadcde 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -12,6 +12,140 @@ #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> +static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + int thoff; + __be16 fo; + u8 proto = ip6h->nexthdr; + + if (skb_csum_unnecessary(skb)) + return true; + + if (ip6h->payload_len && + pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h))) + return false; + + ip6h = ipv6_hdr(skb); + thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo); + if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) + return false; + + if (!nf_reject_verify_csum(proto)) + return true; + + return nf_ip6_checksum(skb, hook, thoff, proto) == 0; +} + +static int nf_reject_ip6hdr_validate(struct sk_buff *skb) +{ + struct ipv6hdr *hdr; + u32 pkt_len; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + return 0; + + hdr = ipv6_hdr(skb); + if (hdr->version != 6) + return 0; + + pkt_len = ntohs(hdr->payload_len); + if (pkt_len + sizeof(struct ipv6hdr) > skb->len) + return 0; + + return 1; +} + +struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook) +{ + struct sk_buff *nskb; + const struct tcphdr *oth; + struct tcphdr _oth; + unsigned int otcplen; + struct ipv6hdr *nip6h; + + if (!nf_reject_ip6hdr_validate(oldskb)) + return NULL; + + oth = nf_reject_ip6_tcphdr_get(oldskb, &_oth, &otcplen, hook); + if (!oth) + return NULL; + + nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct tcphdr) + + LL_MAX_HEADER, GFP_ATOMIC); + if (!nskb) + return NULL; + + nskb->dev = (struct net_device *)dev; + + skb_reserve(nskb, LL_MAX_HEADER); + nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, + net->ipv6.devconf_all->hop_limit); + nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen); + nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr)); + + return nskb; +} +EXPORT_SYMBOL_GPL(nf_reject_skb_v6_tcp_reset); + +struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook, u8 code) +{ + struct sk_buff *nskb; + struct ipv6hdr *nip6h; + struct icmp6hdr *icmp6h; + unsigned int len; + + if (!nf_reject_ip6hdr_validate(oldskb)) + return NULL; + + /* Include "As much of invoking packet as possible without the ICMPv6 + * packet exceeding the minimum IPv6 MTU" in the ICMP payload. + */ + len = min_t(unsigned int, 1220, oldskb->len); + + if (!pskb_may_pull(oldskb, len)) + return NULL; + + if (!nf_reject_v6_csum_ok(oldskb, hook)) + return NULL; + + nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) + + LL_MAX_HEADER + len, GFP_ATOMIC); + if (!nskb) + return NULL; + + nskb->dev = (struct net_device *)dev; + + skb_reserve(nskb, LL_MAX_HEADER); + nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6, + net->ipv6.devconf_all->hop_limit); + + skb_reset_transport_header(nskb); + icmp6h = skb_put_zero(nskb, sizeof(struct icmp6hdr)); + icmp6h->icmp6_type = ICMPV6_DEST_UNREACH; + icmp6h->icmp6_code = code; + + skb_put_data(nskb, skb_network_header(oldskb), len); + nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr)); + + icmp6h->icmp6_cksum = + csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, + nskb->len - sizeof(struct ipv6hdr), + IPPROTO_ICMPV6, + csum_partial(icmp6h, + nskb->len - sizeof(struct ipv6hdr), + 0)); + + return nskb; +} +EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach); + const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, struct tcphdr *otcph, unsigned int *otcplen, int hook) @@ -141,7 +275,8 @@ static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) return 0; } -void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) +void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, + int hook) { struct net_device *br_indev __maybe_unused; struct sk_buff *nskb; @@ -170,7 +305,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; - if (hook == NF_INET_PRE_ROUTING) { + if (hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) { nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); if (!dst) return; @@ -179,7 +314,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev); fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark); - security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); + security_skb_classify_flow(oldskb, flowi6_to_flowi_common(&fl6)); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); @@ -233,7 +368,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) dev_queue_xmit(nskb); } else #endif - ip6_local_out(net, nskb->sk, nskb); + ip6_local_out(net, sk, nskb); } EXPORT_SYMBOL_GPL(nf_send_reset6); @@ -268,7 +403,8 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) skb_in->dev = net->loopback_dev; - if (hooknum == NF_INET_PRE_ROUTING && nf_reject6_fill_skb_dst(skb_in)) + if ((hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_INGRESS) && + nf_reject6_fill_skb_dst(skb_in) < 0) return; icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0); diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c index 8b5193efb1f1..3a00d95e964e 100644 --- a/net/ipv6/netfilter/nft_dup_ipv6.c +++ b/net/ipv6/netfilter/nft_dup_ipv6.c @@ -13,8 +13,8 @@ #include <net/netfilter/ipv6/nf_dup_ipv6.h> struct nft_dup_ipv6 { - enum nft_registers sreg_addr:8; - enum nft_registers sreg_dev:8; + u8 sreg_addr; + u8 sreg_dev; }; static void nft_dup_ipv6_eval(const struct nft_expr *expr, @@ -38,16 +38,16 @@ static int nft_dup_ipv6_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_ADDR] == NULL) return -EINVAL; - priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]); - err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in6_addr)); + err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, + sizeof(struct in6_addr)); if (err < 0) return err; - if (tb[NFTA_DUP_SREG_DEV] != NULL) { - priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); - return nft_validate_register_load(priv->sreg_dev, sizeof(int)); - } - return 0; + if (tb[NFTA_DUP_SREG_DEV]) + err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], + &priv->sreg_dev, sizeof(int)); + + return err; } static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index c1098a1968e1..7969d1f3018d 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -28,7 +28,8 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt)); + nf_send_reset6(nft_net(pkt), pkt->xt.state->sk, pkt->skb, + nft_hook(pkt)); break; default: break; diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 6caa062f68e7..6ac88fe24a8e 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -111,7 +111,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_uid = sk->sk_uid; fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); ipcm6_init_sk(&ipc6, np); ipc6.sockc.mark = sk->sk_mark; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index bbff3e02e302..d6306aa46bb1 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -126,6 +126,7 @@ static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI), + SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; @@ -137,6 +138,7 @@ static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS), + SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 6e4ab80a3b94..1f56d9aae589 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -915,7 +915,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_oif = np->mcast_oif; else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); if (hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index c8cf1bbad74a..47a0dc46cbdb 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -324,9 +324,8 @@ static int ipv6_frag_rcv(struct sk_buff *skb) struct frag_queue *fq; const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = dev_net(skb_dst(skb)->dev); - __be16 frag_off; - int iif, offset; u8 nexthdr; + int iif; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; @@ -362,24 +361,11 @@ static int ipv6_frag_rcv(struct sk_buff *skb) * the source of the fragment, with the Pointer field set to zero. */ nexthdr = hdr->nexthdr; - offset = ipv6_skip_exthdr(skb, skb_transport_offset(skb), &nexthdr, &frag_off); - if (offset >= 0) { - /* Check some common protocols' header */ - if (nexthdr == IPPROTO_TCP) - offset += sizeof(struct tcphdr); - else if (nexthdr == IPPROTO_UDP) - offset += sizeof(struct udphdr); - else if (nexthdr == IPPROTO_ICMPV6) - offset += sizeof(struct icmp6hdr); - else - offset += 1; - - if (!(frag_off & htons(IP6_OFFSET)) && offset > skb->len) { - __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), - IPSTATS_MIB_INHDRERRORS); - icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0); - return -1; - } + if (ipv6frag_thdr_truncated(skb, skb_transport_offset(skb), &nexthdr)) { + __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0); + return -1; } iif = skb->dev ? skb->dev->ifindex : 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7e0ce7af8234..1536f4948e86 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -81,9 +81,11 @@ enum rt6_nud_state { RT6_NUD_SUCCEED = 1 }; -static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); +INDIRECT_CALLABLE_SCOPE +struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); -static unsigned int ip6_mtu(const struct dst_entry *dst); +INDIRECT_CALLABLE_SCOPE +unsigned int ip6_mtu(const struct dst_entry *dst); static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, @@ -2611,7 +2613,8 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, return NULL; } -static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) +INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, + u32 cookie) { struct dst_entry *dst_ret; struct fib6_info *from; @@ -2641,6 +2644,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) return dst_ret; } +EXPORT_INDIRECT_CALLABLE(ip6_dst_check); static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) { @@ -3089,7 +3093,7 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) return mtu; } -static unsigned int ip6_mtu(const struct dst_entry *dst) +INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst) { struct inet6_dev *idev; unsigned int mtu; @@ -3111,6 +3115,7 @@ out: return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } +EXPORT_INDIRECT_CALLABLE(ip6_mtu); /* MTU selection: * 1. mtu on route is locked - use it @@ -4252,11 +4257,12 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, - unsigned int pref) + unsigned int pref, + u32 defrtr_usr_metric) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, - .fc_metric = IP6_RT_PRIO_USER, + .fc_metric = defrtr_usr_metric, .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), @@ -5558,6 +5564,10 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) goto nla_put_failure; + + if (dst->lwtstate && + lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) + goto nla_put_failure; } else if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; struct nlattr *mp; @@ -5609,6 +5619,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, rtm->rtm_flags |= RTM_F_OFFLOAD; if (rt->trap) rtm->rtm_flags |= RTM_F_TRAP; + if (rt->offload_failed) + rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; } if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) @@ -6039,11 +6051,6 @@ void fib6_rt_update(struct net *net, struct fib6_info *rt, struct sk_buff *skb; int err = -ENOBUFS; - /* call_fib6_entry_notifiers will be removed when in-kernel notifier - * is implemented and supported for nexthop objects - */ - call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL); - skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (!skb) goto errout; @@ -6064,6 +6071,58 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } +void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, + bool offload, bool trap, bool offload_failed) +{ + struct sk_buff *skb; + int err; + + if (f6i->offload == offload && f6i->trap == trap && + f6i->offload_failed == offload_failed) + return; + + f6i->offload = offload; + f6i->trap = trap; + + /* 2 means send notifications only if offload_failed was changed. */ + if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 && + f6i->offload_failed == offload_failed) + return; + + f6i->offload_failed = offload_failed; + + if (!rcu_access_pointer(f6i->fib6_node)) + /* The route was removed from the tree, do not send + * notfication. + */ + return; + + if (!net->ipv6.sysctl.fib_notify_on_flag_change) + return; + + skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL); + if (!skb) { + err = -ENOBUFS; + goto errout; + } + + err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0, + 0, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL); + return; + +errout: + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); +} +EXPORT_SYMBOL(fib6_info_hw_flags_set); + static int ip6_route_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c index 307f336b5353..488aec9e1a74 100644 --- a/net/ipv6/rpl.c +++ b/net/ipv6/rpl.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/** +/* * Authors: * (C) 2020 Alexander Aring <alex.aring@gmail.com> */ diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index 5fdf3ebb953f..ff691d9f4a04 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/** +/* * Authors: * (C) 2020 Alexander Aring <alex.aring@gmail.com> */ @@ -190,18 +190,13 @@ static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) { struct dst_entry *dst = skb_dst(skb); struct rpl_iptunnel_encap *tinfo; - int err = 0; if (skb->protocol != htons(ETH_P_IPV6)) return -EINVAL; tinfo = rpl_encap_lwtunnel(dst->lwtstate); - err = rpl_do_srh_inline(skb, rlwt, tinfo->srh); - if (err) - return err; - - return 0; + return rpl_do_srh_inline(skb, rlwt, tinfo->srh); } static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 85dddfe3a2c6..687d95dce085 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -35,7 +35,6 @@ #include <net/xfrm.h> #include <crypto/hash.h> -#include <crypto/sha.h> #include <net/seg6.h> #include <net/genetlink.h> #include <net/seg6_hmac.h> diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index eba23279912d..c2a0c78e84d4 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -31,13 +31,39 @@ #include <linux/etherdevice.h> #include <linux/bpf.h> +#define SEG6_F_ATTR(i) BIT(i) + struct seg6_local_lwt; +/* callbacks used for customizing the creation and destruction of a behavior */ +struct seg6_local_lwtunnel_ops { + int (*build_state)(struct seg6_local_lwt *slwt, const void *cfg, + struct netlink_ext_ack *extack); + void (*destroy_state)(struct seg6_local_lwt *slwt); +}; + struct seg6_action_desc { int action; unsigned long attrs; + + /* The optattrs field is used for specifying all the optional + * attributes supported by a specific behavior. + * It means that if one of these attributes is not provided in the + * netlink message during the behavior creation, no errors will be + * returned to the userspace. + * + * Each attribute can be only of two types (mutually exclusive): + * 1) required or 2) optional. + * Every user MUST obey to this rule! If you set an attribute as + * required the same attribute CANNOT be set as optional and vice + * versa. + */ + unsigned long optattrs; + int (*input)(struct sk_buff *skb, struct seg6_local_lwt *slwt); int static_headroom; + + struct seg6_local_lwtunnel_ops slwt_ops; }; struct bpf_lwt_prog { @@ -45,6 +71,28 @@ struct bpf_lwt_prog { char *name; }; +enum seg6_end_dt_mode { + DT_INVALID_MODE = -EINVAL, + DT_LEGACY_MODE = 0, + DT_VRF_MODE = 1, +}; + +struct seg6_end_dt_info { + enum seg6_end_dt_mode mode; + + struct net *net; + /* VRF device associated to the routing table used by the SRv6 + * End.DT4/DT6 behavior for routing IPv4/IPv6 packets. + */ + int vrf_ifindex; + int vrf_table; + + /* tunneled packet proto and family (IPv4 or IPv6) */ + __be16 proto; + u16 family; + int hdrlen; +}; + struct seg6_local_lwt { int action; struct ipv6_sr_hdr *srh; @@ -54,9 +102,16 @@ struct seg6_local_lwt { int iif; int oif; struct bpf_lwt_prog bpf; +#ifdef CONFIG_NET_L3_MASTER_DEV + struct seg6_end_dt_info dt_info; +#endif int headroom; struct seg6_action_desc *desc; + /* unlike the required attrs, we have to track the optional attributes + * that have been effectively parsed. + */ + unsigned long parsed_optattrs; }; static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt) @@ -401,6 +456,248 @@ drop: return -EINVAL; } +#ifdef CONFIG_NET_L3_MASTER_DEV +static struct net *fib6_config_get_net(const struct fib6_config *fib6_cfg) +{ + const struct nl_info *nli = &fib6_cfg->fc_nlinfo; + + return nli->nl_net; +} + +static int __seg6_end_dt_vrf_build(struct seg6_local_lwt *slwt, const void *cfg, + u16 family, struct netlink_ext_ack *extack) +{ + struct seg6_end_dt_info *info = &slwt->dt_info; + int vrf_ifindex; + struct net *net; + + net = fib6_config_get_net(cfg); + + /* note that vrf_table was already set by parse_nla_vrftable() */ + vrf_ifindex = l3mdev_ifindex_lookup_by_table_id(L3MDEV_TYPE_VRF, net, + info->vrf_table); + if (vrf_ifindex < 0) { + if (vrf_ifindex == -EPERM) { + NL_SET_ERR_MSG(extack, + "Strict mode for VRF is disabled"); + } else if (vrf_ifindex == -ENODEV) { + NL_SET_ERR_MSG(extack, + "Table has no associated VRF device"); + } else { + pr_debug("seg6local: SRv6 End.DT* creation error=%d\n", + vrf_ifindex); + } + + return vrf_ifindex; + } + + info->net = net; + info->vrf_ifindex = vrf_ifindex; + + switch (family) { + case AF_INET: + info->proto = htons(ETH_P_IP); + info->hdrlen = sizeof(struct iphdr); + break; + case AF_INET6: + info->proto = htons(ETH_P_IPV6); + info->hdrlen = sizeof(struct ipv6hdr); + break; + default: + return -EINVAL; + } + + info->family = family; + info->mode = DT_VRF_MODE; + + return 0; +} + +/* The SRv6 End.DT4/DT6 behavior extracts the inner (IPv4/IPv6) packet and + * routes the IPv4/IPv6 packet by looking at the configured routing table. + * + * In the SRv6 End.DT4/DT6 use case, we can receive traffic (IPv6+Segment + * Routing Header packets) from several interfaces and the outer IPv6 + * destination address (DA) is used for retrieving the specific instance of the + * End.DT4/DT6 behavior that should process the packets. + * + * However, the inner IPv4/IPv6 packet is not really bound to any receiving + * interface and thus the End.DT4/DT6 sets the VRF (associated with the + * corresponding routing table) as the *receiving* interface. + * In other words, the End.DT4/DT6 processes a packet as if it has been received + * directly by the VRF (and not by one of its slave devices, if any). + * In this way, the VRF interface is used for routing the IPv4/IPv6 packet in + * according to the routing table configured by the End.DT4/DT6 instance. + * + * This design allows you to get some interesting features like: + * 1) the statistics on rx packets; + * 2) the possibility to install a packet sniffer on the receiving interface + * (the VRF one) for looking at the incoming packets; + * 3) the possibility to leverage the netfilter prerouting hook for the inner + * IPv4 packet. + * + * This function returns: + * - the sk_buff* when the VRF rcv handler has processed the packet correctly; + * - NULL when the skb is consumed by the VRF rcv handler; + * - a pointer which encodes a negative error number in case of error. + * Note that in this case, the function takes care of freeing the skb. + */ +static struct sk_buff *end_dt_vrf_rcv(struct sk_buff *skb, u16 family, + struct net_device *dev) +{ + /* based on l3mdev_ip_rcv; we are only interested in the master */ + if (unlikely(!netif_is_l3_master(dev) && !netif_has_l3_rx_handler(dev))) + goto drop; + + if (unlikely(!dev->l3mdev_ops->l3mdev_l3_rcv)) + goto drop; + + /* the decap packet IPv4/IPv6 does not come with any mac header info. + * We must unset the mac header to allow the VRF device to rebuild it, + * just in case there is a sniffer attached on the device. + */ + skb_unset_mac_header(skb); + + skb = dev->l3mdev_ops->l3mdev_l3_rcv(dev, skb, family); + if (!skb) + /* the skb buffer was consumed by the handler */ + return NULL; + + /* when a packet is received by a VRF or by one of its slaves, the + * master device reference is set into the skb. + */ + if (unlikely(skb->dev != dev || skb->skb_iif != dev->ifindex)) + goto drop; + + return skb; + +drop: + kfree_skb(skb); + return ERR_PTR(-EINVAL); +} + +static struct net_device *end_dt_get_vrf_rcu(struct sk_buff *skb, + struct seg6_end_dt_info *info) +{ + int vrf_ifindex = info->vrf_ifindex; + struct net *net = info->net; + + if (unlikely(vrf_ifindex < 0)) + goto error; + + if (unlikely(!net_eq(dev_net(skb->dev), net))) + goto error; + + return dev_get_by_index_rcu(net, vrf_ifindex); + +error: + return NULL; +} + +static struct sk_buff *end_dt_vrf_core(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + struct seg6_end_dt_info *info = &slwt->dt_info; + struct net_device *vrf; + + vrf = end_dt_get_vrf_rcu(skb, info); + if (unlikely(!vrf)) + goto drop; + + skb->protocol = info->proto; + + skb_dst_drop(skb); + + skb_set_transport_header(skb, info->hdrlen); + + return end_dt_vrf_rcv(skb, info->family, vrf); + +drop: + kfree_skb(skb); + return ERR_PTR(-EINVAL); +} + +static int input_action_end_dt4(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + struct iphdr *iph; + int err; + + if (!decap_and_validate(skb, IPPROTO_IPIP)) + goto drop; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto drop; + + skb = end_dt_vrf_core(skb, slwt); + if (!skb) + /* packet has been processed and consumed by the VRF */ + return 0; + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + iph = ip_hdr(skb); + + err = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev); + if (unlikely(err)) + goto drop; + + return dst_input(skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int seg6_end_dt4_build(struct seg6_local_lwt *slwt, const void *cfg, + struct netlink_ext_ack *extack) +{ + return __seg6_end_dt_vrf_build(slwt, cfg, AF_INET, extack); +} + +static enum +seg6_end_dt_mode seg6_end_dt6_parse_mode(struct seg6_local_lwt *slwt) +{ + unsigned long parsed_optattrs = slwt->parsed_optattrs; + bool legacy, vrfmode; + + legacy = !!(parsed_optattrs & SEG6_F_ATTR(SEG6_LOCAL_TABLE)); + vrfmode = !!(parsed_optattrs & SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE)); + + if (!(legacy ^ vrfmode)) + /* both are absent or present: invalid DT6 mode */ + return DT_INVALID_MODE; + + return legacy ? DT_LEGACY_MODE : DT_VRF_MODE; +} + +static enum seg6_end_dt_mode seg6_end_dt6_get_mode(struct seg6_local_lwt *slwt) +{ + struct seg6_end_dt_info *info = &slwt->dt_info; + + return info->mode; +} + +static int seg6_end_dt6_build(struct seg6_local_lwt *slwt, const void *cfg, + struct netlink_ext_ack *extack) +{ + enum seg6_end_dt_mode mode = seg6_end_dt6_parse_mode(slwt); + struct seg6_end_dt_info *info = &slwt->dt_info; + + switch (mode) { + case DT_LEGACY_MODE: + info->mode = DT_LEGACY_MODE; + return 0; + case DT_VRF_MODE: + return __seg6_end_dt_vrf_build(slwt, cfg, AF_INET6, extack); + default: + NL_SET_ERR_MSG(extack, "table or vrftable must be specified"); + return -EINVAL; + } +} +#endif + static int input_action_end_dt6(struct sk_buff *skb, struct seg6_local_lwt *slwt) { @@ -410,6 +707,28 @@ static int input_action_end_dt6(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; +#ifdef CONFIG_NET_L3_MASTER_DEV + if (seg6_end_dt6_get_mode(slwt) == DT_LEGACY_MODE) + goto legacy_mode; + + /* DT6_VRF_MODE */ + skb = end_dt_vrf_core(skb, slwt); + if (!skb) + /* packet has been processed and consumed by the VRF */ + return 0; + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + /* note: this time we do not need to specify the table because the VRF + * takes care of selecting the correct table. + */ + seg6_lookup_any_nexthop(skb, NULL, 0, true); + + return dst_input(skb); + +legacy_mode: +#endif skb_set_transport_header(skb, sizeof(struct ipv6hdr)); seg6_lookup_any_nexthop(skb, NULL, slwt->table, true); @@ -566,48 +885,67 @@ static struct seg6_action_desc seg6_action_table[] = { }, { .action = SEG6_LOCAL_ACTION_END_X, - .attrs = (1 << SEG6_LOCAL_NH6), + .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH6), .input = input_action_end_x, }, { .action = SEG6_LOCAL_ACTION_END_T, - .attrs = (1 << SEG6_LOCAL_TABLE), + .attrs = SEG6_F_ATTR(SEG6_LOCAL_TABLE), .input = input_action_end_t, }, { .action = SEG6_LOCAL_ACTION_END_DX2, - .attrs = (1 << SEG6_LOCAL_OIF), + .attrs = SEG6_F_ATTR(SEG6_LOCAL_OIF), .input = input_action_end_dx2, }, { .action = SEG6_LOCAL_ACTION_END_DX6, - .attrs = (1 << SEG6_LOCAL_NH6), + .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH6), .input = input_action_end_dx6, }, { .action = SEG6_LOCAL_ACTION_END_DX4, - .attrs = (1 << SEG6_LOCAL_NH4), + .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH4), .input = input_action_end_dx4, }, { + .action = SEG6_LOCAL_ACTION_END_DT4, + .attrs = SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE), +#ifdef CONFIG_NET_L3_MASTER_DEV + .input = input_action_end_dt4, + .slwt_ops = { + .build_state = seg6_end_dt4_build, + }, +#endif + }, + { .action = SEG6_LOCAL_ACTION_END_DT6, - .attrs = (1 << SEG6_LOCAL_TABLE), +#ifdef CONFIG_NET_L3_MASTER_DEV + .attrs = 0, + .optattrs = SEG6_F_ATTR(SEG6_LOCAL_TABLE) | + SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE), + .slwt_ops = { + .build_state = seg6_end_dt6_build, + }, +#else + .attrs = SEG6_F_ATTR(SEG6_LOCAL_TABLE), +#endif .input = input_action_end_dt6, }, { .action = SEG6_LOCAL_ACTION_END_B6, - .attrs = (1 << SEG6_LOCAL_SRH), + .attrs = SEG6_F_ATTR(SEG6_LOCAL_SRH), .input = input_action_end_b6, }, { .action = SEG6_LOCAL_ACTION_END_B6_ENCAP, - .attrs = (1 << SEG6_LOCAL_SRH), + .attrs = SEG6_F_ATTR(SEG6_LOCAL_SRH), .input = input_action_end_b6_encap, .static_headroom = sizeof(struct ipv6hdr), }, { .action = SEG6_LOCAL_ACTION_END_BPF, - .attrs = (1 << SEG6_LOCAL_BPF), + .attrs = SEG6_F_ATTR(SEG6_LOCAL_BPF), .input = input_action_end_bpf, }, @@ -649,6 +987,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_ACTION] = { .type = NLA_U32 }, [SEG6_LOCAL_SRH] = { .type = NLA_BINARY }, [SEG6_LOCAL_TABLE] = { .type = NLA_U32 }, + [SEG6_LOCAL_VRFTABLE] = { .type = NLA_U32 }, [SEG6_LOCAL_NH4] = { .type = NLA_BINARY, .len = sizeof(struct in_addr) }, [SEG6_LOCAL_NH6] = { .type = NLA_BINARY, @@ -710,6 +1049,11 @@ static int cmp_nla_srh(struct seg6_local_lwt *a, struct seg6_local_lwt *b) return memcmp(a->srh, b->srh, len); } +static void destroy_attr_srh(struct seg6_local_lwt *slwt) +{ + kfree(slwt->srh); +} + static int parse_nla_table(struct nlattr **attrs, struct seg6_local_lwt *slwt) { slwt->table = nla_get_u32(attrs[SEG6_LOCAL_TABLE]); @@ -733,6 +1077,53 @@ static int cmp_nla_table(struct seg6_local_lwt *a, struct seg6_local_lwt *b) return 0; } +static struct +seg6_end_dt_info *seg6_possible_end_dt_info(struct seg6_local_lwt *slwt) +{ +#ifdef CONFIG_NET_L3_MASTER_DEV + return &slwt->dt_info; +#else + return ERR_PTR(-EOPNOTSUPP); +#endif +} + +static int parse_nla_vrftable(struct nlattr **attrs, + struct seg6_local_lwt *slwt) +{ + struct seg6_end_dt_info *info = seg6_possible_end_dt_info(slwt); + + if (IS_ERR(info)) + return PTR_ERR(info); + + info->vrf_table = nla_get_u32(attrs[SEG6_LOCAL_VRFTABLE]); + + return 0; +} + +static int put_nla_vrftable(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + struct seg6_end_dt_info *info = seg6_possible_end_dt_info(slwt); + + if (IS_ERR(info)) + return PTR_ERR(info); + + if (nla_put_u32(skb, SEG6_LOCAL_VRFTABLE, info->vrf_table)) + return -EMSGSIZE; + + return 0; +} + +static int cmp_nla_vrftable(struct seg6_local_lwt *a, struct seg6_local_lwt *b) +{ + struct seg6_end_dt_info *info_a = seg6_possible_end_dt_info(a); + struct seg6_end_dt_info *info_b = seg6_possible_end_dt_info(b); + + if (info_a->vrf_table != info_b->vrf_table) + return 1; + + return 0; +} + static int parse_nla_nh4(struct nlattr **attrs, struct seg6_local_lwt *slwt) { memcpy(&slwt->nh4, nla_data(attrs[SEG6_LOCAL_NH4]), @@ -901,16 +1292,30 @@ static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b) return strcmp(a->bpf.name, b->bpf.name); } +static void destroy_attr_bpf(struct seg6_local_lwt *slwt) +{ + kfree(slwt->bpf.name); + if (slwt->bpf.prog) + bpf_prog_put(slwt->bpf.prog); +} + struct seg6_action_param { int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt); int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt); int (*cmp)(struct seg6_local_lwt *a, struct seg6_local_lwt *b); + + /* optional destroy() callback useful for releasing resources which + * have been previously acquired in the corresponding parse() + * function. + */ + void (*destroy)(struct seg6_local_lwt *slwt); }; static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_SRH] = { .parse = parse_nla_srh, .put = put_nla_srh, - .cmp = cmp_nla_srh }, + .cmp = cmp_nla_srh, + .destroy = destroy_attr_srh }, [SEG6_LOCAL_TABLE] = { .parse = parse_nla_table, .put = put_nla_table, @@ -934,14 +1339,130 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_BPF] = { .parse = parse_nla_bpf, .put = put_nla_bpf, - .cmp = cmp_nla_bpf }, + .cmp = cmp_nla_bpf, + .destroy = destroy_attr_bpf }, + + [SEG6_LOCAL_VRFTABLE] = { .parse = parse_nla_vrftable, + .put = put_nla_vrftable, + .cmp = cmp_nla_vrftable }, }; +/* call the destroy() callback (if available) for each set attribute in + * @parsed_attrs, starting from the first attribute up to the @max_parsed + * (excluded) attribute. + */ +static void __destroy_attrs(unsigned long parsed_attrs, int max_parsed, + struct seg6_local_lwt *slwt) +{ + struct seg6_action_param *param; + int i; + + /* Every required seg6local attribute is identified by an ID which is + * encoded as a flag (i.e: 1 << ID) in the 'attrs' bitmask; + * + * We scan the 'parsed_attrs' bitmask, starting from the first attribute + * up to the @max_parsed (excluded) attribute. + * For each set attribute, we retrieve the corresponding destroy() + * callback. If the callback is not available, then we skip to the next + * attribute; otherwise, we call the destroy() callback. + */ + for (i = 0; i < max_parsed; ++i) { + if (!(parsed_attrs & SEG6_F_ATTR(i))) + continue; + + param = &seg6_action_params[i]; + + if (param->destroy) + param->destroy(slwt); + } +} + +/* release all the resources that may have been acquired during parsing + * operations. + */ +static void destroy_attrs(struct seg6_local_lwt *slwt) +{ + unsigned long attrs = slwt->desc->attrs | slwt->parsed_optattrs; + + __destroy_attrs(attrs, SEG6_LOCAL_MAX + 1, slwt); +} + +static int parse_nla_optional_attrs(struct nlattr **attrs, + struct seg6_local_lwt *slwt) +{ + struct seg6_action_desc *desc = slwt->desc; + unsigned long parsed_optattrs = 0; + struct seg6_action_param *param; + int err, i; + + for (i = 0; i < SEG6_LOCAL_MAX + 1; ++i) { + if (!(desc->optattrs & SEG6_F_ATTR(i)) || !attrs[i]) + continue; + + /* once here, the i-th attribute is provided by the + * userspace AND it is identified optional as well. + */ + param = &seg6_action_params[i]; + + err = param->parse(attrs, slwt); + if (err < 0) + goto parse_optattrs_err; + + /* current attribute has been correctly parsed */ + parsed_optattrs |= SEG6_F_ATTR(i); + } + + /* store in the tunnel state all the optional attributed successfully + * parsed. + */ + slwt->parsed_optattrs = parsed_optattrs; + + return 0; + +parse_optattrs_err: + __destroy_attrs(parsed_optattrs, i, slwt); + + return err; +} + +/* call the custom constructor of the behavior during its initialization phase + * and after that all its attributes have been parsed successfully. + */ +static int +seg6_local_lwtunnel_build_state(struct seg6_local_lwt *slwt, const void *cfg, + struct netlink_ext_ack *extack) +{ + struct seg6_action_desc *desc = slwt->desc; + struct seg6_local_lwtunnel_ops *ops; + + ops = &desc->slwt_ops; + if (!ops->build_state) + return 0; + + return ops->build_state(slwt, cfg, extack); +} + +/* call the custom destructor of the behavior which is invoked before the + * tunnel is going to be destroyed. + */ +static void seg6_local_lwtunnel_destroy_state(struct seg6_local_lwt *slwt) +{ + struct seg6_action_desc *desc = slwt->desc; + struct seg6_local_lwtunnel_ops *ops; + + ops = &desc->slwt_ops; + if (!ops->destroy_state) + return; + + ops->destroy_state(slwt); +} + static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) { struct seg6_action_param *param; struct seg6_action_desc *desc; + unsigned long invalid_attrs; int i, err; desc = __get_action_desc(slwt->action); @@ -954,8 +1475,28 @@ static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) slwt->desc = desc; slwt->headroom += desc->static_headroom; + /* Forcing the desc->optattrs *set* and the desc->attrs *set* to be + * disjoined, this allow us to release acquired resources by optional + * attributes and by required attributes independently from each other + * without any interfarence. + * In other terms, we are sure that we do not release some the acquired + * resources twice. + * + * Note that if an attribute is configured both as required and as + * optional, it means that the user has messed something up in the + * seg6_action_table. Therefore, this check is required for SRv6 + * behaviors to work properly. + */ + invalid_attrs = desc->attrs & desc->optattrs; + if (invalid_attrs) { + WARN_ONCE(1, + "An attribute cannot be both required AND optional"); + return -EINVAL; + } + + /* parse the required attributes */ for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) { - if (desc->attrs & (1 << i)) { + if (desc->attrs & SEG6_F_ATTR(i)) { if (!attrs[i]) return -EINVAL; @@ -963,11 +1504,24 @@ static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) err = param->parse(attrs, slwt); if (err < 0) - return err; + goto parse_attrs_err; } } + /* parse the optional attributes, if any */ + err = parse_nla_optional_attrs(attrs, slwt); + if (err < 0) + goto parse_attrs_err; + return 0; + +parse_attrs_err: + /* release any resource that may have been acquired during the i-1 + * parse() operations. + */ + __destroy_attrs(desc->attrs, i, slwt); + + return err; } static int seg6_local_build_state(struct net *net, struct nlattr *nla, @@ -1003,6 +1557,10 @@ static int seg6_local_build_state(struct net *net, struct nlattr *nla, if (err < 0) goto out_free; + err = seg6_local_lwtunnel_build_state(slwt, cfg, extack); + if (err < 0) + goto out_destroy_attrs; + newts->type = LWTUNNEL_ENCAP_SEG6_LOCAL; newts->flags = LWTUNNEL_STATE_INPUT_REDIRECT; newts->headroom = slwt->headroom; @@ -1011,8 +1569,9 @@ static int seg6_local_build_state(struct net *net, struct nlattr *nla, return 0; +out_destroy_attrs: + destroy_attrs(slwt); out_free: - kfree(slwt->srh); kfree(newts); return err; } @@ -1021,12 +1580,9 @@ static void seg6_local_destroy_state(struct lwtunnel_state *lwt) { struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt); - kfree(slwt->srh); + seg6_local_lwtunnel_destroy_state(slwt); - if (slwt->desc->attrs & (1 << SEG6_LOCAL_BPF)) { - kfree(slwt->bpf.name); - bpf_prog_put(slwt->bpf.prog); - } + destroy_attrs(slwt); return; } @@ -1036,13 +1592,16 @@ static int seg6_local_fill_encap(struct sk_buff *skb, { struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt); struct seg6_action_param *param; + unsigned long attrs; int i, err; if (nla_put_u32(skb, SEG6_LOCAL_ACTION, slwt->action)) return -EMSGSIZE; + attrs = slwt->desc->attrs | slwt->parsed_optattrs; + for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) { - if (slwt->desc->attrs & (1 << i)) { + if (attrs & SEG6_F_ATTR(i)) { param = &seg6_action_params[i]; err = param->put(skb, slwt); if (err < 0) @@ -1061,31 +1620,34 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt) nlsize = nla_total_size(4); /* action */ - attrs = slwt->desc->attrs; + attrs = slwt->desc->attrs | slwt->parsed_optattrs; - if (attrs & (1 << SEG6_LOCAL_SRH)) + if (attrs & SEG6_F_ATTR(SEG6_LOCAL_SRH)) nlsize += nla_total_size((slwt->srh->hdrlen + 1) << 3); - if (attrs & (1 << SEG6_LOCAL_TABLE)) + if (attrs & SEG6_F_ATTR(SEG6_LOCAL_TABLE)) nlsize += nla_total_size(4); - if (attrs & (1 << SEG6_LOCAL_NH4)) + if (attrs & SEG6_F_ATTR(SEG6_LOCAL_NH4)) nlsize += nla_total_size(4); - if (attrs & (1 << SEG6_LOCAL_NH6)) + if (attrs & SEG6_F_ATTR(SEG6_LOCAL_NH6)) nlsize += nla_total_size(16); - if (attrs & (1 << SEG6_LOCAL_IIF)) + if (attrs & SEG6_F_ATTR(SEG6_LOCAL_IIF)) nlsize += nla_total_size(4); - if (attrs & (1 << SEG6_LOCAL_OIF)) + if (attrs & SEG6_F_ATTR(SEG6_LOCAL_OIF)) nlsize += nla_total_size(4); - if (attrs & (1 << SEG6_LOCAL_BPF)) + if (attrs & SEG6_F_ATTR(SEG6_LOCAL_BPF)) nlsize += nla_total_size(sizeof(struct nlattr)) + nla_total_size(MAX_PROG_NAME) + nla_total_size(4); + if (attrs & SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE)) + nlsize += nla_total_size(4); + return nlsize; } @@ -1094,6 +1656,7 @@ static int seg6_local_cmp_encap(struct lwtunnel_state *a, { struct seg6_local_lwt *slwt_a, *slwt_b; struct seg6_action_param *param; + unsigned long attrs_a, attrs_b; int i; slwt_a = seg6_local_lwtunnel(a); @@ -1102,11 +1665,14 @@ static int seg6_local_cmp_encap(struct lwtunnel_state *a, if (slwt_a->action != slwt_b->action) return 1; - if (slwt_a->desc->attrs != slwt_b->desc->attrs) + attrs_a = slwt_a->desc->attrs | slwt_a->parsed_optattrs; + attrs_b = slwt_b->desc->attrs | slwt_b->parsed_optattrs; + + if (attrs_a != attrs_b) return 1; for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) { - if (slwt_a->desc->attrs & (1 << i)) { + if (attrs_a & SEG6_F_ATTR(i)) { param = &seg6_action_params[i]; if (param->cmp(slwt_a, slwt_b)) return 1; @@ -1128,6 +1694,15 @@ static const struct lwtunnel_encap_ops seg6_local_ops = { int __init seg6_local_init(void) { + /* If the max total number of defined attributes is reached, then your + * kernel build stops here. + * + * This check is required to avoid arithmetic overflows when processing + * behavior attributes and the maximum number of defined attributes + * exceeds the allowed value. + */ + BUILD_BUG_ON(SEG6_LOCAL_MAX + 1 > BITS_PER_TYPE(unsigned long)); + return lwtunnel_encap_add_ops(&seg6_local_ops, LWTUNNEL_ENCAP_SEG6_LOCAL); } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 5e2c34c0ac97..93636867aee2 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1128,7 +1128,6 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) if (tdev && !netif_is_l3_master(tdev)) { int t_hlen = tunnel->hlen + sizeof(struct iphdr); - dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); dev->mtu = tdev->mtu - t_hlen; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; @@ -1396,7 +1395,7 @@ static const struct net_device_ops ipip6_netdev_ops = { .ndo_uninit = ipip6_tunnel_uninit, .ndo_start_xmit = sit_tunnel_xmit, .ndo_do_ioctl = ipip6_tunnel_ioctl, - .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip6_tunnel_ctl, }; @@ -1426,7 +1425,6 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->priv_destructor = ipip6_dev_free; dev->type = ARPHRD_SIT; - dev->hard_header_len = LL_MAX_HEADER + t_hlen; dev->mtu = ETH_DATA_LEN - t_hlen; dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - t_hlen; @@ -1647,8 +1645,11 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev, } #ifdef CONFIG_IPV6_SIT_6RD - if (ipip6_netlink_6rd_parms(data, &ip6rd)) + if (ipip6_netlink_6rd_parms(data, &ip6rd)) { err = ipip6_tunnel_update_6rd(nt, &ip6rd); + if (err < 0) + unregister_netdevice_queue(dev, NULL); + } #endif return err; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index e796a64be308..e8cfb9e997bf 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -136,7 +136,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) __u32 cookie = ntohl(th->ack_seq) - 1; struct sock *ret = sk; struct request_sock *req; - int mss; + int full_space, mss; struct dst_entry *dst; __u8 rcv_wscale; u32 tsoff = 0; @@ -233,7 +233,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; fl6.flowi6_uid = sk->sk_uid; - security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) @@ -241,7 +241,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) } req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); - tcp_select_initial_window(sk, tcp_full_space(sk), req->mss, + /* limit the window selection if the user enforce a smaller rx buffer */ + full_space = tcp_full_space(sk); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; + + tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 5b60a4bdd36a..263ab43ed06b 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -160,6 +160,15 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "fib_notify_on_flag_change", + .data = &init_net.ipv6.sysctl.fib_notify_on_flag_change, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, { } }; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8db59f4e5f13..bd44ded7e50c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -278,7 +278,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -527,15 +527,21 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, if (np->repflow && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); + tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | + (np->tclass & INET_ECN_MASK) : + np->tclass; + + if (!INET_ECN_is_capable(tclass) && + tcp_bpf_ca_needs_ecn((struct sock *)req)) + tclass |= INET_ECN_ECT_0; + rcu_read_lock(); opt = ireq->ipv6_opt; - tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? - tcp_rsk(req)->syn_tos : np->tclass; if (!opt) opt = rcu_dereference(np->opt); err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, - tclass & ~INET_ECN_MASK, - sk->sk_priority); + tclass, sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -823,9 +829,15 @@ static void tcp_v6_init_req(struct request_sock *req, } static struct dst_entry *tcp_v6_route_req(const struct sock *sk, + struct sk_buff *skb, struct flowi *fl, - const struct request_sock *req) + struct request_sock *req) { + tcp_v6_init_req(req, sk, skb); + + if (security_inet_conn_request(sk, skb, req)) + return NULL; + return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP); } @@ -846,7 +858,6 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .req_md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, #endif - .init_req = tcp_v6_init_req, #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v6_init_sequence, #endif @@ -954,7 +965,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); - security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); /* Pass a socket to ip6_dst_lookup either it is for RST * Underlying function will use this to retrieve the network @@ -1193,6 +1204,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct inet_sock *newinet; + bool found_dup_sk = false; struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG @@ -1314,7 +1326,9 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * if (np->repflow) newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); - /* Set ToS of the new socket based upon the value of incoming SYN. */ + /* Set ToS of the new socket based upon the value of incoming SYN. + * ECT bits are set later in tcp_init_transfer(). + */ if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; @@ -1368,7 +1382,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * tcp_done(newsk); goto out; } - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), + &found_dup_sk); if (*own_req) { tcp_move_syn(newtp, req); @@ -1383,6 +1398,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * skb_set_owner_r(newnp->pktoptions, newsk); } } + } else { + if (!req_unhash && found_dup_sk) { + /* This code path should only be executed in the + * syncookie case only + */ + bh_unlock_sock(newsk); + sock_put(newsk); + newsk = NULL; + } } return newsk; @@ -1396,6 +1420,8 @@ out: return NULL; } +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, + u32)); /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * @@ -1449,7 +1475,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || - dst->ops->check(dst, np->rx_dst_cookie) == NULL) { + INDIRECT_CALL_1(dst->ops->check, ip6_dst_check, + dst, np->rx_dst_cookie) == NULL) { dst_release(dst); sk->sk_rx_dst = NULL; } @@ -2097,6 +2124,7 @@ struct proto tcpv6_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 29d9691359b9..d25e5a9252fd 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -276,7 +276,7 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, inet6_sdif(skb), udptable, skb); } -struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, +struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport) { const struct ipv6hdr *iph = ipv6_hdr(skb); @@ -285,7 +285,6 @@ struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, &iph->daddr, dport, inet6_iif(skb), inet6_sdif(skb), &udp_table, NULL); } -EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb); /* Must be called under rcu_read_lock(). * Does increment socket refcount. @@ -410,9 +409,8 @@ try_again: } *addr_len = sizeof(*sin6); - if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, - (struct sockaddr *)sin6); + BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, + (struct sockaddr *)sin6); } if (udp_sk(sk)->gro_enabled) @@ -560,7 +558,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, inet6_iif(skb), inet6_sdif(skb), udptable, NULL); - if (!sk) { + if (!sk || udp_sk(sk)->encap_type) { /* No socket for error: try tunnels before discarding */ sk = ERR_PTR(-ENOENT); if (static_branch_unlikely(&udpv6_encap_needed_key)) { @@ -637,6 +635,9 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (rc == -ENOMEM) UDP6_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); + else + UDP6_INC_STATS(sock_net(sk), + UDP_MIB_MEMERRORS, is_udplite); UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); return -1; @@ -1460,7 +1461,7 @@ do_udp_sendmsg: fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; - if (cgroup_bpf_enabled && !connected) { + if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, &fl6.saddr); if (err) @@ -1496,7 +1497,7 @@ do_udp_sendmsg: } else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); if (ipc6.tclass < 0) ipc6.tclass = np->tclass; @@ -1606,8 +1607,10 @@ void udpv6_destroy_sock(struct sock *sk) if (encap_destroy) encap_destroy(sk); } - if (up->encap_enabled) + if (up->encap_enabled) { static_branch_dec(&udpv6_encap_needed_key); + udp_encap_disable(); + } } inet6_destroy_sock(sk); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 584157a07759..faa823c24292 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -28,10 +28,6 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, int tnl_hlen; int err; - mss = skb_shinfo(skb)->gso_size; - if (unlikely(skb->len <= mss)) - goto out; - if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) segs = skb_udp_tunnel_segment(skb, features, true); @@ -46,7 +42,11 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, goto out; if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) - return __udp_gso_segment(skb, features); + return __udp_gso_segment(skb, features, true); + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; /* Do software UFO. Complete and fill in the UDP checksum as HW cannot * do checksum of UDP packets sent as multiple IP fragments. @@ -111,12 +111,22 @@ out: return segs; } +static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, + __be16 dport) +{ + const struct ipv6hdr *iph = skb_gro_network_header(skb); + + return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, + &iph->daddr, dport, inet6_iif(skb), + inet6_sdif(skb), &udp_table, NULL); +} + INDIRECT_CALLABLE_SCOPE struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + struct sock *sk = NULL; struct sk_buff *pp; - struct sock *sk; if (unlikely(!uh)) goto flush; @@ -135,7 +145,10 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) skip: NAPI_GRO_CB(skb)->is_ipv6 = 1; rcu_read_lock(); - sk = static_branch_unlikely(&udpv6_encap_needed_key) ? udp6_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + + if (static_branch_unlikely(&udpv6_encap_needed_key)) + sk = udp6_gro_lookup_skb(skb, uh->source, uh->dest); + pp = udp_gro_receive(head, skb, uh, sk); rcu_read_unlock(); return pp; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index d80572074667..6092d5cb7168 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -89,7 +89,7 @@ static struct sock *iucv_accept_dequeue(struct sock *parent, static void iucv_sock_kill(struct sock *sk); static void iucv_sock_close(struct sock *sk); -static void afiucv_hs_callback_txnotify(struct sk_buff *, enum iucv_tx_notify); +static void afiucv_hs_callback_txnotify(struct sock *sk, enum iucv_tx_notify); /* Call Back functions */ static void iucv_callback_rx(struct iucv_path *, struct iucv_message *); @@ -182,7 +182,7 @@ static inline int iucv_below_msglim(struct sock *sk) if (sk->sk_state != IUCV_CONNECTED) return 1; if (iucv->transport == AF_IUCV_TRANS_IUCV) - return (skb_queue_len(&iucv->send_skb_q) < iucv->path->msglim); + return (atomic_read(&iucv->skbs_in_xmit) < iucv->path->msglim); else return ((atomic_read(&iucv->msg_sent) < iucv->msglimit_peer) && (atomic_read(&iucv->pendings) <= 0)); @@ -211,7 +211,6 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock, { struct iucv_sock *iucv = iucv_sk(sock); struct af_iucv_trans_hdr *phs_hdr; - struct sk_buff *nskb; int err, confirm_recv = 0; phs_hdr = skb_push(skb, sizeof(*phs_hdr)); @@ -257,22 +256,16 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock, err = -EMSGSIZE; goto err_free; } - skb_trim(skb, skb->dev->mtu); + err = pskb_trim(skb, skb->dev->mtu); + if (err) + goto err_free; } skb->protocol = cpu_to_be16(ETH_P_AF_IUCV); - __skb_header_release(skb); - nskb = skb_clone(skb, GFP_ATOMIC); - if (!nskb) { - err = -ENOMEM; - goto err_free; - } - - skb_queue_tail(&iucv->send_skb_q, nskb); + atomic_inc(&iucv->skbs_in_xmit); err = dev_queue_xmit(skb); if (net_xmit_eval(err)) { - skb_unlink(nskb, &iucv->send_skb_q); - kfree_skb(nskb); + atomic_dec(&iucv->skbs_in_xmit); } else { atomic_sub(confirm_recv, &iucv->msg_recv); WARN_ON(atomic_read(&iucv->msg_recv) < 0); @@ -424,7 +417,7 @@ static void iucv_sock_close(struct sock *sk) sk->sk_state = IUCV_CLOSING; sk->sk_state_change(sk); - if (!err && !skb_queue_empty(&iucv->send_skb_q)) { + if (!err && atomic_read(&iucv->skbs_in_xmit) > 0) { if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) timeo = sk->sk_lingertime; else @@ -491,6 +484,7 @@ static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, atomic_set(&iucv->pendings, 0); iucv->flags = 0; iucv->msglimit = 0; + atomic_set(&iucv->skbs_in_xmit, 0); atomic_set(&iucv->msg_sent, 0); atomic_set(&iucv->msg_recv, 0); iucv->path = NULL; @@ -587,7 +581,7 @@ static void __iucv_auto_name(struct iucv_sock *iucv) static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { - struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr; + DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr); char uid[sizeof(sa->siucv_user_id)]; struct sock *sk = sock->sk; struct iucv_sock *iucv; @@ -691,7 +685,7 @@ static int iucv_sock_autobind(struct sock *sk) static int afiucv_path_connect(struct socket *sock, struct sockaddr *addr) { - struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr; + DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr); struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); unsigned char user_data[16]; @@ -738,7 +732,7 @@ done: static int iucv_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { - struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr; + DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr); struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); int err; @@ -874,7 +868,7 @@ done: static int iucv_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { - struct sockaddr_iucv *siucv = (struct sockaddr_iucv *) addr; + DECLARE_SOCKADDR(struct sockaddr_iucv *, siucv, addr); struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); @@ -1004,7 +998,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (iucv->transport == AF_IUCV_TRANS_HIPER) { headroom = sizeof(struct af_iucv_trans_hdr) + LL_RESERVED_SPACE(iucv->hs_dev); - linear = len; + linear = min(len, PAGE_SIZE - headroom); } else { if (len < PAGE_SIZE) { linear = len; @@ -1055,6 +1049,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, } } else { /* Classic VM IUCV transport */ skb_queue_tail(&iucv->send_skb_q, skb); + atomic_inc(&iucv->skbs_in_xmit); if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags) && skb->len <= 7) { @@ -1063,6 +1058,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, /* on success: there is no message_complete callback */ /* for an IPRMDATA msg; remove skb from send queue */ if (err == 0) { + atomic_dec(&iucv->skbs_in_xmit); skb_unlink(skb, &iucv->send_skb_q); kfree_skb(skb); } @@ -1071,6 +1067,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, /* IUCV_IPRMDATA path flag is set... sever path */ if (err == 0x15) { pr_iucv->path_sever(iucv->path, NULL); + atomic_dec(&iucv->skbs_in_xmit); skb_unlink(skb, &iucv->send_skb_q); err = -EPIPE; goto fail; @@ -1109,6 +1106,8 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, } else { err = -EPIPE; } + + atomic_dec(&iucv->skbs_in_xmit); skb_unlink(skb, &iucv->send_skb_q); goto fail; } @@ -1434,7 +1433,8 @@ static int iucv_sock_shutdown(struct socket *sock, int how) break; } - if (how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) { + if ((how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) && + sk->sk_state == IUCV_CONNECTED) { if (iucv->transport == AF_IUCV_TRANS_IUCV) { txmsg.class = 0; txmsg.tag = 0; @@ -1644,7 +1644,7 @@ static int iucv_callback_connreq(struct iucv_path *path, } /* Create the new socket */ - nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); + nsk = iucv_sock_alloc(NULL, sk->sk_protocol, GFP_ATOMIC, 0); if (!nsk) { err = pr_iucv->path_sever(path, user_data); iucv_path_free(path); @@ -1747,10 +1747,14 @@ static void iucv_callback_txdone(struct iucv_path *path, { struct sock *sk = path->private; struct sk_buff *this = NULL; - struct sk_buff_head *list = &iucv_sk(sk)->send_skb_q; + struct sk_buff_head *list; struct sk_buff *list_skb; + struct iucv_sock *iucv; unsigned long flags; + iucv = iucv_sk(sk); + list = &iucv->send_skb_q; + bh_lock_sock(sk); spin_lock_irqsave(&list->lock, flags); @@ -1760,8 +1764,11 @@ static void iucv_callback_txdone(struct iucv_path *path, break; } } - if (this) + if (this) { + atomic_dec(&iucv->skbs_in_xmit); __skb_unlink(this, list); + } + spin_unlock_irqrestore(&list->lock, flags); if (this) { @@ -1771,7 +1778,7 @@ static void iucv_callback_txdone(struct iucv_path *path, } if (sk->sk_state == IUCV_CLOSING) { - if (skb_queue_empty(&iucv_sk(sk)->send_skb_q)) { + if (atomic_read(&iucv->skbs_in_xmit) == 0) { sk->sk_state = IUCV_CLOSED; sk->sk_state_change(sk); } @@ -1850,7 +1857,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb) goto out; } - nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); + nsk = iucv_sock_alloc(NULL, sk->sk_protocol, GFP_ATOMIC, 0); bh_lock_sock(sk); if ((sk->sk_state != IUCV_LISTEN) || sk_acceptq_is_full(sk) || @@ -2035,7 +2042,6 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, char nullstring[8]; if (!pskb_may_pull(skb, sizeof(*trans_hdr))) { - WARN_ONCE(1, "AF_IUCV failed to receive skb, len=%u", skb->len); kfree_skb(skb); return NET_RX_SUCCESS; } @@ -2131,73 +2137,40 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, * afiucv_hs_callback_txnotify() - handle send notifcations from HiperSockets * transport **/ -static void afiucv_hs_callback_txnotify(struct sk_buff *skb, - enum iucv_tx_notify n) +static void afiucv_hs_callback_txnotify(struct sock *sk, enum iucv_tx_notify n) { - struct sock *isk = skb->sk; - struct sock *sk = NULL; - struct iucv_sock *iucv = NULL; - struct sk_buff_head *list; - struct sk_buff *list_skb; - struct sk_buff *nskb; - unsigned long flags; - - read_lock_irqsave(&iucv_sk_list.lock, flags); - sk_for_each(sk, &iucv_sk_list.head) - if (sk == isk) { - iucv = iucv_sk(sk); - break; - } - read_unlock_irqrestore(&iucv_sk_list.lock, flags); + struct iucv_sock *iucv = iucv_sk(sk); - if (!iucv || sock_flag(sk, SOCK_ZAPPED)) + if (sock_flag(sk, SOCK_ZAPPED)) return; - list = &iucv->send_skb_q; - spin_lock_irqsave(&list->lock, flags); - skb_queue_walk_safe(list, list_skb, nskb) { - if (skb_shinfo(list_skb) == skb_shinfo(skb)) { - switch (n) { - case TX_NOTIFY_OK: - __skb_unlink(list_skb, list); - kfree_skb(list_skb); - iucv_sock_wake_msglim(sk); - break; - case TX_NOTIFY_PENDING: - atomic_inc(&iucv->pendings); - break; - case TX_NOTIFY_DELAYED_OK: - __skb_unlink(list_skb, list); - atomic_dec(&iucv->pendings); - if (atomic_read(&iucv->pendings) <= 0) - iucv_sock_wake_msglim(sk); - kfree_skb(list_skb); - break; - case TX_NOTIFY_UNREACHABLE: - case TX_NOTIFY_DELAYED_UNREACHABLE: - case TX_NOTIFY_TPQFULL: /* not yet used */ - case TX_NOTIFY_GENERALERROR: - case TX_NOTIFY_DELAYED_GENERALERROR: - __skb_unlink(list_skb, list); - kfree_skb(list_skb); - if (sk->sk_state == IUCV_CONNECTED) { - sk->sk_state = IUCV_DISCONN; - sk->sk_state_change(sk); - } - break; - } - break; + switch (n) { + case TX_NOTIFY_OK: + atomic_dec(&iucv->skbs_in_xmit); + iucv_sock_wake_msglim(sk); + break; + case TX_NOTIFY_PENDING: + atomic_inc(&iucv->pendings); + break; + case TX_NOTIFY_DELAYED_OK: + atomic_dec(&iucv->skbs_in_xmit); + if (atomic_dec_return(&iucv->pendings) <= 0) + iucv_sock_wake_msglim(sk); + break; + default: + atomic_dec(&iucv->skbs_in_xmit); + if (sk->sk_state == IUCV_CONNECTED) { + sk->sk_state = IUCV_DISCONN; + sk->sk_state_change(sk); } } - spin_unlock_irqrestore(&list->lock, flags); if (sk->sk_state == IUCV_CLOSING) { - if (skb_queue_empty(&iucv_sk(sk)->send_skb_q)) { + if (atomic_read(&iucv->skbs_in_xmit) == 0) { sk->sk_state = IUCV_CLOSED; sk->sk_state_change(sk); } } - } /* diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 56dad9565bc9..d0b56ffbb057 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -786,7 +786,7 @@ static ssize_t kcm_sendpage(struct socket *sock, struct page *page, if (skb_can_coalesce(skb, i, page, offset)) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; goto coalesced; } @@ -834,7 +834,7 @@ static ssize_t kcm_sendpage(struct socket *sock, struct page *page, get_page(page); skb_fill_page_desc(skb, i, page, offset, size); - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; coalesced: skb->len += size; @@ -1496,7 +1496,7 @@ static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) return 0; out: - fput(csock->file); + sockfd_put(csock); return err; } @@ -1644,7 +1644,7 @@ static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) spin_unlock_bh(&mux->lock); out: - fput(csock->file); + sockfd_put(csock); return err; } diff --git a/net/key/af_key.c b/net/key/af_key.c index c12dbc51ef5f..ef9b4ac03e7b 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2902,7 +2902,7 @@ static int count_ah_combs(const struct xfrm_tmpl *t) break; if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg) && aalg->available) + if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } return sz + sizeof(struct sadb_prop); @@ -2920,7 +2920,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!ealg->pfkey_supported) continue; - if (!(ealg_tmpl_set(t, ealg) && ealg->available)) + if (!(ealg_tmpl_set(t, ealg))) continue; for (k = 1; ; k++) { @@ -2931,7 +2931,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg) && aalg->available) + if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } } diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index e5e5036257b0..96f975777438 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -606,7 +606,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); if (ipc6.tclass < 0) ipc6.tclass = np->tclass; diff --git a/net/l3mdev/Makefile b/net/l3mdev/Makefile index 59755a9e2f9b..9e7da0acc58c 100644 --- a/net/l3mdev/Makefile +++ b/net/l3mdev/Makefile @@ -3,4 +3,4 @@ # Makefile for the L3 device API # -obj-$(CONFIG_NET_L3_MASTER_DEV) += l3mdev.o +obj-y += l3mdev.o diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 864326f150e2..ad7730b68772 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -241,6 +241,7 @@ EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup); * L3 master device * @net: network namespace for device index lookup * @fl: flow struct + * @arg: store the table the rule matched with here */ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 3c03f6512c5f..0511bbe4af7b 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -122,6 +122,8 @@ static struct lapb_cb *lapb_create_cb(void) timer_setup(&lapb->t1timer, NULL, 0); timer_setup(&lapb->t2timer, NULL, 0); + lapb->t1timer_stop = true; + lapb->t2timer_stop = true; lapb->t1 = LAPB_DEFAULT_T1; lapb->t2 = LAPB_DEFAULT_T2; @@ -129,6 +131,8 @@ static struct lapb_cb *lapb_create_cb(void) lapb->mode = LAPB_DEFAULT_MODE; lapb->window = LAPB_DEFAULT_WINDOW; lapb->state = LAPB_STATE_0; + + spin_lock_init(&lapb->lock); refcount_set(&lapb->refcnt, 1); out: return lapb; @@ -178,11 +182,23 @@ int lapb_unregister(struct net_device *dev) goto out; lapb_put(lapb); + /* Wait for other refs to "lapb" to drop */ + while (refcount_read(&lapb->refcnt) > 2) + usleep_range(1, 10); + + spin_lock_bh(&lapb->lock); + lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); lapb_clear_queues(lapb); + spin_unlock_bh(&lapb->lock); + + /* Wait for running timers to stop */ + del_timer_sync(&lapb->t1timer); + del_timer_sync(&lapb->t2timer); + __lapb_remove_cb(lapb); lapb_put(lapb); @@ -201,6 +217,8 @@ int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms) if (!lapb) goto out; + spin_lock_bh(&lapb->lock); + parms->t1 = lapb->t1 / HZ; parms->t2 = lapb->t2 / HZ; parms->n2 = lapb->n2; @@ -219,6 +237,7 @@ int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms) else parms->t2timer = (lapb->t2timer.expires - jiffies) / HZ; + spin_unlock_bh(&lapb->lock); lapb_put(lapb); rc = LAPB_OK; out: @@ -234,6 +253,8 @@ int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms) if (!lapb) goto out; + spin_lock_bh(&lapb->lock); + rc = LAPB_INVALUE; if (parms->t1 < 1 || parms->t2 < 1 || parms->n2 < 1) goto out_put; @@ -256,6 +277,7 @@ int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms) rc = LAPB_OK; out_put: + spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; @@ -270,6 +292,8 @@ int lapb_connect_request(struct net_device *dev) if (!lapb) goto out; + spin_lock_bh(&lapb->lock); + rc = LAPB_OK; if (lapb->state == LAPB_STATE_1) goto out_put; @@ -285,24 +309,18 @@ int lapb_connect_request(struct net_device *dev) rc = LAPB_OK; out_put: + spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_connect_request); -int lapb_disconnect_request(struct net_device *dev) +static int __lapb_disconnect_request(struct lapb_cb *lapb) { - struct lapb_cb *lapb = lapb_devtostruct(dev); - int rc = LAPB_BADTOKEN; - - if (!lapb) - goto out; - switch (lapb->state) { case LAPB_STATE_0: - rc = LAPB_NOTCONNECTED; - goto out_put; + return LAPB_NOTCONNECTED; case LAPB_STATE_1: lapb_dbg(1, "(%p) S1 TX DISC(1)\n", lapb->dev); @@ -310,12 +328,10 @@ int lapb_disconnect_request(struct net_device *dev) lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); lapb->state = LAPB_STATE_0; lapb_start_t1timer(lapb); - rc = LAPB_NOTCONNECTED; - goto out_put; + return LAPB_NOTCONNECTED; case LAPB_STATE_2: - rc = LAPB_OK; - goto out_put; + return LAPB_OK; } lapb_clear_queues(lapb); @@ -328,8 +344,22 @@ int lapb_disconnect_request(struct net_device *dev) lapb_dbg(1, "(%p) S3 DISC(1)\n", lapb->dev); lapb_dbg(0, "(%p) S3 -> S2\n", lapb->dev); - rc = LAPB_OK; -out_put: + return LAPB_OK; +} + +int lapb_disconnect_request(struct net_device *dev) +{ + struct lapb_cb *lapb = lapb_devtostruct(dev); + int rc = LAPB_BADTOKEN; + + if (!lapb) + goto out; + + spin_lock_bh(&lapb->lock); + + rc = __lapb_disconnect_request(lapb); + + spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; @@ -344,6 +374,8 @@ int lapb_data_request(struct net_device *dev, struct sk_buff *skb) if (!lapb) goto out; + spin_lock_bh(&lapb->lock); + rc = LAPB_NOTCONNECTED; if (lapb->state != LAPB_STATE_3 && lapb->state != LAPB_STATE_4) goto out_put; @@ -352,6 +384,7 @@ int lapb_data_request(struct net_device *dev, struct sk_buff *skb) lapb_kick(lapb); rc = LAPB_OK; out_put: + spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; @@ -364,7 +397,9 @@ int lapb_data_received(struct net_device *dev, struct sk_buff *skb) int rc = LAPB_BADTOKEN; if (lapb) { + spin_lock_bh(&lapb->lock); lapb_data_input(lapb, skb); + spin_unlock_bh(&lapb->lock); lapb_put(lapb); rc = LAPB_OK; } @@ -418,14 +453,98 @@ int lapb_data_transmit(struct lapb_cb *lapb, struct sk_buff *skb) return used; } +/* Handle device status changes. */ +static int lapb_device_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct lapb_cb *lapb; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + if (dev->type != ARPHRD_X25) + return NOTIFY_DONE; + + lapb = lapb_devtostruct(dev); + if (!lapb) + return NOTIFY_DONE; + + spin_lock_bh(&lapb->lock); + + switch (event) { + case NETDEV_UP: + lapb_dbg(0, "(%p) Interface up: %s\n", dev, dev->name); + + if (netif_carrier_ok(dev)) { + lapb_dbg(0, "(%p): Carrier is already up: %s\n", dev, + dev->name); + if (lapb->mode & LAPB_DCE) { + lapb_start_t1timer(lapb); + } else { + if (lapb->state == LAPB_STATE_0) { + lapb->state = LAPB_STATE_1; + lapb_establish_data_link(lapb); + } + } + } + break; + case NETDEV_GOING_DOWN: + if (netif_carrier_ok(dev)) + __lapb_disconnect_request(lapb); + break; + case NETDEV_DOWN: + lapb_dbg(0, "(%p) Interface down: %s\n", dev, dev->name); + lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state); + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb->n2count = 0; + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + break; + case NETDEV_CHANGE: + if (netif_carrier_ok(dev)) { + lapb_dbg(0, "(%p): Carrier detected: %s\n", dev, + dev->name); + if (lapb->mode & LAPB_DCE) { + lapb_start_t1timer(lapb); + } else { + if (lapb->state == LAPB_STATE_0) { + lapb->state = LAPB_STATE_1; + lapb_establish_data_link(lapb); + } + } + } else { + lapb_dbg(0, "(%p) Carrier lost: %s\n", dev, dev->name); + lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state); + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb->n2count = 0; + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + } + break; + } + + spin_unlock_bh(&lapb->lock); + lapb_put(lapb); + return NOTIFY_DONE; +} + +static struct notifier_block lapb_dev_notifier = { + .notifier_call = lapb_device_event, +}; + static int __init lapb_init(void) { - return 0; + return register_netdevice_notifier(&lapb_dev_notifier); } static void __exit lapb_exit(void) { WARN_ON(!list_empty(&lapb_list)); + + unregister_netdevice_notifier(&lapb_dev_notifier); } MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c index 7a4d0715d1c3..a966d29c772d 100644 --- a/net/lapb/lapb_out.c +++ b/net/lapb/lapb_out.c @@ -82,7 +82,8 @@ void lapb_kick(struct lapb_cb *lapb) skb = skb_dequeue(&lapb->write_queue); do { - if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skbn = skb_copy(skb, GFP_ATOMIC); + if (!skbn) { skb_queue_head(&lapb->write_queue, skb); break; } diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c index 8f5b17001a07..0230b272b7d1 100644 --- a/net/lapb/lapb_timer.c +++ b/net/lapb/lapb_timer.c @@ -40,6 +40,7 @@ void lapb_start_t1timer(struct lapb_cb *lapb) lapb->t1timer.function = lapb_t1timer_expiry; lapb->t1timer.expires = jiffies + lapb->t1; + lapb->t1timer_stop = false; add_timer(&lapb->t1timer); } @@ -50,16 +51,19 @@ void lapb_start_t2timer(struct lapb_cb *lapb) lapb->t2timer.function = lapb_t2timer_expiry; lapb->t2timer.expires = jiffies + lapb->t2; + lapb->t2timer_stop = false; add_timer(&lapb->t2timer); } void lapb_stop_t1timer(struct lapb_cb *lapb) { + lapb->t1timer_stop = true; del_timer(&lapb->t1timer); } void lapb_stop_t2timer(struct lapb_cb *lapb) { + lapb->t2timer_stop = true; del_timer(&lapb->t2timer); } @@ -72,24 +76,46 @@ static void lapb_t2timer_expiry(struct timer_list *t) { struct lapb_cb *lapb = from_timer(lapb, t, t2timer); + spin_lock_bh(&lapb->lock); + if (timer_pending(&lapb->t2timer)) /* A new timer has been set up */ + goto out; + if (lapb->t2timer_stop) /* The timer has been stopped */ + goto out; + if (lapb->condition & LAPB_ACK_PENDING_CONDITION) { lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; lapb_timeout_response(lapb); } + +out: + spin_unlock_bh(&lapb->lock); } static void lapb_t1timer_expiry(struct timer_list *t) { struct lapb_cb *lapb = from_timer(lapb, t, t1timer); + spin_lock_bh(&lapb->lock); + if (timer_pending(&lapb->t1timer)) /* A new timer has been set up */ + goto out; + if (lapb->t1timer_stop) /* The timer has been stopped */ + goto out; + switch (lapb->state) { /* - * If we are a DCE, keep going DM .. DM .. DM + * If we are a DCE, send DM up to N2 times, then switch to + * STATE_1 and send SABM(E). */ case LAPB_STATE_0: - if (lapb->mode & LAPB_DCE) + if (lapb->mode & LAPB_DCE && + lapb->n2count != lapb->n2) { + lapb->n2count++; lapb_send_control(lapb, LAPB_DM, LAPB_POLLOFF, LAPB_RESPONSE); + } else { + lapb->state = LAPB_STATE_1; + lapb_establish_data_link(lapb); + } break; /* @@ -101,7 +127,7 @@ static void lapb_t1timer_expiry(struct timer_list *t) lapb->state = LAPB_STATE_0; lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev); - return; + goto out; } else { lapb->n2count++; if (lapb->mode & LAPB_EXTENDED) { @@ -125,7 +151,7 @@ static void lapb_t1timer_expiry(struct timer_list *t) lapb->state = LAPB_STATE_0; lapb_disconnect_confirmation(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S2 -> S0\n", lapb->dev); - return; + goto out; } else { lapb->n2count++; lapb_dbg(1, "(%p) S2 TX DISC(1)\n", lapb->dev); @@ -143,7 +169,7 @@ static void lapb_t1timer_expiry(struct timer_list *t) lapb_stop_t2timer(lapb); lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S3 -> S0\n", lapb->dev); - return; + goto out; } else { lapb->n2count++; lapb_requeue_frames(lapb); @@ -160,7 +186,7 @@ static void lapb_t1timer_expiry(struct timer_list *t) lapb->state = LAPB_STATE_0; lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S4 -> S0\n", lapb->dev); - return; + goto out; } else { lapb->n2count++; lapb_transmit_frmr(lapb); @@ -169,4 +195,7 @@ static void lapb_t1timer_expiry(struct timer_list *t) } lapb_start_t1timer(lapb); + +out: + spin_unlock_bh(&lapb->lock); } diff --git a/net/llc/Kconfig b/net/llc/Kconfig index b0e646ac47eb..7f79f5e134f9 100644 --- a/net/llc/Kconfig +++ b/net/llc/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config LLC tristate - depends on NET config LLC2 tristate "ANSI/IEEE 802.2 LLC type 2 Support" diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 1144cda2a0fc..912aa9bd5e29 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -909,6 +909,8 @@ static void llc_sk_init(struct sock *sk) * @net: network namespace * @family: upper layer protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * @prot: struct proto associated with this new sock instance + * @kern: is this to be a kernel socket? * * Allocates a LLC sock and initializes it. Returns the new LLC sock * or %NULL if there's no memory available for one diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index cd9a9bd242ba..51ec8256b7fa 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -69,7 +69,7 @@ config MAC80211_MESH config MAC80211_LEDS bool "Enable LED triggers" depends on MAC80211 - depends on LEDS_CLASS + depends on LEDS_CLASS=y || LEDS_CLASS=MAC80211 select LEDS_TRIGGERS help This option enables a few LED triggers for different diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index ad04c361cba5..23d25e8b2358 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -56,11 +56,9 @@ mac80211-$(CONFIG_PM) += pm.o CFLAGS_trace.o := -I$(src) rc80211_minstrel-y := \ - rc80211_minstrel.o \ rc80211_minstrel_ht.o rc80211_minstrel-$(CONFIG_MAC80211_DEBUGFS) += \ - rc80211_minstrel_debugfs.o \ rc80211_minstrel_ht_debugfs.o mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y) diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index cd4cf84a7f99..cce28e3b2232 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -250,10 +250,10 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP; mgmt->u.action.u.addba_resp.dialog_token = dialog_token; - capab = (u16)(amsdu << 0); /* bit 0 A-MSDU support */ - capab |= (u16)(policy << 1); /* bit 1 aggregation policy */ - capab |= (u16)(tid << 2); /* bit 5:2 TID number */ - capab |= (u16)(buf_size << 6); /* bit 15:6 max size of aggregation */ + capab = u16_encode_bits(amsdu, IEEE80211_ADDBA_PARAM_AMSDU_MASK); + capab |= u16_encode_bits(policy, IEEE80211_ADDBA_PARAM_POLICY_MASK); + capab |= u16_encode_bits(tid, IEEE80211_ADDBA_PARAM_TID_MASK); + capab |= u16_encode_bits(buf_size, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK); mgmt->u.action.u.addba_resp.capab = cpu_to_le16(capab); mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout); diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index b37c8a983d88..430a58587538 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -95,10 +95,10 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, mgmt->u.action.u.addba_req.action_code = WLAN_ACTION_ADDBA_REQ; mgmt->u.action.u.addba_req.dialog_token = dialog_token; - capab = (u16)(1 << 0); /* bit 0 A-MSDU support */ - capab |= (u16)(1 << 1); /* bit 1 aggregation policy */ - capab |= (u16)(tid << 2); /* bit 5:2 TID number */ - capab |= (u16)(agg_size << 6); /* bit 15:6 max size of aggergation */ + capab = IEEE80211_ADDBA_PARAM_AMSDU_MASK; + capab |= IEEE80211_ADDBA_PARAM_POLICY_MASK; + capab |= u16_encode_bits(tid, IEEE80211_ADDBA_PARAM_TID_MASK); + capab |= u16_encode_bits(agg_size, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK); mgmt->u.action.u.addba_req.capab = cpu_to_le16(capab); @@ -950,8 +950,8 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab); amsdu = capab & IEEE80211_ADDBA_PARAM_AMSDU_MASK; - tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; - buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; + tid = u16_get_bits(capab, IEEE80211_ADDBA_PARAM_TID_MASK); + buf_size = u16_get_bits(capab, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK); buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes); txq = sta->sta.txq[tid]; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 7276e66ae435..c4c70e30ad7f 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -405,6 +405,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, case WLAN_CIPHER_SUITE_WEP104: if (WARN_ON_ONCE(fips_enabled)) return -EINVAL; + break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: @@ -1121,10 +1122,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.enable_beacon = true; sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p; sdata->vif.bss_conf.twt_responder = params->twt_responder; - memcpy(&sdata->vif.bss_conf.he_obss_pd, ¶ms->he_obss_pd, - sizeof(struct ieee80211_he_obss_pd)); - memcpy(&sdata->vif.bss_conf.he_bss_color, ¶ms->he_bss_color, - sizeof(struct ieee80211_he_bss_color)); + sdata->vif.bss_conf.he_obss_pd = params->he_obss_pd; + sdata->vif.bss_conf.he_bss_color = params->he_bss_color; sdata->vif.bss_conf.s1g = params->chandef.chan->band == NL80211_BAND_S1GHZ; @@ -2708,16 +2707,6 @@ static int ieee80211_get_tx_power(struct wiphy *wiphy, return 0; } -static int ieee80211_set_wds_peer(struct wiphy *wiphy, struct net_device *dev, - const u8 *addr) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - - memcpy(&sdata->u.wds.remote_addr, addr, ETH_ALEN); - - return 0; -} - static void ieee80211_rfkill_poll(struct wiphy *wiphy) { struct ieee80211_local *local = wiphy_priv(wiphy); @@ -3307,6 +3296,7 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, if (cfg80211_get_chandef_type(¶ms->chandef) != cfg80211_get_chandef_type(&sdata->u.ibss.chandef)) return -EINVAL; + break; case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20_NOHT: @@ -3458,7 +3448,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, IEEE80211_QUEUE_STOP_REASON_CSA); cfg80211_ch_switch_started_notify(sdata->dev, &sdata->csa_chandef, - params->count); + params->count, params->block_tx); if (changed) { ieee80211_bss_info_change_notify(sdata, changed); @@ -4083,6 +4073,17 @@ static int ieee80211_reset_tid_config(struct wiphy *wiphy, return ret; } +static int ieee80211_set_sar_specs(struct wiphy *wiphy, + struct cfg80211_sar_specs *sar) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + + if (!local->ops->set_sar_specs) + return -EOPNOTSUPP; + + return local->ops->set_sar_specs(&local->hw, sar); +} + const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -4138,7 +4139,6 @@ const struct cfg80211_ops mac80211_config_ops = { .set_wiphy_params = ieee80211_set_wiphy_params, .set_tx_power = ieee80211_set_tx_power, .get_tx_power = ieee80211_get_tx_power, - .set_wds_peer = ieee80211_set_wds_peer, .rfkill_poll = ieee80211_rfkill_poll, CFG80211_TESTMODE_CMD(ieee80211_testmode_cmd) CFG80211_TESTMODE_DUMP(ieee80211_testmode_dump) @@ -4186,4 +4186,5 @@ const struct cfg80211_ops mac80211_config_ops = { .probe_mesh_link = ieee80211_probe_mesh_link, .set_tid_config = ieee80211_set_tid_config, .reset_tid_config = ieee80211_reset_tid_config, + .set_sar_specs = ieee80211_set_sar_specs, }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 8f48aff74c7b..907bb1f748a1 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -9,6 +9,7 @@ #include <net/cfg80211.h> #include "ieee80211_i.h" #include "driver-ops.h" +#include "rate.h" static int ieee80211_chanctx_num_assigned(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) @@ -191,11 +192,13 @@ ieee80211_find_reservation_chanctx(struct ieee80211_local *local, return NULL; } -enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta) +static enum nl80211_chan_width ieee80211_get_sta_bw(struct sta_info *sta) { - switch (sta->bandwidth) { + enum ieee80211_sta_rx_bandwidth width = ieee80211_sta_cap_rx_bw(sta); + + switch (width) { case IEEE80211_STA_RX_BW_20: - if (sta->ht_cap.ht_supported) + if (sta->sta.ht_cap.ht_supported) return NL80211_CHAN_WIDTH_20; else return NL80211_CHAN_WIDTH_20_NOHT; @@ -232,7 +235,7 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata) !(sta->sdata->bss && sta->sdata->bss == sdata->bss)) continue; - max_bw = max(max_bw, ieee80211_get_sta_bw(&sta->sta)); + max_bw = max(max_bw, ieee80211_get_sta_bw(sta)); } rcu_read_unlock(); @@ -275,11 +278,11 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, case NL80211_IFTYPE_NAN: continue; case NL80211_IFTYPE_ADHOC: - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: width = vif->bss_conf.chandef.width; break; + case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_MONITOR: @@ -343,10 +346,42 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, drv_change_chanctx(local, ctx, IEEE80211_CHANCTX_CHANGE_MIN_WIDTH); } +static void ieee80211_chan_bw_change(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) +{ + struct sta_info *sta; + struct ieee80211_supported_band *sband = + local->hw.wiphy->bands[ctx->conf.def.chan->band]; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &local->sta_list, + list) { + enum ieee80211_sta_rx_bandwidth new_sta_bw; + + if (!ieee80211_sdata_running(sta->sdata)) + continue; + + if (rcu_access_pointer(sta->sdata->vif.chanctx_conf) != + &ctx->conf) + continue; + + new_sta_bw = ieee80211_sta_cur_vht_bw(sta); + if (new_sta_bw == sta->sta.bandwidth) + continue; + + sta->sta.bandwidth = new_sta_bw; + rate_control_rate_update(local, sband, sta, + IEEE80211_RC_BW_CHANGED); + } + rcu_read_unlock(); +} + static void ieee80211_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, const struct cfg80211_chan_def *chandef) { + enum nl80211_chan_width width; + if (cfg80211_chandef_identical(&ctx->conf.def, chandef)) { ieee80211_recalc_chanctx_min_def(local, ctx); return; @@ -354,7 +389,25 @@ static void ieee80211_change_chanctx(struct ieee80211_local *local, WARN_ON(!cfg80211_chandef_compatible(&ctx->conf.def, chandef)); + width = ctx->conf.def.width; ctx->conf.def = *chandef; + + /* expected to handle only 20/40/80/160 channel widths */ + switch (chandef->width) { + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + case NL80211_CHAN_WIDTH_40: + case NL80211_CHAN_WIDTH_80: + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_160: + break; + default: + WARN_ON(1); + } + + if (chandef->width < width) + ieee80211_chan_bw_change(local, ctx); + drv_change_chanctx(local, ctx, IEEE80211_CHANCTX_CHANGE_WIDTH); ieee80211_recalc_chanctx_min_def(local, ctx); @@ -362,6 +415,9 @@ static void ieee80211_change_chanctx(struct ieee80211_local *local, local->_oper_chandef = *chandef; ieee80211_hw_config(local, 0); } + + if (chandef->width > width) + ieee80211_chan_bw_change(local, ctx); } static struct ieee80211_chanctx * @@ -743,7 +799,6 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, continue; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_ADHOC: - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: break; @@ -1052,8 +1107,14 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata) if (WARN_ON(!chandef)) return -EINVAL; + if (old_ctx->conf.def.width > new_ctx->conf.def.width) + ieee80211_chan_bw_change(local, new_ctx); + ieee80211_change_chanctx(local, new_ctx, chandef); + if (old_ctx->conf.def.width < new_ctx->conf.def.width) + ieee80211_chan_bw_change(local, new_ctx); + vif_chsw[0].vif = &sdata->vif; vif_chsw[0].old_ctx = &old_ctx->conf; vif_chsw[0].new_ctx = &new_ctx->conf; @@ -1444,6 +1505,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) ieee80211_recalc_smps_chanctx(local, ctx); ieee80211_recalc_radar_chanctx(local, ctx); ieee80211_recalc_chanctx_min_def(local, ctx); + ieee80211_chan_bw_change(local, ctx); list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs, reserved_chanctx_list) { diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 90470392fdaa..5296898875ff 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -53,7 +53,7 @@ static const struct file_operations name## _ops = { \ DEBUGFS_READONLY_FILE_OPS(name) #define DEBUGFS_ADD(name) \ - debugfs_create_file(#name, 0400, phyd, local, &name## _ops); + debugfs_create_file(#name, 0400, phyd, local, &name## _ops) #define DEBUGFS_ADD_MODE(name, mode) \ debugfs_create_file(#name, mode, phyd, local, &name## _ops); @@ -120,18 +120,17 @@ static ssize_t aqm_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[100]; - size_t len; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = '\0'; - len = strlen(buf); - if (len > 0 && buf[len-1] == '\n') - buf[len-1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1) return count; @@ -177,18 +176,17 @@ static ssize_t airtime_flags_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[16]; - size_t len; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = 0; - len = strlen(buf); - if (len > 0 && buf[len - 1] == '\n') - buf[len - 1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (kstrtou16(buf, 0, &local->airtime_flags)) return -EINVAL; @@ -237,20 +235,19 @@ static ssize_t aql_txq_limit_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[100]; - size_t len; u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old; struct sta_info *sta; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = 0; - len = strlen(buf); - if (len > 0 && buf[len - 1] == '\n') - buf[len - 1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3) return -EINVAL; @@ -284,6 +281,56 @@ static const struct file_operations aql_txq_limit_ops = { .llseek = default_llseek, }; +static ssize_t aql_enable_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[3]; + int len; + + len = scnprintf(buf, sizeof(buf), "%d\n", + !static_key_false(&aql_disable.key)); + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t aql_enable_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + bool aql_disabled = static_key_false(&aql_disable.key); + char buf[3]; + size_t len; + + if (count > sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, user_buf, count)) + return -EFAULT; + + buf[sizeof(buf) - 1] = '\0'; + len = strlen(buf); + if (len > 0 && buf[len - 1] == '\n') + buf[len - 1] = 0; + + if (buf[0] == '0' && buf[1] == '\0') { + if (!aql_disabled) + static_branch_inc(&aql_disable); + } else if (buf[0] == '1' && buf[1] == '\0') { + if (aql_disabled) + static_branch_dec(&aql_disable); + } else { + return -EINVAL; + } + + return count; +} + +static const struct file_operations aql_enable_ops = { + .write = aql_enable_write, + .read = aql_enable_read, + .open = simple_open, + .llseek = default_llseek, +}; + static ssize_t force_tx_status_read(struct file *file, char __user *user_buf, size_t count, @@ -306,18 +353,17 @@ static ssize_t force_tx_status_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[3]; - size_t len; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = '\0'; - len = strlen(buf); - if (len > 0 && buf[len - 1] == '\n') - buf[len - 1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (buf[0] == '0' && buf[1] == '\0') local->force_tx_status = 0; @@ -409,6 +455,7 @@ static const char *hw_flag_names[] = { FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID), FLAG(AMPDU_KEYBORDER_SUPPORT), FLAG(SUPPORTS_TX_ENCAP_OFFLOAD), + FLAG(SUPPORTS_RX_DECAP_OFFLOAD), #undef FLAG }; @@ -572,6 +619,7 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_ADD(power); DEBUGFS_ADD(hw_conf); DEBUGFS_ADD_MODE(force_tx_status, 0600); + DEBUGFS_ADD_MODE(aql_enable, 0600); if (local->ops->wake_tx_queue) DEBUGFS_ADD_MODE(aqm, 0600); diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index 98a713475e0f..f53dec8a3d5c 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -319,7 +319,7 @@ KEY_OPS(key); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, key->debugfs.dir, \ - key, &key_##name##_ops); + key, &key_##name##_ops) #define DEBUGFS_ADD_W(name) \ debugfs_create_file(#name, 0600, key->debugfs.dir, \ key, &key_##name##_ops); diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index fe8a7a87e513..0ad3860852ff 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -574,9 +574,6 @@ static ssize_t ieee80211_if_parse_tsf( IEEE80211_IF_FILE_RW(tsf); -/* WDS attributes */ -IEEE80211_IF_FILE(peer, u.wds.remote_addr, MAC); - #ifdef CONFIG_MAC80211_MESH IEEE80211_IF_FILE(estab_plinks, u.mesh.estab_plinks, ATOMIC); @@ -645,7 +642,7 @@ IEEE80211_IF_FILE(dot11MeshConnectedToAuthServer, #define DEBUGFS_ADD_MODE(name, mode) \ debugfs_create_file(#name, mode, sdata->vif.debugfs_dir, \ - sdata, &name##_ops); + sdata, &name##_ops) #define DEBUGFS_ADD(name) DEBUGFS_ADD_MODE(name, 0400) @@ -701,11 +698,6 @@ static void add_ibss_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD_MODE(tsf, 0600); } -static void add_wds_files(struct ieee80211_sub_if_data *sdata) -{ - DEBUGFS_ADD(peer); -} - #ifdef CONFIG_MAC80211_MESH static void add_mesh_files(struct ieee80211_sub_if_data *sdata) @@ -719,7 +711,7 @@ static void add_mesh_stats(struct ieee80211_sub_if_data *sdata) struct dentry *dir = debugfs_create_dir("mesh_stats", sdata->vif.debugfs_dir); #define MESHSTATS_ADD(name)\ - debugfs_create_file(#name, 0400, dir, sdata, &name##_ops); + debugfs_create_file(#name, 0400, dir, sdata, &name##_ops) MESHSTATS_ADD(fwded_mcast); MESHSTATS_ADD(fwded_unicast); @@ -736,7 +728,7 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata) sdata->vif.debugfs_dir); #define MESHPARAMS_ADD(name) \ - debugfs_create_file(#name, 0600, dir, sdata, &name##_ops); + debugfs_create_file(#name, 0600, dir, sdata, &name##_ops) MESHPARAMS_ADD(dot11MeshMaxRetries); MESHPARAMS_ADD(dot11MeshRetryTimeout); @@ -805,9 +797,6 @@ static void add_files(struct ieee80211_sub_if_data *sdata) case NL80211_IFTYPE_AP_VLAN: add_vlan_files(sdata); break; - case NL80211_IFTYPE_WDS: - add_wds_files(sdata); - break; default: break; } diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 829dcad69c2c..5a27c61a7b38 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -79,6 +79,7 @@ static const char * const sta_flag_names[] = { FLAG(MPSP_RECIPIENT), FLAG(PS_DELIVER), FLAG(USES_ENCRYPTION), + FLAG(DECAP_OFFLOAD), #undef FLAG }; @@ -274,7 +275,7 @@ static ssize_t sta_aql_read(struct file *file, char __user *userbuf, "Q limit[low/high]: VO: %u/%u VI: %u/%u BE: %u/%u BK: %u/%u\n", q_depth[0], q_depth[1], q_depth[2], q_depth[3], q_limit_l[0], q_limit_h[0], q_limit_l[1], q_limit_h[1], - q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]), + q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]); rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); @@ -985,7 +986,7 @@ STA_OPS(he_capa); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, \ - sta->debugfs_dir, sta, &sta_ ##name## _ops); + sta->debugfs_dir, sta, &sta_ ##name## _ops) #define DEBUGFS_ADD_COUNTER(name, field) \ debugfs_create_ulong(#name, 0400, sta->debugfs_dir, &sta->field); diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index c9a8a2433e8a..48322e45e7dd 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -125,8 +125,11 @@ int drv_sta_state(struct ieee80211_local *local, } else if (old_state == IEEE80211_STA_AUTH && new_state == IEEE80211_STA_ASSOC) { ret = drv_sta_add(local, sdata, &sta->sta); - if (ret == 0) + if (ret == 0) { sta->uploaded = true; + if (rcu_access_pointer(sta->sta.rates)) + drv_sta_rate_tbl_update(local, sdata, &sta->sta); + } } else if (old_state == IEEE80211_STA_ASSOC && new_state == IEEE80211_STA_AUTH) { drv_sta_remove(local, sdata, &sta->sta); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index bcdfd19a596b..604ca59937f0 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1413,4 +1413,20 @@ static inline void drv_sta_set_4addr(struct ieee80211_local *local, trace_drv_return_void(local); } +static inline void drv_sta_set_decap_offload(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, + bool enabled) +{ + sdata = get_bss_sdata(sdata); + if (!check_sdata_in_driver(sdata)) + return; + + trace_drv_sta_set_decap_offload(local, sdata, sta, enabled); + if (local->ops->sta_set_decap_offload) + local->ops->sta_set_decap_offload(&local->hw, &sdata->vif, sta, + enabled); + trace_drv_return_void(local); +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/he.c b/net/mac80211/he.c index cc26f239838b..0c0b970835ce 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -52,6 +52,57 @@ ieee80211_update_from_he_6ghz_capa(const struct ieee80211_he_6ghz_capa *he_6ghz_ sta->sta.he_6ghz_capa = *he_6ghz_capa; } +static void ieee80211_he_mcs_disable(__le16 *he_mcs) +{ + u32 i; + + for (i = 0; i < 8; i++) + *he_mcs |= cpu_to_le16(IEEE80211_HE_MCS_NOT_SUPPORTED << i * 2); +} + +static void ieee80211_he_mcs_intersection(__le16 *he_own_rx, __le16 *he_peer_rx, + __le16 *he_own_tx, __le16 *he_peer_tx) +{ + u32 i; + u16 own_rx, own_tx, peer_rx, peer_tx; + + for (i = 0; i < 8; i++) { + own_rx = le16_to_cpu(*he_own_rx); + own_rx = (own_rx >> i * 2) & IEEE80211_HE_MCS_NOT_SUPPORTED; + + own_tx = le16_to_cpu(*he_own_tx); + own_tx = (own_tx >> i * 2) & IEEE80211_HE_MCS_NOT_SUPPORTED; + + peer_rx = le16_to_cpu(*he_peer_rx); + peer_rx = (peer_rx >> i * 2) & IEEE80211_HE_MCS_NOT_SUPPORTED; + + peer_tx = le16_to_cpu(*he_peer_tx); + peer_tx = (peer_tx >> i * 2) & IEEE80211_HE_MCS_NOT_SUPPORTED; + + if (peer_tx != IEEE80211_HE_MCS_NOT_SUPPORTED) { + if (own_rx == IEEE80211_HE_MCS_NOT_SUPPORTED) + peer_tx = IEEE80211_HE_MCS_NOT_SUPPORTED; + else if (own_rx < peer_tx) + peer_tx = own_rx; + } + + if (peer_rx != IEEE80211_HE_MCS_NOT_SUPPORTED) { + if (own_tx == IEEE80211_HE_MCS_NOT_SUPPORTED) + peer_rx = IEEE80211_HE_MCS_NOT_SUPPORTED; + else if (own_tx < peer_rx) + peer_rx = own_tx; + } + + *he_peer_rx &= + ~cpu_to_le16(IEEE80211_HE_MCS_NOT_SUPPORTED << i * 2); + *he_peer_rx |= cpu_to_le16(peer_rx << i * 2); + + *he_peer_tx &= + ~cpu_to_le16(IEEE80211_HE_MCS_NOT_SUPPORTED << i * 2); + *he_peer_tx |= cpu_to_le16(peer_tx << i * 2); + } +} + void ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, @@ -60,10 +111,12 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap; + struct ieee80211_sta_he_cap own_he_cap = sband->iftype_data->he_cap; struct ieee80211_he_cap_elem *he_cap_ie_elem = (void *)he_cap_ie; u8 he_ppe_size; u8 mcs_nss_size; u8 he_total_size; + bool own_160, peer_160, own_80p80, peer_80p80; memset(he_cap, 0, sizeof(*he_cap)); @@ -101,6 +154,45 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, if (sband->band == NL80211_BAND_6GHZ && he_6ghz_capa) ieee80211_update_from_he_6ghz_capa(he_6ghz_capa, sta); + + ieee80211_he_mcs_intersection(&own_he_cap.he_mcs_nss_supp.rx_mcs_80, + &he_cap->he_mcs_nss_supp.rx_mcs_80, + &own_he_cap.he_mcs_nss_supp.tx_mcs_80, + &he_cap->he_mcs_nss_supp.tx_mcs_80); + + own_160 = own_he_cap.he_cap_elem.phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; + peer_160 = he_cap->he_cap_elem.phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; + + if (peer_160 && own_160) { + ieee80211_he_mcs_intersection(&own_he_cap.he_mcs_nss_supp.rx_mcs_160, + &he_cap->he_mcs_nss_supp.rx_mcs_160, + &own_he_cap.he_mcs_nss_supp.tx_mcs_160, + &he_cap->he_mcs_nss_supp.tx_mcs_160); + } else if (peer_160 && !own_160) { + ieee80211_he_mcs_disable(&he_cap->he_mcs_nss_supp.rx_mcs_160); + ieee80211_he_mcs_disable(&he_cap->he_mcs_nss_supp.tx_mcs_160); + he_cap->he_cap_elem.phy_cap_info[0] &= + ~IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; + } + + own_80p80 = own_he_cap.he_cap_elem.phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G; + peer_80p80 = he_cap->he_cap_elem.phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G; + + if (peer_80p80 && own_80p80) { + ieee80211_he_mcs_intersection(&own_he_cap.he_mcs_nss_supp.rx_mcs_80p80, + &he_cap->he_mcs_nss_supp.rx_mcs_80p80, + &own_he_cap.he_mcs_nss_supp.tx_mcs_80p80, + &he_cap->he_mcs_nss_supp.tx_mcs_80p80); + } else if (peer_80p80 && !own_80p80) { + ieee80211_he_mcs_disable(&he_cap->he_mcs_nss_supp.rx_mcs_80p80); + ieee80211_he_mcs_disable(&he_cap->he_mcs_nss_supp.tx_mcs_80p80); + he_cap->he_cap_elem.phy_cap_info[0] &= + ~IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G; + } } void diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 2a21226fb518..ecda126a7026 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -311,11 +311,6 @@ struct ieee80211_if_ap { bool multicast_to_unicast; }; -struct ieee80211_if_wds { - struct sta_info *sta; - u8 remote_addr[ETH_ALEN]; -}; - struct ieee80211_if_vlan { struct list_head list; /* write-protected with RTNL and local->mtx */ @@ -457,7 +452,9 @@ struct ieee80211_if_managed { unsigned long probe_timeout; int probe_send_count; bool nullfunc_failed; - bool connection_loss; + u8 connection_loss:1, + driver_disconnect:1, + reconnect:1; struct cfg80211_bss *associated; struct ieee80211_mgd_auth_data *auth_data; @@ -851,7 +848,6 @@ enum txq_info_flags { */ struct txq_info { struct fq_tin tin; - struct fq_flow def_flow; struct codel_vars def_cvars; struct codel_stats cstats; struct sk_buff_head frags; @@ -985,7 +981,6 @@ struct ieee80211_sub_if_data { union { struct ieee80211_if_ap ap; - struct ieee80211_if_wds wds; struct ieee80211_if_vlan vlan; struct ieee80211_if_managed mgd; struct ieee80211_if_ibss ibss; @@ -1082,6 +1077,7 @@ enum queue_stop_reason { IEEE80211_QUEUE_STOP_REASON_FLUSH, IEEE80211_QUEUE_STOP_REASON_TDLS_TEARDOWN, IEEE80211_QUEUE_STOP_REASON_RESERVE_TID, + IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE, IEEE80211_QUEUE_STOP_REASONS, }; @@ -1147,6 +1143,8 @@ enum mac80211_scan_state { SCAN_ABORT, }; +DECLARE_STATIC_KEY_FALSE(aql_disable); + struct ieee80211_local { /* embed the driver visible part. * don't cast (use the static inlines below), but we keep @@ -1593,13 +1591,8 @@ ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status) { WARN_ON_ONCE(status->flag & RX_FLAG_MACTIME_START && status->flag & RX_FLAG_MACTIME_END); - if (status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END)) - return true; - /* can't handle non-legacy preamble yet */ - if (status->flag & RX_FLAG_MACTIME_PLCP_START && - status->encoding == RX_ENC_LEGACY) - return true; - return false; + return !!(status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END | + RX_FLAG_MACTIME_PLCP_START)); } void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata); @@ -1795,7 +1788,7 @@ static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata) /* tx handling */ void ieee80211_clear_tx_pending(struct ieee80211_local *local); -void ieee80211_tx_pending(unsigned long data); +void ieee80211_tx_pending(struct tasklet_struct *t); netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, @@ -2146,7 +2139,7 @@ void ieee80211_txq_remove_vlan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats, struct txq_info *txqi); -void ieee80211_wake_txqs(unsigned long data); +void ieee80211_wake_txqs(struct tasklet_struct *t); void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *bssid, @@ -2286,7 +2279,6 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, enum ieee80211_chanctx_mode chanmode, u8 radar_detect); int ieee80211_max_num_channels(struct ieee80211_local *local); -enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta); void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, struct ieee80211_chanctx *ctx); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 1be775979132..b80c9b016b2b 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -230,10 +230,6 @@ static inline int identical_mac_addr_allowed(int type1, int type2) type2 == NL80211_IFTYPE_MONITOR || type1 == NL80211_IFTYPE_P2P_DEVICE || type2 == NL80211_IFTYPE_P2P_DEVICE || - (type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_WDS) || - (type1 == NL80211_IFTYPE_WDS && - (type2 == NL80211_IFTYPE_WDS || - type2 == NL80211_IFTYPE_AP)) || (type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_AP_VLAN) || (type1 == NL80211_IFTYPE_AP_VLAN && (type2 == NL80211_IFTYPE_AP || @@ -361,11 +357,14 @@ static int ieee80211_open(struct net_device *dev) if (err) return err; - return ieee80211_do_open(&sdata->wdev, true); + wiphy_lock(sdata->local->hw.wiphy); + err = ieee80211_do_open(&sdata->wdev, true); + wiphy_unlock(sdata->local->hw.wiphy); + + return err; } -static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, - bool going_down) +static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_down) { struct ieee80211_local *local = sdata->local; unsigned long flags; @@ -417,15 +416,12 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, * (because if we remove a STA after ops->remove_interface() * the driver will have removed the vif info already!) * - * In WDS mode a station must exist here and be flushed, for - * AP_VLANs stations may exist since there's nothing else that + * For AP_VLANs stations may exist since there's nothing else that * would have removed them, but in other modes there shouldn't * be any stations. */ flushed = sta_info_flush(sdata); - WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP_VLAN && - ((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) || - (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1))); + WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP_VLAN && flushed > 0); /* don't count this interface for allmulti while it is down */ if (sdata->flags & IEEE80211_SDATA_ALLMULTI) @@ -552,8 +548,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, * When we get here, the interface is marked down. * Free the remaining keys, if there are any * (which can happen in AP mode if userspace sets - * keys before the interface is operating, and maybe - * also in WDS mode) + * keys before the interface is operating) * * Force the key freeing to always synchronize_net() * to wait for the RX path in case it is using this @@ -645,7 +640,9 @@ static int ieee80211_stop(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + wiphy_lock(sdata->local->hw.wiphy); ieee80211_do_stop(sdata, true); + wiphy_unlock(sdata->local->hw.wiphy); return 0; } @@ -773,7 +770,7 @@ static const struct net_device_ops ieee80211_dataif_8023_ops = { .ndo_get_stats64 = ieee80211_get_stats64, }; -static bool ieee80211_iftype_supports_encap_offload(enum nl80211_iftype iftype) +static bool ieee80211_iftype_supports_hdr_offload(enum nl80211_iftype iftype) { switch (iftype) { /* P2P GO and client are mapped to AP/STATION types */ @@ -793,7 +790,7 @@ static bool ieee80211_set_sdata_offload_flags(struct ieee80211_sub_if_data *sdat flags = sdata->vif.offload_flags; if (ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD) && - ieee80211_iftype_supports_encap_offload(sdata->vif.type)) { + ieee80211_iftype_supports_hdr_offload(sdata->vif.type)) { flags |= IEEE80211_OFFLOAD_ENCAP_ENABLED; if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG) && @@ -806,10 +803,21 @@ static bool ieee80211_set_sdata_offload_flags(struct ieee80211_sub_if_data *sdat flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED; } + if (ieee80211_hw_check(&local->hw, SUPPORTS_RX_DECAP_OFFLOAD) && + ieee80211_iftype_supports_hdr_offload(sdata->vif.type)) { + flags |= IEEE80211_OFFLOAD_DECAP_ENABLED; + + if (local->monitors) + flags &= ~IEEE80211_OFFLOAD_DECAP_ENABLED; + } else { + flags &= ~IEEE80211_OFFLOAD_DECAP_ENABLED; + } + if (sdata->vif.offload_flags == flags) return false; sdata->vif.offload_flags = flags; + ieee80211_check_fast_rx_iface(sdata); return true; } @@ -827,7 +835,7 @@ static void ieee80211_set_vif_encap_ops(struct ieee80211_sub_if_data *sdata) } if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD) || - !ieee80211_iftype_supports_encap_offload(bss->vif.type)) + !ieee80211_iftype_supports_hdr_offload(bss->vif.type)) return; enabled = bss->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_ENABLED; @@ -948,6 +956,8 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) return ret; } + set_bit(SDATA_STATE_RUNNING, &sdata->state); + ret = ieee80211_check_queues(sdata, NL80211_IFTYPE_MONITOR); if (ret) { kfree(sdata); @@ -1020,16 +1030,11 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct net_device *dev = wdev->netdev; struct ieee80211_local *local = sdata->local; - struct sta_info *sta; u32 changed = 0; int res; u32 hw_reconf_flags = 0; switch (sdata->vif.type) { - case NL80211_IFTYPE_WDS: - if (!is_valid_ether_addr(sdata->u.wds.remote_addr)) - return -ENOLINK; - break; case NL80211_IFTYPE_AP_VLAN: { struct ieee80211_sub_if_data *master; @@ -1078,6 +1083,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_WDS: /* cannot happen */ WARN_ON(1); break; @@ -1196,7 +1202,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) case NL80211_IFTYPE_OCB: netif_carrier_off(dev); break; - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: break; @@ -1218,28 +1223,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) set_bit(SDATA_STATE_RUNNING, &sdata->state); switch (sdata->vif.type) { - case NL80211_IFTYPE_WDS: - /* Create STA entry for the WDS peer */ - sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr, - GFP_KERNEL); - if (!sta) { - res = -ENOMEM; - goto err_del_interface; - } - - sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); - sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); - sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED); - - res = sta_info_insert(sta); - if (res) { - /* STA has been freed */ - goto err_del_interface; - } - - rate_control_rate_init(sta); - netif_carrier_on(dev); - break; case NL80211_IFTYPE_P2P_DEVICE: rcu_assign_pointer(local->p2p_sdata, sdata); break; @@ -1356,6 +1339,7 @@ static void ieee80211_iface_work(struct work_struct *work) while ((skb = skb_dequeue(&sdata->skb_queue))) { struct ieee80211_mgmt *mgmt = (void *)skb->data; + kcov_remote_start_common(skb_get_kcov_handle(skb)); if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_BACK) { int len = skb->len; @@ -1465,6 +1449,7 @@ static void ieee80211_iface_work(struct work_struct *work) } kfree_skb(skb); + kcov_remote_stop(); } /* then other type-dependent work */ @@ -1574,9 +1559,6 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, sdata->u.mntr.flags = MONITOR_FLAG_CONTROL | MONITOR_FLAG_OTHER_BSS; break; - case NL80211_IFTYPE_WDS: - sdata->vif.bss_conf.bssid = NULL; - break; case NL80211_IFTYPE_NAN: idr_init(&sdata->u.nan.function_inst_ids); spin_lock_init(&sdata->u.nan.func_lock); @@ -1587,6 +1569,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.bssid = sdata->vif.addr; break; case NL80211_IFTYPE_UNSPECIFIED: + case NL80211_IFTYPE_WDS: case NUM_NL80211_IFTYPES: WARN_ON(1); break; @@ -1631,9 +1614,7 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_OCB: /* * Could probably support everything - * but WDS here (WDS do_open can fail - * under memory pressure, which this - * code isn't prepared to handle). + * but here. */ break; case NL80211_IFTYPE_P2P_CLIENT: @@ -1652,6 +1633,10 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, if (ret) return ret; + ieee80211_stop_vif_queues(local, sdata, + IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE); + synchronize_net(); + ieee80211_do_stop(sdata, false); ieee80211_teardown_sdata(sdata); @@ -1674,6 +1659,8 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, err = ieee80211_do_open(&sdata->wdev, false); WARN(err, "type change: do_open returned %d", err); + ieee80211_wake_vif_queues(local, sdata, + IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE); return ret; } @@ -1726,7 +1713,6 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local, case NL80211_IFTYPE_MONITOR: /* doesn't matter */ break; - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_AP_VLAN: /* match up with an AP interface */ list_for_each_entry(sdata, &local->interfaces, list) { @@ -2001,7 +1987,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, ndev->min_mtu = 256; ndev->max_mtu = local->hw.max_mtu; - ret = register_netdevice(ndev); + ret = cfg80211_register_netdevice(ndev); if (ret) { free_netdev(ndev); return ret; @@ -2031,10 +2017,9 @@ void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata) synchronize_rcu(); - if (sdata->dev) { - unregister_netdevice(sdata->dev); - } else { - cfg80211_unregister_wdev(&sdata->wdev); + cfg80211_unregister_wdev(&sdata->wdev); + + if (!sdata->dev) { ieee80211_teardown_sdata(sdata); kfree(sdata); } @@ -2083,13 +2068,16 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) list_add(&sdata->list, &wdev_list); } mutex_unlock(&local->iflist_mtx); + unregister_netdevice_many(&unreg_list); + wiphy_lock(local->hw.wiphy); list_for_each_entry_safe(sdata, tmp, &wdev_list, list) { list_del(&sdata->list); cfg80211_unregister_wdev(&sdata->wdev); kfree(sdata); } + wiphy_unlock(local->hw.wiphy); } static int netdev_notify(struct notifier_block *nb, diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 8c5f829ff6d7..56c068cb49c4 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -887,7 +887,7 @@ void ieee80211_reenable_keys(struct ieee80211_sub_if_data *sdata) struct ieee80211_key *key; struct ieee80211_sub_if_data *vlan; - ASSERT_RTNL(); + lockdep_assert_wiphy(sdata->local->hw.wiphy); mutex_lock(&sdata->local->key_mtx); @@ -924,7 +924,7 @@ void ieee80211_iter_keys(struct ieee80211_hw *hw, struct ieee80211_key *key, *tmp; struct ieee80211_sub_if_data *sdata; - ASSERT_RTNL(); + lockdep_assert_wiphy(hw->wiphy); mutex_lock(&local->key_mtx); if (vif) { @@ -1300,3 +1300,52 @@ ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, return &key->conf; } EXPORT_SYMBOL_GPL(ieee80211_gtk_rekey_add); + +void ieee80211_key_mic_failure(struct ieee80211_key_conf *keyconf) +{ + struct ieee80211_key *key; + + key = container_of(keyconf, struct ieee80211_key, conf); + + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_AES_CMAC: + case WLAN_CIPHER_SUITE_BIP_CMAC_256: + key->u.aes_cmac.icverrors++; + break; + case WLAN_CIPHER_SUITE_BIP_GMAC_128: + case WLAN_CIPHER_SUITE_BIP_GMAC_256: + key->u.aes_gmac.icverrors++; + break; + default: + /* ignore the others for now, we don't keep counters now */ + break; + } +} +EXPORT_SYMBOL_GPL(ieee80211_key_mic_failure); + +void ieee80211_key_replay(struct ieee80211_key_conf *keyconf) +{ + struct ieee80211_key *key; + + key = container_of(keyconf, struct ieee80211_key, conf); + + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_CCMP_256: + key->u.ccmp.replays++; + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + case WLAN_CIPHER_SUITE_BIP_CMAC_256: + key->u.aes_cmac.replays++; + break; + case WLAN_CIPHER_SUITE_BIP_GMAC_128: + case WLAN_CIPHER_SUITE_BIP_GMAC_256: + key->u.aes_gmac.replays++; + break; + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + key->u.gcmp.replays++; + break; + } +} +EXPORT_SYMBOL_GPL(ieee80211_key_replay); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 523380aed92e..4f3f8bb58e76 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -220,9 +220,9 @@ u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata) BSS_CHANGED_ERP_SLOT; } -static void ieee80211_tasklet_handler(unsigned long data) +static void ieee80211_tasklet_handler(struct tasklet_struct *t) { - struct ieee80211_local *local = (struct ieee80211_local *) data; + struct ieee80211_local *local = from_tasklet(local, t, tasklet); struct sk_buff *skb; while ((skb = skb_dequeue(&local->skb_queue)) || @@ -261,7 +261,9 @@ static void ieee80211_restart_work(struct work_struct *work) "%s called with hardware scan in progress\n", __func__); flush_work(&local->radar_detected_work); + /* we might do interface manipulations, so need both */ rtnl_lock(); + wiphy_lock(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { /* * XXX: there may be more work for other vif types and even @@ -293,6 +295,7 @@ static void ieee80211_restart_work(struct work_struct *work) synchronize_net(); ieee80211_reconfig(local); + wiphy_unlock(local->hw.wiphy); rtnl_unlock(); } @@ -733,16 +736,12 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, skb_queue_head_init(&local->pending[i]); atomic_set(&local->agg_queue_stop[i], 0); } - tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending, - (unsigned long)local); + tasklet_setup(&local->tx_pending_tasklet, ieee80211_tx_pending); if (ops->wake_tx_queue) - tasklet_init(&local->wake_txqs_tasklet, ieee80211_wake_txqs, - (unsigned long)local); + tasklet_setup(&local->wake_txqs_tasklet, ieee80211_wake_txqs); - tasklet_init(&local->tasklet, - ieee80211_tasklet_handler, - (unsigned long) local); + tasklet_setup(&local->tasklet, ieee80211_tasklet_handler); skb_queue_head_init(&local->skb_queue); skb_queue_head_init(&local->skb_queue_unreliable); @@ -935,14 +934,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) return -EINVAL; } } else { - /* - * WDS is currently prohibited when channel contexts are used - * because there's no clear definition of which channel WDS - * type interfaces use - */ - if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_WDS)) - return -EINVAL; - /* DFS is not supported with multi-channel combinations yet */ for (i = 0; i < local->hw.wiphy->n_iface_combinations; i++) { const struct ieee80211_iface_combination *comb; @@ -1284,6 +1275,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) rate_control_add_debugfs(local); rtnl_lock(); + wiphy_lock(hw->wiphy); /* add one default STA interface if supported */ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION) && @@ -1297,6 +1289,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) "Failed to add default virtual iface\n"); } + wiphy_unlock(hw->wiphy); rtnl_unlock(); #ifdef CONFIG_INET diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index ce5825d6f1d1..97095b7c9c64 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -667,6 +667,35 @@ void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh) } } +static void +ieee80211_mesh_update_bss_params(struct ieee80211_sub_if_data *sdata, + u8 *ie, u8 ie_len) +{ + struct ieee80211_supported_band *sband; + const u8 *cap; + const struct ieee80211_he_operation *he_oper = NULL; + + sband = ieee80211_get_sband(sdata); + if (!sband) + return; + + if (!ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT) || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + return; + + sdata->vif.bss_conf.he_support = true; + + cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, ie, ie_len); + if (cap && cap[1] >= ieee80211_he_oper_size(&cap[3])) + he_oper = (void *)(cap + 3); + + if (he_oper) + sdata->vif.bss_conf.he_oper.params = + __le32_to_cpu(he_oper->he_oper_params); +} + /** * ieee80211_fill_mesh_addresses - fill addresses of a locally originated mesh frame * @hdr: 802.11 frame header @@ -943,6 +972,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) bcn->tail_len = skb->len; memcpy(bcn->tail, skb->data, bcn->tail_len); + ieee80211_mesh_update_bss_params(sdata, bcn->tail, bcn->tail_len); bcn->meshconf = (struct ieee80211_meshconf_ie *) (bcn->tail + ifmsh->meshconf_offset); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 313eee12410e..3db514c4c63a 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -356,7 +356,7 @@ u32 airtime_link_metric_get(struct ieee80211_local *local, */ tx_time = (device_constant + 10 * test_frame_len / rate); estimated_retx = ((1 << (2 * ARITH_SHIFT)) / (s_unit - err)); - result = (tx_time * estimated_retx) >> (2 * ARITH_SHIFT); + result = ((u64)tx_time * estimated_retx) >> (2 * ARITH_SHIFT); return (u32)result; } diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 48f31ac9233c..620ecf922408 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -60,6 +60,7 @@ static struct mesh_table *mesh_table_alloc(void) atomic_set(&newtbl->entries, 0); spin_lock_init(&newtbl->gates_lock); spin_lock_init(&newtbl->walk_lock); + rhashtable_init(&newtbl->rhead, &mesh_rht_params); return newtbl; } @@ -773,9 +774,6 @@ int mesh_pathtbl_init(struct ieee80211_sub_if_data *sdata) goto free_path; } - rhashtable_init(&tbl_path->rhead, &mesh_rht_params); - rhashtable_init(&tbl_mpp->rhead, &mesh_rht_params); - sdata->u.mesh.mesh_paths = tbl_path; sdata->u.mesh.mpp_paths = tbl_mpp; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 6adfcb9c06dc..2e33a1263518 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1417,6 +1417,17 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, return; } + if (sdata->vif.bss_conf.chandef.chan->band != + csa_ie.chandef.chan->band) { + sdata_info(sdata, + "AP %pM switches to different band (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", + ifmgd->associated->bssid, + csa_ie.chandef.chan->center_freq, + csa_ie.chandef.width, csa_ie.chandef.center_freq1, + csa_ie.chandef.center_freq2); + goto lock_and_drop_connection; + } + if (!cfg80211_chandef_usable(local->hw.wiphy, &csa_ie.chandef, IEEE80211_CHAN_DISABLED)) { sdata_info(sdata, @@ -1429,9 +1440,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, csa_ie.chandef.width, csa_ie.chandef.center_freq1, csa_ie.chandef.freq1_offset, csa_ie.chandef.center_freq2); - ieee80211_queue_work(&local->hw, - &ifmgd->csa_connection_drop_work); - return; + goto lock_and_drop_connection; } if (cfg80211_chandef_identical(&csa_ie.chandef, @@ -1493,6 +1502,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, sdata->csa_chandef = csa_ie.chandef; sdata->csa_block_tx = csa_ie.mode; ifmgd->csa_ignored_same_chan = false; + ifmgd->beacon_crc_valid = false; if (sdata->csa_block_tx) ieee80211_stop_vif_queues(local, sdata, @@ -1500,7 +1510,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, mutex_unlock(&local->mtx); cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chandef, - csa_ie.count); + csa_ie.count, csa_ie.mode); if (local->ops->channel_switch) { /* use driver's channel switch callback */ @@ -1516,6 +1526,9 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, TU_TO_EXP_TIME((csa_ie.count - 1) * cbss->beacon_interval)); return; + lock_and_drop_connection: + mutex_lock(&local->mtx); + mutex_lock(&local->chanctx_mtx); drop_connection: /* * This is just so that the disconnect flow will know that @@ -1560,9 +1573,17 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, chan_increment = 1; break; case NL80211_BAND_5GHZ: - case NL80211_BAND_6GHZ: chan_increment = 4; break; + case NL80211_BAND_6GHZ: + /* + * In the 6 GHz band, the "maximum transmit power level" + * field in the triplets is reserved, and thus will be + * zero and we shouldn't use it to control TX power. + * The actual TX power will be given in the transmit + * power envelope element instead. + */ + return false; } /* find channel */ @@ -2382,6 +2403,8 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, /* clear bssid only after building the needed mgmt frames */ eth_zero_addr(ifmgd->bssid); + sdata->vif.bss_conf.ssid_len = 0; + /* remove AP and TDLS peers */ sta_info_flush(sdata); @@ -2720,7 +2743,7 @@ EXPORT_SYMBOL(ieee80211_ap_probereq_get); static void ieee80211_report_disconnect(struct ieee80211_sub_if_data *sdata, const u8 *buf, size_t len, bool tx, - u16 reason) + u16 reason, bool reconnect) { struct ieee80211_event event = { .type = MLME_EVENT, @@ -2729,7 +2752,7 @@ static void ieee80211_report_disconnect(struct ieee80211_sub_if_data *sdata, }; if (tx) - cfg80211_tx_mlme_mgmt(sdata->dev, buf, len); + cfg80211_tx_mlme_mgmt(sdata->dev, buf, len, reconnect); else cfg80211_rx_mlme_mgmt(sdata->dev, buf, len); @@ -2751,13 +2774,18 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) tx = !sdata->csa_block_tx; - /* AP is probably out of range (or not reachable for another reason) so - * remove the bss struct for that AP. - */ - cfg80211_unlink_bss(local->hw.wiphy, ifmgd->associated); + if (!ifmgd->driver_disconnect) { + /* + * AP is probably out of range (or not reachable for another + * reason) so remove the bss struct for that AP. + */ + cfg80211_unlink_bss(local->hw.wiphy, ifmgd->associated); + } ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, - WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, + ifmgd->driver_disconnect ? + WLAN_REASON_DEAUTH_LEAVING : + WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, tx, frame_buf); mutex_lock(&local->mtx); sdata->vif.csa_active = false; @@ -2770,7 +2798,9 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) mutex_unlock(&local->mtx); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), tx, - WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY); + WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, + ifmgd->reconnect); + ifmgd->reconnect = false; sdata_unlock(sdata); } @@ -2789,6 +2819,13 @@ static void ieee80211_beacon_connection_loss_work(struct work_struct *work) sdata_info(sdata, "Connection to AP %pM lost\n", ifmgd->bssid); __ieee80211_disconnect(sdata); + ifmgd->connection_loss = false; + } else if (ifmgd->driver_disconnect) { + sdata_info(sdata, + "Driver requested disconnection from AP %pM\n", + ifmgd->bssid); + __ieee80211_disconnect(sdata); + ifmgd->driver_disconnect = false; } else { ieee80211_mgd_probe_ap(sdata, true); } @@ -2827,6 +2864,21 @@ void ieee80211_connection_loss(struct ieee80211_vif *vif) } EXPORT_SYMBOL(ieee80211_connection_loss); +void ieee80211_disconnect(struct ieee80211_vif *vif, bool reconnect) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_hw *hw = &sdata->local->hw; + + trace_api_disconnect(sdata, reconnect); + + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) + return; + + sdata->u.mgd.driver_disconnect = true; + sdata->u.mgd.reconnect = reconnect; + ieee80211_queue_work(hw, &sdata->u.mgd.beacon_connection_loss_work); +} +EXPORT_SYMBOL(ieee80211_disconnect); static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, bool assoc) @@ -3130,7 +3182,7 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_set_disassoc(sdata, 0, 0, false, NULL); ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, - reason_code); + reason_code, false); return; } @@ -3179,7 +3231,8 @@ static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata, ieee80211_set_disassoc(sdata, 0, 0, false, NULL); - ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code); + ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code, + false); } static void ieee80211_get_rates(struct ieee80211_supported_band *sband, @@ -3199,8 +3252,8 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband, *have_higher_than_11mbit = true; /* - * Skip HT, VHT and HE BSS membership selectors since they're - * not rates. + * Skip HT, VHT, HE and SAE H2E only BSS membership selectors + * since they're not rates. * * Note: Even though the membership selector and the basic * rate flag share the same bit, they are not exactly @@ -3208,7 +3261,8 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband, */ if (supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HT_PHY) || supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_VHT_PHY) || - supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HE_PHY)) + supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HE_PHY) || + supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_SAE_H2E)) continue; for (j = 0; j < sband->n_bitrates; j++) { @@ -3494,14 +3548,6 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); - bss_conf->multi_sta_back_32bit = - sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & - IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP; - - bss_conf->ack_enabled = - sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & - IEEE80211_HE_MAC_CAP2_ACK_EN; - bss_conf->uora_exists = !!elems->uora_element; if (elems->uora_element) bss_conf->uora_ocw_range = elems->uora_element[0]; @@ -4199,7 +4245,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, true, deauth_buf); ieee80211_report_disconnect(sdata, deauth_buf, sizeof(deauth_buf), true, - WLAN_REASON_DEAUTH_LEAVING); + WLAN_REASON_DEAUTH_LEAVING, + false); return; } @@ -4344,7 +4391,7 @@ static void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, tx, frame_buf); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, - reason); + reason, false); } static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) @@ -4716,7 +4763,8 @@ void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata) if (ifmgd->auth_data) ieee80211_destroy_auth_data(sdata, false); cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, - IEEE80211_DEAUTH_FRAME_LEN); + IEEE80211_DEAUTH_FRAME_LEN, + false); } /* This is a bit of a hack - we should find a better and more generic @@ -5430,7 +5478,8 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, - WLAN_REASON_UNSPECIFIED); + WLAN_REASON_UNSPECIFIED, + false); } sdata_info(sdata, "authenticate with %pM\n", req->bss->bssid); @@ -5471,6 +5520,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *assoc_data; const struct cfg80211_bss_ies *beacon_ies; struct ieee80211_supported_band *sband; + struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; const u8 *ssidie, *ht_ie, *vht_ie; int i, err; bool override = false; @@ -5488,6 +5538,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, } memcpy(assoc_data->ssid, ssidie + 2, ssidie[1]); assoc_data->ssid_len = ssidie[1]; + memcpy(bss_conf->ssid, assoc_data->ssid, assoc_data->ssid_len); + bss_conf->ssid_len = assoc_data->ssid_len; rcu_read_unlock(); if (ifmgd->associated) { @@ -5502,7 +5554,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, - WLAN_REASON_UNSPECIFIED); + WLAN_REASON_UNSPECIFIED, + false); } if (ifmgd->auth_data && !ifmgd->auth_data->done) { @@ -5701,6 +5754,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, if (req->flags & ASSOC_REQ_DISABLE_VHT) ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + if (req->flags & ASSOC_REQ_DISABLE_HE) + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; + err = ieee80211_prep_connection(sdata, req->bss, true, override); if (err) goto err_clear; @@ -5801,7 +5857,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_destroy_auth_data(sdata, false); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, - req->reason_code); + req->reason_code, false); return 0; } @@ -5821,7 +5877,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_destroy_assoc_data(sdata, false, true); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, - req->reason_code); + req->reason_code, false); return 0; } @@ -5836,7 +5892,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, req->reason_code, tx, frame_buf); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, - req->reason_code); + req->reason_code, false); return 0; } @@ -5869,7 +5925,7 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, frame_buf); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, - req->reason_code); + req->reason_code, false); return 0; } diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index 38c45e1dafd8..7809a906d7fe 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -1,4 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 +/* + * Portions + * Copyright (C) 2020-2021 Intel Corporation + */ #include <net/mac80211.h> #include <net/rtnetlink.h> @@ -11,7 +15,7 @@ static void ieee80211_sched_scan_cancel(struct ieee80211_local *local) { if (ieee80211_request_sched_scan_stop(local)) return; - cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy, 0); + cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); } int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) @@ -150,21 +154,6 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) case NL80211_IFTYPE_STATION: ieee80211_mgd_quiesce(sdata); break; - case NL80211_IFTYPE_WDS: - /* tear down aggregation sessions and remove STAs */ - mutex_lock(&local->sta_mtx); - sta = sdata->u.wds.sta; - if (sta && sta->uploaded) { - enum ieee80211_sta_state state; - - state = sta->sta_state; - for (; state > IEEE80211_STA_NOTEXIST; state--) - WARN_ON(drv_sta_state(local, sta->sdata, - sta, state, - state - 1)); - } - mutex_unlock(&local->sta_mtx); - break; default: break; } diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 45927202c71c..63652c39c8e0 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -960,7 +960,8 @@ int rate_control_set_rates(struct ieee80211_hw *hw, if (old) kfree_rcu(old, rcu_head); - drv_sta_rate_tbl_update(hw_to_local(hw), sta->sdata, pubsta); + if (sta->uploaded) + drv_sta_rate_tbl_update(hw_to_local(hw), sta->sdata, pubsta); ieee80211_sta_set_expected_throughput(pubsta, sta_get_expected_throughput(sta)); diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c deleted file mode 100644 index 86bc469a28bc..000000000000 --- a/net/mac80211/rc80211_minstrel.c +++ /dev/null @@ -1,591 +0,0 @@ -/* - * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on minstrel.c: - * Copyright (C) 2005-2007 Derek Smithies <derek@indranet.co.nz> - * Sponsored by Indranet Technologies Ltd - * - * Based on sample.c: - * Copyright (c) 2005 John Bicket - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer, - * without modification. - * 2. Redistributions in binary form must reproduce at minimum a disclaimer - * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any - * redistribution must be conditioned upon including a substantially - * similar Disclaimer requirement for further binary redistribution. - * 3. Neither the names of the above-listed copyright holders nor the names - * of any contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * NO WARRANTY - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY - * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, - * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER - * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGES. - */ -#include <linux/netdevice.h> -#include <linux/types.h> -#include <linux/skbuff.h> -#include <linux/debugfs.h> -#include <linux/random.h> -#include <linux/ieee80211.h> -#include <linux/slab.h> -#include <net/mac80211.h> -#include "rate.h" -#include "rc80211_minstrel.h" - -#define SAMPLE_TBL(_mi, _idx, _col) \ - _mi->sample_table[(_idx * SAMPLE_COLUMNS) + _col] - -/* convert mac80211 rate index to local array index */ -static inline int -rix_to_ndx(struct minstrel_sta_info *mi, int rix) -{ - int i = rix; - for (i = rix; i >= 0; i--) - if (mi->r[i].rix == rix) - break; - return i; -} - -/* return current EMWA throughput */ -int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_avg) -{ - int usecs; - - usecs = mr->perfect_tx_time; - if (!usecs) - usecs = 1000000; - - /* reset thr. below 10% success */ - if (mr->stats.prob_avg < MINSTREL_FRAC(10, 100)) - return 0; - - if (prob_avg > MINSTREL_FRAC(90, 100)) - return MINSTREL_TRUNC(100000 * (MINSTREL_FRAC(90, 100) / usecs)); - else - return MINSTREL_TRUNC(100000 * (prob_avg / usecs)); -} - -/* find & sort topmost throughput rates */ -static inline void -minstrel_sort_best_tp_rates(struct minstrel_sta_info *mi, int i, u8 *tp_list) -{ - int j; - struct minstrel_rate_stats *tmp_mrs; - struct minstrel_rate_stats *cur_mrs = &mi->r[i].stats; - - for (j = MAX_THR_RATES; j > 0; --j) { - tmp_mrs = &mi->r[tp_list[j - 1]].stats; - if (minstrel_get_tp_avg(&mi->r[i], cur_mrs->prob_avg) <= - minstrel_get_tp_avg(&mi->r[tp_list[j - 1]], tmp_mrs->prob_avg)) - break; - } - - if (j < MAX_THR_RATES - 1) - memmove(&tp_list[j + 1], &tp_list[j], MAX_THR_RATES - (j + 1)); - if (j < MAX_THR_RATES) - tp_list[j] = i; -} - -static void -minstrel_set_rate(struct minstrel_sta_info *mi, struct ieee80211_sta_rates *ratetbl, - int offset, int idx) -{ - struct minstrel_rate *r = &mi->r[idx]; - - ratetbl->rate[offset].idx = r->rix; - ratetbl->rate[offset].count = r->adjusted_retry_count; - ratetbl->rate[offset].count_cts = r->retry_count_cts; - ratetbl->rate[offset].count_rts = r->stats.retry_count_rtscts; -} - -static void -minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi) -{ - struct ieee80211_sta_rates *ratetbl; - int i = 0; - - ratetbl = kzalloc(sizeof(*ratetbl), GFP_ATOMIC); - if (!ratetbl) - return; - - /* Start with max_tp_rate */ - minstrel_set_rate(mi, ratetbl, i++, mi->max_tp_rate[0]); - - if (mp->hw->max_rates >= 3) { - /* At least 3 tx rates supported, use max_tp_rate2 next */ - minstrel_set_rate(mi, ratetbl, i++, mi->max_tp_rate[1]); - } - - if (mp->hw->max_rates >= 2) { - /* At least 2 tx rates supported, use max_prob_rate next */ - minstrel_set_rate(mi, ratetbl, i++, mi->max_prob_rate); - } - - /* Use lowest rate last */ - ratetbl->rate[i].idx = mi->lowest_rix; - ratetbl->rate[i].count = mp->max_retry; - ratetbl->rate[i].count_cts = mp->max_retry; - ratetbl->rate[i].count_rts = mp->max_retry; - - rate_control_set_rates(mp->hw, mi->sta, ratetbl); -} - -/* -* Recalculate statistics and counters of a given rate -*/ -void -minstrel_calc_rate_stats(struct minstrel_priv *mp, - struct minstrel_rate_stats *mrs) -{ - unsigned int cur_prob; - - if (unlikely(mrs->attempts > 0)) { - mrs->sample_skipped = 0; - cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); - if (mp->new_avg) { - minstrel_filter_avg_add(&mrs->prob_avg, - &mrs->prob_avg_1, cur_prob); - } else if (unlikely(!mrs->att_hist)) { - mrs->prob_avg = cur_prob; - } else { - /*update exponential weighted moving avarage */ - mrs->prob_avg = minstrel_ewma(mrs->prob_avg, - cur_prob, - EWMA_LEVEL); - } - mrs->att_hist += mrs->attempts; - mrs->succ_hist += mrs->success; - } else { - mrs->sample_skipped++; - } - - mrs->last_success = mrs->success; - mrs->last_attempts = mrs->attempts; - mrs->success = 0; - mrs->attempts = 0; -} - -static void -minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) -{ - u8 tmp_tp_rate[MAX_THR_RATES]; - u8 tmp_prob_rate = 0; - int i, tmp_cur_tp, tmp_prob_tp; - - for (i = 0; i < MAX_THR_RATES; i++) - tmp_tp_rate[i] = 0; - - for (i = 0; i < mi->n_rates; i++) { - struct minstrel_rate *mr = &mi->r[i]; - struct minstrel_rate_stats *mrs = &mi->r[i].stats; - struct minstrel_rate_stats *tmp_mrs = &mi->r[tmp_prob_rate].stats; - - /* Update statistics of success probability per rate */ - minstrel_calc_rate_stats(mp, mrs); - - /* Sample less often below the 10% chance of success. - * Sample less often above the 95% chance of success. */ - if (mrs->prob_avg > MINSTREL_FRAC(95, 100) || - mrs->prob_avg < MINSTREL_FRAC(10, 100)) { - mr->adjusted_retry_count = mrs->retry_count >> 1; - if (mr->adjusted_retry_count > 2) - mr->adjusted_retry_count = 2; - mr->sample_limit = 4; - } else { - mr->sample_limit = -1; - mr->adjusted_retry_count = mrs->retry_count; - } - if (!mr->adjusted_retry_count) - mr->adjusted_retry_count = 2; - - minstrel_sort_best_tp_rates(mi, i, tmp_tp_rate); - - /* To determine the most robust rate (max_prob_rate) used at - * 3rd mmr stage we distinct between two cases: - * (1) if any success probabilitiy >= 95%, out of those rates - * choose the maximum throughput rate as max_prob_rate - * (2) if all success probabilities < 95%, the rate with - * highest success probability is chosen as max_prob_rate */ - if (mrs->prob_avg >= MINSTREL_FRAC(95, 100)) { - tmp_cur_tp = minstrel_get_tp_avg(mr, mrs->prob_avg); - tmp_prob_tp = minstrel_get_tp_avg(&mi->r[tmp_prob_rate], - tmp_mrs->prob_avg); - if (tmp_cur_tp >= tmp_prob_tp) - tmp_prob_rate = i; - } else { - if (mrs->prob_avg >= tmp_mrs->prob_avg) - tmp_prob_rate = i; - } - } - - /* Assign the new rate set */ - memcpy(mi->max_tp_rate, tmp_tp_rate, sizeof(mi->max_tp_rate)); - mi->max_prob_rate = tmp_prob_rate; - -#ifdef CONFIG_MAC80211_DEBUGFS - /* use fixed index if set */ - if (mp->fixed_rate_idx != -1) { - mi->max_tp_rate[0] = mp->fixed_rate_idx; - mi->max_tp_rate[1] = mp->fixed_rate_idx; - mi->max_prob_rate = mp->fixed_rate_idx; - } -#endif - - /* Reset update timer */ - mi->last_stats_update = jiffies; - - minstrel_update_rates(mp, mi); -} - -static void -minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, - void *priv_sta, struct ieee80211_tx_status *st) -{ - struct ieee80211_tx_info *info = st->info; - struct minstrel_priv *mp = priv; - struct minstrel_sta_info *mi = priv_sta; - struct ieee80211_tx_rate *ar = info->status.rates; - int i, ndx; - int success; - - success = !!(info->flags & IEEE80211_TX_STAT_ACK); - - for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { - if (ar[i].idx < 0) - break; - - ndx = rix_to_ndx(mi, ar[i].idx); - if (ndx < 0) - continue; - - mi->r[ndx].stats.attempts += ar[i].count; - - if ((i != IEEE80211_TX_MAX_RATES - 1) && (ar[i + 1].idx < 0)) - mi->r[ndx].stats.success += success; - } - - if ((info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) && (i >= 0)) - mi->sample_packets++; - - if (mi->sample_deferred > 0) - mi->sample_deferred--; - - if (time_after(jiffies, mi->last_stats_update + - mp->update_interval / (mp->new_avg ? 2 : 1))) - minstrel_update_stats(mp, mi); -} - - -static inline unsigned int -minstrel_get_retry_count(struct minstrel_rate *mr, - struct ieee80211_tx_info *info) -{ - u8 retry = mr->adjusted_retry_count; - - if (info->control.use_rts) - retry = max_t(u8, 2, min(mr->stats.retry_count_rtscts, retry)); - else if (info->control.use_cts_prot) - retry = max_t(u8, 2, min(mr->retry_count_cts, retry)); - return retry; -} - - -static int -minstrel_get_next_sample(struct minstrel_sta_info *mi) -{ - unsigned int sample_ndx; - sample_ndx = SAMPLE_TBL(mi, mi->sample_row, mi->sample_column); - mi->sample_row++; - if ((int) mi->sample_row >= mi->n_rates) { - mi->sample_row = 0; - mi->sample_column++; - if (mi->sample_column >= SAMPLE_COLUMNS) - mi->sample_column = 0; - } - return sample_ndx; -} - -static void -minstrel_get_rate(void *priv, struct ieee80211_sta *sta, - void *priv_sta, struct ieee80211_tx_rate_control *txrc) -{ - struct sk_buff *skb = txrc->skb; - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct minstrel_sta_info *mi = priv_sta; - struct minstrel_priv *mp = priv; - struct ieee80211_tx_rate *rate = &info->control.rates[0]; - struct minstrel_rate *msr, *mr; - unsigned int ndx; - bool mrr_capable; - bool prev_sample; - int delta; - int sampling_ratio; - - /* check multi-rate-retry capabilities & adjust lookaround_rate */ - mrr_capable = mp->has_mrr && - !txrc->rts && - !txrc->bss_conf->use_cts_prot; - if (mrr_capable) - sampling_ratio = mp->lookaround_rate_mrr; - else - sampling_ratio = mp->lookaround_rate; - - /* increase sum packet counter */ - mi->total_packets++; - -#ifdef CONFIG_MAC80211_DEBUGFS - if (mp->fixed_rate_idx != -1) - return; -#endif - - /* Don't use EAPOL frames for sampling on non-mrr hw */ - if (mp->hw->max_rates == 1 && - (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO)) - return; - - delta = (mi->total_packets * sampling_ratio / 100) - - (mi->sample_packets + mi->sample_deferred / 2); - - /* delta < 0: no sampling required */ - prev_sample = mi->prev_sample; - mi->prev_sample = false; - if (delta < 0 || (!mrr_capable && prev_sample)) - return; - - if (mi->total_packets >= 10000) { - mi->sample_deferred = 0; - mi->sample_packets = 0; - mi->total_packets = 0; - } else if (delta > mi->n_rates * 2) { - /* With multi-rate retry, not every planned sample - * attempt actually gets used, due to the way the retry - * chain is set up - [max_tp,sample,prob,lowest] for - * sample_rate < max_tp. - * - * If there's too much sampling backlog and the link - * starts getting worse, minstrel would start bursting - * out lots of sampling frames, which would result - * in a large throughput loss. */ - mi->sample_packets += (delta - mi->n_rates * 2); - } - - /* get next random rate sample */ - ndx = minstrel_get_next_sample(mi); - msr = &mi->r[ndx]; - mr = &mi->r[mi->max_tp_rate[0]]; - - /* Decide if direct ( 1st mrr stage) or indirect (2nd mrr stage) - * rate sampling method should be used. - * Respect such rates that are not sampled for 20 interations. - */ - if (mrr_capable && - msr->perfect_tx_time > mr->perfect_tx_time && - msr->stats.sample_skipped < 20) { - /* Only use IEEE80211_TX_CTL_RATE_CTRL_PROBE to mark - * packets that have the sampling rate deferred to the - * second MRR stage. Increase the sample counter only - * if the deferred sample rate was actually used. - * Use the sample_deferred counter to make sure that - * the sampling is not done in large bursts */ - info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; - rate++; - mi->sample_deferred++; - } else { - if (!msr->sample_limit) - return; - - mi->sample_packets++; - if (msr->sample_limit > 0) - msr->sample_limit--; - } - - /* If we're not using MRR and the sampling rate already - * has a probability of >95%, we shouldn't be attempting - * to use it, as this only wastes precious airtime */ - if (!mrr_capable && - (mi->r[ndx].stats.prob_avg > MINSTREL_FRAC(95, 100))) - return; - - mi->prev_sample = true; - - rate->idx = mi->r[ndx].rix; - rate->count = minstrel_get_retry_count(&mi->r[ndx], info); -} - - -static void -calc_rate_durations(enum nl80211_band band, - struct minstrel_rate *d, - struct ieee80211_rate *rate, - struct cfg80211_chan_def *chandef) -{ - int erp = !!(rate->flags & IEEE80211_RATE_ERP_G); - int shift = ieee80211_chandef_get_shift(chandef); - - d->perfect_tx_time = ieee80211_frame_duration(band, 1200, - DIV_ROUND_UP(rate->bitrate, 1 << shift), erp, 1, - shift); - d->ack_time = ieee80211_frame_duration(band, 10, - DIV_ROUND_UP(rate->bitrate, 1 << shift), erp, 1, - shift); -} - -static void -init_sample_table(struct minstrel_sta_info *mi) -{ - unsigned int i, col, new_idx; - u8 rnd[8]; - - mi->sample_column = 0; - mi->sample_row = 0; - memset(mi->sample_table, 0xff, SAMPLE_COLUMNS * mi->n_rates); - - for (col = 0; col < SAMPLE_COLUMNS; col++) { - prandom_bytes(rnd, sizeof(rnd)); - for (i = 0; i < mi->n_rates; i++) { - new_idx = (i + rnd[i & 7]) % mi->n_rates; - while (SAMPLE_TBL(mi, new_idx, col) != 0xff) - new_idx = (new_idx + 1) % mi->n_rates; - - SAMPLE_TBL(mi, new_idx, col) = i; - } - } -} - -static void -minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, - struct cfg80211_chan_def *chandef, - struct ieee80211_sta *sta, void *priv_sta) -{ - struct minstrel_sta_info *mi = priv_sta; - struct minstrel_priv *mp = priv; - struct ieee80211_rate *ctl_rate; - unsigned int i, n = 0; - unsigned int t_slot = 9; /* FIXME: get real slot time */ - u32 rate_flags; - - mi->sta = sta; - mi->lowest_rix = rate_lowest_index(sband, sta); - ctl_rate = &sband->bitrates[mi->lowest_rix]; - mi->sp_ack_dur = ieee80211_frame_duration(sband->band, 10, - ctl_rate->bitrate, - !!(ctl_rate->flags & IEEE80211_RATE_ERP_G), 1, - ieee80211_chandef_get_shift(chandef)); - - rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); - memset(mi->max_tp_rate, 0, sizeof(mi->max_tp_rate)); - mi->max_prob_rate = 0; - - for (i = 0; i < sband->n_bitrates; i++) { - struct minstrel_rate *mr = &mi->r[n]; - struct minstrel_rate_stats *mrs = &mi->r[n].stats; - unsigned int tx_time = 0, tx_time_cts = 0, tx_time_rtscts = 0; - unsigned int tx_time_single; - unsigned int cw = mp->cw_min; - int shift; - - if (!rate_supported(sta, sband->band, i)) - continue; - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; - - n++; - memset(mr, 0, sizeof(*mr)); - memset(mrs, 0, sizeof(*mrs)); - - mr->rix = i; - shift = ieee80211_chandef_get_shift(chandef); - mr->bitrate = DIV_ROUND_UP(sband->bitrates[i].bitrate, - (1 << shift) * 5); - calc_rate_durations(sband->band, mr, &sband->bitrates[i], - chandef); - - /* calculate maximum number of retransmissions before - * fallback (based on maximum segment size) */ - mr->sample_limit = -1; - mrs->retry_count = 1; - mr->retry_count_cts = 1; - mrs->retry_count_rtscts = 1; - tx_time = mr->perfect_tx_time + mi->sp_ack_dur; - do { - /* add one retransmission */ - tx_time_single = mr->ack_time + mr->perfect_tx_time; - - /* contention window */ - tx_time_single += (t_slot * cw) >> 1; - cw = min((cw << 1) | 1, mp->cw_max); - - tx_time += tx_time_single; - tx_time_cts += tx_time_single + mi->sp_ack_dur; - tx_time_rtscts += tx_time_single + 2 * mi->sp_ack_dur; - if ((tx_time_cts < mp->segment_size) && - (mr->retry_count_cts < mp->max_retry)) - mr->retry_count_cts++; - if ((tx_time_rtscts < mp->segment_size) && - (mrs->retry_count_rtscts < mp->max_retry)) - mrs->retry_count_rtscts++; - } while ((tx_time < mp->segment_size) && - (++mr->stats.retry_count < mp->max_retry)); - mr->adjusted_retry_count = mrs->retry_count; - if (!(sband->bitrates[i].flags & IEEE80211_RATE_ERP_G)) - mr->retry_count_cts = mrs->retry_count; - } - - for (i = n; i < sband->n_bitrates; i++) { - struct minstrel_rate *mr = &mi->r[i]; - mr->rix = -1; - } - - mi->n_rates = n; - mi->last_stats_update = jiffies; - - init_sample_table(mi); - minstrel_update_rates(mp, mi); -} - -static u32 minstrel_get_expected_throughput(void *priv_sta) -{ - struct minstrel_sta_info *mi = priv_sta; - struct minstrel_rate_stats *tmp_mrs; - int idx = mi->max_tp_rate[0]; - int tmp_cur_tp; - - /* convert pkt per sec in kbps (1200 is the average pkt size used for - * computing cur_tp - */ - tmp_mrs = &mi->r[idx].stats; - tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_avg) * 10; - tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024; - - return tmp_cur_tp; -} - -const struct rate_control_ops mac80211_minstrel = { - .tx_status_ext = minstrel_tx_status, - .get_rate = minstrel_get_rate, - .rate_init = minstrel_rate_init, - .get_expected_throughput = minstrel_get_expected_throughput, -}; diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h deleted file mode 100644 index dbb43bcd3c45..000000000000 --- a/net/mac80211/rc80211_minstrel.h +++ /dev/null @@ -1,185 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org> - */ - -#ifndef __RC_MINSTREL_H -#define __RC_MINSTREL_H - -#define EWMA_LEVEL 96 /* ewma weighting factor [/EWMA_DIV] */ -#define EWMA_DIV 128 -#define SAMPLE_COLUMNS 10 /* number of columns in sample table */ - -/* scaled fraction values */ -#define MINSTREL_SCALE 12 -#define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div) -#define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE) - -/* number of highest throughput rates to consider*/ -#define MAX_THR_RATES 4 - -/* - * Coefficients for moving average with noise filter (period=16), - * scaled by 10 bits - * - * a1 = exp(-pi * sqrt(2) / period) - * coeff2 = 2 * a1 * cos(sqrt(2) * 2 * pi / period) - * coeff3 = -sqr(a1) - * coeff1 = 1 - coeff2 - coeff3 - */ -#define MINSTREL_AVG_COEFF1 (MINSTREL_FRAC(1, 1) - \ - MINSTREL_AVG_COEFF2 - \ - MINSTREL_AVG_COEFF3) -#define MINSTREL_AVG_COEFF2 0x00001499 -#define MINSTREL_AVG_COEFF3 -0x0000092e - -/* - * Perform EWMA (Exponentially Weighted Moving Average) calculation - */ -static inline int -minstrel_ewma(int old, int new, int weight) -{ - int diff, incr; - - diff = new - old; - incr = (EWMA_DIV - weight) * diff / EWMA_DIV; - - return old + incr; -} - -static inline int minstrel_filter_avg_add(u16 *prev_1, u16 *prev_2, s32 in) -{ - s32 out_1 = *prev_1; - s32 out_2 = *prev_2; - s32 val; - - if (!in) - in += 1; - - if (!out_1) { - val = out_1 = in; - goto out; - } - - val = MINSTREL_AVG_COEFF1 * in; - val += MINSTREL_AVG_COEFF2 * out_1; - val += MINSTREL_AVG_COEFF3 * out_2; - val >>= MINSTREL_SCALE; - - if (val > 1 << MINSTREL_SCALE) - val = 1 << MINSTREL_SCALE; - if (val < 0) - val = 1; - -out: - *prev_2 = out_1; - *prev_1 = val; - - return val; -} - -struct minstrel_rate_stats { - /* current / last sampling period attempts/success counters */ - u16 attempts, last_attempts; - u16 success, last_success; - - /* total attempts/success counters */ - u32 att_hist, succ_hist; - - /* prob_avg - moving average of prob */ - u16 prob_avg; - u16 prob_avg_1; - - /* maximum retry counts */ - u8 retry_count; - u8 retry_count_rtscts; - - u8 sample_skipped; - bool retry_updated; -}; - -struct minstrel_rate { - int bitrate; - - s8 rix; - u8 retry_count_cts; - u8 adjusted_retry_count; - - unsigned int perfect_tx_time; - unsigned int ack_time; - - int sample_limit; - - struct minstrel_rate_stats stats; -}; - -struct minstrel_sta_info { - struct ieee80211_sta *sta; - - unsigned long last_stats_update; - unsigned int sp_ack_dur; - unsigned int rate_avg; - - unsigned int lowest_rix; - - u8 max_tp_rate[MAX_THR_RATES]; - u8 max_prob_rate; - unsigned int total_packets; - unsigned int sample_packets; - int sample_deferred; - - unsigned int sample_row; - unsigned int sample_column; - - int n_rates; - struct minstrel_rate *r; - bool prev_sample; - - /* sampling table */ - u8 *sample_table; -}; - -struct minstrel_priv { - struct ieee80211_hw *hw; - bool has_mrr; - bool new_avg; - u32 sample_switch; - unsigned int cw_min; - unsigned int cw_max; - unsigned int max_retry; - unsigned int segment_size; - unsigned int update_interval; - unsigned int lookaround_rate; - unsigned int lookaround_rate_mrr; - - u8 cck_rates[4]; - -#ifdef CONFIG_MAC80211_DEBUGFS - /* - * enable fixed rate processing per RC - * - write static index to debugfs:ieee80211/phyX/rc/fixed_rate_idx - * - write -1 to enable RC processing again - * - setting will be applied on next update - */ - u32 fixed_rate_idx; -#endif -}; - -struct minstrel_debugfs_info { - size_t len; - char buf[]; -}; - -extern const struct rate_control_ops mac80211_minstrel; -void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); - -/* Recalculate success probabilities and counters for a given rate using EWMA */ -void minstrel_calc_rate_stats(struct minstrel_priv *mp, - struct minstrel_rate_stats *mrs); -int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_avg); - -/* debugfs */ -int minstrel_stats_open(struct inode *inode, struct file *file); -int minstrel_stats_csv_open(struct inode *inode, struct file *file); - -#endif diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c deleted file mode 100644 index 9b8e0daeb7bb..000000000000 --- a/net/mac80211/rc80211_minstrel_debugfs.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on minstrel.c: - * Copyright (C) 2005-2007 Derek Smithies <derek@indranet.co.nz> - * Sponsored by Indranet Technologies Ltd - * - * Based on sample.c: - * Copyright (c) 2005 John Bicket - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer, - * without modification. - * 2. Redistributions in binary form must reproduce at minimum a disclaimer - * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any - * redistribution must be conditioned upon including a substantially - * similar Disclaimer requirement for further binary redistribution. - * 3. Neither the names of the above-listed copyright holders nor the names - * of any contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * NO WARRANTY - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY - * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, - * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER - * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGES. - */ -#include <linux/netdevice.h> -#include <linux/types.h> -#include <linux/skbuff.h> -#include <linux/debugfs.h> -#include <linux/ieee80211.h> -#include <linux/slab.h> -#include <linux/export.h> -#include <net/mac80211.h> -#include "rc80211_minstrel.h" - -int -minstrel_stats_open(struct inode *inode, struct file *file) -{ - struct minstrel_sta_info *mi = inode->i_private; - struct minstrel_debugfs_info *ms; - unsigned int i, tp_max, tp_avg, eprob; - char *p; - - ms = kmalloc(2048, GFP_KERNEL); - if (!ms) - return -ENOMEM; - - file->private_data = ms; - p = ms->buf; - p += sprintf(p, "\n"); - p += sprintf(p, - "best __________rate_________ ____statistics___ ____last_____ ______sum-of________\n"); - p += sprintf(p, - "rate [name idx airtime max_tp] [avg(tp) avg(prob)] [retry|suc|att] [#success | #attempts]\n"); - - for (i = 0; i < mi->n_rates; i++) { - struct minstrel_rate *mr = &mi->r[i]; - struct minstrel_rate_stats *mrs = &mi->r[i].stats; - - *(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' '; - *(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' '; - *(p++) = (i == mi->max_tp_rate[2]) ? 'C' : ' '; - *(p++) = (i == mi->max_tp_rate[3]) ? 'D' : ' '; - *(p++) = (i == mi->max_prob_rate) ? 'P' : ' '; - - p += sprintf(p, " %3u%s ", mr->bitrate / 2, - (mr->bitrate & 1 ? ".5" : " ")); - p += sprintf(p, "%3u ", i); - p += sprintf(p, "%6u ", mr->perfect_tx_time); - - tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); - tp_avg = minstrel_get_tp_avg(mr, mrs->prob_avg); - eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); - - p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u" - " %3u %3u %-3u " - "%9llu %-9llu\n", - tp_max / 10, tp_max % 10, - tp_avg / 10, tp_avg % 10, - eprob / 10, eprob % 10, - mrs->retry_count, - mrs->last_success, - mrs->last_attempts, - (unsigned long long)mrs->succ_hist, - (unsigned long long)mrs->att_hist); - } - p += sprintf(p, "\nTotal packet count:: ideal %d " - "lookaround %d\n\n", - mi->total_packets - mi->sample_packets, - mi->sample_packets); - ms->len = p - ms->buf; - - WARN_ON(ms->len + sizeof(*ms) > 2048); - - return 0; -} - -int -minstrel_stats_csv_open(struct inode *inode, struct file *file) -{ - struct minstrel_sta_info *mi = inode->i_private; - struct minstrel_debugfs_info *ms; - unsigned int i, tp_max, tp_avg, eprob; - char *p; - - ms = kmalloc(2048, GFP_KERNEL); - if (!ms) - return -ENOMEM; - - file->private_data = ms; - p = ms->buf; - - for (i = 0; i < mi->n_rates; i++) { - struct minstrel_rate *mr = &mi->r[i]; - struct minstrel_rate_stats *mrs = &mi->r[i].stats; - - p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : "")); - p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : "")); - p += sprintf(p, "%s" ,((i == mi->max_tp_rate[2]) ? "C" : "")); - p += sprintf(p, "%s" ,((i == mi->max_tp_rate[3]) ? "D" : "")); - p += sprintf(p, "%s" ,((i == mi->max_prob_rate) ? "P" : "")); - - p += sprintf(p, ",%u%s", mr->bitrate / 2, - (mr->bitrate & 1 ? ".5," : ",")); - p += sprintf(p, "%u,", i); - p += sprintf(p, "%u,",mr->perfect_tx_time); - - tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); - tp_avg = minstrel_get_tp_avg(mr, mrs->prob_avg); - eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000); - - p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u,%u," - "%llu,%llu,%d,%d\n", - tp_max / 10, tp_max % 10, - tp_avg / 10, tp_avg % 10, - eprob / 10, eprob % 10, - mrs->retry_count, - mrs->last_success, - mrs->last_attempts, - (unsigned long long)mrs->succ_hist, - (unsigned long long)mrs->att_hist, - mi->total_packets - mi->sample_packets, - mi->sample_packets); - - } - ms->len = p - ms->buf; - - WARN_ON(ms->len + sizeof(*ms) > 2048); - - return 0; -} diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index b11a2af55b06..2f44f4919789 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -13,7 +13,6 @@ #include <net/mac80211.h> #include "rate.h" #include "sta_info.h" -#include "rc80211_minstrel.h" #include "rc80211_minstrel_ht.h" #define AVG_AMPDU_SIZE 16 @@ -136,20 +135,16 @@ __VHT_GROUP(_streams, _sgi, _bw, \ VHT_GROUP_SHIFT(_streams, _sgi, _bw)) -#define CCK_DURATION(_bitrate, _short, _len) \ +#define CCK_DURATION(_bitrate, _short) \ (1000 * (10 /* SIFS */ + \ (_short ? 72 + 24 : 144 + 48) + \ - (8 * (_len + 4) * 10) / (_bitrate))) - -#define CCK_ACK_DURATION(_bitrate, _short) \ - (CCK_DURATION((_bitrate > 10 ? 20 : 10), false, 60) + \ - CCK_DURATION(_bitrate, _short, AVG_PKT_SIZE)) + (8 * (AVG_PKT_SIZE + 4) * 10) / (_bitrate))) #define CCK_DURATION_LIST(_short, _s) \ - CCK_ACK_DURATION(10, _short) >> _s, \ - CCK_ACK_DURATION(20, _short) >> _s, \ - CCK_ACK_DURATION(55, _short) >> _s, \ - CCK_ACK_DURATION(110, _short) >> _s + CCK_DURATION(10, _short) >> _s, \ + CCK_DURATION(20, _short) >> _s, \ + CCK_DURATION(55, _short) >> _s, \ + CCK_DURATION(110, _short) >> _s #define __CCK_GROUP(_s) \ [MINSTREL_CCK_GROUP] = { \ @@ -163,10 +158,42 @@ } #define CCK_GROUP_SHIFT \ - GROUP_SHIFT(CCK_ACK_DURATION(10, false)) + GROUP_SHIFT(CCK_DURATION(10, false)) #define CCK_GROUP __CCK_GROUP(CCK_GROUP_SHIFT) +#define OFDM_DURATION(_bitrate) \ + (1000 * (16 /* SIFS + signal ext */ + \ + 16 /* T_PREAMBLE */ + \ + 4 /* T_SIGNAL */ + \ + 4 * (((16 + 80 * (AVG_PKT_SIZE + 4) + 6) / \ + ((_bitrate) * 4))))) + +#define OFDM_DURATION_LIST(_s) \ + OFDM_DURATION(60) >> _s, \ + OFDM_DURATION(90) >> _s, \ + OFDM_DURATION(120) >> _s, \ + OFDM_DURATION(180) >> _s, \ + OFDM_DURATION(240) >> _s, \ + OFDM_DURATION(360) >> _s, \ + OFDM_DURATION(480) >> _s, \ + OFDM_DURATION(540) >> _s + +#define __OFDM_GROUP(_s) \ + [MINSTREL_OFDM_GROUP] = { \ + .streams = 1, \ + .flags = 0, \ + .shift = _s, \ + .duration = { \ + OFDM_DURATION_LIST(_s), \ + } \ + } + +#define OFDM_GROUP_SHIFT \ + GROUP_SHIFT(OFDM_DURATION(60)) + +#define OFDM_GROUP __OFDM_GROUP(OFDM_GROUP_SHIFT) + static bool minstrel_vht_only = true; module_param(minstrel_vht_only, bool, 0644); @@ -203,6 +230,7 @@ const struct mcs_group minstrel_mcs_groups[] = { MCS_GROUP(4, 1, BW_40), CCK_GROUP, + OFDM_GROUP, VHT_GROUP(1, 0, BW_20), VHT_GROUP(2, 0, BW_20), @@ -235,7 +263,17 @@ const struct mcs_group minstrel_mcs_groups[] = { VHT_GROUP(4, 1, BW_80), }; +const s16 minstrel_cck_bitrates[4] = { 10, 20, 55, 110 }; +const s16 minstrel_ofdm_bitrates[8] = { 60, 90, 120, 180, 240, 360, 480, 540 }; static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly; +static const u8 minstrel_sample_seq[] = { + MINSTREL_SAMPLE_TYPE_INC, + MINSTREL_SAMPLE_TYPE_JUMP, + MINSTREL_SAMPLE_TYPE_INC, + MINSTREL_SAMPLE_TYPE_JUMP, + MINSTREL_SAMPLE_TYPE_INC, + MINSTREL_SAMPLE_TYPE_SLOW, +}; static void minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi); @@ -279,6 +317,13 @@ minstrel_get_valid_vht_rates(int bw, int nss, __le16 mcs_map) return 0x3ff & ~mask; } +static bool +minstrel_ht_is_legacy_group(int group) +{ + return group == MINSTREL_CCK_GROUP || + group == MINSTREL_OFDM_GROUP; +} + /* * Look up an MCS group index based on mac80211 rate information */ @@ -308,37 +353,74 @@ minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, if (rate->flags & IEEE80211_TX_RC_MCS) { group = minstrel_ht_get_group_idx(rate); idx = rate->idx % 8; - } else if (rate->flags & IEEE80211_TX_RC_VHT_MCS) { + goto out; + } + + if (rate->flags & IEEE80211_TX_RC_VHT_MCS) { group = minstrel_vht_get_group_idx(rate); idx = ieee80211_rate_get_vht_mcs(rate); - } else { - group = MINSTREL_CCK_GROUP; + goto out; + } - for (idx = 0; idx < ARRAY_SIZE(mp->cck_rates); idx++) - if (rate->idx == mp->cck_rates[idx]) - break; + group = MINSTREL_CCK_GROUP; + for (idx = 0; idx < ARRAY_SIZE(mp->cck_rates); idx++) { + if (rate->idx != mp->cck_rates[idx]) + continue; /* short preamble */ if ((mi->supported[group] & BIT(idx + 4)) && (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE)) - idx += 4; + idx += 4; + goto out; } + + group = MINSTREL_OFDM_GROUP; + for (idx = 0; idx < ARRAY_SIZE(mp->ofdm_rates[0]); idx++) + if (rate->idx == mp->ofdm_rates[mi->band][idx]) + goto out; + + idx = 0; +out: return &mi->groups[group].rates[idx]; } static inline struct minstrel_rate_stats * minstrel_get_ratestats(struct minstrel_ht_sta *mi, int index) { - return &mi->groups[index / MCS_GROUP_RATES].rates[index % MCS_GROUP_RATES]; + return &mi->groups[MI_RATE_GROUP(index)].rates[MI_RATE_IDX(index)]; +} + +static inline int minstrel_get_duration(int index) +{ + const struct mcs_group *group = &minstrel_mcs_groups[MI_RATE_GROUP(index)]; + unsigned int duration = group->duration[MI_RATE_IDX(index)]; + + return duration << group->shift; } static unsigned int minstrel_ht_avg_ampdu_len(struct minstrel_ht_sta *mi) { - if (!mi->avg_ampdu_len) - return AVG_AMPDU_SIZE; + int duration; + + if (mi->avg_ampdu_len) + return MINSTREL_TRUNC(mi->avg_ampdu_len); + + if (minstrel_ht_is_legacy_group(MI_RATE_GROUP(mi->max_tp_rate[0]))) + return 1; + + duration = minstrel_get_duration(mi->max_tp_rate[0]); - return MINSTREL_TRUNC(mi->avg_ampdu_len); + if (duration > 400 * 1000) + return 2; + + if (duration > 250 * 1000) + return 4; + + if (duration > 150 * 1000) + return 8; + + return 16; } /* @@ -349,15 +431,19 @@ int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, int prob_avg) { - unsigned int nsecs = 0; + unsigned int nsecs = 0, overhead = mi->overhead; + unsigned int ampdu_len = 1; /* do not account throughput if sucess prob is below 10% */ if (prob_avg < MINSTREL_FRAC(10, 100)) return 0; - if (group != MINSTREL_CCK_GROUP) - nsecs = 1000 * mi->overhead / minstrel_ht_avg_ampdu_len(mi); + if (minstrel_ht_is_legacy_group(group)) + overhead = mi->overhead_legacy; + else + ampdu_len = minstrel_ht_avg_ampdu_len(mi); + nsecs = 1000 * overhead / ampdu_len; nsecs += minstrel_mcs_groups[group].duration[rate] << minstrel_mcs_groups[group].shift; @@ -367,10 +453,9 @@ minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, * (prob is scaled - see MINSTREL_FRAC above) */ if (prob_avg > MINSTREL_FRAC(90, 100)) - return MINSTREL_TRUNC(100000 * ((MINSTREL_FRAC(90, 100) * 1000) - / nsecs)); - else - return MINSTREL_TRUNC(100000 * ((prob_avg * 1000) / nsecs)); + prob_avg = MINSTREL_FRAC(90, 100); + + return MINSTREL_TRUNC(100 * ((prob_avg * 1000000) / nsecs)); } /* @@ -388,14 +473,14 @@ minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u16 index, int tmp_group, tmp_idx, tmp_tp_avg, tmp_prob; int j = MAX_THR_RATES; - cur_group = index / MCS_GROUP_RATES; - cur_idx = index % MCS_GROUP_RATES; + cur_group = MI_RATE_GROUP(index); + cur_idx = MI_RATE_IDX(index); cur_prob = mi->groups[cur_group].rates[cur_idx].prob_avg; cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, cur_prob); do { - tmp_group = tp_list[j - 1] / MCS_GROUP_RATES; - tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES; + tmp_group = MI_RATE_GROUP(tp_list[j - 1]); + tmp_idx = MI_RATE_IDX(tp_list[j - 1]); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); @@ -417,41 +502,50 @@ minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u16 index, * Find and set the topmost probability rate per sta and per group */ static void -minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) +minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 *dest, u16 index) { struct minstrel_mcs_group_data *mg; struct minstrel_rate_stats *mrs; int tmp_group, tmp_idx, tmp_tp_avg, tmp_prob; - int max_tp_group, cur_tp_avg, cur_group, cur_idx; + int max_tp_group, max_tp_idx, max_tp_prob; + int cur_tp_avg, cur_group, cur_idx; int max_gpr_group, max_gpr_idx; int max_gpr_tp_avg, max_gpr_prob; - cur_group = index / MCS_GROUP_RATES; - cur_idx = index % MCS_GROUP_RATES; - mg = &mi->groups[index / MCS_GROUP_RATES]; - mrs = &mg->rates[index % MCS_GROUP_RATES]; + cur_group = MI_RATE_GROUP(index); + cur_idx = MI_RATE_IDX(index); + mg = &mi->groups[cur_group]; + mrs = &mg->rates[cur_idx]; - tmp_group = mi->max_prob_rate / MCS_GROUP_RATES; - tmp_idx = mi->max_prob_rate % MCS_GROUP_RATES; + tmp_group = MI_RATE_GROUP(*dest); + tmp_idx = MI_RATE_IDX(*dest); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); /* if max_tp_rate[0] is from MCS_GROUP max_prob_rate get selected from * MCS_GROUP as well as CCK_GROUP rates do not allow aggregation */ - max_tp_group = mi->max_tp_rate[0] / MCS_GROUP_RATES; - if((index / MCS_GROUP_RATES == MINSTREL_CCK_GROUP) && - (max_tp_group != MINSTREL_CCK_GROUP)) + max_tp_group = MI_RATE_GROUP(mi->max_tp_rate[0]); + max_tp_idx = MI_RATE_IDX(mi->max_tp_rate[0]); + max_tp_prob = mi->groups[max_tp_group].rates[max_tp_idx].prob_avg; + + if (minstrel_ht_is_legacy_group(MI_RATE_GROUP(index)) && + !minstrel_ht_is_legacy_group(max_tp_group)) + return; + + /* skip rates faster than max tp rate with lower prob */ + if (minstrel_get_duration(mi->max_tp_rate[0]) > minstrel_get_duration(index) && + mrs->prob_avg < max_tp_prob) return; - max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES; - max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; + max_gpr_group = MI_RATE_GROUP(mg->max_group_prob_rate); + max_gpr_idx = MI_RATE_IDX(mg->max_group_prob_rate); max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_avg; if (mrs->prob_avg > MINSTREL_FRAC(75, 100)) { cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, mrs->prob_avg); if (cur_tp_avg > tmp_tp_avg) - mi->max_prob_rate = index; + *dest = index; max_gpr_tp_avg = minstrel_ht_get_tp_avg(mi, max_gpr_group, max_gpr_idx, @@ -460,7 +554,7 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) mg->max_group_prob_rate = index; } else { if (mrs->prob_avg > tmp_prob) - mi->max_prob_rate = index; + *dest = index; if (mrs->prob_avg > max_gpr_prob) mg->max_group_prob_rate = index; } @@ -476,24 +570,24 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) static void minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi, u16 tmp_mcs_tp_rate[MAX_THR_RATES], - u16 tmp_cck_tp_rate[MAX_THR_RATES]) + u16 tmp_legacy_tp_rate[MAX_THR_RATES]) { unsigned int tmp_group, tmp_idx, tmp_cck_tp, tmp_mcs_tp, tmp_prob; int i; - tmp_group = tmp_cck_tp_rate[0] / MCS_GROUP_RATES; - tmp_idx = tmp_cck_tp_rate[0] % MCS_GROUP_RATES; + tmp_group = MI_RATE_GROUP(tmp_legacy_tp_rate[0]); + tmp_idx = MI_RATE_IDX(tmp_legacy_tp_rate[0]); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_cck_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); - tmp_group = tmp_mcs_tp_rate[0] / MCS_GROUP_RATES; - tmp_idx = tmp_mcs_tp_rate[0] % MCS_GROUP_RATES; + tmp_group = MI_RATE_GROUP(tmp_mcs_tp_rate[0]); + tmp_idx = MI_RATE_IDX(tmp_mcs_tp_rate[0]); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_mcs_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); if (tmp_cck_tp > tmp_mcs_tp) { for(i = 0; i < MAX_THR_RATES; i++) { - minstrel_ht_sort_best_tp_rates(mi, tmp_cck_tp_rate[i], + minstrel_ht_sort_best_tp_rates(mi, tmp_legacy_tp_rate[i], tmp_mcs_tp_rate); } } @@ -511,14 +605,17 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) int tmp_max_streams, group, tmp_idx, tmp_prob; int tmp_tp = 0; - tmp_max_streams = minstrel_mcs_groups[mi->max_tp_rate[0] / - MCS_GROUP_RATES].streams; + if (!mi->sta->ht_cap.ht_supported) + return; + + group = MI_RATE_GROUP(mi->max_tp_rate[0]); + tmp_max_streams = minstrel_mcs_groups[group].streams; for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { mg = &mi->groups[group]; if (!mi->supported[group] || group == MINSTREL_CCK_GROUP) continue; - tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; + tmp_idx = MI_RATE_IDX(mg->max_group_prob_rate); tmp_prob = mi->groups[group].rates[tmp_idx].prob_avg; if (tmp_tp < minstrel_ht_get_tp_avg(mi, group, tmp_idx, tmp_prob) && @@ -531,133 +628,359 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) } } +static u16 +__minstrel_ht_get_sample_rate(struct minstrel_ht_sta *mi, + enum minstrel_sample_type type) +{ + u16 *rates = mi->sample[type].sample_rates; + u16 cur; + int i; + + for (i = 0; i < MINSTREL_SAMPLE_RATES; i++) { + if (!rates[i]) + continue; + + cur = rates[i]; + rates[i] = 0; + return cur; + } + + return 0; +} + static inline int -minstrel_get_duration(int index) +minstrel_ewma(int old, int new, int weight) { - const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES]; - unsigned int duration = group->duration[index % MCS_GROUP_RATES]; - return duration << group->shift; + int diff, incr; + + diff = new - old; + incr = (EWMA_DIV - weight) * diff / EWMA_DIV; + + return old + incr; } -static bool -minstrel_ht_probe_group(struct minstrel_ht_sta *mi, const struct mcs_group *tp_group, - int tp_idx, const struct mcs_group *group) +static inline int minstrel_filter_avg_add(u16 *prev_1, u16 *prev_2, s32 in) { - if (group->bw < tp_group->bw) - return false; + s32 out_1 = *prev_1; + s32 out_2 = *prev_2; + s32 val; - if (group->streams == tp_group->streams) - return true; + if (!in) + in += 1; - if (tp_idx < 4 && group->streams == tp_group->streams - 1) - return true; + if (!out_1) { + val = out_1 = in; + goto out; + } + + val = MINSTREL_AVG_COEFF1 * in; + val += MINSTREL_AVG_COEFF2 * out_1; + val += MINSTREL_AVG_COEFF3 * out_2; + val >>= MINSTREL_SCALE; + + if (val > 1 << MINSTREL_SCALE) + val = 1 << MINSTREL_SCALE; + if (val < 0) + val = 1; + +out: + *prev_2 = out_1; + *prev_1 = val; - return group->streams == tp_group->streams + 1; + return val; } +/* +* Recalculate statistics and counters of a given rate +*/ static void -minstrel_ht_find_probe_rates(struct minstrel_ht_sta *mi, u16 *rates, int *n_rates, - bool faster_rate) +minstrel_ht_calc_rate_stats(struct minstrel_priv *mp, + struct minstrel_rate_stats *mrs) { - const struct mcs_group *group, *tp_group; - int i, g, max_dur; - int tp_idx; + unsigned int cur_prob; + + if (unlikely(mrs->attempts > 0)) { + cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); + minstrel_filter_avg_add(&mrs->prob_avg, + &mrs->prob_avg_1, cur_prob); + mrs->att_hist += mrs->attempts; + mrs->succ_hist += mrs->success; + } - tp_group = &minstrel_mcs_groups[mi->max_tp_rate[0] / MCS_GROUP_RATES]; - tp_idx = mi->max_tp_rate[0] % MCS_GROUP_RATES; + mrs->last_success = mrs->success; + mrs->last_attempts = mrs->attempts; + mrs->success = 0; + mrs->attempts = 0; +} - max_dur = minstrel_get_duration(mi->max_tp_rate[0]); - if (faster_rate) - max_dur -= max_dur / 16; +static bool +minstrel_ht_find_sample_rate(struct minstrel_ht_sta *mi, int type, int idx) +{ + int i; - for (g = 0; g < MINSTREL_GROUPS_NB; g++) { - u16 supported = mi->supported[g]; + for (i = 0; i < MINSTREL_SAMPLE_RATES; i++) { + u16 cur = mi->sample[type].sample_rates[i]; - if (!supported) - continue; + if (cur == idx) + return true; - group = &minstrel_mcs_groups[g]; - if (!minstrel_ht_probe_group(mi, tp_group, tp_idx, group)) - continue; + if (!cur) + break; + } - for (i = 0; supported; supported >>= 1, i++) { - int idx; + return false; +} - if (!(supported & 1)) - continue; +static int +minstrel_ht_move_sample_rates(struct minstrel_ht_sta *mi, int type, + u32 fast_rate_dur, u32 slow_rate_dur) +{ + u16 *rates = mi->sample[type].sample_rates; + int i, j; - if ((group->duration[i] << group->shift) > max_dur) - continue; + for (i = 0, j = 0; i < MINSTREL_SAMPLE_RATES; i++) { + u32 duration; + bool valid = false; + u16 cur; - idx = g * MCS_GROUP_RATES + i; - if (idx == mi->max_tp_rate[0]) - continue; + cur = rates[i]; + if (!cur) + continue; - rates[(*n_rates)++] = idx; + duration = minstrel_get_duration(cur); + switch (type) { + case MINSTREL_SAMPLE_TYPE_SLOW: + valid = duration > fast_rate_dur && + duration < slow_rate_dur; + break; + case MINSTREL_SAMPLE_TYPE_INC: + case MINSTREL_SAMPLE_TYPE_JUMP: + valid = duration < fast_rate_dur; + break; + default: + valid = false; break; } + + if (!valid) { + rates[i] = 0; + continue; + } + + if (i == j) + continue; + + rates[j++] = cur; + rates[i] = 0; } + + return j; } -static void -minstrel_ht_rate_sample_switch(struct minstrel_priv *mp, - struct minstrel_ht_sta *mi) +static int +minstrel_ht_group_min_rate_offset(struct minstrel_ht_sta *mi, int group, + u32 max_duration) { - struct minstrel_rate_stats *mrs; - u16 rates[MINSTREL_GROUPS_NB]; - int n_rates = 0; - int probe_rate = 0; - bool faster_rate; + u16 supported = mi->supported[group]; int i; - u8 random; - /* - * Use rate switching instead of probing packets for devices with - * little control over retry fallback behavior - */ - if (mp->hw->max_rates > 1) - return; + for (i = 0; i < MCS_GROUP_RATES && supported; i++, supported >>= 1) { + if (!(supported & BIT(0))) + continue; - /* - * If the current EWMA prob is >75%, look for a rate that's 6.25% - * faster than the max tp rate. - * If that fails, look again for a rate that is at least as fast - */ - mrs = minstrel_get_ratestats(mi, mi->max_tp_rate[0]); - faster_rate = mrs->prob_avg > MINSTREL_FRAC(75, 100); - minstrel_ht_find_probe_rates(mi, rates, &n_rates, faster_rate); - if (!n_rates && faster_rate) - minstrel_ht_find_probe_rates(mi, rates, &n_rates, false); - - /* If no suitable rate was found, try to pick the next one in the group */ - if (!n_rates) { - int g_idx = mi->max_tp_rate[0] / MCS_GROUP_RATES; - u16 supported = mi->supported[g_idx]; - - supported >>= mi->max_tp_rate[0] % MCS_GROUP_RATES; - for (i = 0; supported; supported >>= 1, i++) { - if (!(supported & 1)) - continue; + if (minstrel_get_duration(MI_RATE(group, i)) >= max_duration) + continue; - probe_rate = mi->max_tp_rate[0] + i; + return i; + } + + return -1; +} + +/* + * Incremental update rates: + * Flip through groups and pick the first group rate that is faster than the + * highest currently selected rate + */ +static u16 +minstrel_ht_next_inc_rate(struct minstrel_ht_sta *mi, u32 fast_rate_dur) +{ + struct minstrel_mcs_group_data *mg; + u8 type = MINSTREL_SAMPLE_TYPE_INC; + int i, index = 0; + u8 group; + + group = mi->sample[type].sample_group; + for (i = 0; i < ARRAY_SIZE(minstrel_mcs_groups); i++) { + group = (group + 1) % ARRAY_SIZE(minstrel_mcs_groups); + mg = &mi->groups[group]; + + index = minstrel_ht_group_min_rate_offset(mi, group, + fast_rate_dur); + if (index < 0) + continue; + + index = MI_RATE(group, index & 0xf); + if (!minstrel_ht_find_sample_rate(mi, type, index)) goto out; + } + index = 0; + +out: + mi->sample[type].sample_group = group; + + return index; +} + +static int +minstrel_ht_next_group_sample_rate(struct minstrel_ht_sta *mi, int group, + u16 supported, int offset) +{ + struct minstrel_mcs_group_data *mg = &mi->groups[group]; + u16 idx; + int i; + + for (i = 0; i < MCS_GROUP_RATES; i++) { + idx = sample_table[mg->column][mg->index]; + if (++mg->index >= MCS_GROUP_RATES) { + mg->index = 0; + if (++mg->column >= ARRAY_SIZE(sample_table)) + mg->column = 0; } - return; + if (idx < offset) + continue; + + if (!(supported & BIT(idx))) + continue; + + return MI_RATE(group, idx); } - i = 0; - if (n_rates > 1) { - random = prandom_u32(); - i = random % n_rates; + return -1; +} + +/* + * Jump rates: + * Sample random rates, use those that are faster than the highest + * currently selected rate. Rates between the fastest and the slowest + * get sorted into the slow sample bucket, but only if it has room + */ +static u16 +minstrel_ht_next_jump_rate(struct minstrel_ht_sta *mi, u32 fast_rate_dur, + u32 slow_rate_dur, int *slow_rate_ofs) +{ + struct minstrel_mcs_group_data *mg; + struct minstrel_rate_stats *mrs; + u32 max_duration = slow_rate_dur; + int i, index, offset; + u16 *slow_rates; + u16 supported; + u32 duration; + u8 group; + + if (*slow_rate_ofs >= MINSTREL_SAMPLE_RATES) + max_duration = fast_rate_dur; + + slow_rates = mi->sample[MINSTREL_SAMPLE_TYPE_SLOW].sample_rates; + group = mi->sample[MINSTREL_SAMPLE_TYPE_JUMP].sample_group; + for (i = 0; i < ARRAY_SIZE(minstrel_mcs_groups); i++) { + u8 type; + + group = (group + 1) % ARRAY_SIZE(minstrel_mcs_groups); + mg = &mi->groups[group]; + + supported = mi->supported[group]; + if (!supported) + continue; + + offset = minstrel_ht_group_min_rate_offset(mi, group, + max_duration); + if (offset < 0) + continue; + + index = minstrel_ht_next_group_sample_rate(mi, group, supported, + offset); + if (index < 0) + continue; + + duration = minstrel_get_duration(index); + if (duration < fast_rate_dur) + type = MINSTREL_SAMPLE_TYPE_JUMP; + else + type = MINSTREL_SAMPLE_TYPE_SLOW; + + if (minstrel_ht_find_sample_rate(mi, type, index)) + continue; + + if (type == MINSTREL_SAMPLE_TYPE_JUMP) + goto found; + + if (*slow_rate_ofs >= MINSTREL_SAMPLE_RATES) + continue; + + if (duration >= slow_rate_dur) + continue; + + /* skip slow rates with high success probability */ + mrs = minstrel_get_ratestats(mi, index); + if (mrs->prob_avg > MINSTREL_FRAC(95, 100)) + continue; + + slow_rates[(*slow_rate_ofs)++] = index; + if (*slow_rate_ofs >= MINSTREL_SAMPLE_RATES) + max_duration = fast_rate_dur; } - probe_rate = rates[i]; + index = 0; -out: - mi->sample_rate = probe_rate; - mi->sample_mode = MINSTREL_SAMPLE_ACTIVE; +found: + mi->sample[MINSTREL_SAMPLE_TYPE_JUMP].sample_group = group; + + return index; +} + +static void +minstrel_ht_refill_sample_rates(struct minstrel_ht_sta *mi) +{ + u32 prob_dur = minstrel_get_duration(mi->max_prob_rate); + u32 tp_dur = minstrel_get_duration(mi->max_tp_rate[0]); + u32 tp2_dur = minstrel_get_duration(mi->max_tp_rate[1]); + u32 fast_rate_dur = min(min(tp_dur, tp2_dur), prob_dur); + u32 slow_rate_dur = max(max(tp_dur, tp2_dur), prob_dur); + u16 *rates; + int i, j; + + rates = mi->sample[MINSTREL_SAMPLE_TYPE_INC].sample_rates; + i = minstrel_ht_move_sample_rates(mi, MINSTREL_SAMPLE_TYPE_INC, + fast_rate_dur, slow_rate_dur); + while (i < MINSTREL_SAMPLE_RATES) { + rates[i] = minstrel_ht_next_inc_rate(mi, tp_dur); + if (!rates[i]) + break; + + i++; + } + + rates = mi->sample[MINSTREL_SAMPLE_TYPE_JUMP].sample_rates; + i = minstrel_ht_move_sample_rates(mi, MINSTREL_SAMPLE_TYPE_JUMP, + fast_rate_dur, slow_rate_dur); + j = minstrel_ht_move_sample_rates(mi, MINSTREL_SAMPLE_TYPE_SLOW, + fast_rate_dur, slow_rate_dur); + while (i < MINSTREL_SAMPLE_RATES) { + rates[i] = minstrel_ht_next_jump_rate(mi, fast_rate_dur, + slow_rate_dur, &j); + if (!rates[i]) + break; + + i++; + } + + for (i = 0; i < ARRAY_SIZE(mi->sample); i++) + memcpy(mi->sample[i].cur_sample_rates, mi->sample[i].sample_rates, + sizeof(mi->sample[i].cur_sample_rates)); } + /* * Update rate statistics and select new primary rates * @@ -668,26 +991,15 @@ out: * higher throughput rates, even if the probablity is a bit lower */ static void -minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, - bool sample) +minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct minstrel_mcs_group_data *mg; struct minstrel_rate_stats *mrs; int group, i, j, cur_prob; u16 tmp_mcs_tp_rate[MAX_THR_RATES], tmp_group_tp_rate[MAX_THR_RATES]; - u16 tmp_cck_tp_rate[MAX_THR_RATES], index; - - mi->sample_mode = MINSTREL_SAMPLE_IDLE; - - if (sample) { - mi->total_packets_cur = mi->total_packets - - mi->total_packets_last; - mi->total_packets_last = mi->total_packets; - } - if (!mp->sample_switch) - sample = false; - if (mi->total_packets_cur < SAMPLE_SWITCH_THR && mp->sample_switch != 1) - sample = false; + u16 tmp_legacy_tp_rate[MAX_THR_RATES], tmp_max_prob_rate; + u16 index; + bool ht_supported = mi->sta->ht_cap.ht_supported; if (mi->ampdu_packets > 0) { if (!ieee80211_hw_check(mp->hw, TX_STATUS_NO_AMPDU_LEN)) @@ -700,65 +1012,72 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, mi->ampdu_packets = 0; } - mi->sample_slow = 0; - mi->sample_count = 0; - - memset(tmp_mcs_tp_rate, 0, sizeof(tmp_mcs_tp_rate)); - memset(tmp_cck_tp_rate, 0, sizeof(tmp_cck_tp_rate)); if (mi->supported[MINSTREL_CCK_GROUP]) - for (j = 0; j < ARRAY_SIZE(tmp_cck_tp_rate); j++) - tmp_cck_tp_rate[j] = MINSTREL_CCK_GROUP * MCS_GROUP_RATES; + group = MINSTREL_CCK_GROUP; + else if (mi->supported[MINSTREL_OFDM_GROUP]) + group = MINSTREL_OFDM_GROUP; + else + group = 0; + + index = MI_RATE(group, 0); + for (j = 0; j < ARRAY_SIZE(tmp_legacy_tp_rate); j++) + tmp_legacy_tp_rate[j] = index; if (mi->supported[MINSTREL_VHT_GROUP_0]) - index = MINSTREL_VHT_GROUP_0 * MCS_GROUP_RATES; + group = MINSTREL_VHT_GROUP_0; + else if (ht_supported) + group = MINSTREL_HT_GROUP_0; + else if (mi->supported[MINSTREL_CCK_GROUP]) + group = MINSTREL_CCK_GROUP; else - index = MINSTREL_HT_GROUP_0 * MCS_GROUP_RATES; + group = MINSTREL_OFDM_GROUP; + index = MI_RATE(group, 0); + tmp_max_prob_rate = index; for (j = 0; j < ARRAY_SIZE(tmp_mcs_tp_rate); j++) tmp_mcs_tp_rate[j] = index; /* Find best rate sets within all MCS groups*/ for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { + u16 *tp_rate = tmp_mcs_tp_rate; + u16 last_prob = 0; mg = &mi->groups[group]; if (!mi->supported[group]) continue; - mi->sample_count++; - /* (re)Initialize group rate indexes */ for(j = 0; j < MAX_THR_RATES; j++) - tmp_group_tp_rate[j] = MCS_GROUP_RATES * group; + tmp_group_tp_rate[j] = MI_RATE(group, 0); - for (i = 0; i < MCS_GROUP_RATES; i++) { + if (group == MINSTREL_CCK_GROUP && ht_supported) + tp_rate = tmp_legacy_tp_rate; + + for (i = MCS_GROUP_RATES - 1; i >= 0; i--) { if (!(mi->supported[group] & BIT(i))) continue; - index = MCS_GROUP_RATES * group + i; + index = MI_RATE(group, i); mrs = &mg->rates[i]; mrs->retry_updated = false; - minstrel_calc_rate_stats(mp, mrs); + minstrel_ht_calc_rate_stats(mp, mrs); + + if (mrs->att_hist) + last_prob = max(last_prob, mrs->prob_avg); + else + mrs->prob_avg = max(last_prob, mrs->prob_avg); cur_prob = mrs->prob_avg; if (minstrel_ht_get_tp_avg(mi, group, i, cur_prob) == 0) continue; /* Find max throughput rate set */ - if (group != MINSTREL_CCK_GROUP) { - minstrel_ht_sort_best_tp_rates(mi, index, - tmp_mcs_tp_rate); - } else if (group == MINSTREL_CCK_GROUP) { - minstrel_ht_sort_best_tp_rates(mi, index, - tmp_cck_tp_rate); - } + minstrel_ht_sort_best_tp_rates(mi, index, tp_rate); /* Find max throughput rate set within a group */ minstrel_ht_sort_best_tp_rates(mi, index, tmp_group_tp_rate); - - /* Find max probability rate per group and global */ - minstrel_ht_set_best_prob_rate(mi, index); } memcpy(mg->max_group_tp_rate, tmp_group_tp_rate, @@ -766,19 +1085,34 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, } /* Assign new rate set per sta */ - minstrel_ht_assign_best_tp_rates(mi, tmp_mcs_tp_rate, tmp_cck_tp_rate); + minstrel_ht_assign_best_tp_rates(mi, tmp_mcs_tp_rate, + tmp_legacy_tp_rate); memcpy(mi->max_tp_rate, tmp_mcs_tp_rate, sizeof(mi->max_tp_rate)); - /* Try to increase robustness of max_prob_rate*/ - minstrel_ht_prob_rate_reduce_streams(mi); + for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { + if (!mi->supported[group]) + continue; + + mg = &mi->groups[group]; + mg->max_group_prob_rate = MI_RATE(group, 0); + + for (i = 0; i < MCS_GROUP_RATES; i++) { + if (!(mi->supported[group] & BIT(i))) + continue; + + index = MI_RATE(group, i); + + /* Find max probability rate per group and global */ + minstrel_ht_set_best_prob_rate(mi, &tmp_max_prob_rate, + index); + } + } - /* try to sample all available rates during each interval */ - mi->sample_count *= 8; - if (mp->new_avg) - mi->sample_count /= 2; + mi->max_prob_rate = tmp_max_prob_rate; - if (sample) - minstrel_ht_rate_sample_switch(mp, mi); + /* Try to increase robustness of max_prob_rate*/ + minstrel_ht_prob_rate_reduce_streams(mi); + minstrel_ht_refill_sample_rates(mi); #ifdef CONFIG_MAC80211_DEBUGFS /* use fixed index if set */ @@ -786,17 +1120,20 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, for (i = 0; i < 4; i++) mi->max_tp_rate[i] = mp->fixed_rate_idx; mi->max_prob_rate = mp->fixed_rate_idx; - mi->sample_mode = MINSTREL_SAMPLE_IDLE; } #endif /* Reset update timer */ mi->last_stats_update = jiffies; + mi->sample_time = jiffies; } static bool -minstrel_ht_txstat_valid(struct minstrel_priv *mp, struct ieee80211_tx_rate *rate) +minstrel_ht_txstat_valid(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, + struct ieee80211_tx_rate *rate) { + int i; + if (rate->idx < 0) return false; @@ -807,32 +1144,15 @@ minstrel_ht_txstat_valid(struct minstrel_priv *mp, struct ieee80211_tx_rate *rat rate->flags & IEEE80211_TX_RC_VHT_MCS) return true; - return rate->idx == mp->cck_rates[0] || - rate->idx == mp->cck_rates[1] || - rate->idx == mp->cck_rates[2] || - rate->idx == mp->cck_rates[3]; -} - -static void -minstrel_set_next_sample_idx(struct minstrel_ht_sta *mi) -{ - struct minstrel_mcs_group_data *mg; - - for (;;) { - mi->sample_group++; - mi->sample_group %= ARRAY_SIZE(minstrel_mcs_groups); - mg = &mi->groups[mi->sample_group]; + for (i = 0; i < ARRAY_SIZE(mp->cck_rates); i++) + if (rate->idx == mp->cck_rates[i]) + return true; - if (!mi->supported[mi->sample_group]) - continue; + for (i = 0; i < ARRAY_SIZE(mp->ofdm_rates[0]); i++) + if (rate->idx == mp->ofdm_rates[mi->band][i]) + return true; - if (++mg->index >= MCS_GROUP_RATES) { - mg->index = 0; - if (++mg->column >= ARRAY_SIZE(sample_table)) - mg->column = 0; - } - break; - } + return false; } static void @@ -840,7 +1160,7 @@ minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u16 *idx, bool primary) { int group, orig_group; - orig_group = group = *idx / MCS_GROUP_RATES; + orig_group = group = MI_RATE_GROUP(*idx); while (group > 0) { group--; @@ -887,21 +1207,14 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, void *priv_sta, struct ieee80211_tx_status *st) { struct ieee80211_tx_info *info = st->info; - struct minstrel_ht_sta_priv *msp = priv_sta; - struct minstrel_ht_sta *mi = &msp->ht; + struct minstrel_ht_sta *mi = priv_sta; struct ieee80211_tx_rate *ar = info->status.rates; - struct minstrel_rate_stats *rate, *rate2, *rate_sample = NULL; + struct minstrel_rate_stats *rate, *rate2; struct minstrel_priv *mp = priv; - u32 update_interval = mp->update_interval / 2; + u32 update_interval = mp->update_interval; bool last, update = false; - bool sample_status = false; int i; - if (!msp->is_ht) - return mac80211_minstrel.tx_status_ext(priv, sband, - &msp->legacy, st); - - /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) @@ -913,64 +1226,31 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, info->status.ampdu_len = 1; } - mi->ampdu_packets++; - mi->ampdu_len += info->status.ampdu_len; - - if (!mi->sample_wait && !mi->sample_tries && mi->sample_count > 0) { - int avg_ampdu_len = minstrel_ht_avg_ampdu_len(mi); - - mi->sample_wait = 16 + 2 * avg_ampdu_len; - mi->sample_tries = 1; - mi->sample_count--; + /* wraparound */ + if (mi->total_packets >= ~0 - info->status.ampdu_len) { + mi->total_packets = 0; + mi->sample_packets = 0; } + mi->total_packets += info->status.ampdu_len; if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) mi->sample_packets += info->status.ampdu_len; - if (mi->sample_mode != MINSTREL_SAMPLE_IDLE) - rate_sample = minstrel_get_ratestats(mi, mi->sample_rate); + mi->ampdu_packets++; + mi->ampdu_len += info->status.ampdu_len; - last = !minstrel_ht_txstat_valid(mp, &ar[0]); + last = !minstrel_ht_txstat_valid(mp, mi, &ar[0]); for (i = 0; !last; i++) { last = (i == IEEE80211_TX_MAX_RATES - 1) || - !minstrel_ht_txstat_valid(mp, &ar[i + 1]); + !minstrel_ht_txstat_valid(mp, mi, &ar[i + 1]); rate = minstrel_ht_get_stats(mp, mi, &ar[i]); - if (rate == rate_sample) - sample_status = true; - if (last) rate->success += info->status.ampdu_ack_len; rate->attempts += ar[i].count * info->status.ampdu_len; } - switch (mi->sample_mode) { - case MINSTREL_SAMPLE_IDLE: - if (mp->new_avg && - (mp->hw->max_rates > 1 || - mi->total_packets_cur < SAMPLE_SWITCH_THR)) - update_interval /= 2; - break; - - case MINSTREL_SAMPLE_ACTIVE: - if (!sample_status) - break; - - mi->sample_mode = MINSTREL_SAMPLE_PENDING; - update = true; - break; - - case MINSTREL_SAMPLE_PENDING: - if (sample_status) - break; - - update = true; - minstrel_ht_update_stats(mp, mi, false); - break; - } - - if (mp->hw->max_rates > 1) { /* * check for sudden death of spatial multiplexing, @@ -993,7 +1273,7 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, if (time_after(jiffies, mi->last_stats_update + update_interval)) { update = true; - minstrel_ht_update_stats(mp, mi, true); + minstrel_ht_update_stats(mp, mi); } if (update) @@ -1031,7 +1311,10 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, ctime += (t_slot * cw) >> 1; cw = min((cw << 1) | 1, mp->cw_max); - if (index / MCS_GROUP_RATES != MINSTREL_CCK_GROUP) { + if (minstrel_ht_is_legacy_group(MI_RATE_GROUP(index))) { + overhead = mi->overhead_legacy; + overhead_rtscts = mi->overhead_legacy_rtscts; + } else { overhead = mi->overhead; overhead_rtscts = mi->overhead_rtscts; } @@ -1061,7 +1344,8 @@ static void minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_sta_rates *ratetbl, int offset, int index) { - const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES]; + int group_idx = MI_RATE_GROUP(index); + const struct mcs_group *group = &minstrel_mcs_groups[group_idx]; struct minstrel_rate_stats *mrs; u8 idx; u16 flags = group->flags; @@ -1080,13 +1364,17 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, ratetbl->rate[offset].count_rts = mrs->retry_count_rtscts; } - if (index / MCS_GROUP_RATES == MINSTREL_CCK_GROUP) + index = MI_RATE_IDX(index); + if (group_idx == MINSTREL_CCK_GROUP) idx = mp->cck_rates[index % ARRAY_SIZE(mp->cck_rates)]; + else if (group_idx == MINSTREL_OFDM_GROUP) + idx = mp->ofdm_rates[mi->band][index % + ARRAY_SIZE(mp->ofdm_rates[0])]; else if (flags & IEEE80211_TX_RC_VHT_MCS) idx = ((group->streams - 1) << 4) | - ((index % MCS_GROUP_RATES) & 0xF); + (index & 0xF); else - idx = index % MCS_GROUP_RATES + (group->streams - 1) * 8; + idx = index + (group->streams - 1) * 8; /* enable RTS/CTS if needed: * - if station is in dynamic SMPS (and streams > 1) @@ -1106,17 +1394,17 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, static inline int minstrel_ht_get_prob_avg(struct minstrel_ht_sta *mi, int rate) { - int group = rate / MCS_GROUP_RATES; - rate %= MCS_GROUP_RATES; + int group = MI_RATE_GROUP(rate); + rate = MI_RATE_IDX(rate); return mi->groups[group].rates[rate].prob_avg; } static int minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) { - int group = mi->max_prob_rate / MCS_GROUP_RATES; + int group = MI_RATE_GROUP(mi->max_prob_rate); const struct mcs_group *g = &minstrel_mcs_groups[group]; - int rate = mi->max_prob_rate % MCS_GROUP_RATES; + int rate = MI_RATE_IDX(mi->max_prob_rate); unsigned int duration; /* Disable A-MSDU if max_prob_rate is bad */ @@ -1164,18 +1452,14 @@ static void minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct ieee80211_sta_rates *rates; - u16 first_rate = mi->max_tp_rate[0]; int i = 0; - if (mi->sample_mode == MINSTREL_SAMPLE_ACTIVE) - first_rate = mi->sample_rate; - rates = kzalloc(sizeof(*rates), GFP_ATOMIC); if (!rates) return; /* Start with max_tp_rate[0] */ - minstrel_ht_set_rate(mp, mi, rates, i++, first_rate); + minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate[0]); if (mp->hw->max_rates >= 3) { /* At least 3 tx rates supported, use max_tp_rate[1] next */ @@ -1191,102 +1475,20 @@ minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) rate_control_set_rates(mp->hw, mi->sta, rates); } -static int -minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) +static u16 +minstrel_ht_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { - struct minstrel_rate_stats *mrs; - struct minstrel_mcs_group_data *mg; - unsigned int sample_dur, sample_group, cur_max_tp_streams; - int tp_rate1, tp_rate2; - int sample_idx = 0; - - if (mp->hw->max_rates == 1 && mp->sample_switch && - (mi->total_packets_cur >= SAMPLE_SWITCH_THR || - mp->sample_switch == 1)) - return -1; - - if (mi->sample_wait > 0) { - mi->sample_wait--; - return -1; - } - - if (!mi->sample_tries) - return -1; - - sample_group = mi->sample_group; - mg = &mi->groups[sample_group]; - sample_idx = sample_table[mg->column][mg->index]; - minstrel_set_next_sample_idx(mi); + u8 seq; - if (!(mi->supported[sample_group] & BIT(sample_idx))) - return -1; - - mrs = &mg->rates[sample_idx]; - sample_idx += sample_group * MCS_GROUP_RATES; - - /* Set tp_rate1, tp_rate2 to the highest / second highest max_tp_rate */ - if (minstrel_get_duration(mi->max_tp_rate[0]) > - minstrel_get_duration(mi->max_tp_rate[1])) { - tp_rate1 = mi->max_tp_rate[1]; - tp_rate2 = mi->max_tp_rate[0]; + if (mp->hw->max_rates > 1) { + seq = mi->sample_seq; + mi->sample_seq = (seq + 1) % ARRAY_SIZE(minstrel_sample_seq); + seq = minstrel_sample_seq[seq]; } else { - tp_rate1 = mi->max_tp_rate[0]; - tp_rate2 = mi->max_tp_rate[1]; + seq = MINSTREL_SAMPLE_TYPE_INC; } - /* - * Sampling might add some overhead (RTS, no aggregation) - * to the frame. Hence, don't use sampling for the highest currently - * used highest throughput or probability rate. - */ - if (sample_idx == mi->max_tp_rate[0] || sample_idx == mi->max_prob_rate) - return -1; - - /* - * Do not sample if the probability is already higher than 95%, - * or if the rate is 3 times slower than the current max probability - * rate, to avoid wasting airtime. - */ - sample_dur = minstrel_get_duration(sample_idx); - if (mrs->prob_avg > MINSTREL_FRAC(95, 100) || - minstrel_get_duration(mi->max_prob_rate) * 3 < sample_dur) - return -1; - - - /* - * For devices with no configurable multi-rate retry, skip sampling - * below the per-group max throughput rate, and only use one sampling - * attempt per rate - */ - if (mp->hw->max_rates == 1 && - (minstrel_get_duration(mg->max_group_tp_rate[0]) < sample_dur || - mrs->attempts)) - return -1; - - /* Skip already sampled slow rates */ - if (sample_dur >= minstrel_get_duration(tp_rate1) && mrs->attempts) - return -1; - - /* - * Make sure that lower rates get sampled only occasionally, - * if the link is working perfectly. - */ - - cur_max_tp_streams = minstrel_mcs_groups[tp_rate1 / - MCS_GROUP_RATES].streams; - if (sample_dur >= minstrel_get_duration(tp_rate2) && - (cur_max_tp_streams - 1 < - minstrel_mcs_groups[sample_group].streams || - sample_dur >= minstrel_get_duration(mi->max_prob_rate))) { - if (mrs->sample_skipped < 20) - return -1; - - if (mi->sample_slow++ > 2) - return -1; - } - mi->sample_tries--; - - return sample_idx; + return __minstrel_ht_get_sample_rate(mi, seq); } static void @@ -1296,16 +1498,12 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, const struct mcs_group *sample_group; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); struct ieee80211_tx_rate *rate = &info->status.rates[0]; - struct minstrel_ht_sta_priv *msp = priv_sta; - struct minstrel_ht_sta *mi = &msp->ht; + struct minstrel_ht_sta *mi = priv_sta; struct minstrel_priv *mp = priv; - int sample_idx; - - if (!msp->is_ht) - return mac80211_minstrel.get_rate(priv, sta, &msp->legacy, txrc); + u16 sample_idx; if (!(info->flags & IEEE80211_TX_CTL_AMPDU) && - mi->max_prob_rate / MCS_GROUP_RATES != MINSTREL_CCK_GROUP) + !minstrel_ht_is_legacy_group(MI_RATE_GROUP(mi->max_prob_rate))) minstrel_aggr_check(sta, txrc->skb); info->flags |= mi->tx_flags; @@ -1318,23 +1516,18 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, /* Don't use EAPOL frames for sampling on non-mrr hw */ if (mp->hw->max_rates == 1 && (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO)) - sample_idx = -1; - else - sample_idx = minstrel_get_sample_rate(mp, mi); - - mi->total_packets++; + return; - /* wraparound */ - if (mi->total_packets == ~0) { - mi->total_packets = 0; - mi->sample_packets = 0; - } + if (time_is_before_jiffies(mi->sample_time)) + return; - if (sample_idx < 0) + mi->sample_time = jiffies + MINSTREL_SAMPLE_INTERVAL; + sample_idx = minstrel_ht_get_sample_rate(mp, mi); + if (!sample_idx) return; - sample_group = &minstrel_mcs_groups[sample_idx / MCS_GROUP_RATES]; - sample_idx %= MCS_GROUP_RATES; + sample_group = &minstrel_mcs_groups[MI_RATE_GROUP(sample_idx)]; + sample_idx = MI_RATE_IDX(sample_idx); if (sample_group == &minstrel_mcs_groups[MINSTREL_CCK_GROUP] && (sample_idx >= 4) != txrc->short_preamble) @@ -1346,8 +1539,11 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, if (sample_group == &minstrel_mcs_groups[MINSTREL_CCK_GROUP]) { int idx = sample_idx % ARRAY_SIZE(mp->cck_rates); rate->idx = mp->cck_rates[idx]; + } else if (sample_group == &minstrel_mcs_groups[MINSTREL_OFDM_GROUP]) { + int idx = sample_idx % ARRAY_SIZE(mp->ofdm_rates[0]); + rate->idx = mp->ofdm_rates[mi->band][idx]; } else if (sample_group->flags & IEEE80211_TX_RC_VHT_MCS) { - ieee80211_rate_set_vht(rate, sample_idx % MCS_GROUP_RATES, + ieee80211_rate_set_vht(rate, MI_RATE_IDX(sample_idx), sample_group->streams); } else { rate->idx = sample_idx + (sample_group->streams - 1) * 8; @@ -1366,44 +1562,59 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, if (sband->band != NL80211_BAND_2GHZ) return; - if (!ieee80211_hw_check(mp->hw, SUPPORTS_HT_CCK_RATES)) + if (sta->ht_cap.ht_supported && + !ieee80211_hw_check(mp->hw, SUPPORTS_HT_CCK_RATES)) return; - mi->cck_supported = 0; - mi->cck_supported_short = 0; for (i = 0; i < 4; i++) { - if (!rate_supported(sta, sband->band, mp->cck_rates[i])) + if (mp->cck_rates[i] == 0xff || + !rate_supported(sta, sband->band, mp->cck_rates[i])) continue; - mi->cck_supported |= BIT(i); + mi->supported[MINSTREL_CCK_GROUP] |= BIT(i); if (sband->bitrates[i].flags & IEEE80211_RATE_SHORT_PREAMBLE) - mi->cck_supported_short |= BIT(i); + mi->supported[MINSTREL_CCK_GROUP] |= BIT(i + 4); } +} + +static void +minstrel_ht_update_ofdm(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, + struct ieee80211_supported_band *sband, + struct ieee80211_sta *sta) +{ + const u8 *rates; + int i; - mi->supported[MINSTREL_CCK_GROUP] = mi->cck_supported; + if (sta->ht_cap.ht_supported) + return; + + rates = mp->ofdm_rates[sband->band]; + for (i = 0; i < ARRAY_SIZE(mp->ofdm_rates[0]); i++) { + if (rates[i] == 0xff || + !rate_supported(sta, sband->band, rates[i])) + continue; + + mi->supported[MINSTREL_OFDM_GROUP] |= BIT(i); + } } static void minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, - struct ieee80211_sta *sta, void *priv_sta) + struct ieee80211_sta *sta, void *priv_sta) { struct minstrel_priv *mp = priv; - struct minstrel_ht_sta_priv *msp = priv_sta; - struct minstrel_ht_sta *mi = &msp->ht; + struct minstrel_ht_sta *mi = priv_sta; struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs; u16 ht_cap = sta->ht_cap.cap; struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; + const struct ieee80211_rate *ctl_rate; + bool ldpc, erp; int use_vht; int n_supported = 0; int ack_dur; int stbc; int i; - bool ldpc; - - /* fall back to the old minstrel for legacy stations */ - if (!sta->ht_cap.ht_supported) - goto use_legacy; BUILD_BUG_ON(ARRAY_SIZE(minstrel_mcs_groups) != MINSTREL_GROUPS_NB); @@ -1412,10 +1623,10 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, else use_vht = 0; - msp->is_ht = true; memset(mi, 0, sizeof(*mi)); mi->sta = sta; + mi->band = sband->band; mi->last_stats_update = jiffies; ack_dur = ieee80211_frame_duration(sband->band, 10, 60, 1, 1, 0); @@ -1423,17 +1634,15 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, mi->overhead += ack_dur; mi->overhead_rtscts = mi->overhead + 2 * ack_dur; - mi->avg_ampdu_len = MINSTREL_FRAC(1, 1); + ctl_rate = &sband->bitrates[rate_lowest_index(sband, sta)]; + erp = ctl_rate->flags & IEEE80211_RATE_ERP_G; + ack_dur = ieee80211_frame_duration(sband->band, 10, + ctl_rate->bitrate, erp, 1, + ieee80211_chandef_get_shift(chandef)); + mi->overhead_legacy = ack_dur; + mi->overhead_legacy_rtscts = mi->overhead_legacy + 2 * ack_dur; - /* When using MRR, sample more on the first attempt, without delay */ - if (mp->has_mrr) { - mi->sample_count = 16; - mi->sample_wait = 0; - } else { - mi->sample_count = 8; - mi->sample_wait = 8; - } - mi->sample_tries = 4; + mi->avg_ampdu_len = MINSTREL_FRAC(1, 1); if (!use_vht) { stbc = (ht_cap & IEEE80211_HT_CAP_RX_STBC) >> @@ -1456,10 +1665,8 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, int bw, nss; mi->supported[i] = 0; - if (i == MINSTREL_CCK_GROUP) { - minstrel_ht_update_cck(mp, mi, sband, sta); + if (minstrel_ht_is_legacy_group(i)) continue; - } if (gflags & IEEE80211_TX_RC_SHORT_GI) { if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH) { @@ -1520,24 +1727,12 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, n_supported++; } - if (!n_supported) - goto use_legacy; - - mi->supported[MINSTREL_CCK_GROUP] |= mi->cck_supported_short << 4; + minstrel_ht_update_cck(mp, mi, sband, sta); + minstrel_ht_update_ofdm(mp, mi, sband, sta); /* create an initial rate table with the lowest supported rates */ - minstrel_ht_update_stats(mp, mi, true); + minstrel_ht_update_stats(mp, mi); minstrel_ht_update_rates(mp, mi); - - return; - -use_legacy: - msp->is_ht = false; - memset(&msp->legacy, 0, sizeof(msp->legacy)); - msp->legacy.r = msp->ratelist; - msp->legacy.sample_table = msp->sample_table; - return mac80211_minstrel.rate_init(priv, sband, chandef, sta, - &msp->legacy); } static void @@ -1561,7 +1756,7 @@ static void * minstrel_ht_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) { struct ieee80211_supported_band *sband; - struct minstrel_ht_sta_priv *msp; + struct minstrel_ht_sta *mi; struct minstrel_priv *mp = priv; struct ieee80211_hw *hw = mp->hw; int max_rates = 0; @@ -1573,91 +1768,91 @@ minstrel_ht_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) max_rates = sband->n_bitrates; } - msp = kzalloc(sizeof(*msp), gfp); - if (!msp) - return NULL; - - msp->ratelist = kcalloc(max_rates, sizeof(struct minstrel_rate), gfp); - if (!msp->ratelist) - goto error; - - msp->sample_table = kmalloc_array(max_rates, SAMPLE_COLUMNS, gfp); - if (!msp->sample_table) - goto error1; - - return msp; - -error1: - kfree(msp->ratelist); -error: - kfree(msp); - return NULL; + return kzalloc(sizeof(*mi), gfp); } static void minstrel_ht_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta) { - struct minstrel_ht_sta_priv *msp = priv_sta; - - kfree(msp->sample_table); - kfree(msp->ratelist); - kfree(msp); + kfree(priv_sta); } static void -minstrel_ht_init_cck_rates(struct minstrel_priv *mp) +minstrel_ht_fill_rate_array(u8 *dest, struct ieee80211_supported_band *sband, + const s16 *bitrates, int n_rates, u32 rate_flags) { - static const int bitrates[4] = { 10, 20, 55, 110 }; - struct ieee80211_supported_band *sband; - u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); int i, j; - sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ]; - if (!sband) - return; - for (i = 0; i < sband->n_bitrates; i++) { struct ieee80211_rate *rate = &sband->bitrates[i]; - if (rate->flags & IEEE80211_RATE_ERP_G) - continue; - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; - for (j = 0; j < ARRAY_SIZE(bitrates); j++) { + for (j = 0; j < n_rates; j++) { if (rate->bitrate != bitrates[j]) continue; - mp->cck_rates[j] = i; + dest[j] = i; break; } } } +static void +minstrel_ht_init_cck_rates(struct minstrel_priv *mp) +{ + static const s16 bitrates[4] = { 10, 20, 55, 110 }; + struct ieee80211_supported_band *sband; + u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); + + memset(mp->cck_rates, 0xff, sizeof(mp->cck_rates)); + sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ]; + if (!sband) + return; + + BUILD_BUG_ON(ARRAY_SIZE(mp->cck_rates) != ARRAY_SIZE(bitrates)); + minstrel_ht_fill_rate_array(mp->cck_rates, sband, + minstrel_cck_bitrates, + ARRAY_SIZE(minstrel_cck_bitrates), + rate_flags); +} + +static void +minstrel_ht_init_ofdm_rates(struct minstrel_priv *mp, enum nl80211_band band) +{ + static const s16 bitrates[8] = { 60, 90, 120, 180, 240, 360, 480, 540 }; + struct ieee80211_supported_band *sband; + u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); + + memset(mp->ofdm_rates[band], 0xff, sizeof(mp->ofdm_rates[band])); + sband = mp->hw->wiphy->bands[band]; + if (!sband) + return; + + BUILD_BUG_ON(ARRAY_SIZE(mp->ofdm_rates[band]) != ARRAY_SIZE(bitrates)); + minstrel_ht_fill_rate_array(mp->ofdm_rates[band], sband, + minstrel_ofdm_bitrates, + ARRAY_SIZE(minstrel_ofdm_bitrates), + rate_flags); +} + static void * minstrel_ht_alloc(struct ieee80211_hw *hw) { struct minstrel_priv *mp; + int i; mp = kzalloc(sizeof(struct minstrel_priv), GFP_ATOMIC); if (!mp) return NULL; - mp->sample_switch = -1; - /* contention window settings * Just an approximation. Using the per-queue values would complicate * the calculations and is probably unnecessary */ mp->cw_min = 15; mp->cw_max = 1023; - /* number of packets (in %) to use for sampling other rates - * sample less often for non-mrr packets, because the overhead - * is much higher than with mrr */ - mp->lookaround_rate = 5; - mp->lookaround_rate_mrr = 10; - /* maximum time that the hw is allowed to stay in one MRR segment */ mp->segment_size = 6000; @@ -1671,10 +1866,11 @@ minstrel_ht_alloc(struct ieee80211_hw *hw) mp->has_mrr = true; mp->hw = hw; - mp->update_interval = HZ / 10; - mp->new_avg = true; + mp->update_interval = HZ / 20; minstrel_ht_init_cck_rates(mp); + for (i = 0; i < ARRAY_SIZE(mp->hw->wiphy->bands); i++) + minstrel_ht_init_ofdm_rates(mp, i); return mp; } @@ -1688,10 +1884,6 @@ static void minstrel_ht_add_debugfs(struct ieee80211_hw *hw, void *priv, mp->fixed_rate_idx = (u32) -1; debugfs_create_u32("fixed_rate_idx", S_IRUGO | S_IWUGO, debugfsdir, &mp->fixed_rate_idx); - debugfs_create_u32("sample_switch", S_IRUGO | S_IWUSR, debugfsdir, - &mp->sample_switch); - debugfs_create_bool("new_avg", S_IRUGO | S_IWUSR, debugfsdir, - &mp->new_avg); } #endif @@ -1703,15 +1895,11 @@ minstrel_ht_free(void *priv) static u32 minstrel_ht_get_expected_throughput(void *priv_sta) { - struct minstrel_ht_sta_priv *msp = priv_sta; - struct minstrel_ht_sta *mi = &msp->ht; + struct minstrel_ht_sta *mi = priv_sta; int i, j, prob, tp_avg; - if (!msp->is_ht) - return mac80211_minstrel.get_expected_throughput(priv_sta); - - i = mi->max_tp_rate[0] / MCS_GROUP_RATES; - j = mi->max_tp_rate[0] % MCS_GROUP_RATES; + i = MI_RATE_GROUP(mi->max_tp_rate[0]); + j = MI_RATE_IDX(mi->max_tp_rate[0]); prob = mi->groups[i].rates[j].prob_avg; /* convert tp_avg from pkt per second in kbps */ diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index 53ea3c29debf..06e7126727ad 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -6,6 +6,35 @@ #ifndef __RC_MINSTREL_HT_H #define __RC_MINSTREL_HT_H +#include <linux/bitfield.h> + +/* number of highest throughput rates to consider*/ +#define MAX_THR_RATES 4 +#define SAMPLE_COLUMNS 10 /* number of columns in sample table */ + +/* scaled fraction values */ +#define MINSTREL_SCALE 12 +#define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div) +#define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE) + +#define EWMA_LEVEL 96 /* ewma weighting factor [/EWMA_DIV] */ +#define EWMA_DIV 128 + +/* + * Coefficients for moving average with noise filter (period=16), + * scaled by 10 bits + * + * a1 = exp(-pi * sqrt(2) / period) + * coeff2 = 2 * a1 * cos(sqrt(2) * 2 * pi / period) + * coeff3 = -sqr(a1) + * coeff1 = 1 - coeff2 - coeff3 + */ +#define MINSTREL_AVG_COEFF1 (MINSTREL_FRAC(1, 1) - \ + MINSTREL_AVG_COEFF2 - \ + MINSTREL_AVG_COEFF3) +#define MINSTREL_AVG_COEFF2 0x00001499 +#define MINSTREL_AVG_COEFF3 -0x0000092e + /* * The number of streams can be changed to 2 to reduce code * size and memory footprint. @@ -18,17 +47,55 @@ MINSTREL_HT_STREAM_GROUPS) #define MINSTREL_VHT_GROUPS_NB (MINSTREL_MAX_STREAMS * \ MINSTREL_VHT_STREAM_GROUPS) -#define MINSTREL_CCK_GROUPS_NB 1 +#define MINSTREL_LEGACY_GROUPS_NB 2 #define MINSTREL_GROUPS_NB (MINSTREL_HT_GROUPS_NB + \ MINSTREL_VHT_GROUPS_NB + \ - MINSTREL_CCK_GROUPS_NB) + MINSTREL_LEGACY_GROUPS_NB) #define MINSTREL_HT_GROUP_0 0 #define MINSTREL_CCK_GROUP (MINSTREL_HT_GROUP_0 + MINSTREL_HT_GROUPS_NB) -#define MINSTREL_VHT_GROUP_0 (MINSTREL_CCK_GROUP + 1) +#define MINSTREL_OFDM_GROUP (MINSTREL_CCK_GROUP + 1) +#define MINSTREL_VHT_GROUP_0 (MINSTREL_OFDM_GROUP + 1) #define MCS_GROUP_RATES 10 +#define MI_RATE_IDX_MASK GENMASK(3, 0) +#define MI_RATE_GROUP_MASK GENMASK(15, 4) + +#define MI_RATE(_group, _idx) \ + (FIELD_PREP(MI_RATE_GROUP_MASK, _group) | \ + FIELD_PREP(MI_RATE_IDX_MASK, _idx)) + +#define MI_RATE_IDX(_rate) FIELD_GET(MI_RATE_IDX_MASK, _rate) +#define MI_RATE_GROUP(_rate) FIELD_GET(MI_RATE_GROUP_MASK, _rate) + +#define MINSTREL_SAMPLE_RATES 5 /* rates per sample type */ +#define MINSTREL_SAMPLE_INTERVAL (HZ / 50) + +struct minstrel_priv { + struct ieee80211_hw *hw; + bool has_mrr; + unsigned int cw_min; + unsigned int cw_max; + unsigned int max_retry; + unsigned int segment_size; + unsigned int update_interval; + + u8 cck_rates[4]; + u8 ofdm_rates[NUM_NL80211_BANDS][8]; + +#ifdef CONFIG_MAC80211_DEBUGFS + /* + * enable fixed rate processing per RC + * - write static index to debugfs:ieee80211/phyX/rc/fixed_rate_idx + * - write -1 to enable RC processing again + * - setting will be applied on next update + */ + u32 fixed_rate_idx; +#endif +}; + + struct mcs_group { u16 flags; u8 streams; @@ -37,8 +104,36 @@ struct mcs_group { u16 duration[MCS_GROUP_RATES]; }; +extern const s16 minstrel_cck_bitrates[4]; +extern const s16 minstrel_ofdm_bitrates[8]; extern const struct mcs_group minstrel_mcs_groups[]; +struct minstrel_rate_stats { + /* current / last sampling period attempts/success counters */ + u16 attempts, last_attempts; + u16 success, last_success; + + /* total attempts/success counters */ + u32 att_hist, succ_hist; + + /* prob_avg - moving average of prob */ + u16 prob_avg; + u16 prob_avg_1; + + /* maximum retry counts */ + u8 retry_count; + u8 retry_count_rtscts; + + bool retry_updated; +}; + +enum minstrel_sample_type { + MINSTREL_SAMPLE_TYPE_INC, + MINSTREL_SAMPLE_TYPE_JUMP, + MINSTREL_SAMPLE_TYPE_SLOW, + __MINSTREL_SAMPLE_TYPE_MAX +}; + struct minstrel_mcs_group_data { u8 index; u8 column; @@ -51,10 +146,10 @@ struct minstrel_mcs_group_data { struct minstrel_rate_stats rates[MCS_GROUP_RATES]; }; -enum minstrel_sample_mode { - MINSTREL_SAMPLE_IDLE, - MINSTREL_SAMPLE_ACTIVE, - MINSTREL_SAMPLE_PENDING, +struct minstrel_sample_category { + u8 sample_group; + u16 sample_rates[MINSTREL_SAMPLE_RATES]; + u16 cur_sample_rates[MINSTREL_SAMPLE_RATES]; }; struct minstrel_ht_sta { @@ -77,28 +172,22 @@ struct minstrel_ht_sta { /* overhead time in usec for each frame */ unsigned int overhead; unsigned int overhead_rtscts; + unsigned int overhead_legacy; + unsigned int overhead_legacy_rtscts; - unsigned int total_packets_last; - unsigned int total_packets_cur; unsigned int total_packets; unsigned int sample_packets; /* tx flags to add for frames for this sta */ u32 tx_flags; - u8 sample_wait; - u8 sample_tries; - u8 sample_count; - u8 sample_slow; + u8 band; - enum minstrel_sample_mode sample_mode; + u8 sample_seq; u16 sample_rate; - /* current MCS group to be sampled */ - u8 sample_group; - - u8 cck_supported; - u8 cck_supported_short; + unsigned long sample_time; + struct minstrel_sample_category sample[__MINSTREL_SAMPLE_TYPE_MAX]; /* Bitfield of supported MCS rates of all groups */ u16 supported[MINSTREL_GROUPS_NB]; @@ -107,16 +196,6 @@ struct minstrel_ht_sta { struct minstrel_mcs_group_data groups[MINSTREL_GROUPS_NB]; }; -struct minstrel_ht_sta_priv { - union { - struct minstrel_ht_sta ht; - struct minstrel_sta_info legacy; - }; - void *ratelist; - void *sample_table; - bool is_ht; -}; - void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, int prob_avg); diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c index bebb71917742..25b8a67a63a4 100644 --- a/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -9,9 +9,13 @@ #include <linux/ieee80211.h> #include <linux/export.h> #include <net/mac80211.h> -#include "rc80211_minstrel.h" #include "rc80211_minstrel_ht.h" +struct minstrel_debugfs_info { + size_t len; + char buf[]; +}; + static ssize_t minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { @@ -28,6 +32,18 @@ minstrel_stats_release(struct inode *inode, struct file *file) return 0; } +static bool +minstrel_ht_is_sample_rate(struct minstrel_ht_sta *mi, int idx) +{ + int type, i; + + for (type = 0; type < ARRAY_SIZE(mi->sample); type++) + for (i = 0; i < MINSTREL_SAMPLE_RATES; i++) + if (mi->sample[type].cur_sample_rates[i] == idx) + return true; + return false; +} + static char * minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) { @@ -52,8 +68,7 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) for (j = 0; j < MCS_GROUP_RATES; j++) { struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; - static const int bitrates[4] = { 10, 20, 55, 110 }; - int idx = i * MCS_GROUP_RATES + j; + int idx = MI_RATE(i, j); unsigned int duration; if (!(mi->supported[i] & BIT(j))) @@ -67,6 +82,9 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) p += sprintf(p, "VHT%c0 ", htmode); p += sprintf(p, "%cGI ", gimode); p += sprintf(p, "%d ", mg->streams); + } else if (i == MINSTREL_OFDM_GROUP) { + p += sprintf(p, "OFDM "); + p += sprintf(p, "1 "); } else { p += sprintf(p, "CCK "); p += sprintf(p, "%cP ", j < 4 ? 'L' : 'S'); @@ -78,13 +96,19 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) *(p++) = (idx == mi->max_tp_rate[2]) ? 'C' : ' '; *(p++) = (idx == mi->max_tp_rate[3]) ? 'D' : ' '; *(p++) = (idx == mi->max_prob_rate) ? 'P' : ' '; + *(p++) = minstrel_ht_is_sample_rate(mi, idx) ? 'S' : ' '; if (gflags & IEEE80211_TX_RC_MCS) { p += sprintf(p, " MCS%-2u", (mg->streams - 1) * 8 + j); } else if (gflags & IEEE80211_TX_RC_VHT_MCS) { p += sprintf(p, " MCS%-1u/%1u", j, mg->streams); } else { - int r = bitrates[j % 4]; + int r; + + if (i == MINSTREL_OFDM_GROUP) + r = minstrel_ofdm_bitrates[j % 8]; + else + r = minstrel_cck_bitrates[j % 4]; p += sprintf(p, " %2u.%1uM", r / 10, r % 10); } @@ -120,20 +144,11 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) static int minstrel_ht_stats_open(struct inode *inode, struct file *file) { - struct minstrel_ht_sta_priv *msp = inode->i_private; - struct minstrel_ht_sta *mi = &msp->ht; + struct minstrel_ht_sta *mi = inode->i_private; struct minstrel_debugfs_info *ms; unsigned int i; - int ret; char *p; - if (!msp->is_ht) { - inode->i_private = &msp->legacy; - ret = minstrel_stats_open(inode, file); - inode->i_private = msp; - return ret; - } - ms = kmalloc(32768, GFP_KERNEL); if (!ms) return -ENOMEM; @@ -143,9 +158,9 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file) p += sprintf(p, "\n"); p += sprintf(p, - " best ____________rate__________ ____statistics___ _____last____ ______sum-of________\n"); + " best ____________rate__________ ____statistics___ _____last____ ______sum-of________\n"); p += sprintf(p, - "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob)] [retry|suc|att] [#success | #attempts]\n"); + "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob)] [retry|suc|att] [#success | #attempts]\n"); p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p); for (i = 0; i < MINSTREL_CCK_GROUP; i++) @@ -199,8 +214,7 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) for (j = 0; j < MCS_GROUP_RATES; j++) { struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; - static const int bitrates[4] = { 10, 20, 55, 110 }; - int idx = i * MCS_GROUP_RATES + j; + int idx = MI_RATE(i, j); unsigned int duration; if (!(mi->supported[i] & BIT(j))) @@ -214,6 +228,8 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) p += sprintf(p, "VHT%c0,", htmode); p += sprintf(p, "%cGI,", gimode); p += sprintf(p, "%d,", mg->streams); + } else if (i == MINSTREL_OFDM_GROUP) { + p += sprintf(p, "OFDM,,1,"); } else { p += sprintf(p, "CCK,"); p += sprintf(p, "%cP,", j < 4 ? 'L' : 'S'); @@ -225,13 +241,20 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) p += sprintf(p, "%s" ,((idx == mi->max_tp_rate[2]) ? "C" : "")); p += sprintf(p, "%s" ,((idx == mi->max_tp_rate[3]) ? "D" : "")); p += sprintf(p, "%s" ,((idx == mi->max_prob_rate) ? "P" : "")); + p += sprintf(p, "%s", (minstrel_ht_is_sample_rate(mi, idx) ? "S" : "")); if (gflags & IEEE80211_TX_RC_MCS) { p += sprintf(p, ",MCS%-2u,", (mg->streams - 1) * 8 + j); } else if (gflags & IEEE80211_TX_RC_VHT_MCS) { p += sprintf(p, ",MCS%-1u/%1u,", j, mg->streams); } else { - int r = bitrates[j % 4]; + int r; + + if (i == MINSTREL_OFDM_GROUP) + r = minstrel_ofdm_bitrates[j % 8]; + else + r = minstrel_cck_bitrates[j % 4]; + p += sprintf(p, ",%2u.%1uM,", r / 10, r % 10); } @@ -270,22 +293,12 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) static int minstrel_ht_stats_csv_open(struct inode *inode, struct file *file) { - struct minstrel_ht_sta_priv *msp = inode->i_private; - struct minstrel_ht_sta *mi = &msp->ht; + struct minstrel_ht_sta *mi = inode->i_private; struct minstrel_debugfs_info *ms; unsigned int i; - int ret; char *p; - if (!msp->is_ht) { - inode->i_private = &msp->legacy; - ret = minstrel_stats_csv_open(inode, file); - inode->i_private = msp; - return ret; - } - ms = kmalloc(32768, GFP_KERNEL); - if (!ms) return -ENOMEM; @@ -316,10 +329,8 @@ static const struct file_operations minstrel_ht_stat_csv_fops = { void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir) { - struct minstrel_ht_sta_priv *msp = priv_sta; - - debugfs_create_file("rc_stats", 0444, dir, msp, + debugfs_create_file("rc_stats", 0444, dir, priv_sta, &minstrel_ht_stat_fops); - debugfs_create_file("rc_stats_csv", 0444, dir, msp, + debugfs_create_file("rc_stats_csv", 0444, dir, priv_sta, &minstrel_ht_stat_csv_fops); } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 1e2e5a406d58..c1343c028b76 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -32,16 +32,6 @@ #include "wme.h" #include "rate.h" -static inline void ieee80211_rx_stats(struct net_device *dev, u32 len) -{ - struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); - - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += len; - u64_stats_update_end(&tstats->syncp); -} - /* * monitor mode reception * @@ -842,7 +832,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, if (skb) { skb->dev = sdata->dev; - ieee80211_rx_stats(skb->dev, skb->len); + dev_sw_netstats_rx_add(skb->dev, skb->len); netif_receive_skb(skb); } } @@ -1477,7 +1467,6 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) if (unlikely((ieee80211_is_data(hdr->frame_control) || ieee80211_is_pspoll(hdr->frame_control)) && rx->sdata->vif.type != NL80211_IFTYPE_ADHOC && - rx->sdata->vif.type != NL80211_IFTYPE_WDS && rx->sdata->vif.type != NL80211_IFTYPE_OCB && (!rx->sta || !test_sta_flag(rx->sta, WLAN_STA_ASSOC)))) { /* @@ -1758,7 +1747,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) } else if (rx->sdata->vif.type == NL80211_IFTYPE_OCB) { sta->rx_stats.last_rx = jiffies; } else if (!ieee80211_is_s1g_beacon(hdr->frame_control) && - is_multicast_ether_addr(hdr->addr1)) { + !is_multicast_ether_addr(hdr->addr1)) { /* * Mesh beacons will update last_rx when if they are found to * match the current local configuration when processed. @@ -2560,7 +2549,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) skb = rx->skb; xmit_skb = NULL; - ieee80211_rx_stats(dev, skb->len); + dev_sw_netstats_rx_add(dev, skb->len); if (rx->sta) { /* The seqno index has the same property as needed @@ -3699,7 +3688,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, } prev_dev = sdata->dev; - ieee80211_rx_stats(sdata->dev, skb->len); + dev_sw_netstats_rx_add(sdata->dev, skb->len); } if (prev_dev) { @@ -4080,10 +4069,6 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) return false; return true; - case NL80211_IFTYPE_WDS: - if (bssid || !ieee80211_is_data(hdr->frame_control)) - return false; - return ether_addr_equal(sdata->u.wds.remote_addr, hdr->addr2); case NL80211_IFTYPE_P2P_DEVICE: return ieee80211_is_public_action(hdr, skb->len) || ieee80211_is_probe_req(hdr->frame_control) || @@ -4110,7 +4095,9 @@ void ieee80211_check_fast_rx(struct sta_info *sta) .vif_type = sdata->vif.type, .control_port_protocol = sdata->control_port_protocol, }, *old, *new = NULL; + bool set_offload = false; bool assign = false; + bool offload; /* use sparse to check that we don't return without updating */ __acquire(check_fast_rx); @@ -4191,6 +4178,8 @@ void ieee80211_check_fast_rx(struct sta_info *sta) rcu_read_lock(); key = rcu_dereference(sta->ptk[sta->ptk_idx]); + if (!key) + key = rcu_dereference(sdata->default_unicast_key); if (key) { switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_TKIP: @@ -4221,6 +4210,17 @@ void ieee80211_check_fast_rx(struct sta_info *sta) if (assign) new = kmemdup(&fastrx, sizeof(fastrx), GFP_KERNEL); + offload = assign && + (sdata->vif.offload_flags & IEEE80211_OFFLOAD_DECAP_ENABLED); + + if (offload) + set_offload = !test_and_set_sta_flag(sta, WLAN_STA_DECAP_OFFLOAD); + else + set_offload = test_and_clear_sta_flag(sta, WLAN_STA_DECAP_OFFLOAD); + + if (set_offload) + drv_sta_set_decap_offload(local, sdata, &sta->sta, assign); + spin_lock_bh(&sta->lock); old = rcu_dereference_protected(sta->fast_rx, true); rcu_assign_pointer(sta->fast_rx, new); @@ -4267,6 +4267,104 @@ void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata) mutex_unlock(&local->sta_mtx); } +static void ieee80211_rx_8023(struct ieee80211_rx_data *rx, + struct ieee80211_fast_rx *fast_rx, + int orig_len) +{ + struct ieee80211_sta_rx_stats *stats; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + struct sta_info *sta = rx->sta; + struct sk_buff *skb = rx->skb; + void *sa = skb->data + ETH_ALEN; + void *da = skb->data; + + stats = &sta->rx_stats; + if (fast_rx->uses_rss) + stats = this_cpu_ptr(sta->pcpu_rx_stats); + + /* statistics part of ieee80211_rx_h_sta_process() */ + if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { + stats->last_signal = status->signal; + if (!fast_rx->uses_rss) + ewma_signal_add(&sta->rx_stats_avg.signal, + -status->signal); + } + + if (status->chains) { + int i; + + stats->chains = status->chains; + for (i = 0; i < ARRAY_SIZE(status->chain_signal); i++) { + int signal = status->chain_signal[i]; + + if (!(status->chains & BIT(i))) + continue; + + stats->chain_signal_last[i] = signal; + if (!fast_rx->uses_rss) + ewma_signal_add(&sta->rx_stats_avg.chain_signal[i], + -signal); + } + } + /* end of statistics */ + + stats->last_rx = jiffies; + stats->last_rate = sta_stats_encode_rate(status); + + stats->fragments++; + stats->packets++; + + skb->dev = fast_rx->dev; + + dev_sw_netstats_rx_add(fast_rx->dev, skb->len); + + /* The seqno index has the same property as needed + * for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS + * for non-QoS-data frames. Here we know it's a data + * frame, so count MSDUs. + */ + u64_stats_update_begin(&stats->syncp); + stats->msdu[rx->seqno_idx]++; + stats->bytes += orig_len; + u64_stats_update_end(&stats->syncp); + + if (fast_rx->internal_forward) { + struct sk_buff *xmit_skb = NULL; + if (is_multicast_ether_addr(da)) { + xmit_skb = skb_copy(skb, GFP_ATOMIC); + } else if (!ether_addr_equal(da, sa) && + sta_info_get(rx->sdata, da)) { + xmit_skb = skb; + skb = NULL; + } + + if (xmit_skb) { + /* + * Send to wireless media and increase priority by 256 + * to keep the received priority instead of + * reclassifying the frame (see cfg80211_classify8021d). + */ + xmit_skb->priority += 256; + xmit_skb->protocol = htons(ETH_P_802_3); + skb_reset_network_header(xmit_skb); + skb_reset_mac_header(xmit_skb); + dev_queue_xmit(xmit_skb); + } + + if (!skb) + return; + } + + /* deliver to local stack */ + skb->protocol = eth_type_trans(skb, fast_rx->dev); + memset(skb->cb, 0, sizeof(skb->cb)); + if (rx->list) + list_add_tail(&skb->list, rx->list); + else + netif_receive_skb(skb); + +} + static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, struct ieee80211_fast_rx *fast_rx) { @@ -4287,9 +4385,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, } addrs __aligned(2); struct ieee80211_sta_rx_stats *stats = &sta->rx_stats; - if (fast_rx->uses_rss) - stats = this_cpu_ptr(sta->pcpu_rx_stats); - /* for parallel-rx, we need to have DUP_VALIDATED, otherwise we write * to a common data structure; drivers can implement that per queue * but we don't have that information in mac80211 @@ -4363,32 +4458,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, pskb_trim(skb, skb->len - fast_rx->icv_len)) goto drop; - /* statistics part of ieee80211_rx_h_sta_process() */ - if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { - stats->last_signal = status->signal; - if (!fast_rx->uses_rss) - ewma_signal_add(&sta->rx_stats_avg.signal, - -status->signal); - } - - if (status->chains) { - int i; - - stats->chains = status->chains; - for (i = 0; i < ARRAY_SIZE(status->chain_signal); i++) { - int signal = status->chain_signal[i]; - - if (!(status->chains & BIT(i))) - continue; - - stats->chain_signal_last[i] = signal; - if (!fast_rx->uses_rss) - ewma_signal_add(&sta->rx_stats_avg.chain_signal[i], - -signal); - } - } - /* end of statistics */ - if (rx->key && !ieee80211_has_protected(hdr->frame_control)) goto drop; @@ -4400,12 +4469,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, return true; } - stats->last_rx = jiffies; - stats->last_rate = sta_stats_encode_rate(status); - - stats->fragments++; - stats->packets++; - /* do the header conversion - first grab the addresses */ ether_addr_copy(addrs.da, skb->data + fast_rx->da_offs); ether_addr_copy(addrs.sa, skb->data + fast_rx->sa_offs); @@ -4414,58 +4477,14 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, /* push the addresses in front */ memcpy(skb_push(skb, sizeof(addrs)), &addrs, sizeof(addrs)); - skb->dev = fast_rx->dev; - - ieee80211_rx_stats(fast_rx->dev, skb->len); - - /* The seqno index has the same property as needed - * for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS - * for non-QoS-data frames. Here we know it's a data - * frame, so count MSDUs. - */ - u64_stats_update_begin(&stats->syncp); - stats->msdu[rx->seqno_idx]++; - stats->bytes += orig_len; - u64_stats_update_end(&stats->syncp); - - if (fast_rx->internal_forward) { - struct sk_buff *xmit_skb = NULL; - if (is_multicast_ether_addr(addrs.da)) { - xmit_skb = skb_copy(skb, GFP_ATOMIC); - } else if (!ether_addr_equal(addrs.da, addrs.sa) && - sta_info_get(rx->sdata, addrs.da)) { - xmit_skb = skb; - skb = NULL; - } - - if (xmit_skb) { - /* - * Send to wireless media and increase priority by 256 - * to keep the received priority instead of - * reclassifying the frame (see cfg80211_classify8021d). - */ - xmit_skb->priority += 256; - xmit_skb->protocol = htons(ETH_P_802_3); - skb_reset_network_header(xmit_skb); - skb_reset_mac_header(xmit_skb); - dev_queue_xmit(xmit_skb); - } - - if (!skb) - return true; - } - - /* deliver to local stack */ - skb->protocol = eth_type_trans(skb, fast_rx->dev); - memset(skb->cb, 0, sizeof(skb->cb)); - if (rx->list) - list_add_tail(&skb->list, rx->list); - else - netif_receive_skb(skb); + ieee80211_rx_8023(rx, fast_rx, orig_len); return true; drop: dev_kfree_skb(skb); + if (fast_rx->uses_rss) + stats = this_cpu_ptr(sta->pcpu_rx_stats); + stats->dropped++; return true; } @@ -4519,6 +4538,43 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, return true; } +static void __ieee80211_rx_handle_8023(struct ieee80211_hw *hw, + struct ieee80211_sta *pubsta, + struct sk_buff *skb, + struct list_head *list) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_fast_rx *fast_rx; + struct ieee80211_rx_data rx; + + memset(&rx, 0, sizeof(rx)); + rx.skb = skb; + rx.local = local; + rx.list = list; + + I802_DEBUG_INC(local->dot11ReceivedFragmentCount); + + /* drop frame if too short for header */ + if (skb->len < sizeof(struct ethhdr)) + goto drop; + + if (!pubsta) + goto drop; + + rx.sta = container_of(pubsta, struct sta_info, sta); + rx.sdata = rx.sta->sdata; + + fast_rx = rcu_dereference(rx.sta->fast_rx); + if (!fast_rx) + goto drop; + + ieee80211_rx_8023(&rx, fast_rx, skb->len); + return; + +drop: + dev_kfree_skb(skb); +} + /* * This is the actual Rx frames handler. as it belongs to Rx path it must * be called with rcu_read_lock protection. @@ -4742,22 +4798,28 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, status->rx_flags = 0; + kcov_remote_start_common(skb_get_kcov_handle(skb)); + /* * Frames with failed FCS/PLCP checksum are not returned, * all other frames are returned without radiotap header * if it was previously present. * Also, frames with less than 16 bytes are dropped. */ - skb = ieee80211_rx_monitor(local, skb, rate); - if (!skb) - return; - - ieee80211_tpt_led_trig_rx(local, - ((struct ieee80211_hdr *)skb->data)->frame_control, - skb->len); + if (!(status->flag & RX_FLAG_8023)) + skb = ieee80211_rx_monitor(local, skb, rate); + if (skb) { + ieee80211_tpt_led_trig_rx(local, + ((struct ieee80211_hdr *)skb->data)->frame_control, + skb->len); - __ieee80211_rx_handle_packet(hw, pubsta, skb, list); + if (status->flag & RX_FLAG_8023) + __ieee80211_rx_handle_8023(hw, pubsta, skb, list); + else + __ieee80211_rx_handle_packet(hw, pubsta, skb, list); + } + kcov_remote_stop(); return; drop: kfree_skb(skb); diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index ae1cb2c68722..76747bfdaddd 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -133,16 +133,20 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, } if (wide_bw_chansw_ie) { + u8 new_seg1 = wide_bw_chansw_ie->new_center_freq_seg1; struct ieee80211_vht_operation vht_oper = { .chan_width = wide_bw_chansw_ie->new_channel_width, .center_freq_seg0_idx = wide_bw_chansw_ie->new_center_freq_seg0, - .center_freq_seg1_idx = - wide_bw_chansw_ie->new_center_freq_seg1, + .center_freq_seg1_idx = new_seg1, /* .basic_mcs_set doesn't matter */ }; - struct ieee80211_ht_operation ht_oper = {}; + struct ieee80211_ht_operation ht_oper = { + .operation_mode = + cpu_to_le16(new_seg1 << + IEEE80211_HT_OP_MODE_CCFS2_SHIFT), + }; /* default, for the case of IEEE80211_VHT_CHANWIDTH_USE_HT, * to the previously parsed chandef diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 4fe284ff1ea3..ec6973ee88ef 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -705,7 +705,7 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) out_drop_sta: local->num_sta--; synchronize_net(); - __cleanup_single_sta(sta); + cleanup_single_sta(sta); out_err: mutex_unlock(&local->sta_mtx); kfree(sinfo); @@ -724,19 +724,13 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) err = sta_info_insert_check(sta); if (err) { + sta_info_free(local, sta); mutex_unlock(&local->sta_mtx); rcu_read_lock(); - goto out_free; + return err; } - err = sta_info_insert_finish(sta); - if (err) - goto out_free; - - return 0; - out_free: - sta_info_free(local, sta); - return err; + return sta_info_insert_finish(sta); } int sta_info_insert(struct sta_info *sta) diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 7afd07636b81..78b9d0c7cc58 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -71,6 +71,7 @@ * until pending frames are delivered * @WLAN_STA_USES_ENCRYPTION: This station was configured for encryption, * so drop all packets without a key later. + * @WLAN_STA_DECAP_OFFLOAD: This station uses rx decap offload * * @NUM_WLAN_STA_FLAGS: number of defined flags */ @@ -102,6 +103,7 @@ enum ieee80211_sta_info_flags { WLAN_STA_MPSP_RECIPIENT, WLAN_STA_PS_DELIVER, WLAN_STA_USES_ENCRYPTION, + WLAN_STA_DECAP_OFFLOAD, NUM_WLAN_STA_FLAGS, }; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 6feb45135020..9baf185ee4c7 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -49,7 +49,8 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, int ac; if (info->flags & (IEEE80211_TX_CTL_NO_PS_BUFFER | - IEEE80211_TX_CTL_AMPDU)) { + IEEE80211_TX_CTL_AMPDU | + IEEE80211_TX_CTL_HW_80211_ENCAP)) { ieee80211_free_txskb(&local->hw, skb); return; } @@ -627,16 +628,12 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie; struct ieee80211_sub_if_data *sdata; struct ieee80211_hdr *hdr = (void *)skb->data; - __be16 ethertype = 0; - - if (skb->len >= ETH_HLEN && skb->protocol == cpu_to_be16(ETH_P_802_3)) - skb_copy_bits(skb, 2 * ETH_ALEN, ðertype, ETH_TLEN); rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); if (sdata) { - if (ethertype == sdata->control_port_protocol || - ethertype == cpu_to_be16(ETH_P_PREAUTH)) + if (skb->protocol == sdata->control_port_protocol || + skb->protocol == cpu_to_be16(ETH_P_PREAUTH)) cfg80211_control_port_tx_status(&sdata->wdev, cookie, skb->data, @@ -915,15 +912,6 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, ieee80211_mpsp_trigger_process( ieee80211_get_qos_ctl(hdr), sta, true, acked); - if (!acked && test_sta_flag(sta, WLAN_STA_PS_STA)) { - /* - * The STA is in power save mode, so assume - * that this TX packet failed because of that. - */ - ieee80211_handle_filtered_frame(local, sta, skb); - return; - } - if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) && (ieee80211_is_data(hdr->frame_control)) && (rates_idx != -1)) @@ -1150,6 +1138,12 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, -info->status.ack_signal); } } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) { + /* + * The STA is in power save mode, so assume + * that this TX packet failed because of that. + */ + if (skb) + ieee80211_handle_filtered_frame(local, sta, skb); return; } else if (noack_success) { /* nothing to do here, do not account as lost */ diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index e01e4daeb8cd..f91d02b81b92 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1927,7 +1927,7 @@ ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, struct ieee80211_tdls_data *tf = (void *)skb->data; struct wiphy *wiphy = sdata->local->hw.wiphy; - ASSERT_RTNL(); + lockdep_assert_wiphy(wiphy); /* make sure the driver supports it */ if (!(wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH)) @@ -1979,7 +1979,7 @@ void ieee80211_tdls_chsw_work(struct work_struct *wk) struct sk_buff *skb; struct ieee80211_tdls_data *tf; - rtnl_lock(); + wiphy_lock(local->hw.wiphy); while ((skb = skb_dequeue(&local->skb_queue_tdls_chsw))) { tf = (struct ieee80211_tdls_data *)skb->data; list_for_each_entry(sdata, &local->interfaces, list) { @@ -1994,7 +1994,7 @@ void ieee80211_tdls_chsw_work(struct work_struct *wk) kfree_skb(skb); } - rtnl_unlock(); + wiphy_unlock(local->hw.wiphy); } void ieee80211_tdls_handle_disconnect(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 89723907a094..8fcc39056402 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH -* Copyright (C) 2018 - 2019 Intel Corporation +* Copyright (C) 2018 - 2020 Intel Corporation */ #if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) @@ -2086,6 +2086,27 @@ TRACE_EVENT(api_connection_loss, ) ); +TRACE_EVENT(api_disconnect, + TP_PROTO(struct ieee80211_sub_if_data *sdata, bool reconnect), + + TP_ARGS(sdata, reconnect), + + TP_STRUCT__entry( + VIF_ENTRY + __field(int, reconnect) + ), + + TP_fast_assign( + VIF_ASSIGN; + __entry->reconnect = reconnect; + ), + + TP_printk( + VIF_PR_FMT " reconnect:%d", + VIF_PR_ARG, __entry->reconnect + ) +); + TRACE_EVENT(api_cqm_rssi_notify, TP_PROTO(struct ieee80211_sub_if_data *sdata, enum nl80211_cqm_rssi_threshold_event rssi_event, @@ -2740,7 +2761,7 @@ DEFINE_EVENT(local_sdata_addr_evt, drv_update_vif_offload, TP_ARGS(local, sdata) ); -TRACE_EVENT(drv_sta_set_4addr, +DECLARE_EVENT_CLASS(sta_flag_evt, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, bool enabled), @@ -2767,6 +2788,22 @@ TRACE_EVENT(drv_sta_set_4addr, ) ); +DEFINE_EVENT(sta_flag_evt, drv_sta_set_4addr, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, bool enabled), + + TP_ARGS(local, sdata, sta, enabled) +); + +DEFINE_EVENT(sta_flag_evt, drv_sta_set_decap_offload, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, bool enabled), + + TP_ARGS(local, sdata, sta, enabled) +); + #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 56a4d0d20a26..5d06de61047a 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -38,16 +38,6 @@ /* misc utils */ -static inline void ieee80211_tx_stats(struct net_device *dev, u32 len) -{ - struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); - - u64_stats_update_begin(&tstats->syncp); - tstats->tx_packets++; - tstats->tx_bytes += len; - u64_stats_update_end(&tstats->syncp); -} - static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, struct sk_buff *skb, int group_addr, int next_frag_len) @@ -319,9 +309,6 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx) if (tx->sdata->vif.type == NL80211_IFTYPE_OCB) return TX_CONTINUE; - if (tx->sdata->vif.type == NL80211_IFTYPE_WDS) - return TX_CONTINUE; - if (tx->flags & IEEE80211_TX_PS_BUFFERED) return TX_CONTINUE; @@ -662,7 +649,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) if (!skip_hw && tx->key && tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) info->control.hw_key = &tx->key->conf; - } else if (!ieee80211_is_mgmt(hdr->frame_control) && tx->sta && + } else if (ieee80211_is_data_present(hdr->frame_control) && tx->sta && test_sta_flag(tx->sta, WLAN_STA_USES_ENCRYPTION)) { return TX_DROP; } @@ -1195,9 +1182,7 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, tx->sta = rcu_dereference(sdata->u.vlan.sta); if (!tx->sta && sdata->wdev.use_4addr) return TX_DROP; - } else if (info->flags & (IEEE80211_TX_INTFL_NL80211_FRAME_TX | - IEEE80211_TX_CTL_INJECTED) || - tx->sdata->control_port_protocol == tx->skb->protocol) { + } else if (tx->sdata->control_port_protocol == tx->skb->protocol) { tx->sta = sta_info_get_bss(sdata, hdr->addr1); } if (!tx->sta && !is_multicast_ether_addr(hdr->addr1)) @@ -1322,7 +1307,7 @@ static struct sk_buff *codel_dequeue_func(struct codel_vars *cvars, fq = &local->fq; if (cvars == &txqi->def_cvars) - flow = &txqi->def_flow; + flow = &txqi->tin.default_flow; else flow = &fq->flows[cvars - local->cvars]; @@ -1365,7 +1350,7 @@ static struct sk_buff *fq_tin_dequeue_func(struct fq *fq, cparams = &local->cparams; } - if (flow == &txqi->def_flow) + if (flow == &tin->default_flow) cvars = &txqi->def_cvars; else cvars = &local->cvars[flow - fq->flows]; @@ -1392,17 +1377,6 @@ static void fq_skb_free_func(struct fq *fq, ieee80211_free_txskb(&local->hw, skb); } -static struct fq_flow *fq_flow_get_default_func(struct fq *fq, - struct fq_tin *tin, - int idx, - struct sk_buff *skb) -{ - struct txq_info *txqi; - - txqi = container_of(tin, struct txq_info, tin); - return &txqi->def_flow; -} - static void ieee80211_txq_enqueue(struct ieee80211_local *local, struct txq_info *txqi, struct sk_buff *skb) @@ -1415,8 +1389,7 @@ static void ieee80211_txq_enqueue(struct ieee80211_local *local, spin_lock_bh(&fq->lock); fq_tin_enqueue(fq, tin, flow_idx, skb, - fq_skb_free_func, - fq_flow_get_default_func); + fq_skb_free_func); spin_unlock_bh(&fq->lock); } @@ -1459,7 +1432,6 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct txq_info *txqi, int tid) { fq_tin_init(&txqi->tin); - fq_flow_init(&txqi->def_flow); codel_vars_init(&txqi->def_cvars); codel_stats_init(&txqi->cstats); __skb_queue_head_init(&txqi->frags); @@ -2113,6 +2085,9 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, info->flags |= IEEE80211_TX_CTL_NO_ACK; if (txflags & IEEE80211_RADIOTAP_F_TX_NOSEQNO) info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; + if (txflags & IEEE80211_RADIOTAP_F_TX_ORDER) + info->control.flags |= + IEEE80211_TX_CTRL_DONT_REORDER; break; case IEEE80211_RADIOTAP_RATE: @@ -2143,6 +2118,19 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_BW && mcs_bw == IEEE80211_RADIOTAP_MCS_BW_40) rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; + + if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_FEC && + mcs_flags & IEEE80211_RADIOTAP_MCS_FEC_LDPC) + info->flags |= IEEE80211_TX_CTL_LDPC; + + if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_STBC) { + u8 stbc = u8_get_bits(mcs_flags, + IEEE80211_RADIOTAP_MCS_STBC_MASK); + + info->flags |= + u32_encode_bits(stbc, + IEEE80211_TX_CTL_STBC); + } break; case IEEE80211_RADIOTAP_VHT: @@ -2279,11 +2267,13 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, payload[7]); } - /* - * Initialize skb->priority for QoS frames. This is put in the TID field - * of the frame before passing it to the driver. + /* Initialize skb->priority for QoS frames. If the DONT_REORDER flag + * is set, stick to the default value for skb->priority to assure + * frames injected with this flag are not reordered relative to each + * other. */ - if (ieee80211_is_data_qos(hdr->frame_control)) { + if (ieee80211_is_data_qos(hdr->frame_control) && + !(info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER)) { u8 *p = ieee80211_get_qos_ctl(hdr); skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK; } @@ -2295,8 +2285,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, * we handle as though they are non-injected frames. * This code here isn't entirely correct, the local MAC address * isn't always enough to find the interface to use; for proper - * VLAN/WDS support we will need a different mechanism (which - * likely isn't going to be monitor interfaces). + * VLAN support we have an nl80211-based mechanism. * * This is necessary, for example, for old hostapd versions that * don't use nl80211-based management TX/RX. @@ -2307,8 +2296,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, if (!ieee80211_sdata_running(tmp_sdata)) continue; if (tmp_sdata->vif.type == NL80211_IFTYPE_MONITOR || - tmp_sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - tmp_sdata->vif.type == NL80211_IFTYPE_WDS) + tmp_sdata->vif.type == NL80211_IFTYPE_AP_VLAN) continue; if (ether_addr_equal(tmp_sdata->vif.addr, hdr->addr2)) { sdata = tmp_sdata; @@ -2402,9 +2390,6 @@ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, } sta = sta_info_get_bss(sdata, skb->data); break; - case NL80211_IFTYPE_WDS: - sta = sta_info_get(sdata, sdata->u.wds.remote_addr); - break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: /* determined much later */ @@ -2580,20 +2565,6 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, hdrlen = 24; band = chanctx_conf->def.chan->band; break; - case NL80211_IFTYPE_WDS: - fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); - /* RA TA DA SA */ - memcpy(hdr.addr1, sdata->u.wds.remote_addr, ETH_ALEN); - memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); - memcpy(hdr.addr3, skb->data, ETH_ALEN); - memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN); - hdrlen = 30; - /* - * This is the exception! WDS style interfaces are prohibited - * when channel contexts are in used so this must be valid - */ - band = local->hw.conf.chandef.chan->band; - break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: if (!is_multicast_ether_addr(skb->data)) { @@ -3310,8 +3281,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, */ tin = &txqi->tin; - flow = fq_flow_classify(fq, tin, flow_idx, skb, - fq_flow_get_default_func); + flow = fq_flow_classify(fq, tin, flow_idx, skb); head = skb_peek_tail(&flow->queue); if (!head || skb_is_gso(head)) goto out; @@ -3378,8 +3348,6 @@ out_recalc: if (head->len != orig_len) { flow->backlog += head->len - orig_len; tin->backlog_bytes += head->len - orig_len; - - fq_recalc_backlog(fq, tin, flow); } out: spin_unlock_bh(&fq->lock); @@ -3403,7 +3371,7 @@ static void ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, if (key) info->control.hw_key = &key->conf; - ieee80211_tx_stats(skb->dev, skb->len); + dev_sw_netstats_tx_add(skb->dev, 1, skb->len); if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; @@ -3836,7 +3804,7 @@ void __ieee80211_schedule_txq(struct ieee80211_hw *hw, * get immediately moved to the back of the list on the next * call to ieee80211_next_txq(). */ - if (txqi->txq.sta && + if (txqi->txq.sta && local->airtime_flags && wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) list_add(&txqi->schedule_order, @@ -3850,6 +3818,8 @@ void __ieee80211_schedule_txq(struct ieee80211_hw *hw, } EXPORT_SYMBOL(__ieee80211_schedule_txq); +DEFINE_STATIC_KEY_FALSE(aql_disable); + bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { @@ -3859,6 +3829,9 @@ bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) return true; + if (static_branch_unlikely(&aql_disable)) + return true; + if (!txq->sta) return true; @@ -4021,7 +3994,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, goto out; } - ieee80211_tx_stats(dev, skb->len); + dev_sw_netstats_tx_add(dev, 1, skb->len); ieee80211_xmit(sdata, sta, skb); } @@ -4248,7 +4221,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; - ieee80211_tx_stats(dev, skb->len); + dev_sw_netstats_tx_add(dev, 1, skb->len); sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; sta->tx_stats.packets[skb_get_queue_mapping(skb)]++; @@ -4278,7 +4251,6 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, struct ethhdr *ehdr = (struct ethhdr *)skb->data; struct ieee80211_key *key; struct sta_info *sta; - bool offload = true; if (unlikely(skb->len < ETH_HLEN)) { kfree_skb(skb); @@ -4294,18 +4266,22 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, if (unlikely(IS_ERR_OR_NULL(sta) || !sta->uploaded || !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || - sdata->control_port_protocol == ehdr->h_proto)) - offload = false; - else if ((key = rcu_dereference(sta->ptk[sta->ptk_idx])) && - (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) || - key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)) - offload = false; - - if (offload) - ieee80211_8023_xmit(sdata, dev, sta, key, skb); - else - ieee80211_subif_start_xmit(skb, dev); + sdata->control_port_protocol == ehdr->h_proto)) + goto skip_offload; + + key = rcu_dereference(sta->ptk[sta->ptk_idx]); + if (!key) + key = rcu_dereference(sdata->default_unicast_key); + if (key && (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) || + key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)) + goto skip_offload; + + ieee80211_8023_xmit(sdata, dev, sta, key, skb); + goto out; + +skip_offload: + ieee80211_subif_start_xmit(skb, dev); out: rcu_read_unlock(); @@ -4418,9 +4394,10 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, /* * Transmit all pending packets. Called from tasklet. */ -void ieee80211_tx_pending(unsigned long data) +void ieee80211_tx_pending(struct tasklet_struct *t) { - struct ieee80211_local *local = (struct ieee80211_local *)data; + struct ieee80211_local *local = from_tasklet(local, t, + tx_pending_tasklet); unsigned long flags; int i; bool txok; @@ -5434,6 +5411,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; + struct sta_info *sta; struct sk_buff *skb; struct ethhdr *ehdr; u32 ctrl_flags = 0; @@ -5456,8 +5434,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, if (cookie) ctrl_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; - flags |= IEEE80211_TX_INTFL_NL80211_FRAME_TX | - IEEE80211_TX_CTL_INJECTED; + flags |= IEEE80211_TX_INTFL_NL80211_FRAME_TX; skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(struct ethhdr) + len); @@ -5474,10 +5451,25 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, ehdr->h_proto = proto; skb->dev = dev; - skb->protocol = htons(ETH_P_802_3); + skb->protocol = proto; skb_reset_network_header(skb); skb_reset_mac_header(skb); + /* update QoS header to prioritize control port frames if possible, + * priorization also happens for control port frames send over + * AF_PACKET + */ + rcu_read_lock(); + + if (ieee80211_lookup_ra_sta(sdata, skb, &sta) == 0 && !IS_ERR(sta)) { + u16 queue = __ieee80211_select_queue(sdata, sta, skb); + + skb_set_queue_mapping(skb, queue); + skb_get_hash(skb); + } + + rcu_read_unlock(); + /* mutex lock is only needed for incrementing the cookie counter */ mutex_lock(&local->mtx); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 49342060490f..f080fcf60e45 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -386,9 +386,10 @@ _ieee80211_wake_txqs(struct ieee80211_local *local, unsigned long *flags) rcu_read_unlock(); } -void ieee80211_wake_txqs(unsigned long data) +void ieee80211_wake_txqs(struct tasklet_struct *t) { - struct ieee80211_local *local = (struct ieee80211_local *)data; + struct ieee80211_local *local = from_tasklet(local, t, + wake_txqs_tasklet); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); @@ -831,7 +832,7 @@ void ieee80211_iterate_active_interfaces_atomic( } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic); -void ieee80211_iterate_active_interfaces_rtnl( +void ieee80211_iterate_active_interfaces_mtx( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), @@ -839,12 +840,12 @@ void ieee80211_iterate_active_interfaces_rtnl( { struct ieee80211_local *local = hw_to_local(hw); - ASSERT_RTNL(); + lockdep_assert_wiphy(hw->wiphy); __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, iterator, data); } -EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_rtnl); +EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_mtx); static void __iterate_stations(struct ieee80211_local *local, void (*iterator)(void *data, @@ -2513,7 +2514,6 @@ int ieee80211_reconfig(struct ieee80211_local *local) return res; } break; - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: @@ -2523,6 +2523,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_WDS: WARN_ON(1); break; } @@ -2594,7 +2595,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) mutex_unlock(&local->mtx); if (sched_scan_stopped) - cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy, 0); + cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); wake_up: @@ -3455,7 +3456,7 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, *chandef = he_chandef; - return false; + return true; } bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, @@ -3665,6 +3666,7 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, u64 ts = status->mactime; struct rate_info ri; u16 rate; + u8 n_ltf; if (WARN_ON(!ieee80211_have_rx_timestamp(status))) return 0; @@ -3675,11 +3677,58 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, /* Fill cfg80211 rate info */ switch (status->encoding) { + case RX_ENC_HE: + ri.flags |= RATE_INFO_FLAGS_HE_MCS; + ri.mcs = status->rate_idx; + ri.nss = status->nss; + ri.he_ru_alloc = status->he_ru; + if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) + ri.flags |= RATE_INFO_FLAGS_SHORT_GI; + + /* + * See P802.11ax_D6.0, section 27.3.4 for + * VHT PPDU format. + */ + if (status->flag & RX_FLAG_MACTIME_PLCP_START) { + mpdu_offset += 2; + ts += 36; + + /* + * TODO: + * For HE MU PPDU, add the HE-SIG-B. + * For HE ER PPDU, add 8us for the HE-SIG-A. + * For HE TB PPDU, add 4us for the HE-STF. + * Add the HE-LTF durations - variable. + */ + } + + break; case RX_ENC_HT: ri.mcs = status->rate_idx; ri.flags |= RATE_INFO_FLAGS_MCS; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; + + /* + * See P802.11REVmd_D3.0, section 19.3.2 for + * HT PPDU format. + */ + if (status->flag & RX_FLAG_MACTIME_PLCP_START) { + mpdu_offset += 2; + if (status->enc_flags & RX_ENC_FLAG_HT_GF) + ts += 24; + else + ts += 32; + + /* + * Add Data HT-LTFs per streams + * TODO: add Extension HT-LTFs, 4us per LTF + */ + n_ltf = ((ri.mcs >> 3) & 3) + 1; + n_ltf = n_ltf == 3 ? 4 : n_ltf; + ts += n_ltf * 4; + } + break; case RX_ENC_VHT: ri.flags |= RATE_INFO_FLAGS_VHT_MCS; @@ -3687,6 +3736,23 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, ri.nss = status->nss; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; + + /* + * See P802.11REVmd_D3.0, section 21.3.2 for + * VHT PPDU format. + */ + if (status->flag & RX_FLAG_MACTIME_PLCP_START) { + mpdu_offset += 2; + ts += 36; + + /* + * Add VHT-LTFs per streams + */ + n_ltf = (ri.nss != 1) && (ri.nss % 2) ? + ri.nss + 1 : ri.nss; + ts += 4 * n_ltf; + } + break; default: WARN_ON(1); @@ -3710,7 +3776,6 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, ri.legacy = DIV_ROUND_UP(bitrate, (1 << shift)); if (status->flag & RX_FLAG_MACTIME_PLCP_START) { - /* TODO: handle HT/VHT preambles */ if (status->band == NL80211_BAND_5GHZ) { ts += 20 << shift; mpdu_offset += 2; @@ -3746,7 +3811,7 @@ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local) struct cfg80211_chan_def chandef; /* for interface list, to avoid linking iflist_mtx and chanctx_mtx */ - ASSERT_RTNL(); + lockdep_assert_wiphy(local->hw.wiphy); mutex_lock(&local->mtx); list_for_each_entry(sdata, &local->interfaces, list) { @@ -3786,9 +3851,9 @@ void ieee80211_dfs_radar_detected_work(struct work_struct *work) } mutex_unlock(&local->chanctx_mtx); - rtnl_lock(); + wiphy_lock(local->hw.wiphy); ieee80211_dfs_cac_cancel(local); - rtnl_unlock(); + wiphy_unlock(local->hw.wiphy); if (num_chanctx > 1) /* XXX: multi-channel is not supported yet */ diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index fb0e3a657d2d..e856f9092137 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -465,12 +465,18 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta) * IEEE80211-2016 specification makes higher bandwidth operation * possible on the TDLS link if the peers have wider bandwidth * capability. + * + * However, in this case, and only if the TDLS peer is authorized, + * limit to the tdls_chandef so that the configuration here isn't + * wider than what's actually requested on the channel context. */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && - test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW)) - return bw; - - bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width)); + test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW) && + test_sta_flag(sta, WLAN_STA_AUTHORIZED) && + sta->tdls_chandef.chan) + bw = min(bw, ieee80211_chan_width_to_rx_bw(sta->tdls_chandef.width)); + else + bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width)); return bw; } @@ -478,6 +484,7 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta) void ieee80211_sta_set_rx_nss(struct sta_info *sta) { u8 ht_rx_nss = 0, vht_rx_nss = 0, he_rx_nss = 0, rx_nss; + bool support_160; /* if we received a notification already don't overwrite it */ if (sta->sta.rx_nss) @@ -508,7 +515,13 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta) } } - he_rx_nss = min(rx_mcs_80, rx_mcs_160); + support_160 = he_cap->he_cap_elem.phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; + + if (support_160) + he_rx_nss = min(rx_mcs_80, rx_mcs_160); + else + he_rx_nss = rx_mcs_80; } if (sta->sta.ht_cap.ht_supported) { diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index 2fb99325135a..9ea6004abe1b 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -118,9 +118,11 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr) { struct ieee80211_local *local = sdata->local; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u8 *p; - if (local->hw.queues < IEEE80211_NUM_ACS) + if ((info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER) || + local->hw.queues < IEEE80211_NUM_ACS) return 0; if (!ieee80211_is_data(hdr->frame_control)) { @@ -141,6 +143,7 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct mac80211_qos_map *qos_map; bool qos; @@ -153,7 +156,7 @@ u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, else qos = false; - if (!qos) { + if (!qos || (info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER)) { skb->priority = 0; /* required for correct WPA/11i MIC */ return IEEE80211_AC_BE; } @@ -202,9 +205,6 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_AP: ra = skb->data; break; - case NL80211_IFTYPE_WDS: - ra = sdata->u.wds.remote_addr; - break; case NL80211_IFTYPE_STATION: /* might be a TDLS station */ sta = sta_info_get(sdata, skb->data); @@ -249,6 +249,14 @@ void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata, p = ieee80211_get_qos_ctl(hdr); + /* don't overwrite the QoS field of injected frames */ + if (info->flags & IEEE80211_TX_CTL_INJECTED) { + /* do take into account Ack policy of injected frames */ + if (*p & IEEE80211_QOS_CTL_ACK_POLICY_NOACK) + info->flags |= IEEE80211_TX_CTL_NO_ACK; + return; + } + /* set up the first byte */ /* diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 06ea0f8bfd5c..520cedc594e1 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -20,9 +20,9 @@ #include "ieee802154_i.h" #include "cfg.h" -static void ieee802154_tasklet_handler(unsigned long data) +static void ieee802154_tasklet_handler(struct tasklet_struct *t) { - struct ieee802154_local *local = (struct ieee802154_local *)data; + struct ieee802154_local *local = from_tasklet(local, t, tasklet); struct sk_buff *skb; while ((skb = skb_dequeue(&local->skb_queue))) { @@ -91,9 +91,7 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) INIT_LIST_HEAD(&local->interfaces); mutex_init(&local->iflist_mtx); - tasklet_init(&local->tasklet, - ieee802154_tasklet_handler, - (unsigned long)local); + tasklet_setup(&local->tasklet, ieee802154_tasklet_handler); skb_queue_head_init(&local->skb_queue); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index f2868a8a50c3..47bab701555f 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -377,6 +377,8 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, if (!pskb_may_pull(skb, sizeof(*hdr))) goto err; + skb_dst_drop(skb); + /* Read and decode the label */ hdr = mpls_hdr(skb); dec = mpls_entry_decode(hdr); diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index 05d398d3fde4..b472dc149856 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -21,7 +21,7 @@ */ #include <linux/kernel.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <asm/unaligned.h> #include "protocol.h" diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 54b888f94009..96ba616f59bf 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -18,6 +18,7 @@ struct mptcp_pernet { struct ctl_table_header *ctl_table_hdr; int mptcp_enabled; + unsigned int add_addr_timeout; }; static struct mptcp_pernet *mptcp_get_pernet(struct net *net) @@ -30,6 +31,11 @@ int mptcp_is_enabled(struct net *net) return mptcp_get_pernet(net)->mptcp_enabled; } +unsigned int mptcp_get_add_addr_timeout(struct net *net) +{ + return mptcp_get_pernet(net)->add_addr_timeout; +} + static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", @@ -40,12 +46,19 @@ static struct ctl_table mptcp_sysctl_table[] = { */ .proc_handler = proc_dointvec, }, + { + .procname = "add_addr_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, {} }; static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; + pernet->add_addr_timeout = TCP_RTO_MAX; } static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) @@ -61,6 +74,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) } table[0].data = &pernet->mptcp_enabled; + table[1].data = &pernet->add_addr_timeout; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 84d119436b22..3780c29c321d 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -29,8 +29,16 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA), SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR), SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD), + SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD), + SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX), + SNMP_MIB_ITEM("MPJoinPortSynAckRx", MPTCP_MIB_JOINPORTSYNACKRX), + SNMP_MIB_ITEM("MPJoinPortAckRx", MPTCP_MIB_JOINPORTACKRX), + SNMP_MIB_ITEM("MismatchPortSynRx", MPTCP_MIB_MISMATCHPORTSYNRX), + SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX), SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR), SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW), + SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX), + SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX), SNMP_MIB_SENTINEL }; @@ -67,6 +75,7 @@ void mptcp_seq_show(struct seq_file *seq) for (i = 0; mptcp_snmp_list[i].name; i++) seq_puts(seq, " 0"); + seq_putc(seq, '\n'); return; } diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 47bcecce1106..72afbc135f8e 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -22,8 +22,16 @@ enum linux_mptcp_mib_field { MPTCP_MIB_DUPDATA, /* Segments discarded due to duplicate DSS */ MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */ MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */ + MPTCP_MIB_PORTADD, /* Received ADD_ADDR with a port-number */ + MPTCP_MIB_JOINPORTSYNRX, /* Received a SYN MP_JOIN with a different port-number */ + MPTCP_MIB_JOINPORTSYNACKRX, /* Received a SYNACK MP_JOIN with a different port-number */ + MPTCP_MIB_JOINPORTACKRX, /* Received an ACK MP_JOIN with a different port-number */ + MPTCP_MIB_MISMATCHPORTSYNRX, /* Received a SYN MP_JOIN with a mismatched port-number */ + MPTCP_MIB_MISMATCHPORTACKRX, /* Received an ACK MP_JOIN with a mismatched port-number */ MPTCP_MIB_RMADDR, /* Received RM_ADDR */ MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */ + MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */ + MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */ __MPTCP_MIB_MAX }; diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 5f390a97f556..f16d9b5ee978 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -128,11 +128,13 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); - info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max); - val = READ_ONCE(msk->pm.add_addr_signal_max); + info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); + info->mptcpi_subflows_max = mptcp_pm_get_subflows_max(msk); + val = mptcp_pm_get_add_addr_signal_max(msk); info->mptcpi_add_addr_signal_max = val; - val = READ_ONCE(msk->pm.add_addr_accept_max); + val = mptcp_pm_get_add_addr_accept_max(msk); info->mptcpi_add_addr_accepted_max = val; + info->mptcpi_local_addr_max = mptcp_pm_get_local_addr_max(msk); if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) flags |= MPTCP_INFO_FLAG_FALLBACK; if (READ_ONCE(msk->can_ack)) @@ -140,7 +142,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, info->mptcpi_flags = flags; info->mptcpi_token = READ_ONCE(msk->token); info->mptcpi_write_seq = READ_ONCE(msk->write_seq); - info->mptcpi_snd_una = atomic64_read(&msk->snd_una); + info->mptcpi_snd_una = READ_ONCE(msk->snd_una); info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); unlock_sock_fast(sk, slow); } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index a044dd43411d..444a38681e93 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -7,7 +7,7 @@ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <net/tcp.h> #include <net/mptcp.h> #include "protocol.h" @@ -242,7 +242,6 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->add_addr = 1; mp_opt->addr_id = *ptr++; - pr_debug("ADD_ADDR: id=%d, echo=%d", mp_opt->addr_id, mp_opt->echo); if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4); ptr += 4; @@ -267,6 +266,9 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->ahmac = get_unaligned_be64(ptr); ptr += 8; } + pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d", + (mp_opt->family == MPTCP_ADDR_IPVERSION_6) ? "6" : "", + mp_opt->addr_id, mp_opt->ahmac, mp_opt->echo, mp_opt->port); break; case MPTCPOPT_RM_ADDR: @@ -280,6 +282,25 @@ static void mptcp_parse_option(const struct sk_buff *skb, pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); break; + case MPTCPOPT_MP_PRIO: + if (opsize != TCPOLEN_MPTCP_PRIO) + break; + + mp_opt->mp_prio = 1; + mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP; + pr_debug("MP_PRIO: prio=%d", mp_opt->backup); + break; + + case MPTCPOPT_MP_FASTCLOSE: + if (opsize != TCPOLEN_MPTCP_FASTCLOSE) + break; + + ptr += 2; + mp_opt->rcvr_key = get_unaligned_be64(ptr); + ptr += 8; + mp_opt->fastclose = 1; + break; + default: break; } @@ -297,9 +318,11 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->mp_join = 0; mp_opt->add_addr = 0; mp_opt->ahmac = 0; + mp_opt->fastclose = 0; mp_opt->port = 0; mp_opt->rm_addr = 0; mp_opt->dss = 0; + mp_opt->mp_prio = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); @@ -388,6 +411,7 @@ static void clear_3rdack_retransmission(struct sock *sk) } static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, + bool snd_data_fin_enable, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) @@ -405,9 +429,10 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, if (!skb) return false; - /* MPC/MPJ needed only on 3rd ack packet */ - if (subflow->fully_established || - subflow->snd_isn != TCP_SKB_CB(skb)->seq) + /* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */ + if (subflow->fully_established || snd_data_fin_enable || + subflow->snd_isn != TCP_SKB_CB(skb)->seq || + sk->sk_state != TCP_ESTABLISHED) return false; if (subflow->mp_capable) { @@ -479,6 +504,7 @@ static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, } static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, + bool snd_data_fin_enable, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) @@ -486,13 +512,12 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int dss_size = 0; - u64 snd_data_fin_enable; struct mptcp_ext *mpext; unsigned int ack_size; bool ret = false; + u64 ack_seq; mpext = skb ? mptcp_get_ext(skb) : NULL; - snd_data_fin_enable = READ_ONCE(msk->snd_data_fin_enable); if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size; @@ -518,16 +543,18 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, return ret; } + ack_seq = READ_ONCE(msk->ack_seq); if (READ_ONCE(msk->use_64bit_ack)) { ack_size = TCPOLEN_MPTCP_DSS_ACK64; - opts->ext_copy.data_ack = READ_ONCE(msk->ack_seq); + opts->ext_copy.data_ack = ack_seq; opts->ext_copy.ack64 = 1; } else { ack_size = TCPOLEN_MPTCP_DSS_ACK32; - opts->ext_copy.data_ack32 = (uint32_t)READ_ONCE(msk->ack_seq); + opts->ext_copy.data_ack32 = (uint32_t)ack_seq; opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; + WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk)); /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) @@ -573,27 +600,45 @@ static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, } #endif -static bool mptcp_established_options_add_addr(struct sock *sk, +static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); + bool drop_other_suboptions = false; + unsigned int opt_size = *size; struct mptcp_addr_info saddr; bool echo; + bool port; int len; + if ((mptcp_pm_should_add_signal_ipv6(msk) || + mptcp_pm_should_add_signal_port(msk)) && + skb && skb_is_tcp_pure_ack(skb)) { + pr_debug("drop other suboptions"); + opts->suboptions = 0; + opts->ext_copy.use_ack = 0; + opts->ext_copy.use_map = 0; + remaining += opt_size; + drop_other_suboptions = true; + } + if (!mptcp_pm_should_add_signal(msk) || - !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo))) + !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo, &port))) return false; - len = mptcp_add_addr_len(saddr.family, echo); + len = mptcp_add_addr_len(saddr.family, echo, port); if (remaining < len) return false; *size = len; + if (drop_other_suboptions) + *size -= opt_size; opts->addr_id = saddr.id; + if (port) + opts->port = ntohs(saddr.port); if (saddr.family == AF_INET) { opts->suboptions |= OPTION_MPTCP_ADD_ADDR; opts->addr = saddr.addr; @@ -616,7 +661,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, } } #endif - pr_debug("addr_id=%d, ahmac=%llu, echo=%d", opts->addr_id, opts->ahmac, echo); + pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d", + opts->addr_id, opts->ahmac, echo, opts->port); return true; } @@ -646,16 +692,42 @@ static bool mptcp_established_options_rm_addr(struct sock *sk, return true; } +static bool mptcp_established_options_mp_prio(struct sock *sk, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + if (!subflow->send_mp_prio) + return false; + + /* account for the trailing 'nop' option */ + if (remaining < TCPOLEN_MPTCP_PRIO_ALIGN) + return false; + + *size = TCPOLEN_MPTCP_PRIO_ALIGN; + opts->suboptions |= OPTION_MPTCP_PRIO; + opts->backup = subflow->request_bkup; + + pr_debug("prio=%d", opts->backup); + + return true; +} + bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int opt_size = 0; + bool snd_data_fin; bool ret = false; opts->suboptions = 0; - if (unlikely(mptcp_check_fallback(sk))) + if (unlikely(__mptcp_check_fallback(msk))) return false; /* prevent adding of any MPTCP related options on reset packet @@ -664,10 +736,10 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) return false; - if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) + snd_data_fin = mptcp_data_fin_enabled(msk); + if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts)) ret = true; - else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, - opts)) + else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts)) ret = true; /* we reserved enough space for the above options, and exceeding the @@ -678,7 +750,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, *size += opt_size; remaining -= opt_size; - if (mptcp_established_options_add_addr(sk, &opt_size, remaining, opts)) { + if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; @@ -688,6 +760,12 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, ret = true; } + if (mptcp_established_options_mp_prio(sk, &opt_size, remaining, opts)) { + *size += opt_size; + remaining -= opt_size; + ret = true; + } + return ret; } @@ -759,6 +837,11 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, goto fully_established; } + if (mp_opt->add_addr) { + WRITE_ONCE(msk->fully_established, true); + return true; + } + /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP. Fallback scenarios requires a reset for * MP_JOIN subflows. @@ -777,7 +860,12 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, mptcp_subflow_fully_established(subflow, mp_opt); fully_established: - if (likely(subflow->pm_notified)) + /* if the subflow is not already linked into the conn_list, we can't + * notify the PM: this subflow is still on the listener queue + * and the PM possibly acquiring the subflow lock could race with + * the listener close + */ + if (likely(subflow->pm_notified) || list_empty(&subflow->node)) return true; subflow->pm_notified = 1; @@ -785,7 +873,7 @@ fully_established: clear_3rdack_retransmission(ssk); mptcp_pm_subflow_established(msk, subflow); } else { - mptcp_pm_fully_established(msk); + mptcp_pm_fully_established(msk, ssk, GFP_ATOMIC); } return true; @@ -809,31 +897,41 @@ static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) return cur_ack; } -static void update_una(struct mptcp_sock *msk, - struct mptcp_options_received *mp_opt) +static void ack_update_msk(struct mptcp_sock *msk, + struct sock *ssk, + struct mptcp_options_received *mp_opt) { - u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); - u64 write_seq = READ_ONCE(msk->write_seq); + u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt); + struct sock *sk = (struct sock *)msk; + u64 old_snd_una; + + mptcp_data_lock(sk); /* avoid ack expansion on update conflict, to reduce the risk of * wrongly expanding to a future ack sequence number, which is way * more dangerous than missing an ack */ + old_snd_una = msk->snd_una; new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore. */ - if (after64(new_snd_una, write_seq)) + if (after64(new_snd_una, snd_nxt)) new_snd_una = old_snd_una; - while (after64(new_snd_una, old_snd_una)) { - snd_una = old_snd_una; - old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una, - new_snd_una); - if (old_snd_una == snd_una) { - mptcp_data_acked((struct sock *)msk); - break; - } + new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; + + if (after64(new_wnd_end, msk->wnd_end)) + msk->wnd_end = new_wnd_end; + + /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */ + if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt))) + __mptcp_check_push(sk, ssk); + + if (after64(new_snd_una, old_snd_una)) { + msk->snd_una = new_snd_una; + __mptcp_data_acked(sk); } + mptcp_data_unlock(sk); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) @@ -886,13 +984,30 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; - if (__mptcp_check_fallback(msk)) + if (__mptcp_check_fallback(msk)) { + /* Keep it simple and unconditionally trigger send data cleanup and + * pending queue spooling. We will need to acquire the data lock + * for more accurate checks, and once the lock is acquired, such + * helpers are cheap. + */ + mptcp_data_lock(subflow->conn); + if (sk_stream_memory_free(sk)) + __mptcp_check_push(subflow->conn, sk); + __mptcp_data_acked(subflow->conn); + mptcp_data_unlock(subflow->conn); return; + } mptcp_get_options(skb, &mp_opt); if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; + if (mp_opt.fastclose && + msk->local_key == mp_opt.rcvr_key) { + WRITE_ONCE(msk->rcv_fastclose, true); + mptcp_schedule_work((struct sock *)msk); + } + if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) { struct mptcp_addr_info addr; @@ -915,6 +1030,10 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) mptcp_pm_del_add_timer(msk, &addr); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD); } + + if (mp_opt.port) + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD); + mp_opt.add_addr = 0; } @@ -923,6 +1042,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) mp_opt.rm_addr = 0; } + if (mp_opt.mp_prio) { + mptcp_pm_mp_prio_received(sk, mp_opt.backup); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX); + mp_opt.mp_prio = 0; + } + if (!mp_opt.dss) return; @@ -930,7 +1055,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) * monodirectional flows will stuck */ if (mp_opt.use_ack) - update_una(msk, &mp_opt); + ack_update_msk(msk, sk, &mp_opt); /* Zero-data-length packets are dropped by the caller and not * propagated to the MPTCP layer, so the skb extension does not @@ -975,7 +1100,24 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) } } -void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) +static void mptcp_set_rwin(const struct tcp_sock *tp) +{ + const struct sock *ssk = (const struct sock *)tp; + const struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + u64 ack_seq; + + subflow = mptcp_subflow_ctx(ssk); + msk = mptcp_sk(subflow->conn); + + ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd; + + if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent))) + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); +} + +void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, + struct mptcp_out_options *opts) { if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions) { @@ -1014,50 +1156,84 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) } mp_capable_done: - if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { - if (opts->ahmac) - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR, 0, - opts->addr_id); - else - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR_BASE, - MPTCP_ADDR_ECHO, - opts->addr_id); - memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); - ptr += 1; + if ((OPTION_MPTCP_ADD_ADDR +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + | OPTION_MPTCP_ADD_ADDR6 +#endif + ) & opts->suboptions) { + u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; + u8 echo = MPTCP_ADDR_ECHO; + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) + len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; +#endif + + if (opts->port) + len += TCPOLEN_MPTCP_PORT_LEN; + if (opts->ahmac) { - put_unaligned_be64(opts->ahmac, ptr); - ptr += 2; + len += sizeof(opts->ahmac); + echo = 0; } - } + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + len, echo, opts->addr_id); + if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { + memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); + ptr += 1; + } #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { - if (opts->ahmac) - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR6, 0, - opts->addr_id); - else - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR6_BASE, - MPTCP_ADDR_ECHO, - opts->addr_id); - memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); - ptr += 4; - if (opts->ahmac) { - put_unaligned_be64(opts->ahmac, ptr); - ptr += 2; + else if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { + memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); + ptr += 4; } - } #endif + if (!opts->port) { + if (opts->ahmac) { + put_unaligned_be64(opts->ahmac, ptr); + ptr += 2; + } + } else { + if (opts->ahmac) { + u8 *bptr = (u8 *)ptr; + + put_unaligned_be16(opts->port, bptr); + bptr += 2; + put_unaligned_be64(opts->ahmac, bptr); + bptr += 8; + put_unaligned_be16(TCPOPT_NOP << 8 | + TCPOPT_NOP, bptr); + + ptr += 3; + } else { + put_unaligned_be32(opts->port << 16 | + TCPOPT_NOP << 8 | + TCPOPT_NOP, ptr); + ptr += 1; + } + } + } + if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, TCPOLEN_MPTCP_RM_ADDR_BASE, 0, opts->rm_id); } + if (OPTION_MPTCP_PRIO & opts->suboptions) { + const struct sock *ssk = (const struct sock *)tp; + struct mptcp_subflow_context *subflow; + + subflow = mptcp_subflow_ctx(ssk); + subflow->send_mp_prio = 0; + + *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO, + TCPOLEN_MPTCP_PRIO, + opts->backup, TCPOPT_NOP); + } + if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_SYN, @@ -1132,4 +1308,7 @@ mp_capable_done: TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } } + + if (tp) + mptcp_set_rwin(tp); } diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index e19e1525ecbb..6fd4b2c1b076 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -14,22 +14,45 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, - bool echo) + bool echo, bool port) { + u8 add_addr = READ_ONCE(msk->pm.addr_signal); + pr_debug("msk=%p, local_id=%d", msk, addr->id); + lockdep_assert_held(&msk->pm.lock); + + if (add_addr) { + pr_warn("addr_signal error, add_addr=%d", add_addr); + return -EINVAL; + } + msk->pm.local = *addr; - WRITE_ONCE(msk->pm.add_addr_echo, echo); - WRITE_ONCE(msk->pm.add_addr_signal, true); + add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL); + if (echo) + add_addr |= BIT(MPTCP_ADD_ADDR_ECHO); + if (addr->family == AF_INET6) + add_addr |= BIT(MPTCP_ADD_ADDR_IPV6); + if (port) + add_addr |= BIT(MPTCP_ADD_ADDR_PORT); + WRITE_ONCE(msk->pm.addr_signal, add_addr); return 0; } int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id) { + u8 rm_addr = READ_ONCE(msk->pm.addr_signal); + pr_debug("msk=%p, local_id=%d", msk, local_id); + if (rm_addr) { + pr_warn("addr_signal error, rm_addr=%d", rm_addr); + return -EINVAL; + } + msk->pm.rm_id = local_id; - WRITE_ONCE(msk->pm.rm_addr_signal, true); + rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); + WRITE_ONCE(msk->pm.addr_signal, rm_addr); return 0; } @@ -45,22 +68,26 @@ int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id) /* path manager event handlers */ -void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side) +void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side) { struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side); WRITE_ONCE(pm->server_side, server_side); + mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC); } bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; + unsigned int subflows_max; int ret = 0; + subflows_max = mptcp_pm_get_subflows_max(msk); + pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows, - pm->subflows_max, READ_ONCE(pm->accept_subflow)); + subflows_max, READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ if (!READ_ONCE(pm->accept_subflow)) @@ -68,8 +95,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) spin_lock_bh(&pm->lock); if (READ_ONCE(pm->accept_subflow)) { - ret = pm->subflows < pm->subflows_max; - if (ret && ++pm->subflows == pm->subflows_max) + ret = pm->subflows < subflows_max; + if (ret && ++pm->subflows == subflows_max) WRITE_ONCE(pm->accept_subflow, false); } spin_unlock_bh(&pm->lock); @@ -89,27 +116,35 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, return false; msk->pm.status |= BIT(new_status); - if (schedule_work(&msk->work)) - sock_hold((struct sock *)msk); + mptcp_schedule_work((struct sock *)msk); return true; } -void mptcp_pm_fully_established(struct mptcp_sock *msk) +void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp) { struct mptcp_pm_data *pm = &msk->pm; + bool announce = false; pr_debug("msk=%p", msk); - /* try to avoid acquiring the lock below */ - if (!READ_ONCE(pm->work_pending)) - return; - spin_lock_bh(&pm->lock); - if (READ_ONCE(pm->work_pending)) + /* mptcp_pm_fully_established() can be invoked by multiple + * racing paths - accept() and check_fully_established() + * be sure to serve this event only once. + */ + if (READ_ONCE(pm->work_pending) && + !(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED))) mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED); + if ((msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0) + announce = true; + + msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED); spin_unlock_bh(&pm->lock); + + if (announce) + mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, gfp); } void mptcp_pm_connection_closed(struct mptcp_sock *msk) @@ -148,32 +183,56 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id, READ_ONCE(pm->accept_addr)); + mptcp_event_addr_announced(msk, addr); + spin_lock_bh(&pm->lock); - if (!READ_ONCE(pm->accept_addr)) - mptcp_pm_announce_addr(msk, addr, true); - else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) + if (!READ_ONCE(pm->accept_addr)) { + mptcp_pm_announce_addr(msk, addr, true, addr->port); + mptcp_pm_add_addr_send_ack(msk); + } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->remote = *addr; + } spin_unlock_bh(&pm->lock); } +void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) +{ + if (!mptcp_pm_should_add_signal(msk)) + return; + + mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK); +} + void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id) { struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p remote_id=%d", msk, rm_id); + mptcp_event_addr_removed(msk, rm_id); + spin_lock_bh(&pm->lock); mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED); pm->rm_id = rm_id; spin_unlock_bh(&pm->lock); } +void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup); + subflow->backup = bkup; + + mptcp_event(MPTCP_EVENT_SUB_PRIORITY, mptcp_sk(subflow->conn), sk, GFP_ATOMIC); +} + /* path manager helpers */ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - struct mptcp_addr_info *saddr, bool *echo) + struct mptcp_addr_info *saddr, bool *echo, bool *port) { int ret = false; @@ -183,13 +242,14 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, if (!mptcp_pm_should_add_signal(msk)) goto out_unlock; - *echo = READ_ONCE(msk->pm.add_addr_echo); + *echo = mptcp_pm_should_add_signal_echo(msk); + *port = mptcp_pm_should_add_signal_port(msk); - if (remaining < mptcp_add_addr_len(msk->pm.local.family, *echo)) + if (remaining < mptcp_add_addr_len(msk->pm.local.family, *echo, *port)) goto out_unlock; *saddr = msk->pm.local; - WRITE_ONCE(msk->pm.add_addr_signal, false); + WRITE_ONCE(msk->pm.addr_signal, 0); ret = true; out_unlock: @@ -212,7 +272,7 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, goto out_unlock; *rm_id = msk->pm.rm_id; - WRITE_ONCE(msk->pm.rm_addr_signal, false); + WRITE_ONCE(msk->pm.addr_signal, 0); ret = true; out_unlock: @@ -233,11 +293,9 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) msk->pm.subflows = 0; msk->pm.rm_id = 0; WRITE_ONCE(msk->pm.work_pending, false); - WRITE_ONCE(msk->pm.add_addr_signal, false); - WRITE_ONCE(msk->pm.rm_addr_signal, false); + WRITE_ONCE(msk->pm.addr_signal, 0); WRITE_ONCE(msk->pm.accept_addr, false); WRITE_ONCE(msk->pm.accept_subflow, false); - WRITE_ONCE(msk->pm.add_addr_echo, false); msk->pm.status = 0; spin_lock_init(&msk->pm.lock); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 0d6f3d912891..8e8e35fa4002 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -26,6 +26,7 @@ struct mptcp_pm_addr_entry { struct list_head list; struct mptcp_addr_info addr; struct rcu_head rcu; + struct socket *lsk; }; struct mptcp_pm_add_entry { @@ -36,6 +37,9 @@ struct mptcp_pm_add_entry { u8 retrans_times; }; +#define MAX_ADDR_ID 255 +#define BITMAP_SZ DIV_ROUND_UP(MAX_ADDR_ID + 1, BITS_PER_LONG) + struct pm_nl_pernet { /* protects pernet updates */ spinlock_t lock; @@ -46,25 +50,33 @@ struct pm_nl_pernet { unsigned int local_addr_max; unsigned int subflows_max; unsigned int next_id; + unsigned long id_bitmap[BITMAP_SZ]; }; #define MPTCP_PM_ADDR_MAX 8 #define ADD_ADDR_RETRANS_MAX 3 +static void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk); + static bool addresses_equal(const struct mptcp_addr_info *a, struct mptcp_addr_info *b, bool use_port) { bool addr_equals = false; - if (a->family != b->family) - return false; - - if (a->family == AF_INET) - addr_equals = a->addr.s_addr == b->addr.s_addr; + if (a->family == b->family) { + if (a->family == AF_INET) + addr_equals = a->addr.s_addr == b->addr.s_addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) - else - addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6); + else + addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6); + } else if (a->family == AF_INET) { + if (ipv6_addr_v4mapped(&b->addr6)) + addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3]; + } else if (b->family == AF_INET) { + if (ipv6_addr_v4mapped(&a->addr6)) + addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr; #endif + } if (!addr_equals) return false; @@ -81,14 +93,14 @@ static bool address_zero(const struct mptcp_addr_info *addr) memset(&zero, 0, sizeof(zero)); zero.family = addr->family; - return addresses_equal(addr, &zero, false); + return addresses_equal(addr, &zero, true); } static void local_address(const struct sock_common *skc, struct mptcp_addr_info *addr) { - addr->port = 0; addr->family = skc->skc_family; + addr->port = htons(skc->skc_num); if (addr->family == AF_INET) addr->addr.s_addr = skc->skc_rcv_saddr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -121,7 +133,7 @@ static bool lookup_subflow_by_saddr(const struct list_head *list, skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); local_address(skc, &cur); - if (addresses_equal(&cur, saddr, false)) + if (addresses_equal(&cur, saddr, saddr->port)) return true; } @@ -133,24 +145,34 @@ select_local_address(const struct pm_nl_pernet *pernet, struct mptcp_sock *msk) { struct mptcp_pm_addr_entry *entry, *ret = NULL; + struct sock *sk = (struct sock *)msk; + + msk_owned_by_me(msk); rcu_read_lock(); - spin_lock_bh(&msk->join_list_lock); + __mptcp_flush_join_list(msk); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; + if (entry->addr.family != sk->sk_family) { +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if ((entry->addr.family == AF_INET && + !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) || + (sk->sk_family == AF_INET && + !ipv6_addr_v4mapped(&entry->addr.addr6))) +#endif + continue; + } + /* avoid any address already in use by subflows and * pending join */ - if (entry->addr.family == ((struct sock *)msk)->sk_family && - !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && - !lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) { + if (!lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) { ret = entry; break; } } - spin_unlock_bh(&msk->join_list_lock); rcu_read_unlock(); return ret; } @@ -179,11 +201,47 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos) return ret; } +unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet; + + pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + return READ_ONCE(pernet->add_addr_signal_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); + +unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet; + + pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + return READ_ONCE(pernet->add_addr_accept_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); + +unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet; + + pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + return READ_ONCE(pernet->subflows_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); + +unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet; + + pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + return READ_ONCE(pernet->local_addr_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); + static void check_work_pending(struct mptcp_sock *msk) { - if (msk->pm.add_addr_signaled == msk->pm.add_addr_signal_max && - (msk->pm.local_addr_used == msk->pm.local_addr_max || - msk->pm.subflows == msk->pm.subflows_max)) + if (msk->pm.add_addr_signaled == mptcp_pm_get_add_addr_signal_max(msk) && + (msk->pm.local_addr_used == mptcp_pm_get_local_addr_max(msk) || + msk->pm.subflows == mptcp_pm_get_subflows_max(msk))) WRITE_ONCE(msk->pm.work_pending, false); } @@ -193,14 +251,37 @@ lookup_anno_list_by_saddr(struct mptcp_sock *msk, { struct mptcp_pm_add_entry *entry; + lockdep_assert_held(&msk->pm.lock); + list_for_each_entry(entry, &msk->pm.anno_list, list) { - if (addresses_equal(&entry->addr, addr, false)) + if (addresses_equal(&entry->addr, addr, true)) return entry; } return NULL; } +bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) +{ + struct mptcp_pm_add_entry *entry; + struct mptcp_addr_info saddr; + bool ret = false; + + local_address((struct sock_common *)sk, &saddr); + + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(entry, &msk->pm.anno_list, list) { + if (addresses_equal(&entry->addr, &saddr, true)) { + ret = true; + goto out; + } + } + +out: + spin_unlock_bh(&msk->pm.lock); + return ret; +} + static void mptcp_pm_add_timer(struct timer_list *timer) { struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer); @@ -227,12 +308,14 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (!mptcp_pm_should_add_signal(msk)) { pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id); - mptcp_pm_announce_addr(msk, &entry->addr, false); + mptcp_pm_announce_addr(msk, &entry->addr, false, entry->addr.port); + mptcp_pm_add_addr_send_ack(msk); entry->retrans_times++; } if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) - sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX); + sk_reset_timer(sk, timer, + jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); spin_unlock_bh(&msk->pm.lock); @@ -264,6 +347,9 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; + struct net *net = sock_net(sk); + + lockdep_assert_held(&msk->pm.lock); if (lookup_anno_list_by_saddr(msk, &entry->addr)) return false; @@ -279,7 +365,8 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, add_entry->retrans_times = 0; timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); - sk_reset_timer(sk, &add_entry->add_timer, jiffies + TCP_RTO_MAX); + sk_reset_timer(sk, &add_entry->add_timer, + jiffies + mptcp_get_add_addr_timeout(net)); return true; } @@ -304,46 +391,54 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk) static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { - struct mptcp_addr_info remote = { 0 }; struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *local; + unsigned int add_addr_signal_max; + unsigned int local_addr_max; struct pm_nl_pernet *pernet; + unsigned int subflows_max; - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + pernet = net_generic(sock_net(sk), pm_nl_pernet_id); + + add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); + local_addr_max = mptcp_pm_get_local_addr_max(msk); + subflows_max = mptcp_pm_get_subflows_max(msk); pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", - msk->pm.local_addr_used, msk->pm.local_addr_max, - msk->pm.add_addr_signaled, msk->pm.add_addr_signal_max, - msk->pm.subflows, msk->pm.subflows_max); + msk->pm.local_addr_used, local_addr_max, + msk->pm.add_addr_signaled, add_addr_signal_max, + msk->pm.subflows, subflows_max); /* check first for announce */ - if (msk->pm.add_addr_signaled < msk->pm.add_addr_signal_max) { + if (msk->pm.add_addr_signaled < add_addr_signal_max) { local = select_signal_address(pernet, msk->pm.add_addr_signaled); if (local) { if (mptcp_pm_alloc_anno_list(msk, local)) { msk->pm.add_addr_signaled++; - mptcp_pm_announce_addr(msk, &local->addr, false); + mptcp_pm_announce_addr(msk, &local->addr, false, local->addr.port); + mptcp_pm_nl_add_addr_send_ack(msk); } } else { /* pick failed, avoid fourther attempts later */ - msk->pm.local_addr_used = msk->pm.add_addr_signal_max; + msk->pm.local_addr_used = add_addr_signal_max; } check_work_pending(msk); } /* check if should create a new subflow */ - if (msk->pm.local_addr_used < msk->pm.local_addr_max && - msk->pm.subflows < msk->pm.subflows_max) { - remote_address((struct sock_common *)sk, &remote); - + if (msk->pm.local_addr_used < local_addr_max && + msk->pm.subflows < subflows_max) { local = select_local_address(pernet, msk); if (local) { + struct mptcp_addr_info remote = { 0 }; + msk->pm.local_addr_used++; msk->pm.subflows++; check_work_pending(msk); + remote_address((struct sock_common *)sk, &remote); spin_unlock_bh(&msk->pm.lock); __mptcp_subflow_connect(sk, &local->addr, &remote); spin_lock_bh(&msk->pm.lock); @@ -351,34 +446,40 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) } /* lookup failed, avoid fourther attempts later */ - msk->pm.local_addr_used = msk->pm.local_addr_max; + msk->pm.local_addr_used = local_addr_max; check_work_pending(msk); } } -void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) +static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) { mptcp_pm_create_subflow_or_signal_addr(msk); } -void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) +static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) { mptcp_pm_create_subflow_or_signal_addr(msk); } -void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) +static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; + unsigned int add_addr_accept_max; struct mptcp_addr_info remote; struct mptcp_addr_info local; + unsigned int subflows_max; + bool use_port = false; + + add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); + subflows_max = mptcp_pm_get_subflows_max(msk); pr_debug("accepted %d:%d remote family %d", - msk->pm.add_addr_accepted, msk->pm.add_addr_accept_max, + msk->pm.add_addr_accepted, add_addr_accept_max, msk->pm.remote.family); msk->pm.add_addr_accepted++; msk->pm.subflows++; - if (msk->pm.add_addr_accepted >= msk->pm.add_addr_accept_max || - msk->pm.subflows >= msk->pm.subflows_max) + if (msk->pm.add_addr_accepted >= add_addr_accept_max || + msk->pm.subflows >= subflows_max) WRITE_ONCE(msk->pm.accept_addr, false); /* connect to the specified remote address, using whatever @@ -387,23 +488,98 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) remote = msk->pm.remote; if (!remote.port) remote.port = sk->sk_dport; + else + use_port = true; memset(&local, 0, sizeof(local)); local.family = remote.family; spin_unlock_bh(&msk->pm.lock); - __mptcp_subflow_connect((struct sock *)msk, &local, &remote); + __mptcp_subflow_connect(sk, &local, &remote); spin_lock_bh(&msk->pm.lock); - mptcp_pm_announce_addr(msk, &remote, true); + mptcp_pm_announce_addr(msk, &remote, true, use_port); + mptcp_pm_nl_add_addr_send_ack(msk); +} + +static void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + msk_owned_by_me(msk); + lockdep_assert_held(&msk->pm.lock); + + if (!mptcp_pm_should_add_signal(msk)) + return; + + __mptcp_flush_join_list(msk); + subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); + if (subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + u8 add_addr; + + spin_unlock_bh(&msk->pm.lock); + pr_debug("send ack for add_addr%s%s", + mptcp_pm_should_add_signal_ipv6(msk) ? " [ipv6]" : "", + mptcp_pm_should_add_signal_port(msk) ? " [port]" : ""); + + lock_sock(ssk); + tcp_send_ack(ssk); + release_sock(ssk); + spin_lock_bh(&msk->pm.lock); + + add_addr = READ_ONCE(msk->pm.addr_signal); + if (mptcp_pm_should_add_signal_ipv6(msk)) + add_addr &= ~BIT(MPTCP_ADD_ADDR_IPV6); + if (mptcp_pm_should_add_signal_port(msk)) + add_addr &= ~BIT(MPTCP_ADD_ADDR_PORT); + WRITE_ONCE(msk->pm.addr_signal, add_addr); + } +} + +int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + u8 bkup) +{ + struct mptcp_subflow_context *subflow; + + pr_debug("bkup=%d", bkup); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + struct sock *sk = (struct sock *)msk; + struct mptcp_addr_info local; + + local_address((struct sock_common *)ssk, &local); + if (!addresses_equal(&local, addr, addr->port)) + continue; + + subflow->backup = bkup; + subflow->send_mp_prio = 1; + subflow->request_bkup = bkup; + __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIOTX); + + spin_unlock_bh(&msk->pm.lock); + pr_debug("send ack for mp_prio"); + lock_sock(ssk); + tcp_send_ack(ssk); + release_sock(ssk); + spin_lock_bh(&msk->pm.lock); + + return 0; + } + + return -EINVAL; } -void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) +static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; pr_debug("address rm_id %d", msk->pm.rm_id); + msk_owned_by_me(msk); + if (!msk->pm.rm_id) return; @@ -413,14 +589,13 @@ void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - long timeout = 0; if (msk->pm.rm_id != subflow->remote_id) continue; spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); - __mptcp_close_ssk(sk, ssk, subflow, timeout); + mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); msk->pm.add_addr_accepted--; @@ -433,6 +608,39 @@ void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) } } +void mptcp_pm_nl_work(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + + msk_owned_by_me(msk); + + spin_lock_bh(&msk->pm.lock); + + pr_debug("msk=%p status=%x", msk, pm->status); + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); + mptcp_pm_nl_add_addr_received(msk); + } + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); + mptcp_pm_nl_add_addr_send_ack(msk); + } + if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); + mptcp_pm_nl_rm_addr_received(msk); + } + if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); + mptcp_pm_nl_fully_established(msk); + } + if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); + mptcp_pm_nl_subflow_established(msk); + } + + spin_unlock_bh(&msk->pm.lock); +} + void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id) { struct mptcp_subflow_context *subflow, *tmp; @@ -440,6 +648,8 @@ void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id) pr_debug("subflow rm_id %d", rm_id); + msk_owned_by_me(msk); + if (!rm_id) return; @@ -449,14 +659,13 @@ void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id) list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - long timeout = 0; if (rm_id != subflow->local_id) continue; spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); - __mptcp_close_ssk(sk, ssk, subflow, timeout); + mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); msk->pm.local_addr_used--; @@ -479,16 +688,19 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, struct mptcp_pm_addr_entry *entry) { struct mptcp_pm_addr_entry *cur; + unsigned int addr_max; int ret = -EINVAL; spin_lock_bh(&pernet->lock); /* to keep the code simple, don't do IDR-like allocation for address ID, * just bail when we exceed limits */ - if (pernet->next_id > 255) - goto out; + if (pernet->next_id == MAX_ADDR_ID) + pernet->next_id = 1; if (pernet->addrs >= MPTCP_PM_ADDR_MAX) goto out; + if (test_bit(entry->addr.id, pernet->id_bitmap)) + goto out; /* do not insert duplicate address, differentiate on port only * singled addresses @@ -500,12 +712,34 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, goto out; } - if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) - pernet->add_addr_signal_max++; - if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) - pernet->local_addr_max++; + if (!entry->addr.id) { +find_next: + entry->addr.id = find_next_zero_bit(pernet->id_bitmap, + MAX_ADDR_ID + 1, + pernet->next_id); + if ((!entry->addr.id || entry->addr.id > MAX_ADDR_ID) && + pernet->next_id != 1) { + pernet->next_id = 1; + goto find_next; + } + } + + if (!entry->addr.id || entry->addr.id > MAX_ADDR_ID) + goto out; + + __set_bit(entry->addr.id, pernet->id_bitmap); + if (entry->addr.id > pernet->next_id) + pernet->next_id = entry->addr.id; + + if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + addr_max = pernet->add_addr_signal_max; + WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); + } + if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + addr_max = pernet->local_addr_max; + WRITE_ONCE(pernet->local_addr_max, addr_max + 1); + } - entry->addr.id = pernet->next_id++; pernet->addrs++; list_add_tail_rcu(&entry->list, &pernet->local_addr_list); ret = entry->addr.id; @@ -515,6 +749,53 @@ out: return ret; } +static int mptcp_pm_nl_create_listen_socket(struct sock *sk, + struct mptcp_pm_addr_entry *entry) +{ + struct sockaddr_storage addr; + struct mptcp_sock *msk; + struct socket *ssock; + int backlog = 1024; + int err; + + err = sock_create_kern(sock_net(sk), entry->addr.family, + SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk); + if (err) + return err; + + msk = mptcp_sk(entry->lsk->sk); + if (!msk) { + err = -EINVAL; + goto out; + } + + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + err = -EINVAL; + goto out; + } + + mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); + err = kernel_bind(ssock, (struct sockaddr *)&addr, + sizeof(struct sockaddr_in)); + if (err) { + pr_warn("kernel_bind error, err=%d", err); + goto out; + } + + err = kernel_listen(ssock, backlog); + if (err) { + pr_warn("kernel_listen error, err=%d", err); + goto out; + } + + return 0; + +out: + sock_release(entry->lsk); + return err; +} + int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) { struct mptcp_pm_addr_entry *entry; @@ -541,7 +822,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (addresses_equal(&entry->addr, &skc_local, false)) { + if (addresses_equal(&entry->addr, &skc_local, entry->addr.port)) { ret = entry->addr.id; break; } @@ -558,6 +839,9 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) entry->addr = skc_local; entry->addr.ifindex = 0; entry->addr.flags = 0; + entry->addr.id = 0; + entry->addr.port = 0; + entry->lsk = NULL; ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); if (ret < 0) kfree(entry); @@ -568,26 +852,23 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) void mptcp_pm_nl_data_init(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; - struct pm_nl_pernet *pernet; bool subflows; - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); - - pm->add_addr_signal_max = READ_ONCE(pernet->add_addr_signal_max); - pm->add_addr_accept_max = READ_ONCE(pernet->add_addr_accept_max); - pm->local_addr_max = READ_ONCE(pernet->local_addr_max); - pm->subflows_max = READ_ONCE(pernet->subflows_max); - subflows = !!pm->subflows_max; - WRITE_ONCE(pm->work_pending, (!!pm->local_addr_max && subflows) || - !!pm->add_addr_signal_max); - WRITE_ONCE(pm->accept_addr, !!pm->add_addr_accept_max && subflows); + subflows = !!mptcp_pm_get_subflows_max(msk); + WRITE_ONCE(pm->work_pending, (!!mptcp_pm_get_local_addr_max(msk) && subflows) || + !!mptcp_pm_get_add_addr_signal_max(msk)); + WRITE_ONCE(pm->accept_addr, !!mptcp_pm_get_add_addr_accept_max(msk) && subflows); WRITE_ONCE(pm->accept_subflow, subflows); } -#define MPTCP_PM_CMD_GRP_OFFSET 0 +#define MPTCP_PM_CMD_GRP_OFFSET 0 +#define MPTCP_PM_EV_GRP_OFFSET 1 static const struct genl_multicast_group mptcp_pm_mcgrps[] = { [MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, }, + [MPTCP_PM_EV_GRP_OFFSET] = { .name = MPTCP_PM_EV_GRP_NAME, + .flags = GENL_UNS_ADMIN_PERM, + }, }; static const struct nla_policy @@ -683,6 +964,9 @@ skip_family: if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) entry->addr.flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); + if (tb[MPTCP_PM_ADDR_ATTR_PORT]) + entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); + return 0; } @@ -691,6 +975,31 @@ static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) return net_generic(genl_info_net(info), pm_nl_pernet_id); } +static int mptcp_nl_add_subflow_or_signal_addr(struct net *net) +{ + struct mptcp_sock *msk; + long s_slot = 0, s_num = 0; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + + if (!READ_ONCE(msk->fully_established)) + goto next; + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + mptcp_pm_create_subflow_or_signal_addr(msk); + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } + + return 0; +} + static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; @@ -709,13 +1018,25 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) } *entry = addr; + if (entry->addr.port) { + ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry); + if (ret) { + GENL_SET_ERR_MSG(info, "create listen socket error"); + kfree(entry); + return ret; + } + } ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); if (ret < 0) { GENL_SET_ERR_MSG(info, "too many addresses or duplicate one"); + if (entry->lsk) + sock_release(entry->lsk); kfree(entry); return ret; } + mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk)); + return 0; } @@ -793,11 +1114,44 @@ next: return 0; } +struct addr_entry_release_work { + struct rcu_work rwork; + struct mptcp_pm_addr_entry *entry; +}; + +static void mptcp_pm_release_addr_entry(struct work_struct *work) +{ + struct addr_entry_release_work *w; + struct mptcp_pm_addr_entry *entry; + + w = container_of(to_rcu_work(work), struct addr_entry_release_work, rwork); + entry = w->entry; + if (entry) { + if (entry->lsk) + sock_release(entry->lsk); + kfree(entry); + } + kfree(w); +} + +static void mptcp_pm_free_addr_entry(struct mptcp_pm_addr_entry *entry) +{ + struct addr_entry_release_work *w; + + w = kmalloc(sizeof(*w), GFP_ATOMIC); + if (w) { + INIT_RCU_WORK(&w->rwork, mptcp_pm_release_addr_entry); + w->entry = entry; + queue_rcu_work(system_wq, &w->rwork); + } +} + static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; + unsigned int addr_max; int ret; ret = mptcp_pm_parse_addr(attr, info, false, &addr); @@ -811,49 +1165,59 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) spin_unlock_bh(&pernet->lock); return -EINVAL; } - if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) - pernet->add_addr_signal_max--; - if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) - pernet->local_addr_max--; + if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + addr_max = pernet->add_addr_signal_max; + WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); + } + if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + addr_max = pernet->local_addr_max; + WRITE_ONCE(pernet->local_addr_max, addr_max - 1); + } pernet->addrs--; list_del_rcu(&entry->list); + __clear_bit(entry->addr.id, pernet->id_bitmap); spin_unlock_bh(&pernet->lock); mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), &entry->addr); - kfree_rcu(entry, rcu); + mptcp_pm_free_addr_entry(entry); return ret; } -static void __flush_addrs(struct pm_nl_pernet *pernet) +static void __flush_addrs(struct net *net, struct list_head *list) { - while (!list_empty(&pernet->local_addr_list)) { + while (!list_empty(list)) { struct mptcp_pm_addr_entry *cur; - cur = list_entry(pernet->local_addr_list.next, + cur = list_entry(list->next, struct mptcp_pm_addr_entry, list); + mptcp_nl_remove_subflow_and_signal_addr(net, &cur->addr); list_del_rcu(&cur->list); - kfree_rcu(cur, rcu); + mptcp_pm_free_addr_entry(cur); } } static void __reset_counters(struct pm_nl_pernet *pernet) { - pernet->add_addr_signal_max = 0; - pernet->add_addr_accept_max = 0; - pernet->local_addr_max = 0; + WRITE_ONCE(pernet->add_addr_signal_max, 0); + WRITE_ONCE(pernet->add_addr_accept_max, 0); + WRITE_ONCE(pernet->local_addr_max, 0); pernet->addrs = 0; } static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + LIST_HEAD(free_list); spin_lock_bh(&pernet->lock); - __flush_addrs(pernet); + list_splice_init(&pernet->local_addr_list, &free_list); __reset_counters(pernet); + pernet->next_id = 1; + bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1); spin_unlock_bh(&pernet->lock); + __flush_addrs(sock_net(skb->sk), &free_list); return 0; } @@ -869,6 +1233,8 @@ static int mptcp_nl_fill_addr(struct sk_buff *skb, if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family)) goto nla_put_failure; + if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_PORT, ntohs(addr->port))) + goto nla_put_failure; if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id)) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->addr.flags)) @@ -952,27 +1318,34 @@ static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg, struct pm_nl_pernet *pernet; int id = cb->args[0]; void *hdr; + int i; pernet = net_generic(net, pm_nl_pernet_id); spin_lock_bh(&pernet->lock); - list_for_each_entry(entry, &pernet->local_addr_list, list) { - if (entry->addr.id <= id) - continue; - - hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, &mptcp_genl_family, - NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); - if (!hdr) - break; + for (i = id; i < MAX_ADDR_ID + 1; i++) { + if (test_bit(i, pernet->id_bitmap)) { + entry = __lookup_addr_by_id(pernet, i); + if (!entry) + break; + + if (entry->addr.id <= id) + continue; + + hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &mptcp_genl_family, + NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); + if (!hdr) + break; + + if (mptcp_nl_fill_addr(msg, entry) < 0) { + genlmsg_cancel(msg, hdr); + break; + } - if (mptcp_nl_fill_addr(msg, entry) < 0) { - genlmsg_cancel(msg, hdr); - break; + id = entry->addr.id; + genlmsg_end(msg, hdr); } - - id = entry->addr.id; - genlmsg_end(msg, hdr); } spin_unlock_bh(&pernet->lock); @@ -1054,6 +1427,321 @@ fail: return -EMSGSIZE; } +static int mptcp_nl_addr_backup(struct net *net, + struct mptcp_addr_info *addr, + u8 bkup) +{ + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + int ret = -EINVAL; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + + if (list_empty(&msk->conn_list)) + goto next; + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, bkup); + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } + + return ret; +} + +static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + struct net *net = sock_net(skb->sk); + u8 bkup = 0; + int ret; + + ret = mptcp_pm_parse_addr(attr, info, true, &addr); + if (ret < 0) + return ret; + + if (addr.addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) + bkup = 1; + + list_for_each_entry(entry, &pernet->local_addr_list, list) { + if (addresses_equal(&entry->addr, &addr.addr, true)) { + ret = mptcp_nl_addr_backup(net, &entry->addr, bkup); + if (ret) + return ret; + + if (bkup) + entry->addr.flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + else + entry->addr.flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; + } + } + + return 0; +} + +static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp) +{ + genlmsg_multicast_netns(&mptcp_genl_family, net, + nlskb, 0, MPTCP_PM_EV_GRP_OFFSET, gfp); +} + +static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk) +{ + const struct inet_sock *issk = inet_sk(ssk); + const struct mptcp_subflow_context *sf; + + if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family)) + return -EMSGSIZE; + + switch (ssk->sk_family) { + case AF_INET: + if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr)) + return -EMSGSIZE; + if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, issk->inet_daddr)) + return -EMSGSIZE; + break; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + case AF_INET6: { + const struct ipv6_pinfo *np = inet6_sk(ssk); + + if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) + return -EMSGSIZE; + if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &ssk->sk_v6_daddr)) + return -EMSGSIZE; + break; + } +#endif + default: + WARN_ON_ONCE(1); + return -EMSGSIZE; + } + + if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport)) + return -EMSGSIZE; + if (nla_put_be16(skb, MPTCP_ATTR_DPORT, issk->inet_dport)) + return -EMSGSIZE; + + sf = mptcp_subflow_ctx(ssk); + if (WARN_ON_ONCE(!sf)) + return -EINVAL; + + if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, sf->local_id)) + return -EMSGSIZE; + + if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, sf->remote_id)) + return -EMSGSIZE; + + return 0; +} + +static int mptcp_event_put_token_and_ssk(struct sk_buff *skb, + const struct mptcp_sock *msk, + const struct sock *ssk) +{ + const struct sock *sk = (const struct sock *)msk; + const struct mptcp_subflow_context *sf; + u8 sk_err; + + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token)) + return -EMSGSIZE; + + if (mptcp_event_add_subflow(skb, ssk)) + return -EMSGSIZE; + + sf = mptcp_subflow_ctx(ssk); + if (WARN_ON_ONCE(!sf)) + return -EINVAL; + + if (nla_put_u8(skb, MPTCP_ATTR_BACKUP, sf->backup)) + return -EMSGSIZE; + + if (ssk->sk_bound_dev_if && + nla_put_s32(skb, MPTCP_ATTR_IF_IDX, ssk->sk_bound_dev_if)) + return -EMSGSIZE; + + sk_err = ssk->sk_err; + if (sk_err && sk->sk_state == TCP_ESTABLISHED && + nla_put_u8(skb, MPTCP_ATTR_ERROR, sk_err)) + return -EMSGSIZE; + + return 0; +} + +static int mptcp_event_sub_established(struct sk_buff *skb, + const struct mptcp_sock *msk, + const struct sock *ssk) +{ + return mptcp_event_put_token_and_ssk(skb, msk, ssk); +} + +static int mptcp_event_sub_closed(struct sk_buff *skb, + const struct mptcp_sock *msk, + const struct sock *ssk) +{ + if (mptcp_event_put_token_and_ssk(skb, msk, ssk)) + return -EMSGSIZE; + + return 0; +} + +static int mptcp_event_created(struct sk_buff *skb, + const struct mptcp_sock *msk, + const struct sock *ssk) +{ + int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token); + + if (err) + return err; + + return mptcp_event_add_subflow(skb, ssk); +} + +void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id) +{ + struct net *net = sock_net((const struct sock *)msk); + struct nlmsghdr *nlh; + struct sk_buff *skb; + + if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) + return; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return; + + nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, MPTCP_EVENT_REMOVED); + if (!nlh) + goto nla_put_failure; + + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token)) + goto nla_put_failure; + + if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, id)) + goto nla_put_failure; + + genlmsg_end(skb, nlh); + mptcp_nl_mcast_send(net, skb, GFP_ATOMIC); + return; + +nla_put_failure: + kfree_skb(skb); +} + +void mptcp_event_addr_announced(const struct mptcp_sock *msk, + const struct mptcp_addr_info *info) +{ + struct net *net = sock_net((const struct sock *)msk); + struct nlmsghdr *nlh; + struct sk_buff *skb; + + if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) + return; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return; + + nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, + MPTCP_EVENT_ANNOUNCED); + if (!nlh) + goto nla_put_failure; + + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token)) + goto nla_put_failure; + + if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id)) + goto nla_put_failure; + + if (nla_put_be16(skb, MPTCP_ATTR_DPORT, info->port)) + goto nla_put_failure; + + switch (info->family) { + case AF_INET: + if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, info->addr.s_addr)) + goto nla_put_failure; + break; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + case AF_INET6: + if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &info->addr6)) + goto nla_put_failure; + break; +#endif + default: + WARN_ON_ONCE(1); + goto nla_put_failure; + } + + genlmsg_end(skb, nlh); + mptcp_nl_mcast_send(net, skb, GFP_ATOMIC); + return; + +nla_put_failure: + kfree_skb(skb); +} + +void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, + const struct sock *ssk, gfp_t gfp) +{ + struct net *net = sock_net((const struct sock *)msk); + struct nlmsghdr *nlh; + struct sk_buff *skb; + + if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) + return; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!skb) + return; + + nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, type); + if (!nlh) + goto nla_put_failure; + + switch (type) { + case MPTCP_EVENT_UNSPEC: + WARN_ON_ONCE(1); + break; + case MPTCP_EVENT_CREATED: + case MPTCP_EVENT_ESTABLISHED: + if (mptcp_event_created(skb, msk, ssk) < 0) + goto nla_put_failure; + break; + case MPTCP_EVENT_CLOSED: + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token) < 0) + goto nla_put_failure; + break; + case MPTCP_EVENT_ANNOUNCED: + case MPTCP_EVENT_REMOVED: + /* call mptcp_event_addr_announced()/removed instead */ + WARN_ON_ONCE(1); + break; + case MPTCP_EVENT_SUB_ESTABLISHED: + case MPTCP_EVENT_SUB_PRIORITY: + if (mptcp_event_sub_established(skb, msk, ssk) < 0) + goto nla_put_failure; + break; + case MPTCP_EVENT_SUB_CLOSED: + if (mptcp_event_sub_closed(skb, msk, ssk) < 0) + goto nla_put_failure; + break; + } + + genlmsg_end(skb, nlh); + mptcp_nl_mcast_send(net, skb, gfp); + return; + +nla_put_failure: + kfree_skb(skb); +} + static const struct genl_small_ops mptcp_pm_ops[] = { { .cmd = MPTCP_PM_CMD_ADD_ADDR, @@ -1084,6 +1772,11 @@ static const struct genl_small_ops mptcp_pm_ops[] = { .cmd = MPTCP_PM_CMD_GET_LIMITS, .doit = mptcp_nl_cmd_get_limits, }, + { + .cmd = MPTCP_PM_CMD_SET_FLAGS, + .doit = mptcp_nl_cmd_set_flags, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_family mptcp_genl_family __ro_after_init = { @@ -1106,6 +1799,7 @@ static int __net_init pm_nl_init_net(struct net *net) INIT_LIST_HEAD_RCU(&pernet->local_addr_list); __reset_counters(pernet); pernet->next_id = 1; + bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1); spin_lock_init(&pernet->lock); return 0; } @@ -1115,10 +1809,12 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list) struct net *net; list_for_each_entry(net, net_list, exit_list) { + struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); + /* net is removed from namespace list, can't race with * other modifiers */ - __flush_addrs(net_generic(net, pm_nl_pernet_id)); + __flush_addrs(net, &pernet->local_addr_list); } } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e7419fd15d84..c5d5e68940ea 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -11,6 +11,7 @@ #include <linux/netdevice.h> #include <linux/sched/signal.h> #include <linux/atomic.h> +#include <linux/igmp.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> @@ -19,8 +20,10 @@ #include <net/tcp_states.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/transp_v6.h> +#include <net/addrconf.h> #endif #include <net/mptcp.h> +#include <net/xfrm.h> #include "protocol.h" #include "mib.h" @@ -41,11 +44,17 @@ struct mptcp_skb_cb { static struct percpu_counter mptcp_sockets_allocated; +static void __mptcp_destroy_sock(struct sock *sk); +static void __mptcp_check_send_data_fin(struct sock *sk); + +DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); +static struct net_device mptcp_napi_dev; + /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not * completed yet or has failed, return the subflow socket. * Otherwise return NULL. */ -static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) +struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) { if (!msk->subflow || READ_ONCE(msk->can_ack)) return NULL; @@ -53,6 +62,12 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) return msk->subflow; } +/* Returns end sequence number of the receiver's advertised window */ +static u64 mptcp_wnd_end(const struct mptcp_sock *msk) +{ + return READ_ONCE(msk->wnd_end); +} + static bool mptcp_is_tcpsk(struct sock *sk) { struct socket *sock = sk->sk_socket; @@ -102,12 +117,9 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) msk->subflow = ssock; subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); + sock_hold(ssock->sk); subflow->request_mptcp = 1; - - /* accept() will wait on first subflow sk_wq, and we always wakes up - * via msk->sk_socket - */ - RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq); + mptcp_sock_graft(msk->first, sk->sk_socket); return 0; } @@ -157,18 +169,19 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) struct rb_node **p, *parent; u64 seq, end_seq, max_seq; struct sk_buff *skb1; - int space; seq = MPTCP_SKB_CB(skb)->map_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq; - space = tcp_space(sk); - max_seq = space > 0 ? space + msk->ack_seq : msk->ack_seq; + max_seq = READ_ONCE(msk->rcv_wnd_sent); pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); - if (after64(seq, max_seq)) { + if (after64(end_seq, max_seq)) { /* out of window */ mptcp_drop(sk, skb); + pr_debug("oow by %lld, rcv_wnd_sent %llu\n", + (unsigned long long)end_seq - (unsigned long)max_seq, + (unsigned long long)msk->rcv_wnd_sent); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } @@ -323,38 +336,48 @@ static void mptcp_stop_timer(struct sock *sk) mptcp_sk(sk)->timer_ival = 0; } -static void mptcp_check_data_fin_ack(struct sock *sk) +static void mptcp_close_wake_up(struct sock *sk) +{ + if (sock_flag(sk, SOCK_DEAD)) + return; + + sk->sk_state_change(sk); + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); +} + +static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (__mptcp_check_fallback(msk)) - return; + return !__mptcp_check_fallback(msk) && + ((1 << sk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && + msk->write_seq == READ_ONCE(msk->snd_una); +} - /* Look for an acknowledged DATA_FIN */ - if (((1 << sk->sk_state) & - (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && - msk->write_seq == atomic64_read(&msk->snd_una)) { - mptcp_stop_timer(sk); +static void mptcp_check_data_fin_ack(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + /* Look for an acknowledged DATA_FIN */ + if (mptcp_pending_data_fin_ack(sk)) { WRITE_ONCE(msk->snd_data_fin_enable, 0); switch (sk->sk_state) { case TCP_FIN_WAIT1: inet_sk_state_store(sk, TCP_FIN_WAIT2); - sk->sk_state_change(sk); break; case TCP_CLOSING: case TCP_LAST_ACK: inet_sk_state_store(sk, TCP_CLOSE); - sk->sk_state_change(sk); break; } - if (sk->sk_shutdown == SHUTDOWN_MASK || - sk->sk_state == TCP_CLOSE) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - else - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + mptcp_close_wake_up(sk); } } @@ -388,13 +411,90 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } -static void mptcp_check_data_fin(struct sock *sk) +static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) +{ + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ + if (subflow->request_join && !subflow->fully_established) + return false; + + /* only send if our side has not closed yet */ + return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); +} + +static bool tcp_can_send_ack(const struct sock *ssk) +{ + return !((1 << inet_sk_state_load(ssk)) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); +} + +static void mptcp_send_ack(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + if (tcp_can_send_ack(ssk)) + tcp_send_ack(ssk); + release_sock(ssk); + } +} + +static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk) +{ + int ret; + + lock_sock(ssk); + ret = tcp_can_send_ack(ssk); + if (ret) + tcp_cleanup_rbuf(ssk, 1); + release_sock(ssk); + return ret; +} + +static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) +{ + struct sock *ack_hint = READ_ONCE(msk->ack_hint); + int old_space = READ_ONCE(msk->old_wspace); + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + bool cleanup; + + /* this is a simple superset of what tcp_cleanup_rbuf() implements + * so that we don't have to acquire the ssk socket lock most of the time + * to do actually nothing + */ + cleanup = __mptcp_space(sk) - old_space >= max(0, old_space); + if (!cleanup) + return; + + /* if the hinted ssk is still active, try to use it */ + if (likely(ack_hint)) { + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (ack_hint == ssk && mptcp_subflow_cleanup_rbuf(ssk)) + return; + } + } + + /* otherwise pick the first active subflow */ + mptcp_for_each_subflow(msk, subflow) + if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow))) + return; +} + +static bool mptcp_check_data_fin(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); u64 rcv_data_fin_seq; + bool ret = false; if (__mptcp_check_fallback(msk) || !msk->first) - return; + return ret; /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. @@ -410,8 +510,6 @@ static void mptcp_check_data_fin(struct sock *sk) */ if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { - struct mptcp_subflow_context *subflow; - WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); WRITE_ONCE(msk->rcv_data_fin, 0); @@ -428,7 +526,6 @@ static void mptcp_check_data_fin(struct sock *sk) break; case TCP_FIN_WAIT2: inet_sk_state_store(sk, TCP_CLOSE); - // @@ Close subflows now? break; default: /* Other states not expected */ @@ -436,23 +533,12 @@ static void mptcp_check_data_fin(struct sock *sk) break; } + ret = true; mptcp_set_timeout(sk, NULL); - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - lock_sock(ssk); - tcp_send_ack(ssk); - release_sock(ssk); - } - - sk->sk_state_change(sk); - - if (sk->sk_shutdown == SHUTDOWN_MASK || - sk->sk_state == TCP_CLOSE) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - else - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + mptcp_send_ack(msk); + mptcp_close_wake_up(sk); } + return ret; } static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, @@ -464,12 +550,22 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, unsigned int moved = 0; bool more_data_avail; struct tcp_sock *tp; - u32 old_copied_seq; bool done = false; + int sk_rbuf; + + sk_rbuf = READ_ONCE(sk->sk_rcvbuf); + + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); + + if (unlikely(ssk_rbuf > sk_rbuf)) { + WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); + sk_rbuf = ssk_rbuf; + } + } pr_debug("msk=%p ssk=%p", msk, ssk); tp = tcp_sk(ssk); - old_copied_seq = tp->copied_seq; do { u32 map_remaining, offset; u32 seq = tp->copied_seq; @@ -528,20 +624,18 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, WRITE_ONCE(tp->copied_seq, seq); more_data_avail = mptcp_subflow_data_available(ssk); - if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) { + if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { done = true; break; } } while (more_data_avail); + WRITE_ONCE(msk->ack_hint, ssk); *bytes += moved; - if (tp->copied_seq != old_copied_seq) - tcp_cleanup_rbuf(ssk, 1); - return done; } -static bool mptcp_ofo_queue(struct mptcp_sock *msk) +static bool __mptcp_ofo_queue(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; struct sk_buff *skb, *tail; @@ -587,43 +681,43 @@ static bool mptcp_ofo_queue(struct mptcp_sock *msk) /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ -static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) +static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; - if (READ_ONCE(sk->sk_lock.owned)) - return false; - - if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock))) - return false; + if (inet_sk_state_load(sk) == TCP_CLOSE) + return; - /* must re-check after taking the lock */ - if (!READ_ONCE(sk->sk_lock.owned)) { - __mptcp_move_skbs_from_subflow(msk, ssk, &moved); - mptcp_ofo_queue(msk); + mptcp_data_lock(sk); - /* If the moves have caught up with the DATA_FIN sequence number - * it's time to ack the DATA_FIN and change socket state, but - * this is not a good place to change state. Let the workqueue - * do it. - */ - if (mptcp_pending_data_fin(sk, NULL) && - schedule_work(&msk->work)) - sock_hold(sk); - } + __mptcp_move_skbs_from_subflow(msk, ssk, &moved); + __mptcp_ofo_queue(msk); - spin_unlock_bh(&sk->sk_lock.slock); - - return moved > 0; + /* If the moves have caught up with the DATA_FIN sequence number + * it's time to ack the DATA_FIN and change socket state, but + * this is not a good place to change state. Let the workqueue + * do it. + */ + if (mptcp_pending_data_fin(sk, NULL)) + mptcp_schedule_work(sk); + mptcp_data_unlock(sk); } void mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(sk); + int sk_rbuf, ssk_rbuf; bool wake; + /* The peer can send data while we are shutting down this + * subflow at msk destruction time, but we must avoid enqueuing + * more data to the msk receive queue + */ + if (unlikely(subflow->disposable)) + return; + /* move_skbs_to_msk below can legitly clear the data_avail flag, * but we will need later to properly woke the reader, cache its * value @@ -632,35 +726,32 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) if (wake) set_bit(MPTCP_DATA_READY, &msk->flags); - if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) && - move_skbs_to_msk(msk, ssk)) - goto wake; + ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); + sk_rbuf = READ_ONCE(sk->sk_rcvbuf); + if (unlikely(ssk_rbuf > sk_rbuf)) + sk_rbuf = ssk_rbuf; - /* don't schedule if mptcp sk is (still) over limit */ - if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) + /* over limit? can't append more skbs to msk */ + if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) goto wake; - /* mptcp socket is owned, release_cb should retry */ - if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, - &sk->sk_tsq_flags)) { - sock_hold(sk); + move_skbs_to_msk(msk, ssk); - /* need to try again, its possible release_cb() has already - * been called after the test_and_set_bit() above. - */ - move_skbs_to_msk(msk, ssk); - } wake: if (wake) sk->sk_data_ready(sk); } -static void __mptcp_flush_join_list(struct mptcp_sock *msk) +void __mptcp_flush_join_list(struct mptcp_sock *msk) { + struct mptcp_subflow_context *subflow; + if (likely(list_empty(&msk->join_list))) return; spin_lock_bh(&msk->join_list_lock); + list_for_each_entry(subflow, &msk->join_list, node) + mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow)); list_splice_tail_init(&msk->join_list, &msk->conn_list); spin_unlock_bh(&msk->join_list_lock); } @@ -675,6 +766,10 @@ static void mptcp_reset_timer(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); unsigned long tout; + /* prevent rescheduling on close */ + if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) + return; + /* should never be called with mptcp level timer cleared */ tout = READ_ONCE(mptcp_sk(sk)->timer_ival); if (WARN_ON_ONCE(!tout)) @@ -682,23 +777,23 @@ static void mptcp_reset_timer(struct sock *sk) sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); } -void mptcp_data_acked(struct sock *sk) +bool mptcp_schedule_work(struct sock *sk) { - mptcp_reset_timer(sk); - - if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) || - (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && - schedule_work(&mptcp_sk(sk)->work)) + if (inet_sk_state_load(sk) != TCP_CLOSE && + schedule_work(&mptcp_sk(sk)->work)) { + /* each subflow already holds a reference to the sk, and the + * workqueue is invoked by a subflow, so sk can't go away here. + */ sock_hold(sk); + return true; + } + return false; } void mptcp_subflow_eof(struct sock *sk) { - struct mptcp_sock *msk = mptcp_sk(sk); - - if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) && - schedule_work(&msk->work)) - sock_hold(sk); + if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags)) + mptcp_schedule_work(sk); } static void mptcp_check_for_eof(struct mptcp_sock *msk) @@ -709,8 +804,10 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) mptcp_for_each_subflow(msk, subflow) receivers += !subflow->rx_eof; + if (receivers) + return; - if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + if (!(sk->sk_shutdown & RCV_SHUTDOWN)) { /* hopefully temporary hack: propagate shutdown status * to msk, when all subflows agree on it */ @@ -720,16 +817,21 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) set_bit(MPTCP_DATA_READY, &msk->flags); sk->sk_data_ready(sk); } -} - -static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) -{ - const struct sock *sk = (const struct sock *)msk; - if (!msk->cached_ext) - msk->cached_ext = __skb_ext_alloc(sk->sk_allocation); - - return !!msk->cached_ext; + switch (sk->sk_state) { + case TCP_ESTABLISHED: + inet_sk_state_store(sk, TCP_CLOSE_WAIT); + break; + case TCP_FIN_WAIT1: + inet_sk_state_store(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + inet_sk_state_store(sk, TCP_CLOSE); + break; + default: + return; + } + mptcp_close_wake_up(sk); } static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) @@ -754,8 +856,11 @@ static bool mptcp_skb_can_collapse_to(u64 write_seq, if (!tcp_skb_can_collapse_to(skb)) return false; - /* can collapse only if MPTCP level sequence is in order */ - return mpext && mpext->data_seq + mpext->data_len == write_seq; + /* can collapse only if MPTCP level sequence is in order and this + * mapping has not been xmitted yet + */ + return mpext && mpext->data_seq + mpext->data_len == write_seq && + !mpext->frozen; } static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, @@ -763,9 +868,128 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, const struct mptcp_data_frag *df) { return df && pfrag->page == df->page && + pfrag->size - pfrag->offset > 0 && df->data_seq + df->data_len == msk->write_seq; } +static int mptcp_wmem_with_overhead(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + int ret, skbs; + + ret = size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); + skbs = (msk->tx_pending_data + size) / msk->size_goal_cache; + if (skbs < msk->skb_tx_cache.qlen) + return ret; + + return ret + (skbs - msk->skb_tx_cache.qlen) * SKB_TRUESIZE(MAX_TCP_HEADER); +} + +static void __mptcp_wmem_reserve(struct sock *sk, int size) +{ + int amount = mptcp_wmem_with_overhead(sk, size); + struct mptcp_sock *msk = mptcp_sk(sk); + + WARN_ON_ONCE(msk->wmem_reserved); + if (WARN_ON_ONCE(amount < 0)) + amount = 0; + + if (amount <= sk->sk_forward_alloc) + goto reserve; + + /* under memory pressure try to reserve at most a single page + * otherwise try to reserve the full estimate and fallback + * to a single page before entering the error path + */ + if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) || + !sk_wmem_schedule(sk, amount)) { + if (amount <= PAGE_SIZE) + goto nomem; + + amount = PAGE_SIZE; + if (!sk_wmem_schedule(sk, amount)) + goto nomem; + } + +reserve: + msk->wmem_reserved = amount; + sk->sk_forward_alloc -= amount; + return; + +nomem: + /* we will wait for memory on next allocation */ + msk->wmem_reserved = -1; +} + +static void __mptcp_update_wmem(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (!msk->wmem_reserved) + return; + + if (msk->wmem_reserved < 0) + msk->wmem_reserved = 0; + if (msk->wmem_reserved > 0) { + sk->sk_forward_alloc += msk->wmem_reserved; + msk->wmem_reserved = 0; + } +} + +static bool mptcp_wmem_alloc(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + /* check for pre-existing error condition */ + if (msk->wmem_reserved < 0) + return false; + + if (msk->wmem_reserved >= size) + goto account; + + mptcp_data_lock(sk); + if (!sk_wmem_schedule(sk, size)) { + mptcp_data_unlock(sk); + return false; + } + + sk->sk_forward_alloc -= size; + msk->wmem_reserved += size; + mptcp_data_unlock(sk); + +account: + msk->wmem_reserved -= size; + return true; +} + +static void mptcp_wmem_uncharge(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (msk->wmem_reserved < 0) + msk->wmem_reserved = 0; + msk->wmem_reserved += size; +} + +static void mptcp_mem_reclaim_partial(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + /* if we are experiencing a transint allocation error, + * the forward allocation memory has been already + * released + */ + if (msk->wmem_reserved < 0) + return; + + mptcp_data_lock(sk); + sk->sk_forward_alloc += msk->wmem_reserved; + sk_mem_reclaim_partial(sk); + msk->wmem_reserved = sk->sk_forward_alloc; + sk->sk_forward_alloc = 0; + mptcp_data_unlock(sk); +} + static void dfrag_uncharge(struct sock *sk, int len) { sk_mem_uncharge(sk, len); @@ -781,21 +1005,7 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) put_page(dfrag->page); } -static bool mptcp_is_writeable(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - if (!sk_stream_is_writeable((struct sock *)msk)) - return false; - - mptcp_for_each_subflow(msk, subflow) { - if (sk_stream_is_writeable(subflow->tcp_sock)) - return true; - } - return false; -} - -static void mptcp_clean_una(struct sock *sk) +static void __mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; @@ -806,13 +1016,15 @@ static void mptcp_clean_una(struct sock *sk) * plain TCP */ if (__mptcp_check_fallback(msk)) - atomic64_set(&msk->snd_una, msk->write_seq); - snd_una = atomic64_read(&msk->snd_una); + msk->snd_una = READ_ONCE(msk->snd_nxt); + snd_una = msk->snd_una; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) break; + if (WARN_ON_ONCE(dfrag == msk->first_pending)) + break; dfrag_clear(sk, dfrag); cleaned = true; } @@ -821,12 +1033,13 @@ static void mptcp_clean_una(struct sock *sk) if (dfrag && after64(snd_una, dfrag->data_seq)) { u64 delta = snd_una - dfrag->data_seq; - if (WARN_ON_ONCE(delta > dfrag->data_len)) + if (WARN_ON_ONCE(delta > dfrag->already_sent)) goto out; dfrag->data_seq += delta; dfrag->offset += delta; dfrag->data_len -= delta; + dfrag->already_sent -= delta; dfrag_uncharge(sk, delta); cleaned = true; @@ -834,18 +1047,34 @@ static void mptcp_clean_una(struct sock *sk) out: if (cleaned) { - sk_mem_reclaim_partial(sk); + if (tcp_under_memory_pressure(sk)) { + __mptcp_update_wmem(sk); + sk_mem_reclaim_partial(sk); + } + } - /* Only wake up writers if a subflow is ready */ - if (mptcp_is_writeable(msk)) { - set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags); - smp_mb__after_atomic(); + if (snd_una == READ_ONCE(msk->snd_nxt)) { + if (msk->timer_ival) + mptcp_stop_timer(sk); + } else { + mptcp_reset_timer(sk); + } +} - /* set SEND_SPACE before sk_stream_write_space clears - * NOSPACE - */ - sk_stream_write_space(sk); - } +static void mptcp_enter_memory_pressure(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + bool first = true; + + sk_stream_moderate_sndbuf(sk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (first) + tcp_enter_memory_pressure(ssk); + sk_stream_moderate_sndbuf(ssk); + first = false; } } @@ -858,8 +1087,7 @@ static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) pfrag, sk->sk_allocation))) return true; - sk->sk_prot->enter_memory_pressure(sk); - sk_stream_moderate_sndbuf(sk); + mptcp_enter_memory_pressure(sk); return false; } @@ -875,149 +1103,241 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, dfrag->data_seq = msk->write_seq; dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); dfrag->offset = offset + sizeof(struct mptcp_data_frag); + dfrag->already_sent = 0; dfrag->page = pfrag->page; return dfrag; } -static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, - struct msghdr *msg, struct mptcp_data_frag *dfrag, - long *timeo, int *pmss_now, - int *ps_goal) +struct mptcp_sendmsg_info { + int mss_now; + int size_goal; + u16 limit; + u16 sent; + unsigned int flags; +}; + +static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, + int avail_size) +{ + u64 window_end = mptcp_wnd_end(msk); + + if (__mptcp_check_fallback(msk)) + return avail_size; + + if (!before64(data_seq + avail_size, window_end)) { + u64 allowed_size = window_end - data_seq; + + return min_t(unsigned int, allowed_size, avail_size); + } + + return avail_size; +} + +static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp) +{ + struct skb_ext *mpext = __skb_ext_alloc(gfp); + + if (!mpext) + return false; + __skb_ext_set(skb, SKB_EXT_MPTCP, mpext); + return true; +} + +static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) +{ + struct sk_buff *skb; + + skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); + if (likely(skb)) { + if (likely(__mptcp_add_ext(skb, gfp))) { + skb_reserve(skb, MAX_TCP_HEADER); + skb->reserved_tailroom = skb->end - skb->tail; + return skb; + } + __kfree_skb(skb); + } else { + mptcp_enter_memory_pressure(sk); + } + return NULL; +} + +static bool mptcp_tx_cache_refill(struct sock *sk, int size, + struct sk_buff_head *skbs, int *total_ts) { - int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0; - bool dfrag_collapsed, can_collapse = false; struct mptcp_sock *msk = mptcp_sk(sk); - struct mptcp_ext *mpext = NULL; - bool retransmission = !!dfrag; - struct sk_buff *skb, *tail; - struct page_frag *pfrag; - struct page *page; - u64 *write_seq; - size_t psize; - - /* use the mptcp page cache so that we can easily move the data - * from one substream to another, but do per subflow memory accounting - * Note: pfrag is used only !retransmission, but the compiler if - * fooled into a warning if we don't init here - */ - pfrag = sk_page_frag(sk); - if (!retransmission) { - write_seq = &msk->write_seq; - page = pfrag->page; + struct sk_buff *skb; + int space_needed; + + if (unlikely(tcp_under_memory_pressure(sk))) { + mptcp_mem_reclaim_partial(sk); + + /* under pressure pre-allocate at most a single skb */ + if (msk->skb_tx_cache.qlen) + return true; + space_needed = msk->size_goal_cache; } else { - write_seq = &dfrag->data_seq; - page = dfrag->page; + space_needed = msk->tx_pending_data + size - + msk->skb_tx_cache.qlen * msk->size_goal_cache; } - /* compute copy limit */ - mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); - *pmss_now = mss_now; - *ps_goal = size_goal; - avail_size = size_goal; - skb = tcp_write_queue_tail(ssk); + while (space_needed > 0) { + skb = __mptcp_do_alloc_tx_skb(sk, sk->sk_allocation); + if (unlikely(!skb)) { + /* under memory pressure, try to pass the caller a + * single skb to allow forward progress + */ + while (skbs->qlen > 1) { + skb = __skb_dequeue_tail(skbs); + __kfree_skb(skb); + } + return skbs->qlen > 0; + } + + *total_ts += skb->truesize; + __skb_queue_tail(skbs, skb); + space_needed -= msk->size_goal_cache; + } + return true; +} + +static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + + if (ssk->sk_tx_skb_cache) { + skb = ssk->sk_tx_skb_cache; + if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) && + !__mptcp_add_ext(skb, gfp))) + return false; + return true; + } + + skb = skb_peek(&msk->skb_tx_cache); if (skb) { - mpext = skb_ext_find(skb, SKB_EXT_MPTCP); + if (likely(sk_wmem_schedule(ssk, skb->truesize))) { + skb = __skb_dequeue(&msk->skb_tx_cache); + if (WARN_ON_ONCE(!skb)) + return false; + mptcp_wmem_uncharge(sk, skb->truesize); + ssk->sk_tx_skb_cache = skb; + return true; + } + + /* over memory limit, no point to try to allocate a new skb */ + return false; + } + + skb = __mptcp_do_alloc_tx_skb(sk, gfp); + if (!skb) + return false; + + if (likely(sk_wmem_schedule(ssk, skb->truesize))) { + ssk->sk_tx_skb_cache = skb; + return true; + } + kfree_skb(skb); + return false; +} + +static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk) +{ + return !ssk->sk_tx_skb_cache && + !skb_peek(&mptcp_sk(sk)->skb_tx_cache) && + tcp_under_memory_pressure(sk); +} + +static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) +{ + if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) + mptcp_mem_reclaim_partial(sk); + return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation); +} + +static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, + struct mptcp_data_frag *dfrag, + struct mptcp_sendmsg_info *info) +{ + u64 data_seq = dfrag->data_seq + info->sent; + struct mptcp_sock *msk = mptcp_sk(sk); + bool zero_window_probe = false; + struct mptcp_ext *mpext = NULL; + struct sk_buff *skb, *tail; + bool can_collapse = false; + int size_bias = 0; + int avail_size; + size_t ret = 0; + + pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d", + msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); + + /* compute send limit */ + info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); + avail_size = info->size_goal; + msk->size_goal_cache = info->size_goal; + skb = tcp_write_queue_tail(ssk); + if (skb) { /* Limit the write to the size available in the * current skb, if any, so that we create at most a new skb. * Explicitly tells TCP internals to avoid collapsing on later * queue management operation, to avoid breaking the ext <-> * SSN association set here */ - can_collapse = (size_goal - skb->len > 0) && - mptcp_skb_can_collapse_to(*write_seq, skb, mpext); - if (!can_collapse) + mpext = skb_ext_find(skb, SKB_EXT_MPTCP); + can_collapse = (info->size_goal - skb->len > 0) && + mptcp_skb_can_collapse_to(data_seq, skb, mpext); + if (!can_collapse) { TCP_SKB_CB(skb)->eor = 1; - else - avail_size = size_goal - skb->len; - } - - if (!retransmission) { - /* reuse tail pfrag, if possible, or carve a new one from the - * page allocator - */ - dfrag = mptcp_rtx_tail(sk); - offset = pfrag->offset; - dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); - if (!dfrag_collapsed) { - dfrag = mptcp_carve_data_frag(msk, pfrag, offset); - offset = dfrag->offset; - frag_truesize = dfrag->overhead; - } - psize = min_t(size_t, pfrag->size - offset, avail_size); - - /* Copy to page */ - pr_debug("left=%zu", msg_data_left(msg)); - psize = copy_page_from_iter(pfrag->page, offset, - min_t(size_t, msg_data_left(msg), - psize), - &msg->msg_iter); - pr_debug("left=%zu", msg_data_left(msg)); - if (!psize) - return -EINVAL; - - if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) { - iov_iter_revert(&msg->msg_iter, psize); - return -ENOMEM; + } else { + size_bias = skb->len; + avail_size = info->size_goal - skb->len; } - } else { - offset = dfrag->offset; - psize = min_t(size_t, dfrag->data_len, avail_size); } - /* tell the TCP stack to delay the push so that we can safely - * access the skb after the sendpages call - */ - ret = do_tcp_sendpages(ssk, page, offset, psize, - msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT); - if (ret <= 0) { - if (!retransmission) - iov_iter_revert(&msg->msg_iter, psize); - return ret; - } + /* Zero window and all data acked? Probe. */ + avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); + if (avail_size == 0) { + u64 snd_una = READ_ONCE(msk->snd_una); - frag_truesize += ret; - if (!retransmission) { - if (unlikely(ret < psize)) - iov_iter_revert(&msg->msg_iter, psize - ret); + if (skb || snd_una != msk->snd_nxt) + return 0; + zero_window_probe = true; + data_seq = snd_una - 1; + avail_size = 1; + } - /* send successful, keep track of sent data for mptcp-level - * retransmission - */ - dfrag->data_len += ret; - if (!dfrag_collapsed) { - get_page(dfrag->page); - list_add_tail(&dfrag->list, &msk->rtx_queue); - sk_wmem_queued_add(sk, frag_truesize); - } else { - sk_wmem_queued_add(sk, ret); - } + if (WARN_ON_ONCE(info->sent > info->limit || + info->limit > dfrag->data_len)) + return 0; - /* charge data on mptcp rtx queue to the master socket - * Note: we charge such data both to sk and ssk - */ - sk->sk_forward_alloc -= frag_truesize; + ret = info->limit - info->sent; + tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags, + dfrag->page, dfrag->offset + info->sent, &ret); + if (!tail) { + tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); + return -ENOMEM; } - /* if the tail skb extension is still the cached one, collapsing - * really happened. Note: we can't check for 'same skb' as the sk_buff - * hdr on tail can be transmitted, freed and re-allocated by the - * do_tcp_sendpages() call + /* if the tail skb is still the cached one, collapsing really happened. */ - tail = tcp_write_queue_tail(ssk); - if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) { - WARN_ON_ONCE(!can_collapse); + if (skb == tail) { + TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH; mpext->data_len += ret; + WARN_ON_ONCE(!can_collapse); + WARN_ON_ONCE(zero_window_probe); goto out; } - skb = tcp_write_queue_tail(ssk); - mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext); - msk->cached_ext = NULL; + mpext = skb_ext_find(tail, SKB_EXT_MPTCP); + if (WARN_ON_ONCE(!mpext)) { + /* should never reach here, stream corrupted */ + return -EINVAL; + } memset(mpext, 0, sizeof(*mpext)); - mpext->data_seq = *write_seq; + mpext->data_seq = data_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->data_len = ret; mpext->use_map = 1; @@ -1027,44 +1347,17 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext->data_seq, mpext->subflow_seq, mpext->data_len, mpext->dsn64); + if (zero_window_probe) { + mptcp_subflow_ctx(ssk)->rel_write_seq += ret; + mpext->frozen = 1; + ret = 0; + tcp_push_pending_frames(ssk); + } out: - if (!retransmission) - pfrag->offset += frag_truesize; - WRITE_ONCE(*write_seq, *write_seq + ret); mptcp_subflow_ctx(ssk)->rel_write_seq += ret; - return ret; } -static void mptcp_nospace(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - clear_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ - - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - struct socket *sock = READ_ONCE(ssk->sk_socket); - - /* enables ssk->write_space() callbacks */ - if (sock) - set_bit(SOCK_NOSPACE, &sock->flags); - } -} - -static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) -{ - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ - if (subflow->request_join && !subflow->fully_established) - return false; - - /* only send if our side has not closed yet */ - return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); -} - #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ sizeof(struct tcphdr) - \ MAX_TCP_OPTION_SPACE - \ @@ -1076,8 +1369,7 @@ struct subflow_send_info { u64 ratio; }; -static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, - u32 *sndbuf) +static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct subflow_send_info send_info[2]; struct mptcp_subflow_context *subflow; @@ -1088,27 +1380,17 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, sock_owned_by_me((struct sock *)msk); - *sndbuf = 0; - if (!mptcp_ext_cache_refill(msk)) - return NULL; - if (__mptcp_check_fallback(msk)) { if (!msk->first) return NULL; - *sndbuf = msk->first->sk_sndbuf; return sk_stream_memory_free(msk->first) ? msk->first : NULL; } /* re-use last subflow, if the burst allow that */ if (msk->last_snd && msk->snd_burst > 0 && sk_stream_memory_free(msk->last_snd) && - mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { - mptcp_for_each_subflow(msk, subflow) { - ssk = mptcp_subflow_tcp_sock(subflow); - *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf); - } + mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) return msk->last_snd; - } /* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < 2; ++i) { @@ -1121,8 +1403,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, continue; nr_active += !subflow->backup; - *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf); - if (!sk_stream_memory_free(subflow->tcp_sock)) + if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd) continue; pace = READ_ONCE(ssk->sk_pacing_rate); @@ -1148,33 +1429,183 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, if (send_info[0].ssk) { msk->last_snd = send_info[0].ssk; msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE, - sk_stream_wspace(msk->last_snd)); + tcp_sk(msk->last_snd)->snd_wnd); return msk->last_snd; } + return NULL; } -static void ssk_check_wmem(struct mptcp_sock *msk) +static void mptcp_push_release(struct sock *sk, struct sock *ssk, + struct mptcp_sendmsg_info *info) +{ + mptcp_set_timeout(sk, ssk); + tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); + release_sock(ssk); +} + +static void mptcp_push_pending(struct sock *sk, unsigned int flags) +{ + struct sock *prev_ssk = NULL, *ssk = NULL; + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_sendmsg_info info = { + .flags = flags, + }; + struct mptcp_data_frag *dfrag; + int len, copied = 0; + + while ((dfrag = mptcp_send_head(sk))) { + info.sent = dfrag->already_sent; + info.limit = dfrag->data_len; + len = dfrag->data_len - dfrag->already_sent; + while (len > 0) { + int ret = 0; + + prev_ssk = ssk; + __mptcp_flush_join_list(msk); + ssk = mptcp_subflow_get_send(msk); + + /* try to keep the subflow socket lock across + * consecutive xmit on the same socket + */ + if (ssk != prev_ssk && prev_ssk) + mptcp_push_release(sk, prev_ssk, &info); + if (!ssk) + goto out; + + if (ssk != prev_ssk || !prev_ssk) + lock_sock(ssk); + + /* keep it simple and always provide a new skb for the + * subflow, even if we will not use it when collapsing + * on the pending one + */ + if (!mptcp_alloc_tx_skb(sk, ssk)) { + mptcp_push_release(sk, ssk, &info); + goto out; + } + + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) { + mptcp_push_release(sk, ssk, &info); + goto out; + } + + info.sent += ret; + dfrag->already_sent += ret; + msk->snd_nxt += ret; + msk->snd_burst -= ret; + msk->tx_pending_data -= ret; + copied += ret; + len -= ret; + } + WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + } + + /* at this point we held the socket lock for the last subflow we used */ + if (ssk) + mptcp_push_release(sk, ssk, &info); + +out: + if (copied) { + /* start the timer, if it's not pending */ + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + __mptcp_check_send_data_fin(sk); + } +} + +static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) { - if (unlikely(!mptcp_is_writeable(msk))) - mptcp_nospace(msk); + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_sendmsg_info info; + struct mptcp_data_frag *dfrag; + struct sock *xmit_ssk; + int len, copied = 0; + bool first = true; + + info.flags = 0; + while ((dfrag = mptcp_send_head(sk))) { + info.sent = dfrag->already_sent; + info.limit = dfrag->data_len; + len = dfrag->data_len - dfrag->already_sent; + while (len > 0) { + int ret = 0; + + /* the caller already invoked the packet scheduler, + * check for a different subflow usage only after + * spooling the first chunk of data + */ + xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk)); + if (!xmit_ssk) + goto out; + if (xmit_ssk != ssk) { + mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk)); + goto out; + } + + if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) { + __mptcp_update_wmem(sk); + sk_mem_reclaim_partial(sk); + } + if (!__mptcp_alloc_tx_skb(sk, ssk, GFP_ATOMIC)) + goto out; + + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) + goto out; + + info.sent += ret; + dfrag->already_sent += ret; + msk->snd_nxt += ret; + msk->snd_burst -= ret; + msk->tx_pending_data -= ret; + copied += ret; + len -= ret; + first = false; + } + WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + } + +out: + /* __mptcp_alloc_tx_skb could have released some wmem and we are + * not going to flush it via release_sock() + */ + __mptcp_update_wmem(sk); + if (copied) { + mptcp_set_timeout(sk, ssk); + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + + if (msk->snd_data_fin_enable && + msk->snd_nxt + 1 == msk->write_seq) + mptcp_schedule_work(sk); + } +} + +static void mptcp_set_nospace(struct sock *sk) +{ + /* enable autotune */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + /* will be cleared on avail space */ + set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { - int mss_now = 0, size_goal = 0, ret = 0; struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; size_t copied = 0; - struct sock *ssk; - u32 sndbuf; - bool tx_ok; + int ret = 0; long timeo; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -EOPNOTSUPP; - lock_sock(sk); + mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len))); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); @@ -1185,130 +1616,96 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } pfrag = sk_page_frag(sk); -restart: - mptcp_clean_una(sk); - - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { - ret = -EPIPE; - goto out; - } - __mptcp_flush_join_list(msk); - ssk = mptcp_subflow_get_send(msk, &sndbuf); - while (!sk_stream_memory_free(sk) || - !ssk || - !mptcp_page_frag_refill(ssk, pfrag)) { - if (ssk) { - /* make sure retransmit timer is - * running before we wait for memory. - * - * The retransmit timer might be needed - * to make the peer send an up-to-date - * MPTCP Ack. - */ - mptcp_set_timeout(sk, ssk); - if (!mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); - } + while (msg_data_left(msg)) { + int total_ts, frag_truesize = 0; + struct mptcp_data_frag *dfrag; + struct sk_buff_head skbs; + bool dfrag_collapsed; + size_t psize, offset; - mptcp_nospace(msk); - ret = sk_stream_wait_memory(sk, &timeo); - if (ret) - goto out; - - mptcp_clean_una(sk); - - ssk = mptcp_subflow_get_send(msk, &sndbuf); - if (list_empty(&msk->conn_list)) { - ret = -ENOTCONN; + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { + ret = -EPIPE; goto out; } - } - /* do auto tuning */ - if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && - sndbuf > READ_ONCE(sk->sk_sndbuf)) - WRITE_ONCE(sk->sk_sndbuf, sndbuf); + /* reuse tail pfrag, if possible, or carve a new one from the + * page allocator + */ + dfrag = mptcp_pending_tail(sk); + dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); + if (!dfrag_collapsed) { + if (!sk_stream_memory_free(sk)) + goto wait_for_memory; - pr_debug("conn_list->subflow=%p", ssk); + if (!mptcp_page_frag_refill(sk, pfrag)) + goto wait_for_memory; - lock_sock(ssk); - tx_ok = msg_data_left(msg); - while (tx_ok) { - ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now, - &size_goal); - if (ret < 0) { - if (ret == -EAGAIN && timeo > 0) { - mptcp_set_timeout(sk, ssk); - release_sock(ssk); - goto restart; - } - break; + dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset); + frag_truesize = dfrag->overhead; } - /* burst can be negative, we will try move to the next subflow - * at selection time, if possible. + /* we do not bound vs wspace, to allow a single packet. + * memory accounting will prevent execessive memory usage + * anyway */ - msk->snd_burst -= ret; - copied += ret; - - tx_ok = msg_data_left(msg); - if (!tx_ok) - break; + offset = dfrag->offset + dfrag->data_len; + psize = pfrag->size - offset; + psize = min_t(size_t, psize, msg_data_left(msg)); + total_ts = psize + frag_truesize; + __skb_queue_head_init(&skbs); + if (!mptcp_tx_cache_refill(sk, psize, &skbs, &total_ts)) + goto wait_for_memory; + + if (!mptcp_wmem_alloc(sk, total_ts)) { + __skb_queue_purge(&skbs); + goto wait_for_memory; + } - if (!sk_stream_memory_free(ssk) || - !mptcp_page_frag_refill(ssk, pfrag) || - !mptcp_ext_cache_refill(msk)) { - tcp_push(ssk, msg->msg_flags, mss_now, - tcp_sk(ssk)->nonagle, size_goal); - mptcp_set_timeout(sk, ssk); - release_sock(ssk); - goto restart; + skb_queue_splice_tail(&skbs, &msk->skb_tx_cache); + if (copy_page_from_iter(dfrag->page, offset, psize, + &msg->msg_iter) != psize) { + mptcp_wmem_uncharge(sk, psize + frag_truesize); + ret = -EFAULT; + goto out; } - /* memory is charged to mptcp level socket as well, i.e. - * if msg is very large, mptcp socket may run out of buffer - * space. mptcp_clean_una() will release data that has - * been acked at mptcp level in the mean time, so there is - * a good chance we can continue sending data right away. - * - * Normally, when the tcp subflow can accept more data, then - * so can the MPTCP socket. However, we need to cope with - * peers that might lag behind in their MPTCP-level - * acknowledgements, i.e. data might have been acked at - * tcp level only. So, we must also check the MPTCP socket - * limits before we send more data. + /* data successfully copied into the write queue */ + copied += psize; + dfrag->data_len += psize; + frag_truesize += psize; + pfrag->offset += frag_truesize; + WRITE_ONCE(msk->write_seq, msk->write_seq + psize); + msk->tx_pending_data += psize; + + /* charge data on mptcp pending queue to the msk socket + * Note: we charge such data both to sk and ssk */ - if (unlikely(!sk_stream_memory_free(sk))) { - tcp_push(ssk, msg->msg_flags, mss_now, - tcp_sk(ssk)->nonagle, size_goal); - mptcp_clean_una(sk); - if (!sk_stream_memory_free(sk)) { - /* can't send more for now, need to wait for - * MPTCP-level ACKs from peer. - * - * Wakeup will happen via mptcp_clean_una(). - */ - mptcp_set_timeout(sk, ssk); - release_sock(ssk); - goto restart; - } + sk_wmem_queued_add(sk, frag_truesize); + if (!dfrag_collapsed) { + get_page(dfrag->page); + list_add_tail(&dfrag->list, &msk->rtx_queue); + if (!msk->first_pending) + WRITE_ONCE(msk->first_pending, dfrag); } - } + pr_debug("msk=%p dfrag at seq=%lld len=%d sent=%d new=%d", msk, + dfrag->data_seq, dfrag->data_len, dfrag->already_sent, + !dfrag_collapsed); - mptcp_set_timeout(sk, ssk); - if (copied) { - tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, - size_goal); + continue; - /* start the timer, if it's not pending */ - if (!mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); +wait_for_memory: + mptcp_set_nospace(sk); + mptcp_push_pending(sk, msg->msg_flags); + ret = sk_stream_wait_memory(sk, &timeo); + if (ret) + goto out; } - release_sock(ssk); + if (copied) + mptcp_push_pending(sk, msg->msg_flags); + out: - ssk_check_wmem(msk); release_sock(sk); return copied ? : ret; } @@ -1332,11 +1729,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len) { - struct sock *sk = (struct sock *)msk; struct sk_buff *skb; int copied = 0; - while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { + while ((skb = skb_peek(&msk->receive_queue)) != NULL) { u32 offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; u32 count = min_t(size_t, len - copied, data_len); @@ -1356,7 +1752,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, break; } - __skb_unlink(skb, &sk->sk_receive_queue); + /* we will bulk release the skb memory later */ + skb->destructor = NULL; + msk->rmem_released += skb->truesize; + __skb_unlink(skb, &msk->receive_queue); __kfree_skb(skb); if (copied >= len) @@ -1464,32 +1863,66 @@ new_measure: msk->rcvq_space.time = mstamp; } +static void __mptcp_update_rmem(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (!msk->rmem_released) + return; + + atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, msk->rmem_released); + msk->rmem_released = 0; +} + +static void __mptcp_splice_receive_queue(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); +} + static bool __mptcp_move_skbs(struct mptcp_sock *msk) { + struct sock *sk = (struct sock *)msk; unsigned int moved = 0; - bool done; - - /* avoid looping forever below on racing close */ - if (((struct sock *)msk)->sk_state == TCP_CLOSE) - return false; + bool ret, done; __mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); + bool slowpath; - if (!ssk) + /* we can have data pending in the subflows only if the msk + * receive buffer was full at subflow_data_ready() time, + * that is an unlikely slow path. + */ + if (likely(!ssk)) break; - lock_sock(ssk); + slowpath = lock_sock_fast(ssk); + mptcp_data_lock(sk); + __mptcp_update_rmem(sk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); - release_sock(ssk); + mptcp_data_unlock(sk); + tcp_cleanup_rbuf(ssk, moved); + unlock_sock_fast(ssk, slowpath); } while (!done); - if (mptcp_ofo_queue(msk) || moved > 0) { - mptcp_check_data_fin((struct sock *)msk); - return true; + /* acquire the data lock only if some input data is pending */ + ret = moved > 0; + if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || + !skb_queue_empty_lockless(&sk->sk_receive_queue)) { + mptcp_data_lock(sk); + __mptcp_update_rmem(sk); + ret |= __mptcp_ofo_queue(msk); + __mptcp_splice_receive_queue(sk); + mptcp_data_unlock(sk); + mptcp_cleanup_rbuf(msk); } - return false; + if (ret) + mptcp_check_data_fin((struct sock *)msk); + return !skb_queue_empty(&msk->receive_queue); } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, @@ -1503,14 +1936,18 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) return -EOPNOTSUPP; - lock_sock(sk); + mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk)); + if (unlikely(sk->sk_state == TCP_LISTEN)) { + copied = -ENOTCONN; + goto out_err; + } + timeo = sock_rcvtimeo(sk, nonblock); len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); - __mptcp_flush_join_list(msk); - while (len > (size_t)copied) { + while (copied < len) { int bytes_read; bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied); @@ -1522,8 +1959,10 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - if (skb_queue_empty(&sk->sk_receive_queue) && - __mptcp_move_skbs(msk)) + /* be sure to advertise window change */ + mptcp_cleanup_rbuf(msk); + + if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) continue; /* only the master socket status is relevant here. The exit @@ -1548,8 +1987,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (sk->sk_shutdown & RCV_SHUTDOWN) { + /* race breaker: the shutdown could be after the + * previous receive queue check + */ + if (__mptcp_move_skbs(msk)) + continue; break; + } if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; @@ -1571,7 +2016,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, mptcp_wait_data(sk, &timeo); } - if (skb_queue_empty(&sk->sk_receive_queue)) { + if (skb_queue_empty_lockless(&sk->sk_receive_queue) && + skb_queue_empty(&msk->receive_queue)) { /* entire backlog drained, clear DATA_READY. */ clear_bit(MPTCP_DATA_READY, &msk->flags); @@ -1587,7 +2033,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, out_err: pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", msk, test_bit(MPTCP_DATA_READY, &msk->flags), - skb_queue_empty(&sk->sk_receive_queue), copied); + skb_queue_empty_lockless(&sk->sk_receive_queue), copied); mptcp_rcv_space_adjust(msk, copied); release_sock(sk); @@ -1598,13 +2044,8 @@ static void mptcp_retransmit_handler(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->write_seq)) { - mptcp_stop_timer(sk); - } else { - set_bit(MPTCP_WORK_RTX, &msk->flags); - if (schedule_work(&msk->work)) - sock_hold(sk); - } + set_bit(MPTCP_WORK_RTX, &msk->flags); + mptcp_schedule_work(sk); } static void mptcp_retransmit_timer(struct timer_list *t) @@ -1626,6 +2067,14 @@ static void mptcp_retransmit_timer(struct timer_list *t) sock_put(sk); } +static void mptcp_timeout_timer(struct timer_list *t) +{ + struct sock *sk = from_timer(sk, t, sk_timer); + + mptcp_schedule_work(sk); + sock_put(sk); +} + /* Find an idle subflow. Return NULL if there is unacked data at tcp * level. * @@ -1639,7 +2088,7 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) sock_owned_by_me((const struct sock *)msk); if (__mptcp_check_fallback(msk)) - return msk->first; + return NULL; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -1648,8 +2097,11 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) continue; /* still data outstanding at TCP level? Don't retransmit. */ - if (!tcp_write_queue_empty(ssk)) + if (!tcp_write_queue_empty(ssk)) { + if (inet_csk(ssk)->icsk_ca_state >= TCP_CA_Loss) + continue; return NULL; + } if (subflow->backup) { if (!backup) @@ -1671,21 +2123,45 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) * so we need to use tcp_close() after detaching them from the mptcp * parent socket. */ -void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, - struct mptcp_subflow_context *subflow, - long timeout) +static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, + struct mptcp_subflow_context *subflow) { - struct socket *sock = READ_ONCE(ssk->sk_socket); - list_del(&subflow->node); - if (sock && sock != sk->sk_socket) { - /* outgoing subflow */ - sock_release(sock); + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + + /* if we are invoked by the msk cleanup code, the subflow is + * already orphaned + */ + if (ssk->sk_socket) + sock_orphan(ssk); + + subflow->disposable = 1; + + /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops + * the ssk has been already destroyed, we just need to release the + * reference owned by msk; + */ + if (!inet_csk(ssk)->icsk_ulp_ops) { + kfree_rcu(subflow, rcu); } else { - /* incoming subflow */ - tcp_close(ssk, timeout); + /* otherwise tcp will dispose of the ssk and subflow ctx */ + __tcp_close(ssk, 0); + + /* close acquired an extra ref */ + __sock_put(ssk); } + release_sock(ssk); + + sock_put(ssk); +} + +void mptcp_close_ssk(struct sock *sk, struct sock *ssk, + struct mptcp_subflow_context *subflow) +{ + if (sk->sk_state == TCP_ESTABLISHED) + mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); + __mptcp_close_ssk(sk, ssk, subflow); } static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) @@ -1693,117 +2169,149 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) return 0; } -static void pm_work(struct mptcp_sock *msk) +static void __mptcp_close_subflow(struct mptcp_sock *msk) { - struct mptcp_pm_data *pm = &msk->pm; + struct mptcp_subflow_context *subflow, *tmp; - spin_lock_bh(&msk->pm.lock); + might_sleep(); - pr_debug("msk=%p status=%x", msk, pm->status); - if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); - mptcp_pm_nl_add_addr_received(msk); - } - if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); - mptcp_pm_nl_rm_addr_received(msk); - } - if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); - mptcp_pm_nl_fully_established(msk); - } - if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); - mptcp_pm_nl_subflow_established(msk); + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (inet_sk_state_load(ssk) != TCP_CLOSE) + continue; + + /* 'subflow_data_ready' will re-sched once rx queue is empty */ + if (!skb_queue_empty_lockless(&ssk->sk_receive_queue)) + continue; + + mptcp_close_ssk((struct sock *)msk, ssk, subflow); } +} + +static bool mptcp_check_close_timeout(const struct sock *sk) +{ + s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp; + struct mptcp_subflow_context *subflow; + + if (delta >= TCP_TIMEWAIT_LEN) + return true; - spin_unlock_bh(&msk->pm.lock); + /* if all subflows are in closed status don't bother with additional + * timeout + */ + mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) != + TCP_CLOSE) + return false; + } + return true; } -static void __mptcp_close_subflow(struct mptcp_sock *msk) +static void mptcp_check_fastclose(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *tmp; + struct sock *sk = &msk->sk.icsk_inet.sk; - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + if (likely(!READ_ONCE(msk->rcv_fastclose))) + return; - if (inet_sk_state_load(ssk) != TCP_CLOSE) - continue; + mptcp_token_destroy(msk); - __mptcp_close_ssk((struct sock *)msk, ssk, subflow, 0); + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(tcp_sk); + if (tcp_sk->sk_state != TCP_CLOSE) { + tcp_send_active_reset(tcp_sk, GFP_ATOMIC); + tcp_set_state(tcp_sk, TCP_CLOSE); + } + release_sock(tcp_sk); } + + inet_sk_state_store(sk, TCP_CLOSE); + sk->sk_shutdown = SHUTDOWN_MASK; + smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ + set_bit(MPTCP_DATA_READY, &msk->flags); + set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); + + mptcp_close_wake_up(sk); } static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; - int orig_len, orig_offset, mss_now = 0, size_goal = 0; + struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; - u64 orig_write_seq; size_t copied = 0; - struct msghdr msg = { - .msg_flags = MSG_DONTWAIT, - }; - long timeo = 0; + int state, ret; lock_sock(sk); - mptcp_clean_una(sk); + state = sk->sk_state; + if (unlikely(state == TCP_CLOSE)) + goto unlock; + mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); - if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) - __mptcp_close_subflow(msk); - __mptcp_move_skbs(msk); + mptcp_check_fastclose(msk); if (msk->pm.status) - pm_work(msk); + mptcp_pm_nl_work(msk); if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); + __mptcp_check_send_data_fin(sk); mptcp_check_data_fin(sk); + /* There is no point in keeping around an orphaned sk timedout or + * closed, but we need the msk around to reply to incoming DATA_FIN, + * even if it is orphaned and in FIN_WAIT2 state + */ + if (sock_flag(sk, SOCK_DEAD) && + (mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) { + inet_sk_state_store(sk, TCP_CLOSE); + __mptcp_destroy_sock(sk); + goto unlock; + } + + if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + __mptcp_close_subflow(msk); + if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) goto unlock; + __mptcp_clean_una(sk); dfrag = mptcp_rtx_head(sk); if (!dfrag) goto unlock; - if (!mptcp_ext_cache_refill(msk)) - goto reset_unlock; - ssk = mptcp_subflow_get_retrans(msk); if (!ssk) goto reset_unlock; lock_sock(ssk); - orig_len = dfrag->data_len; - orig_offset = dfrag->offset; - orig_write_seq = dfrag->data_seq; - while (dfrag->data_len > 0) { - int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, - &mss_now, &size_goal); - if (ret < 0) + /* limit retransmission to the bytes already sent on some subflows */ + info.sent = 0; + info.limit = dfrag->already_sent; + while (info.sent < dfrag->already_sent) { + if (!mptcp_alloc_tx_skb(sk, ssk)) + break; + + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) break; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); copied += ret; - dfrag->data_len -= ret; - dfrag->offset += ret; - - if (!mptcp_ext_cache_refill(msk)) - break; + info.sent += ret; } if (copied) - tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle, - size_goal); - - dfrag->data_seq = orig_write_seq; - dfrag->offset = orig_offset; - dfrag->data_len = orig_len; + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); mptcp_set_timeout(sk, ssk); release_sock(ssk); @@ -1826,10 +2334,17 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); - __set_bit(MPTCP_SEND_SPACE, &msk->flags); INIT_WORK(&msk->work, mptcp_worker); + __skb_queue_head_init(&msk->receive_queue); + __skb_queue_head_init(&msk->skb_tx_cache); msk->out_of_order_queue = RB_ROOT; + msk->first_pending = NULL; + msk->wmem_reserved = 0; + msk->rmem_released = 0; + msk->tx_pending_data = 0; + msk->size_goal_cache = TCP_BASE_MSS; + msk->ack_hint = NULL; msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; @@ -1837,7 +2352,7 @@ static int __mptcp_init_sock(struct sock *sk) /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); - + timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0); return 0; } @@ -1871,11 +2386,15 @@ static void __mptcp_clear_xmit(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; + struct sk_buff *skb; - sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); - + WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); + while ((skb = __skb_dequeue(&msk->skb_tx_cache)) != NULL) { + sk->sk_forward_alloc += skb->truesize; + kfree_skb(skb); + } } static void mptcp_cancel_work(struct sock *sk) @@ -1883,7 +2402,7 @@ static void mptcp_cancel_work(struct sock *sk) struct mptcp_sock *msk = mptcp_sk(sk); if (cancel_work_sync(&msk->work)) - sock_put(sk); + __sock_put(sk); } void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) @@ -1941,42 +2460,75 @@ static int mptcp_close_state(struct sock *sk) return next & TCP_ACTION_FIN; } -static void mptcp_close(struct sock *sk, long timeout) +static void __mptcp_check_send_data_fin(struct sock *sk) { - struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - LIST_HEAD(conn_list); - lock_sock(sk); - sk->sk_shutdown = SHUTDOWN_MASK; + pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu", + msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk), + msk->snd_nxt, msk->write_seq); - if (sk->sk_state == TCP_LISTEN) { - inet_sk_state_store(sk, TCP_CLOSE); - goto cleanup; - } else if (sk->sk_state == TCP_CLOSE) { - goto cleanup; - } + /* we still need to enqueue subflows or not really shutting down, + * skip this + */ + if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq || + mptcp_send_head(sk)) + return; + WRITE_ONCE(msk->snd_nxt, msk->write_seq); + + /* fallback socket will not get data_fin/ack, can move to the next + * state now + */ if (__mptcp_check_fallback(msk)) { - goto update_state; - } else if (mptcp_close_state(sk)) { - pr_debug("Sending DATA_FIN sk=%p", sk); - WRITE_ONCE(msk->write_seq, msk->write_seq + 1); - WRITE_ONCE(msk->snd_data_fin_enable, 1); + if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_close_wake_up(sk); + } else if (sk->sk_state == TCP_FIN_WAIT1) { + inet_sk_state_store(sk, TCP_FIN_WAIT2); + } + } - mptcp_for_each_subflow(msk, subflow) { - struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + __mptcp_flush_join_list(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); - mptcp_subflow_shutdown(sk, tcp_sk, SHUTDOWN_MASK); - } + mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN); } +} - sk_stream_wait_close(sk, timeout); +static void __mptcp_wr_shutdown(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); -update_state: - inet_sk_state_store(sk, TCP_CLOSE); + pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d", + msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state, + !!mptcp_send_head(sk)); + + /* will be ignored by fallback sockets */ + WRITE_ONCE(msk->write_seq, msk->write_seq + 1); + WRITE_ONCE(msk->snd_data_fin_enable, 1); + + __mptcp_check_send_data_fin(sk); +} + +static void __mptcp_destroy_sock(struct sock *sk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_sock *msk = mptcp_sk(sk); + LIST_HEAD(conn_list); + + pr_debug("msk=%p", msk); + + might_sleep(); + + /* dispose the ancillatory tcp socket, if any */ + if (msk->subflow) { + iput(SOCK_INODE(msk->subflow)); + msk->subflow = NULL; + } -cleanup: /* be sure to always acquire the join list lock, to sync vs * mptcp_finish_join(). */ @@ -1985,20 +2537,71 @@ cleanup: spin_unlock_bh(&msk->join_list_lock); list_splice_init(&msk->conn_list, &conn_list); - __mptcp_clear_xmit(sk); - - release_sock(sk); + sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); + sk_stop_timer(sk, &sk->sk_timer); + msk->pm.status = 0; list_for_each_entry_safe(subflow, tmp, &conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - __mptcp_close_ssk(sk, ssk, subflow, timeout); + __mptcp_close_ssk(sk, ssk, subflow); } - mptcp_cancel_work(sk); + sk->sk_prot->destroy(sk); - __skb_queue_purge(&sk->sk_receive_queue); + WARN_ON_ONCE(msk->wmem_reserved); + WARN_ON_ONCE(msk->rmem_released); + sk_stream_kill_queues(sk); + xfrm_sk_free_policy(sk); + sk_refcnt_debug_release(sk); + sock_put(sk); +} - sk_common_release(sk); +static void mptcp_close(struct sock *sk, long timeout) +{ + struct mptcp_subflow_context *subflow; + bool do_cancel_work = false; + + lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { + inet_sk_state_store(sk, TCP_CLOSE); + goto cleanup; + } + + if (mptcp_close_state(sk)) + __mptcp_wr_shutdown(sk); + + sk_stream_wait_close(sk, timeout); + +cleanup: + /* orphan all the subflows */ + inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; + list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow = lock_sock_fast(ssk); + + sock_orphan(ssk); + unlock_sock_fast(ssk, slow); + } + sock_orphan(sk); + + sock_hold(sk); + pr_debug("msk=%p state=%d", sk, sk->sk_state); + if (sk->sk_state == TCP_CLOSE) { + __mptcp_destroy_sock(sk); + do_cancel_work = true; + } else { + sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN); + } + release_sock(sk); + if (do_cancel_work) + mptcp_cancel_work(sk); + + if (mptcp_sk(sk)->token) + mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); + + sock_put(sk); } static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) @@ -2026,11 +2629,17 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) static int mptcp_disconnect(struct sock *sk, int flags) { - /* Should never be called. - * inet_stream_connect() calls ->disconnect, but that - * refers to the subflow socket, not the mptcp one. - */ - WARN_ON_ONCE(1); + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + + __mptcp_flush_join_list(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + tcp_disconnect(ssk, flags); + release_sock(ssk); + } return 0; } @@ -2069,18 +2678,24 @@ struct sock *mptcp_sk_clone(const struct sock *sk, WRITE_ONCE(msk->fully_established, false); msk->write_seq = subflow_req->idsn + 1; - atomic64_set(&msk->snd_una, msk->write_seq); + msk->snd_nxt = msk->write_seq; + msk->snd_una = msk->write_seq; + msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; + if (mp_opt->mp_capable) { msk->can_ack = true; msk->remote_key = mp_opt->sndr_key; mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); ack_seq++; WRITE_ONCE(msk->ack_seq, ack_seq); + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); } sock_reset_flag(nsk, SOCK_RCU_FREE); /* will be fully established after successful MPC subflow creation */ inet_sk_state_store(nsk, TCP_SYN_RECV); + + security_inet_csk_clone(nsk, req); bh_unlock_sock(nsk); /* keep a single reference */ @@ -2102,6 +2717,8 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) TCP_INIT_CWND * tp->advmss); if (msk->rcvq_space.space == 0) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; + + WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); } static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, @@ -2126,7 +2743,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; - struct sock *ssk = newsk; subflow = mptcp_subflow_ctx(newsk); new_mptcp_sock = subflow->conn; @@ -2141,21 +2757,8 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, /* acquire the 2nd reference for the owning socket */ sock_hold(new_mptcp_sock); - - local_bh_disable(); - bh_lock_sock(new_mptcp_sock); - msk = mptcp_sk(new_mptcp_sock); - msk->first = newsk; - newsk = new_mptcp_sock; - mptcp_copy_inaddrs(newsk, ssk); - list_add(&subflow->node, &msk->conn_list); - - mptcp_rcv_space_init(msk, ssk); - bh_unlock_sock(new_mptcp_sock); - - __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); - local_bh_enable(); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); } else { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); @@ -2166,6 +2769,13 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, void mptcp_destroy_common(struct mptcp_sock *msk) { + struct sock *sk = (struct sock *)msk; + + __mptcp_clear_xmit(sk); + + /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ + skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); + skb_rbtree_purge(&msk->out_of_order_queue); mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); @@ -2175,9 +2785,6 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (msk->cached_ext) - __skb_ext_put(msk->cached_ext); - mptcp_destroy_common(msk); sk_sockets_allocated_dec(sk); } @@ -2292,16 +2899,66 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; } -#define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \ - TCPF_WRITE_TIMER_DEFERRED) +void __mptcp_data_acked(struct sock *sk) +{ + if (!sock_owned_by_user(sk)) + __mptcp_clean_una(sk); + else + set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags); + + if (mptcp_pending_data_fin_ack(sk)) + mptcp_schedule_work(sk); +} + +void __mptcp_check_push(struct sock *sk, struct sock *ssk) +{ + if (!mptcp_send_head(sk)) + return; + + if (!sock_owned_by_user(sk)) { + struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk)); -/* this is very alike tcp_release_cb() but we must handle differently a - * different set of events - */ + if (xmit_ssk == ssk) + __mptcp_subflow_push_pending(sk, ssk); + else if (xmit_ssk) + mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk)); + } else { + set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); + } +} + +#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED) + +/* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) { unsigned long flags, nflags; + /* push_pending may touch wmem_reserved, do it before the later + * cleanup + */ + if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) + __mptcp_clean_una(sk); + if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) { + /* mptcp_push_pending() acquires the subflow socket lock + * + * 1) can't be invoked in atomic scope + * 2) must avoid ABBA deadlock with msk socket spinlock: the RX + * datapath acquires the msk socket spinlock while helding + * the subflow socket lock + */ + + spin_unlock_bh(&sk->sk_lock.slock); + mptcp_push_pending(sk, 0); + spin_lock_bh(&sk->sk_lock.slock); + } + if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) + __mptcp_error_report(sk); + + /* clear any wmem reservation and errors */ + __mptcp_update_wmem(sk); + __mptcp_update_rmem(sk); + do { flags = sk->sk_tsq_flags; if (!(flags & MPTCP_DEFERRED_ALL)) @@ -2311,21 +2968,26 @@ static void mptcp_release_cb(struct sock *sk) sock_release_ownership(sk); - if (flags & TCPF_DELACK_TIMER_DEFERRED) { - struct mptcp_sock *msk = mptcp_sk(sk); - struct sock *ssk; - - ssk = mptcp_subflow_recv_lookup(msk); - if (!ssk || !schedule_work(&msk->work)) - __sock_put(sk); - } - if (flags & TCPF_WRITE_TIMER_DEFERRED) { mptcp_retransmit_handler(sk); __sock_put(sk); } } +void mptcp_subflow_process_delegated(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = subflow->conn; + + mptcp_data_lock(sk); + if (!sock_owned_by_user(sk)) + __mptcp_subflow_push_pending(sk, ssk); + else + set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); + mptcp_data_unlock(sk); + mptcp_subflow_delegated_done(subflow); +} + static int mptcp_hash(struct sock *sk) { /* should never be called, @@ -2377,16 +3039,18 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->local_key, subflow->local_key); WRITE_ONCE(msk->write_seq, subflow->idsn + 1); + WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->ack_seq, ack_seq); + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); WRITE_ONCE(msk->can_ack, 1); - atomic64_set(&msk->snd_una, msk->write_seq); + WRITE_ONCE(msk->snd_una, msk->write_seq); - mptcp_pm_new_connection(msk, 0); + mptcp_pm_new_connection(msk, ssk, 0); mptcp_rcv_space_init(msk, ssk); } -static void mptcp_sock_graft(struct sock *sk, struct socket *parent) +void mptcp_sock_graft(struct sock *sk, struct socket *parent) { write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); @@ -2395,9 +3059,9 @@ static void mptcp_sock_graft(struct sock *sk, struct socket *parent) write_unlock_bh(&sk->sk_callback_lock); } -bool mptcp_finish_join(struct sock *sk) +bool mptcp_finish_join(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct sock *parent = (void *)msk; struct socket *parent_sock; @@ -2410,7 +3074,7 @@ bool mptcp_finish_join(struct sock *sk) return false; if (!msk->pm.server_side) - return true; + goto out; if (!mptcp_pm_allow_new_subflow(msk)) return false; @@ -2418,12 +3082,14 @@ bool mptcp_finish_join(struct sock *sk) /* active connections are already on conn_list, and we can't acquire * msk lock here. * use the join list lock as synchronization point and double-check - * msk status to avoid racing with mptcp_close() + * msk status to avoid racing with __mptcp_destroy_sock() */ spin_lock_bh(&msk->join_list_lock); ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; - if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) + if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) { list_add_tail(&subflow->node, &msk->join_list); + sock_hold(ssk); + } spin_unlock_bh(&msk->join_list_lock); if (!ret) return false; @@ -2432,17 +3098,20 @@ bool mptcp_finish_join(struct sock *sk) * at close time */ parent_sock = READ_ONCE(parent->sk_socket); - if (parent_sock && !sk->sk_socket) - mptcp_sock_graft(sk, parent_sock); + if (parent_sock && !ssk->sk_socket) + mptcp_sock_graft(ssk, parent_sock); subflow->map_seq = READ_ONCE(msk->ack_seq); +out: + mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); return true; } -static bool mptcp_memory_free(const struct sock *sk, int wake) +static void mptcp_shutdown(struct sock *sk, int how) { - struct mptcp_sock *msk = mptcp_sk(sk); + pr_debug("sk=%p, how=%d", sk, how); - return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true; + if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) + __mptcp_wr_shutdown(sk); } static struct proto mptcp_prot = { @@ -2454,7 +3123,7 @@ static struct proto mptcp_prot = { .accept = mptcp_accept, .setsockopt = mptcp_setsockopt, .getsockopt = mptcp_getsockopt, - .shutdown = tcp_shutdown, + .shutdown = mptcp_shutdown, .destroy = mptcp_destroy, .sendmsg = mptcp_sendmsg, .recvmsg = mptcp_recvmsg, @@ -2465,8 +3134,8 @@ static struct proto mptcp_prot = { .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, - .stream_memory_free = mptcp_memory_free, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -2609,6 +3278,23 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { struct mptcp_sock *msk = mptcp_sk(newsock->sk); struct mptcp_subflow_context *subflow; + struct sock *newsk = newsock->sk; + + lock_sock(newsk); + + /* PM/worker can now acquire the first subflow socket + * lock without racing with listener queue cleanup, + * we can notify it, if needed. + */ + subflow = mptcp_subflow_ctx(msk->first); + list_add(&subflow->node, &msk->conn_list); + sock_hold(msk->first); + if (mptcp_is_fully_established(newsk)) + mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL); + + mptcp_copy_inaddrs(newsk, msk->first); + mptcp_rcv_space_init(msk, msk->first); + mptcp_propagate_sndbuf(newsk, msk->first); /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. @@ -2620,6 +3306,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } + release_sock(newsk); } if (inet_csk_listen_poll(ssock->sk)) @@ -2638,6 +3325,24 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) 0; } +static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + + if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) + return EPOLLOUT | EPOLLWRNORM; + + if (sk_stream_is_writeable(sk)) + return EPOLLOUT | EPOLLWRNORM; + + mptcp_set_nospace(sk); + smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ + if (sk_stream_is_writeable(sk)) + return EPOLLOUT | EPOLLWRNORM; + + return 0; +} + static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { @@ -2656,79 +3361,49 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); - if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) - mask |= EPOLLOUT | EPOLLWRNORM; + mask |= mptcp_check_writeable(msk); } + if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) + mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + /* This barrier is coupled with smp_wmb() in tcp_reset() */ + smp_rmb(); + if (sk->sk_err) + mask |= EPOLLERR; + return mask; } -static int mptcp_shutdown(struct socket *sock, int how) +static int mptcp_release(struct socket *sock) { - struct mptcp_sock *msk = mptcp_sk(sock->sk); struct mptcp_subflow_context *subflow; - int ret = 0; - - pr_debug("sk=%p, how=%d", msk, how); - - lock_sock(sock->sk); - - how++; - if ((how & ~SHUTDOWN_MASK) || !how) { - ret = -EINVAL; - goto out_unlock; - } - - if (sock->state == SS_CONNECTING) { - if ((1 << sock->sk->sk_state) & - (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) - sock->state = SS_DISCONNECTING; - else - sock->state = SS_CONNECTED; - } - - /* If we've already sent a FIN, or it's a closed state, skip this. */ - if (__mptcp_check_fallback(msk)) { - if (how == SHUT_WR || how == SHUT_RDWR) - inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); + struct sock *sk = sock->sk; + struct mptcp_sock *msk; - mptcp_for_each_subflow(msk, subflow) { - struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + if (!sk) + return 0; - mptcp_subflow_shutdown(sock->sk, tcp_sk, how); - } - } else if ((how & SEND_SHUTDOWN) && - ((1 << sock->sk->sk_state) & - (TCPF_ESTABLISHED | TCPF_SYN_SENT | - TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) && - mptcp_close_state(sock->sk)) { - __mptcp_flush_join_list(msk); + lock_sock(sk); - WRITE_ONCE(msk->write_seq, msk->write_seq + 1); - WRITE_ONCE(msk->snd_data_fin_enable, 1); + msk = mptcp_sk(sk); - mptcp_for_each_subflow(msk, subflow) { - struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - mptcp_subflow_shutdown(sock->sk, tcp_sk, how); - } + ip_mc_drop_socket(ssk); } - /* Wake up anyone sleeping in poll. */ - sock->sk->sk_state_change(sock->sk); - -out_unlock: - release_sock(sock->sk); + release_sock(sk); - return ret; + return inet_release(sock); } static const struct proto_ops mptcp_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, - .release = inet_release, + .release = mptcp_release, .bind = mptcp_bind, .connect = mptcp_stream_connect, .socketpair = sock_no_socketpair, @@ -2738,7 +3413,7 @@ static const struct proto_ops mptcp_stream_ops = { .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = mptcp_listen, - .shutdown = mptcp_shutdown, + .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, @@ -2755,13 +3430,58 @@ static struct inet_protosw mptcp_protosw = { .flags = INET_PROTOSW_ICSK, }; +static int mptcp_napi_poll(struct napi_struct *napi, int budget) +{ + struct mptcp_delegated_action *delegated; + struct mptcp_subflow_context *subflow; + int work_done = 0; + + delegated = container_of(napi, struct mptcp_delegated_action, napi); + while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + bh_lock_sock_nested(ssk); + if (!sock_owned_by_user(ssk) && + mptcp_subflow_has_delegated_action(subflow)) + mptcp_subflow_process_delegated(ssk); + /* ... elsewhere tcp_release_cb_override already processed + * the action or will do at next release_sock(). + * In both case must dequeue the subflow here - on the same + * CPU that scheduled it. + */ + bh_unlock_sock(ssk); + sock_put(ssk); + + if (++work_done == budget) + return budget; + } + + /* always provide a 0 'work_done' argument, so that napi_complete_done + * will not try accessing the NULL napi->dev ptr + */ + napi_complete_done(napi, 0); + return work_done; +} + void __init mptcp_proto_init(void) { + struct mptcp_delegated_action *delegated; + int cpu; + mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) panic("Failed to allocate MPTCP pcpu counter\n"); + init_dummy_netdev(&mptcp_napi_dev); + for_each_possible_cpu(cpu) { + delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); + INIT_LIST_HEAD(&delegated->head); + netif_tx_napi_add(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll, + NAPI_POLL_WEIGHT); + napi_enable(&delegated->napi); + } + mptcp_subflow_init(); mptcp_pm_init(); mptcp_token_init(); @@ -2775,10 +3495,35 @@ void __init mptcp_proto_init(void) } #if IS_ENABLED(CONFIG_MPTCP_IPV6) +static int mptcp6_release(struct socket *sock) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + struct sock *sk = sock->sk; + + if (!sk) + return 0; + + lock_sock(sk); + + msk = mptcp_sk(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + ip_mc_drop_socket(ssk); + ipv6_sock_mc_close(ssk); + ipv6_sock_ac_close(ssk); + } + + release_sock(sk); + return inet6_release(sock); +} + static const struct proto_ops mptcp_v6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, - .release = inet6_release, + .release = mptcp6_release, .bind = mptcp_bind, .connect = mptcp_stream_connect, .socketpair = sock_no_socketpair, @@ -2788,7 +3533,7 @@ static const struct proto_ops mptcp_v6_stream_ops = { .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = mptcp_listen, - .shutdown = mptcp_shutdown, + .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet6_sendmsg, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 13ab89dc1914..91827d949766 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -10,6 +10,7 @@ #include <linux/random.h> #include <net/tcp.h> #include <net/inet_connection_sock.h> +#include <uapi/linux/mptcp.h> #define MPTCP_SUPPORTED_VERSION 1 @@ -23,6 +24,8 @@ #define OPTION_MPTCP_ADD_ADDR BIT(6) #define OPTION_MPTCP_ADD_ADDR6 BIT(7) #define OPTION_MPTCP_RM_ADDR BIT(8) +#define OPTION_MPTCP_FASTCLOSE BIT(9) +#define OPTION_MPTCP_PRIO BIT(10) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -49,15 +52,18 @@ #define TCPOLEN_MPTCP_DSS_MAP64 14 #define TCPOLEN_MPTCP_DSS_CHECKSUM 2 #define TCPOLEN_MPTCP_ADD_ADDR 16 -#define TCPOLEN_MPTCP_ADD_ADDR_PORT 18 +#define TCPOLEN_MPTCP_ADD_ADDR_PORT 20 #define TCPOLEN_MPTCP_ADD_ADDR_BASE 8 -#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10 +#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 12 #define TCPOLEN_MPTCP_ADD_ADDR6 28 -#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30 +#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 32 #define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20 -#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22 -#define TCPOLEN_MPTCP_PORT_LEN 2 +#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 24 +#define TCPOLEN_MPTCP_PORT_LEN 4 #define TCPOLEN_MPTCP_RM_ADDR_BASE 4 +#define TCPOLEN_MPTCP_PRIO 3 +#define TCPOLEN_MPTCP_PRIO_ALIGN 4 +#define TCPOLEN_MPTCP_FASTCLOSE 12 /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) @@ -84,13 +90,26 @@ #define MPTCP_ADDR_IPVERSION_4 4 #define MPTCP_ADDR_IPVERSION_6 6 +/* MPTCP MP_PRIO flags */ +#define MPTCP_PRIO_BKUP BIT(0) + /* MPTCP socket flags */ #define MPTCP_DATA_READY 0 -#define MPTCP_SEND_SPACE 1 +#define MPTCP_NOSPACE 1 #define MPTCP_WORK_RTX 2 #define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 #define MPTCP_WORK_CLOSE_SUBFLOW 5 +#define MPTCP_PUSH_PENDING 6 +#define MPTCP_CLEAN_UNA 7 +#define MPTCP_ERROR_REPORT 8 + +static inline bool before64(__u64 seq1, __u64 seq2) +{ + return (__s64)(seq1 - seq2) < 0; +} + +#define after64(seq2, seq1) before64(seq1, seq2) struct mptcp_options_received { u64 sndr_key; @@ -101,16 +120,18 @@ struct mptcp_options_received { u16 data_len; u16 mp_capable : 1, mp_join : 1, + fastclose : 1, dss : 1, add_addr : 1, rm_addr : 1, + mp_prio : 1, family : 4, echo : 1, backup : 1; u32 token; u32 nonce; u64 thmac; - u8 hmac[20]; + u8 hmac[MPTCPOPT_HMAC_LEN]; u8 join_id; u8 use_map:1, dsn64:1, @@ -153,11 +174,21 @@ struct mptcp_addr_info { enum mptcp_pm_status { MPTCP_PM_ADD_ADDR_RECEIVED, + MPTCP_PM_ADD_ADDR_SEND_ACK, MPTCP_PM_RM_ADDR_RECEIVED, MPTCP_PM_ESTABLISHED, + MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */ MPTCP_PM_SUBFLOW_ESTABLISHED, }; +enum mptcp_addr_signal_status { + MPTCP_ADD_ADDR_SIGNAL, + MPTCP_ADD_ADDR_ECHO, + MPTCP_ADD_ADDR_IPV6, + MPTCP_ADD_ADDR_PORT, + MPTCP_RM_ADDR_SIGNAL, +}; + struct mptcp_pm_data { struct mptcp_addr_info local; struct mptcp_addr_info remote; @@ -165,21 +196,15 @@ struct mptcp_pm_data { spinlock_t lock; /*protects the whole PM data */ - bool add_addr_signal; - bool rm_addr_signal; + u8 addr_signal; bool server_side; bool work_pending; bool accept_addr; bool accept_subflow; - bool add_addr_echo; u8 add_addr_signaled; u8 add_addr_accepted; u8 local_addr_used; u8 subflows; - u8 add_addr_signal_max; - u8 add_addr_accept_max; - u8 local_addr_max; - u8 subflows_max; u8 status; u8 rm_id; }; @@ -187,9 +212,10 @@ struct mptcp_pm_data { struct mptcp_data_frag { struct list_head list; u64 data_seq; - int data_len; - int offset; - int overhead; + u16 data_len; + u16 offset; + u16 overhead; + u16 already_sent; struct page *page; }; @@ -200,27 +226,39 @@ struct mptcp_sock { u64 local_key; u64 remote_key; u64 write_seq; + u64 snd_nxt; u64 ack_seq; + u64 rcv_wnd_sent; u64 rcv_data_fin_seq; + int wmem_reserved; struct sock *last_snd; int snd_burst; - atomic64_t snd_una; + int old_wspace; + u64 snd_una; + u64 wnd_end; unsigned long timer_ival; u32 token; + int rmem_released; unsigned long flags; bool can_ack; bool fully_established; bool rcv_data_fin; bool snd_data_fin_enable; + bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ spinlock_t join_list_lock; + struct sock *ack_hint; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; + struct sk_buff_head receive_queue; + struct sk_buff_head skb_tx_cache; /* this is wmem accounted */ + int tx_pending_data; + int size_goal_cache; struct list_head conn_list; struct list_head rtx_queue; + struct mptcp_data_frag *first_pending; struct list_head join_list; - struct skb_ext *cached_ext; /* for the next sendmsg */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */ struct sock *first; struct mptcp_pm_data pm; @@ -232,19 +270,65 @@ struct mptcp_sock { } rcvq_space; }; +#define mptcp_lock_sock(___sk, cb) do { \ + struct sock *__sk = (___sk); /* silence macro reuse warning */ \ + might_sleep(); \ + spin_lock_bh(&__sk->sk_lock.slock); \ + if (__sk->sk_lock.owned) \ + __lock_sock(__sk); \ + cb; \ + __sk->sk_lock.owned = 1; \ + spin_unlock(&__sk->sk_lock.slock); \ + mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_); \ + local_bh_enable(); \ +} while (0) + +#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) +#define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock) + #define mptcp_for_each_subflow(__msk, __subflow) \ list_for_each_entry(__subflow, &((__msk)->conn_list), node) +static inline void msk_owned_by_me(const struct mptcp_sock *msk) +{ + sock_owned_by_me((const struct sock *)msk); +} + static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) { return (struct mptcp_sock *)sk; } -static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk) +static inline int __mptcp_space(const struct sock *sk) +{ + return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_released); +} + +static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) +{ + const struct mptcp_sock *msk = mptcp_sk(sk); + + return READ_ONCE(msk->first_pending); +} + +static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_data_frag *cur; - if (list_empty(&msk->rtx_queue)) + cur = msk->first_pending; + return list_is_last(&cur->list, &msk->rtx_queue) ? NULL : + list_next_entry(cur, list); +} + +static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (!msk->first_pending) + return NULL; + + if (WARN_ON_ONCE(list_empty(&msk->rtx_queue))) return NULL; return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); @@ -254,6 +338,9 @@ static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); + if (msk->snd_una == READ_ONCE(msk->snd_nxt)) + return NULL; + return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); } @@ -287,6 +374,15 @@ enum mptcp_data_avail { MPTCP_SUBFLOW_OOO_DATA }; +struct mptcp_delegated_action { + struct napi_struct napi; + struct list_head head; +}; + +DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); + +#define MPTCP_DELEGATE_SEND 0 + /* MPTCP subflow context */ struct mptcp_subflow_context { struct list_head node;/* conn_list of subflows */ @@ -311,8 +407,10 @@ struct mptcp_subflow_context { map_valid : 1, mpc_map : 1, backup : 1, + send_mp_prio : 1, rx_eof : 1, - can_ack : 1; /* only after processing the remote a key */ + can_ack : 1, /* only after processing the remote a key */ + disposable : 1; /* ctx can be free at ulp release time */ enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; @@ -322,12 +420,16 @@ struct mptcp_subflow_context { u8 local_id; u8 remote_id; + long delegated_status; + struct list_head delegated_node; /* link into delegated_action, protected by local BH */ + struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ const struct inet_connection_sock_af_ops *icsk_af_ops; void (*tcp_data_ready)(struct sock *sk); void (*tcp_state_change)(struct sock *sk); void (*tcp_write_space)(struct sock *sk); + void (*tcp_error_report)(struct sock *sk); struct rcu_head rcu; }; @@ -361,21 +463,90 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); } +static inline void mptcp_add_pending_subflow(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) +{ + sock_hold(mptcp_subflow_tcp_sock(subflow)); + spin_lock_bh(&msk->join_list_lock); + list_add_tail(&subflow->node, &msk->join_list); + spin_unlock_bh(&msk->join_list_lock); +} + +void mptcp_subflow_process_delegated(struct sock *ssk); + +static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow) +{ + struct mptcp_delegated_action *delegated; + bool schedule; + + /* The implied barrier pairs with mptcp_subflow_delegated_done(), and + * ensures the below list check sees list updates done prior to status + * bit changes + */ + if (!test_and_set_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status)) { + /* still on delegated list from previous scheduling */ + if (!list_empty(&subflow->delegated_node)) + return; + + /* the caller held the subflow bh socket lock */ + lockdep_assert_in_softirq(); + + delegated = this_cpu_ptr(&mptcp_delegated_actions); + schedule = list_empty(&delegated->head); + list_add_tail(&subflow->delegated_node, &delegated->head); + sock_hold(mptcp_subflow_tcp_sock(subflow)); + if (schedule) + napi_schedule(&delegated->napi); + } +} + +static inline struct mptcp_subflow_context * +mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated) +{ + struct mptcp_subflow_context *ret; + + if (list_empty(&delegated->head)) + return NULL; + + ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node); + list_del_init(&ret->delegated_node); + return ret; +} + +static inline bool mptcp_subflow_has_delegated_action(const struct mptcp_subflow_context *subflow) +{ + return test_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status); +} + +static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *subflow) +{ + /* pairs with mptcp_subflow_delegate, ensures delegate_node is updated before + * touching the status bit + */ + smp_wmb(); + clear_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status); +} + int mptcp_is_enabled(struct net *net); +unsigned int mptcp_get_add_addr_timeout(struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); -void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, - struct mptcp_subflow_context *subflow, - long timeout); +void mptcp_close_ssk(struct sock *sk, struct sock *ssk, + struct mptcp_subflow_context *subflow); void mptcp_subflow_reset(struct sock *ssk); +void mptcp_sock_graft(struct sock *sk, struct socket *parent); +struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *remote); int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); +void mptcp_info2sockaddr(const struct mptcp_addr_info *info, + struct sockaddr_storage *addr, + unsigned short family); static inline void mptcp_subflow_tcp_fallback(struct sock *sk, struct mptcp_subflow_context *ctx) @@ -383,6 +554,7 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk, sk->sk_data_ready = ctx->tcp_data_ready; sk->sk_state_change = ctx->tcp_state_change; sk->sk_write_space = ctx->tcp_write_space; + sk->sk_error_report = ctx->tcp_error_report; inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } @@ -407,9 +579,38 @@ static inline bool mptcp_is_fully_established(struct sock *sk) void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); -void mptcp_data_acked(struct sock *sk); +bool mptcp_schedule_work(struct sock *sk); +void __mptcp_check_push(struct sock *sk, struct sock *ssk); +void __mptcp_data_acked(struct sock *sk); +void __mptcp_error_report(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); +void __mptcp_flush_join_list(struct mptcp_sock *msk); +static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) +{ + return READ_ONCE(msk->snd_data_fin_enable) && + READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt); +} + +static inline bool mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) +{ + if ((sk->sk_userlocks & SOCK_SNDBUF_LOCK) || ssk->sk_sndbuf <= READ_ONCE(sk->sk_sndbuf)) + return false; + + WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf); + return true; +} + +static inline void mptcp_write_space(struct sock *sk) +{ + if (sk_stream_is_writeable(sk)) { + /* pairs with memory barrier in mptcp_poll */ + smp_mb(); + if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags)) + sk_stream_write_space(sk); + } +} + void mptcp_destroy_common(struct mptcp_sock *msk); void __init mptcp_token_init(void); @@ -435,8 +636,8 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); -void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side); -void mptcp_pm_fully_established(struct mptcp_sock *msk); +void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); +void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); void mptcp_pm_connection_closed(struct mptcp_sock *msk); void mptcp_pm_subflow_established(struct mptcp_sock *msk, @@ -444,63 +645,89 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk, void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id); void mptcp_pm_add_addr_received(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); +void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id); +void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); +int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + u8 bkup); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); +bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, struct mptcp_addr_info *addr); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, - bool echo); + bool echo, bool port); int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id); int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id); +void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, + const struct sock *ssk, gfp_t gfp); +void mptcp_event_addr_announced(const struct mptcp_sock *msk, const struct mptcp_addr_info *info); +void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); + static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.add_addr_signal); + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); +} + +static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); +} + +static inline bool mptcp_pm_should_add_signal_ipv6(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_IPV6); +} + +static inline bool mptcp_pm_should_add_signal_port(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_PORT); } static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.rm_addr_signal); + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL); } -static inline unsigned int mptcp_add_addr_len(int family, bool echo) +static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) { - if (family == AF_INET) - return echo ? TCPOLEN_MPTCP_ADD_ADDR_BASE - : TCPOLEN_MPTCP_ADD_ADDR; - return echo ? TCPOLEN_MPTCP_ADD_ADDR6_BASE : TCPOLEN_MPTCP_ADD_ADDR6; + u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; + + if (family == AF_INET6) + len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; + if (!echo) + len += MPTCPOPT_THMAC_LEN; + if (port) + len += TCPOLEN_MPTCP_PORT_LEN; + + return len; } bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - struct mptcp_addr_info *saddr, bool *echo); + struct mptcp_addr_info *saddr, bool *echo, bool *port); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, u8 *rm_id); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_data_init(struct mptcp_sock *msk); -void mptcp_pm_nl_fully_established(struct mptcp_sock *msk); -void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk); -void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk); -void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk); +void mptcp_pm_nl_work(struct mptcp_sock *msk); void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); +unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk); +unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk); +unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); +unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk); static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) { return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); } -static inline bool before64(__u64 seq1, __u64 seq2) -{ - return (__s64)(seq1 - seq2) < 0; -} - -#define after64(seq2, seq1) before64(seq1, seq2) - void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ac4a1fe3550b..e1fbcab257e6 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -10,7 +10,7 @@ #include <linux/module.h> #include <linux/netdevice.h> #include <crypto/algapi.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> @@ -18,12 +18,15 @@ #include <net/tcp.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/ip6_route.h> +#include <net/transp_v6.h> #endif #include <net/mptcp.h> #include <uapi/linux/mptcp.h> #include "protocol.h" #include "mib.h" +static void mptcp_subflow_ops_undo_override(struct sock *ssk); + static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, enum linux_mptcp_mib_field field) { @@ -61,11 +64,23 @@ static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) } /* validate received token and create truncated hmac and nonce for SYN-ACK */ -static struct mptcp_sock *subflow_token_join_request(struct request_sock *req, - const struct sk_buff *skb) +static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req) { - struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; + + get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); + + subflow_generate_hmac(msk->local_key, msk->remote_key, + subflow_req->local_nonce, + subflow_req->remote_nonce, hmac); + + subflow_req->thmac = get_unaligned_be64(hmac); +} + +static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_sock *msk; int local_id; @@ -82,17 +97,10 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req, } subflow_req->local_id = local_id; - get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); - - subflow_generate_hmac(msk->local_key, msk->remote_key, - subflow_req->local_nonce, - subflow_req->remote_nonce, hmac); - - subflow_req->thmac = get_unaligned_be64(hmac); return msk; } -static int __subflow_init_req(struct request_sock *req, const struct sock *sk_listener) +static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); @@ -100,32 +108,35 @@ static int __subflow_init_req(struct request_sock *req, const struct sock *sk_li subflow_req->mp_join = 0; subflow_req->msk = NULL; mptcp_token_init_request(req); +} -#ifdef CONFIG_TCP_MD5SIG - /* no MPTCP if MD5SIG is enabled on this socket or we may run out of - * TCP option space. - */ - if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) - return -EINVAL; -#endif - - return 0; +static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk) +{ + return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport; } -static void subflow_init_req(struct request_sock *req, +/* Init mptcp request socket. + * + * Returns an error code if a JOIN has failed and a TCP reset + * should be sent. + */ +static int subflow_check_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; - int ret; pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); - ret = __subflow_init_req(req, sk_listener); - if (ret) - return; +#ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) + return -EINVAL; +#endif mptcp_get_options(skb, &mp_opt); @@ -133,7 +144,7 @@ static void subflow_init_req(struct request_sock *req, SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); if (mp_opt.mp_join) - return; + return 0; } else if (mp_opt.mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); } @@ -157,7 +168,7 @@ again: } else { subflow_req->mp_capable = 1; } - return; + return 0; } err = mptcp_token_new_request(req); @@ -173,9 +184,31 @@ again: subflow_req->remote_id = mp_opt.join_id; subflow_req->token = mp_opt.token; subflow_req->remote_nonce = mp_opt.nonce; - subflow_req->msk = subflow_token_join_request(req, skb); + subflow_req->msk = subflow_token_join_request(req); + + /* Can't fall back to TCP in this case. */ + if (!subflow_req->msk) + return -EPERM; + + if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { + pr_debug("syn inet_sport=%d %d", + ntohs(inet_sk(sk_listener)->inet_sport), + ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); + if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { + sock_put((struct sock *)subflow_req->msk); + mptcp_token_destroy_request(req); + tcp_request_sock_ops.destructor(req); + subflow_req->msk = NULL; + subflow_req->mp_join = 0; + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX); + return -EPERM; + } + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX); + } + + subflow_req_create_thmac(subflow_req); - if (unlikely(req->syncookie) && subflow_req->msk) { + if (unlikely(req->syncookie)) { if (mptcp_can_accept_new_subflow(subflow_req->msk)) subflow_init_req_cookie_join_save(subflow_req, skb); } @@ -183,6 +216,8 @@ again: pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } + + return 0; } int mptcp_subflow_init_cookie_req(struct request_sock *req, @@ -194,10 +229,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, struct mptcp_options_received mp_opt; int err; - err = __subflow_init_req(req, sk_listener); - if (err) - return err; - + subflow_init_req(req, sk_listener); mptcp_get_options(skb, &mp_opt); if (mp_opt.mp_capable && mp_opt.mp_join) @@ -228,27 +260,55 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, } EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); -static void subflow_v4_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static struct dst_entry *subflow_v4_route_req(const struct sock *sk, + struct sk_buff *skb, + struct flowi *fl, + struct request_sock *req) { + struct dst_entry *dst; + int err; + tcp_rsk(req)->is_mptcp = 1; + subflow_init_req(req, sk); - tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb); + dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); + if (!dst) + return NULL; - subflow_init_req(req, sk_listener, skb); + err = subflow_check_req(req, sk, skb); + if (err == 0) + return dst; + + dst_release(dst); + if (!req->syncookie) + tcp_request_sock_ops.send_reset(sk, skb); + return NULL; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) -static void subflow_v6_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static struct dst_entry *subflow_v6_route_req(const struct sock *sk, + struct sk_buff *skb, + struct flowi *fl, + struct request_sock *req) { + struct dst_entry *dst; + int err; + tcp_rsk(req)->is_mptcp = 1; + subflow_init_req(req, sk); + + dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); + if (!dst) + return NULL; - tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb); + err = subflow_check_req(req, sk, skb); + if (err == 0) + return dst; - subflow_init_req(req, sk_listener, skb); + dst_release(dst); + if (!req->syncookie) + tcp6_request_sock_ops.send_reset(sk, skb); + return NULL; } #endif @@ -276,12 +336,22 @@ void mptcp_subflow_reset(struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; + /* must hold: tcp_done() could drop last reference on parent */ + sock_hold(sk); + tcp_set_state(ssk, TCP_CLOSE); tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && schedule_work(&mptcp_sk(sk)->work)) - sock_hold(sk); + return; /* worker will put sk for us */ + + sock_put(sk); +} + +static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk) +{ + return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport; } static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) @@ -301,6 +371,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) if (subflow->conn_finished) return; + mptcp_propagate_sndbuf(parent, sk); subflow->rel_write_seq = 1; subflow->conn_finished = 1; subflow->ssn_offset = TCP_SKB_CB(skb)->seq; @@ -349,6 +420,13 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); + + if (subflow_use_different_dport(mptcp_sk(parent), sk)) { + pr_debug("synack inet_dport=%d %d", + ntohs(inet_sk(sk)->inet_dport), + ntohs(inet_sk(parent)->inet_dport)); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX); + } } else if (mptcp_check_fallback(sk)) { fallback: mptcp_rcv_space_init(mptcp_sk(parent), sk); @@ -385,6 +463,7 @@ drop: static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops; static struct inet_connection_sock_af_ops subflow_v6_specific; static struct inet_connection_sock_af_ops subflow_v6m_specific; +static struct proto tcpv6_prot_override; static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) { @@ -466,6 +545,8 @@ static void subflow_ulp_fallback(struct sock *sk, icsk->icsk_ulp_ops = NULL; rcu_assign_pointer(icsk->icsk_ulp_data, NULL); tcp_sk(sk)->is_mptcp = 0; + + mptcp_subflow_ops_undo_override(sk); } static void subflow_drop_ctx(struct sock *ssk) @@ -543,9 +624,8 @@ create_msk: fallback = true; } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); - if (!mp_opt.mp_join || - !mptcp_can_accept_new_subflow(subflow_req->msk) || - !subflow_hmac_valid(req, &mp_opt)) { + if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) || + !mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); fallback = true; } @@ -578,11 +658,16 @@ create_child: */ inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED); + /* record the newly created socket as the first msk + * subflow, but don't link it yet into conn_list + */ + WRITE_ONCE(mptcp_sk(new_msk)->first, child); + /* new mpc subflow takes ownership of the newly * created mptcp socket */ new_msk->sk_destruct = mptcp_sock_destruct; - mptcp_pm_new_connection(mptcp_sk(new_msk), 1); + mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1); mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); ctx->conn = new_msk; new_msk = NULL; @@ -607,6 +692,17 @@ create_child: SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); tcp_rsk(req)->drop_req = true; + + if (subflow_use_different_sport(owner, sk)) { + pr_debug("ack inet_sport=%d %d", + ntohs(inet_sk(sk)->inet_sport), + ntohs(inet_sk((struct sock *)owner)->inet_sport)); + if (!mptcp_pm_sport_in_anno_list(owner, sk)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX); + goto out; + } + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX); + } } } @@ -635,6 +731,7 @@ dispose_child: } static struct inet_connection_sock_af_ops subflow_specific; +static struct proto tcp_prot_override; enum mapping_status { MAPPING_OK, @@ -846,8 +943,22 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) subflow->map_valid = 0; - if (incr) - tcp_cleanup_rbuf(ssk, incr); +} + +/* sched mptcp worker to remove the subflow if no more data is pending */ +static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) +{ + struct sock *sk = (struct sock *)msk; + + if (likely(ssk->sk_state != TCP_CLOSE)) + return; + + if (skb_queue_empty(&ssk->sk_receive_queue) && + !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) { + sock_hold(sk); + if (!schedule_work(&msk->work)) + sock_put(sk); + } } static bool subflow_check_data_avail(struct sock *ssk) @@ -888,11 +999,11 @@ static bool subflow_check_data_avail(struct sock *ssk) } if (status != MAPPING_OK) - return false; + goto no_data; skb = skb_peek(&ssk->sk_receive_queue); if (WARN_ON_ONCE(!skb)) - return false; + goto no_data; /* if msk lacks the remote key, this subflow must provide an * MP_CAPABLE-based mapping @@ -926,6 +1037,9 @@ static bool subflow_check_data_avail(struct sock *ssk) } return true; +no_data: + subflow_sched_work_if_closed(msk, ssk); + return false; fatal: /* fatal protocol error, close the socket */ /* This barrier is coupled with smp_rmb() in tcp_poll() */ @@ -969,7 +1083,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); const struct sock *sk = subflow->conn; - *space = tcp_space(sk); + *space = __mptcp_space(sk); *full_space = tcp_full_space(sk); } @@ -982,6 +1096,12 @@ static void subflow_data_ready(struct sock *sk) msk = mptcp_sk(parent); if (state & TCPF_LISTEN) { + /* MPJ subflow are removed from accept queue before reaching here, + * avoid stray wakeups + */ + if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) + return; + set_bit(MPTCP_DATA_READY, &msk->flags); parent->sk_data_ready(parent); return; @@ -994,22 +1114,54 @@ static void subflow_data_ready(struct sock *sk) mptcp_data_ready(parent, sk); } -static void subflow_write_space(struct sock *sk) +static void subflow_write_space(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct sock *parent = subflow->conn; + struct sock *sk = mptcp_subflow_ctx(ssk)->conn; - if (!sk_stream_is_writeable(sk)) - return; + mptcp_propagate_sndbuf(sk, ssk); + mptcp_write_space(sk); +} + +void __mptcp_error_report(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + int err = sock_error(ssk); + + if (!err) + continue; - if (sk_stream_is_writeable(parent)) { - set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); - smp_mb__after_atomic(); - /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ - sk_stream_write_space(parent); + /* only propagate errors on fallen-back sockets or + * on MPC connect + */ + if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk)) + continue; + + inet_sk_state_store(sk, inet_sk_state_load(ssk)); + sk->sk_err = -err; + + /* This barrier is coupled with smp_rmb() in mptcp_poll() */ + smp_wmb(); + sk->sk_error_report(sk); + break; } } +static void subflow_error_report(struct sock *ssk) +{ + struct sock *sk = mptcp_subflow_ctx(ssk)->conn; + + mptcp_data_lock(sk); + if (!sock_owned_by_user(sk)) + __mptcp_error_report(sk); + else + set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags); + mptcp_data_unlock(sk); +} + static struct inet_connection_sock_af_ops * subflow_default_af_ops(struct sock *sk) { @@ -1040,22 +1192,32 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped) } #endif -static void mptcp_info2sockaddr(const struct mptcp_addr_info *info, - struct sockaddr_storage *addr) +void mptcp_info2sockaddr(const struct mptcp_addr_info *info, + struct sockaddr_storage *addr, + unsigned short family) { memset(addr, 0, sizeof(*addr)); - addr->ss_family = info->family; + addr->ss_family = family; if (addr->ss_family == AF_INET) { struct sockaddr_in *in_addr = (struct sockaddr_in *)addr; - in_addr->sin_addr = info->addr; + if (info->family == AF_INET) + in_addr->sin_addr = info->addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (ipv6_addr_v4mapped(&info->addr6)) + in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3]; +#endif in_addr->sin_port = info->port; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->ss_family == AF_INET6) { struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr; - in6_addr->sin6_addr = info->addr6; + if (info->family == AF_INET) + ipv6_addr_set_v4mapped(info->addr.s_addr, + &in6_addr->sin6_addr); + else + in6_addr->sin6_addr = info->addr6; in6_addr->sin6_port = info->port; } #endif @@ -1099,11 +1261,11 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, subflow->remote_key = msk->remote_key; subflow->local_key = msk->local_key; subflow->token = msk->token; - mptcp_info2sockaddr(loc, &addr); + mptcp_info2sockaddr(loc, &addr, ssk->sk_family); addrlen = sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (loc->family == AF_INET6) + if (addr.ss_family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif ssk->sk_bound_dev_if = loc->ifindex; @@ -1119,23 +1281,72 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, subflow->remote_id = remote_id; subflow->request_join = 1; subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - mptcp_info2sockaddr(remote, &addr); + mptcp_info2sockaddr(remote, &addr, ssk->sk_family); + mptcp_add_pending_subflow(msk, subflow); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) - goto failed; + goto failed_unlink; + + /* discard the subflow socket */ + mptcp_sock_graft(ssk, sk->sk_socket); + iput(SOCK_INODE(sf)); + return err; +failed_unlink: spin_lock_bh(&msk->join_list_lock); - list_add_tail(&subflow->node, &msk->join_list); + list_del(&subflow->node); spin_unlock_bh(&msk->join_list_lock); - return err; - failed: + subflow->disposable = 1; sock_release(sf); return err; } +static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) +{ +#ifdef CONFIG_SOCK_CGROUP_DATA + struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, + *child_skcd = &child->sk_cgrp_data; + + /* only the additional subflows created by kworkers have to be modified */ + if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != + cgroup_id(sock_cgroup_ptr(child_skcd))) { +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg = parent->sk_memcg; + + mem_cgroup_sk_free(child); + if (memcg && css_tryget(&memcg->css)) + child->sk_memcg = memcg; +#endif /* CONFIG_MEMCG */ + + cgroup_sk_free(child_skcd); + *child_skcd = *parent_skcd; + cgroup_sk_clone(child_skcd); + } +#endif /* CONFIG_SOCK_CGROUP_DATA */ +} + +static void mptcp_subflow_ops_override(struct sock *ssk) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (ssk->sk_prot == &tcpv6_prot) + ssk->sk_prot = &tcpv6_prot_override; + else +#endif + ssk->sk_prot = &tcp_prot_override; +} + +static void mptcp_subflow_ops_undo_override(struct sock *ssk) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (ssk->sk_prot == &tcpv6_prot_override) + ssk->sk_prot = &tcpv6_prot; + else +#endif + ssk->sk_prot = &tcp_prot; +} int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) { struct mptcp_subflow_context *subflow; @@ -1156,6 +1367,9 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) lock_sock(sf->sk); + /* the newly created socket has to be in the same cgroup as its parent */ + mptcp_attach_cgroup(sk, sf->sk); + /* kernel sockets do not by default acquire net ref, but TCP timer * needs it. */ @@ -1188,6 +1402,7 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) *new_sock = sf; sock_hold(sk); subflow->conn = sk; + mptcp_subflow_ops_override(sf->sk); return 0; } @@ -1204,6 +1419,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, rcu_assign_pointer(icsk->icsk_ulp_data, ctx); INIT_LIST_HEAD(&ctx->node); + INIT_LIST_HEAD(&ctx->delegated_node); pr_debug("subflow=%p", ctx); @@ -1236,6 +1452,7 @@ static void subflow_state_change(struct sock *sk) __subflow_state_change(sk); if (subflow_simultaneous_connect(sk)) { + mptcp_propagate_sndbuf(parent, sk); mptcp_do_fallback(sk); mptcp_rcv_space_init(mptcp_sk(parent), sk); pr_fallback(mptcp_sk(parent)); @@ -1253,8 +1470,9 @@ static void subflow_state_change(struct sock *sk) if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); + subflow_sched_work_if_closed(mptcp_sk(parent), sk); + if (__mptcp_check_fallback(mptcp_sk(parent)) && - !(parent->sk_shutdown & RCV_SHUTDOWN) && !subflow->rx_eof && subflow_is_done(sk)) { subflow->rx_eof = 1; mptcp_subflow_eof(parent); @@ -1290,24 +1508,37 @@ static int subflow_ulp_init(struct sock *sk) ctx->tcp_data_ready = sk->sk_data_ready; ctx->tcp_state_change = sk->sk_state_change; ctx->tcp_write_space = sk->sk_write_space; + ctx->tcp_error_report = sk->sk_error_report; sk->sk_data_ready = subflow_data_ready; sk->sk_write_space = subflow_write_space; sk->sk_state_change = subflow_state_change; + sk->sk_error_report = subflow_error_report; out: return err; } -static void subflow_ulp_release(struct sock *sk) +static void subflow_ulp_release(struct sock *ssk) { - struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); + bool release = true; + struct sock *sk; if (!ctx) return; - if (ctx->conn) - sock_put(ctx->conn); + sk = ctx->conn; + if (sk) { + /* if the msk has been orphaned, keep the ctx + * alive, will be freed by __mptcp_close_ssk(), + * when the subflow is still unaccepted + */ + release = ctx->disposable || list_empty(&ctx->node); + sock_put(sk); + } - kfree_rcu(ctx, rcu); + mptcp_subflow_ops_undo_override(ssk); + if (release) + kfree_rcu(ctx, rcu); } static void subflow_ulp_clone(const struct request_sock *req, @@ -1335,6 +1566,7 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; new_ctx->tcp_state_change = old_ctx->tcp_state_change; new_ctx->tcp_write_space = old_ctx->tcp_write_space; + new_ctx->tcp_error_report = old_ctx->tcp_error_report; new_ctx->rel_write_seq = 1; new_ctx->tcp_sock = newsk; @@ -1359,6 +1591,16 @@ static void subflow_ulp_clone(const struct request_sock *req, } } +static void tcp_release_cb_override(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + + if (mptcp_subflow_has_delegated_action(subflow)) + mptcp_subflow_process_delegated(ssk); + + tcp_release_cb(ssk); +} + static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { .name = "mptcp", .owner = THIS_MODULE, @@ -1392,16 +1634,19 @@ void __init mptcp_subflow_init(void) panic("MPTCP: failed to init subflow request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; - subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req; + subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; subflow_specific = ipv4_specific; subflow_specific.conn_request = subflow_v4_conn_request; subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect; + tcp_prot_override = tcp_prot; + tcp_prot_override.release_cb = tcp_release_cb_override; + #if IS_ENABLED(CONFIG_MPTCP_IPV6) subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; - subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req; + subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; subflow_v6_specific = ipv6_specific; subflow_v6_specific.conn_request = subflow_v6_conn_request; @@ -1414,6 +1659,9 @@ void __init mptcp_subflow_init(void) subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; subflow_v6m_specific.net_frag_header_len = 0; + + tcpv6_prot_override = tcpv6_prot; + tcpv6_prot_override.release_cb = tcp_release_cb_override; #endif mptcp_diag_subflow_init(&subflow_ulp_ops); diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index f1be3e3f6425..a9cb355324d1 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1726,9 +1726,6 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, ndp->ptype.dev = dev; dev_add_pack(&ndp->ptype); - /* Set up generic netlink interface */ - ncsi_init_netlink(dev); - pdev = to_platform_device(dev->dev.parent); if (pdev) { np = pdev->dev.of_node; @@ -1892,8 +1889,6 @@ void ncsi_unregister_dev(struct ncsi_dev *nd) list_del_rcu(&ndp->node); spin_unlock_irqrestore(&ncsi_dev_lock, flags); - ncsi_unregister_netlink(nd->dev); - kfree(ndp); } EXPORT_SYMBOL_GPL(ncsi_unregister_dev); diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index adddc7707aa4..bb5f1650f11c 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -766,24 +766,8 @@ static struct genl_family ncsi_genl_family __ro_after_init = { .n_small_ops = ARRAY_SIZE(ncsi_ops), }; -int ncsi_init_netlink(struct net_device *dev) +static int __init ncsi_init_netlink(void) { - int rc; - - rc = genl_register_family(&ncsi_genl_family); - if (rc) - netdev_err(dev, "ncsi: failed to register netlink family\n"); - - return rc; -} - -int ncsi_unregister_netlink(struct net_device *dev) -{ - int rc; - - rc = genl_unregister_family(&ncsi_genl_family); - if (rc) - netdev_err(dev, "ncsi: failed to unregister netlink family\n"); - - return rc; + return genl_register_family(&ncsi_genl_family); } +subsys_initcall(ncsi_init_netlink); diff --git a/net/ncsi/ncsi-netlink.h b/net/ncsi/ncsi-netlink.h index 7502723fba83..39a1a9d7bf77 100644 --- a/net/ncsi/ncsi-netlink.h +++ b/net/ncsi/ncsi-netlink.h @@ -22,7 +22,4 @@ int ncsi_send_netlink_err(struct net_device *dev, struct nlmsghdr *nlhdr, int err); -int ncsi_init_netlink(struct net_device *dev); -int ncsi_unregister_netlink(struct net_device *dev); - #endif /* __NCSI_NETLINK_H__ */ diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 5b1f4ec66dd9..888ccc2d4e34 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -1120,7 +1120,7 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, int payload, i, ret; /* Find the NCSI device */ - nd = ncsi_find_dev(dev); + nd = ncsi_find_dev(orig_dev); ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; if (!ndp) return -ENODEV; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 52370211e46b..1a92063c73a4 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only menu "Core Netfilter Configuration" - depends on NET && INET && NETFILTER + depends on INET && NETFILTER config NETFILTER_INGRESS bool "Netfilter ingress support" @@ -682,6 +682,16 @@ config NFT_FIB_NETDEV The lookup will be delegated to the IPv4 or IPv6 FIB depending on the protocol of the packet. +config NFT_REJECT_NETDEV + depends on NFT_REJECT_IPV4 + depends on NFT_REJECT_IPV6 + tristate "Netfilter nf_tables netdev REJECT support" + help + This option enables the REJECT support from the netdev table. + The return packet generation will be delegated to the IPv4 + or IPv6 ICMP or TCP RST implementation depending on the + protocol of the packet. + endif # NF_TABLES_NETDEV endif # NF_TABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 0e0ded87e27b..33da7bf1b68e 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -101,6 +101,7 @@ obj-$(CONFIG_NFT_QUEUE) += nft_queue.o obj-$(CONFIG_NFT_QUOTA) += nft_quota.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o +obj-$(CONFIG_NFT_REJECT_NETDEV) += nft_reject_netdev.o obj-$(CONFIG_NFT_TUNNEL) += nft_tunnel.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 7cff6e5e7445..89009c82a6b2 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -271,8 +271,7 @@ flag_nested(const struct nlattr *nla) static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = { [IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 }, - [IPSET_ATTR_IPADDR_IPV6] = { .type = NLA_BINARY, - .len = sizeof(struct in6_addr) }, + [IPSET_ATTR_IPADDR_IPV6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), }; int @@ -1110,6 +1109,8 @@ static int ip_set_create(struct net *net, struct sock *ctnl, ret = -IPSET_ERR_PROTOCOL; goto put_out; } + /* Set create flags depending on the type revision */ + set->flags |= set->type->create_flags[revision]; ret = set->type->create(net, set, tb, flags); if (ret != 0) @@ -1240,10 +1241,12 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, /* Modified by ip_set_destroy() only, which is serialized */ inst->is_destroyed = false; } else { + u32 flags = flag_exist(nlh); s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); if (!s) { - ret = -ENOENT; + if (!(flags & IPSET_FLAG_EXIST)) + ret = -ENOENT; goto out; } else if (s->ref || s->ref_netlink) { ret = -IPSET_ERR_BUSY; diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 521e970be402..6186358eac7c 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -37,18 +37,18 @@ */ /* Number of elements to store in an initial array block */ -#define AHASH_INIT_SIZE 4 +#define AHASH_INIT_SIZE 2 /* Max number of elements to store in an array block */ -#define AHASH_MAX_SIZE (3 * AHASH_INIT_SIZE) +#define AHASH_MAX_SIZE (6 * AHASH_INIT_SIZE) /* Max muber of elements in the array block when tuned */ #define AHASH_MAX_TUNED 64 +#define AHASH_MAX(h) ((h)->bucketsize) + /* Max number of elements can be tuned */ #ifdef IP_SET_HASH_WITH_MULTI -#define AHASH_MAX(h) ((h)->ahash_max) - static u8 -tune_ahash_max(u8 curr, u32 multi) +tune_bucketsize(u8 curr, u32 multi) { u32 n; @@ -61,12 +61,10 @@ tune_ahash_max(u8 curr, u32 multi) */ return n > curr && n <= AHASH_MAX_TUNED ? n : curr; } - -#define TUNE_AHASH_MAX(h, multi) \ - ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi)) +#define TUNE_BUCKETSIZE(h, multi) \ + ((h)->bucketsize = tune_bucketsize((h)->bucketsize, multi)) #else -#define AHASH_MAX(h) AHASH_MAX_SIZE -#define TUNE_AHASH_MAX(h, multi) +#define TUNE_BUCKETSIZE(h, multi) #endif /* A hash bucket */ @@ -143,20 +141,6 @@ htable_size(u8 hbits) return hsize * sizeof(struct hbucket *) + sizeof(struct htable); } -/* Compute htable_bits from the user input parameter hashsize */ -static u8 -htable_bits(u32 hashsize) -{ - /* Assume that hashsize == 2^htable_bits */ - u8 bits = fls(hashsize - 1); - - if (jhash_size(bits) != hashsize) - /* Round up to the first 2^n value */ - bits = fls(hashsize); - - return bits; -} - #ifdef IP_SET_HASH_WITH_NETS #if IPSET_NET_COUNT > 1 #define __CIDR(cidr, i) (cidr[i]) @@ -321,9 +305,7 @@ struct htype { #ifdef IP_SET_HASH_WITH_MARKMASK u32 markmask; /* markmask value for mark mask to store */ #endif -#ifdef IP_SET_HASH_WITH_MULTI - u8 ahash_max; /* max elements in an array block */ -#endif + u8 bucketsize; /* max elements in an array block */ #ifdef IP_SET_HASH_WITH_NETMASK u8 netmask; /* netmask value for subnets to store */ #endif @@ -644,7 +626,7 @@ mtype_resize(struct ip_set *set, bool retried) struct htype *h = set->data; struct htable *t, *orig; u8 htable_bits; - size_t dsize = set->dsize; + size_t hsize, dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; struct mtype_elem *tmp; @@ -668,14 +650,12 @@ mtype_resize(struct ip_set *set, bool retried) retry: ret = 0; htable_bits++; - if (!htable_bits) { - /* In case we have plenty of memory :-) */ - pr_warn("Cannot increase the hashsize of set %s further\n", - set->name); - ret = -IPSET_ERR_HASH_FULL; - goto out; - } - t = ip_set_alloc(htable_size(htable_bits)); + if (!htable_bits) + goto hbwarn; + hsize = htable_size(htable_bits); + if (!hsize) + goto hbwarn; + t = ip_set_alloc(hsize); if (!t) { ret = -ENOMEM; goto out; @@ -817,6 +797,12 @@ cleanup: if (ret == -EAGAIN) goto retry; goto out; + +hbwarn: + /* In case we have plenty of memory :-) */ + pr_warn("Cannot increase the hashsize of set %s further\n", set->name); + ret = -IPSET_ERR_HASH_FULL; + goto out; } /* Get the current number of elements and ext_size in the set */ @@ -950,7 +936,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, goto set_full; /* Create a new slot */ if (n->pos >= n->size) { - TUNE_AHASH_MAX(h, multi); + TUNE_BUCKETSIZE(h, multi); if (n->size >= AHASH_MAX(h)) { /* Trigger rehashing */ mtype_data_next(&h->next, d); @@ -1305,6 +1291,11 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask)) goto nla_put_failure; #endif + if (set->flags & IPSET_CREATE_FLAG_BUCKETSIZE) { + if (nla_put_u8(skb, IPSET_ATTR_BUCKETSIZE, h->bucketsize) || + nla_put_net32(skb, IPSET_ATTR_INITVAL, htonl(h->initval))) + goto nla_put_failure; + } if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements))) @@ -1520,7 +1511,11 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, if (!h) return -ENOMEM; - hbits = htable_bits(hashsize); + /* Compute htable_bits from the user input parameter hashsize. + * Assume that hashsize == 2^htable_bits, + * otherwise round up to the first 2^n value. + */ + hbits = fls(hashsize - 1); hsize = htable_size(hbits); if (hsize == 0) { kfree(h); @@ -1547,8 +1542,20 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #ifdef IP_SET_HASH_WITH_MARKMASK h->markmask = markmask; #endif - get_random_bytes(&h->initval, sizeof(h->initval)); - + if (tb[IPSET_ATTR_INITVAL]) + h->initval = ntohl(nla_get_be32(tb[IPSET_ATTR_INITVAL])); + else + get_random_bytes(&h->initval, sizeof(h->initval)); + h->bucketsize = AHASH_MAX_SIZE; + if (tb[IPSET_ATTR_BUCKETSIZE]) { + h->bucketsize = nla_get_u8(tb[IPSET_ATTR_BUCKETSIZE]); + if (h->bucketsize < AHASH_INIT_SIZE) + h->bucketsize = AHASH_INIT_SIZE; + else if (h->bucketsize > AHASH_MAX_SIZE) + h->bucketsize = AHASH_MAX_SIZE; + else if (h->bucketsize % 2) + h->bucketsize += 1; + } t->htable_bits = hbits; t->maxelem = h->maxelem / ahash_numof_locks(hbits); RCU_INIT_POINTER(h->table, t); diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 5d6d68eaf6a9..d1bef23fd4f5 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -23,7 +23,8 @@ /* 1 Counters support */ /* 2 Comments support */ /* 3 Forceadd support */ -#define IPSET_TYPE_REV_MAX 4 /* skbinfo support */ +/* 4 skbinfo support */ +#define IPSET_TYPE_REV_MAX 5 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -277,11 +278,13 @@ static struct ip_set_type hash_ip_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ip_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c index eceb7bc4a93a..467c59a83c0a 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmac.c +++ b/net/netfilter/ipset/ip_set_hash_ipmac.c @@ -23,7 +23,7 @@ #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 0 +#define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Tomasz Chilinski <tomasz.chilinski@chilan.com>"); @@ -268,11 +268,13 @@ static struct ip_set_type hash_ipmac_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipmac_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index aba1df617d6e..18346d18aa16 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -21,7 +21,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Forceadd support */ -#define IPSET_TYPE_REV_MAX 2 /* skbinfo support */ +/* 2 skbinfo support */ +#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>"); @@ -274,12 +275,14 @@ static struct ip_set_type hash_ipmark_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipmark_create, .create_policy = { [IPSET_ATTR_MARKMASK] = { .type = NLA_U32 }, [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 1ff228717e29..e1ca11196515 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -25,7 +25,8 @@ /* 2 Counters support added */ /* 3 Comments support added */ /* 4 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ +/* 5 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -341,11 +342,13 @@ static struct ip_set_type hash_ipport_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipport_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index fa88afd812fa..ab179e064597 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -25,7 +25,8 @@ /* 2 Counters support added */ /* 3 Comments support added */ /* 4 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ +/* 5 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -356,11 +357,13 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipportip_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index eef6ecfcb409..8f075b44cf64 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -27,7 +27,8 @@ /* 4 Counters support added */ /* 5 Comments support added */ /* 6 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ +/* 7 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -513,11 +514,13 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipportnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 0b61593165ef..718814730acf 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -16,7 +16,7 @@ #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 0 +#define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -125,11 +125,13 @@ static struct ip_set_type hash_mac_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_mac_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 136cf0781d3a..c1a11f041ac6 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -24,7 +24,8 @@ /* 3 Counters support added */ /* 4 Comments support added */ /* 5 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */ +/* 6 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 7 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -354,11 +355,13 @@ static struct ip_set_type hash_net_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_net_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index be5e95a0d876..ddd51c2e1cb3 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -26,7 +26,8 @@ /* 4 Comments support added */ /* 5 Forceadd support added */ /* 6 skbinfo support added */ -#define IPSET_TYPE_REV_MAX 7 /* interface wildcard support added */ +/* 7 interface wildcard support added */ +#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -225,7 +226,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } - nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); + nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); @@ -442,7 +443,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], ip6_netmask(&e.ip, e.cidr); - nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); + nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); @@ -470,11 +471,13 @@ static struct ip_set_type hash_netiface_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netiface_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index da4ef910b12d..6532f0505e66 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -22,7 +22,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */ +/* 2 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); @@ -459,11 +460,13 @@ static struct ip_set_type hash_netnet_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 34448df80fb9..ec1564a1cb5a 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -26,7 +26,8 @@ /* 4 Counters support added */ /* 5 Comments support added */ /* 6 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ +/* 7 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -460,11 +461,13 @@ static struct ip_set_type hash_netport_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netport_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 934c1712cba8..0e91d1e82f1c 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -23,7 +23,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 0 Comments support added */ /* 1 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */ +/* 2 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); @@ -558,11 +559,13 @@ static struct ip_set_type hash_netportnet_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netportnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index eb0e329f9b8d..d61886874940 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -4,7 +4,7 @@ # menuconfig IP_VS tristate "IP virtual server support" - depends on NET && INET && NETFILTER + depends on INET && NETFILTER depends on (NF_CONNTRACK || NF_CONNTRACK=n) help IP Virtual Server support will let you build a high-performance @@ -271,6 +271,17 @@ config IP_VS_NQ If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_TWOS + tristate "weighted random twos choice least-connection scheduling" + help + The weighted random twos choice least-connection scheduling + algorithm picks two random real servers and directs network + connections to the server with the least active connections + normalized by the server weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + comment 'IPVS SH scheduler' config IP_VS_SH_TAB_BITS diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index bfce2677fda2..bb5d8125c82a 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o +obj-$(CONFIG_IP_VS_TWOS) += ip_vs_twos.o # IPVS application helpers obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index c0b8215ab3d4..0c132ff9b446 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -68,18 +68,6 @@ EXPORT_SYMBOL(ip_vs_get_debug_level); #endif EXPORT_SYMBOL(ip_vs_new_conn_out); -#ifdef CONFIG_IP_VS_PROTO_TCP -INDIRECT_CALLABLE_DECLARE(int - tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); -#endif - -#ifdef CONFIG_IP_VS_PROTO_UDP -INDIRECT_CALLABLE_DECLARE(int - udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); -#endif - #if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP) #define SNAT_CALL(f, ...) \ INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__) @@ -2137,7 +2125,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int if (cp->flags & IP_VS_CONN_F_ONE_PACKET) pkts = sysctl_sync_threshold(ipvs); else - pkts = atomic_add_return(1, &cp->in_pkts); + pkts = atomic_inc_return(&cp->in_pkts); if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, pkts); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index e279ded4e306..d45dbcba8b49 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -4167,12 +4167,18 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) spin_lock_init(&ipvs->tot_stats.lock); - proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops, - sizeof(struct ip_vs_iter)); - proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, - ip_vs_stats_show, NULL); - proc_create_net_single("ip_vs_stats_percpu", 0, ipvs->net->proc_net, - ip_vs_stats_percpu_show, NULL); +#ifdef CONFIG_PROC_FS + if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, + &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter))) + goto err_vs; + if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, + ip_vs_stats_show, NULL)) + goto err_stats; + if (!proc_create_net_single("ip_vs_stats_percpu", 0, + ipvs->net->proc_net, + ip_vs_stats_percpu_show, NULL)) + goto err_percpu; +#endif if (ip_vs_control_net_init_sysctl(ipvs)) goto err; @@ -4180,6 +4186,17 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) return 0; err: +#ifdef CONFIG_PROC_FS + remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); + +err_percpu: + remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); + +err_stats: + remove_proc_entry("ip_vs", ipvs->net->proc_net); + +err_vs: +#endif free_percpu(ipvs->tot_stats.cpustats); return -ENOMEM; } @@ -4188,9 +4205,11 @@ void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { ip_vs_trash_cleanup(ipvs); ip_vs_control_net_cleanup_sysctl(ipvs); +#ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); remove_proc_entry("ip_vs", ipvs->net->proc_net); +#endif free_percpu(ipvs->tot_stats.cpustats); } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 16b48064f715..9d43277b8b4f 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -615,7 +615,7 @@ static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, cp = cp->control; if (cp) { if (cp->flags & IP_VS_CONN_F_TEMPLATE) - pkts = atomic_add_return(1, &cp->in_pkts); + pkts = atomic_inc_return(&cp->in_pkts); else pkts = sysctl_sync_threshold(ipvs); ip_vs_sync_conn(ipvs, cp, pkts); @@ -776,7 +776,7 @@ control: if (!cp) return; if (cp->flags & IP_VS_CONN_F_TEMPLATE) - pkts = atomic_add_return(1, &cp->in_pkts); + pkts = atomic_inc_return(&cp->in_pkts); else pkts = sysctl_sync_threshold(ipvs); goto sloop; diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c new file mode 100644 index 000000000000..acb55d8393ef --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_twos.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* IPVS: Power of Twos Choice Scheduling module + * + * Authors: Darby Payne <darby.payne@applovin.com> + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/random.h> + +#include <net/ip_vs.h> + +/* Power of Twos Choice scheduling, algorithm originally described by + * Michael Mitzenmacher. + * + * Randomly picks two destinations and picks the one with the least + * amount of connections + * + * The algorithm calculates a few variables + * - total_weight = sum of all weights + * - rweight1 = random number between [0,total_weight] + * - rweight2 = random number between [0,total_weight] + * + * For each destination + * decrement rweight1 and rweight2 by the destination weight + * pick choice1 when rweight1 is <= 0 + * pick choice2 when rweight2 is <= 0 + * + * Return choice2 if choice2 has less connections than choice 1 normalized + * by weight + * + * References + * ---------- + * + * [Mitzenmacher 2016] + * The Power of Two Random Choices: A Survey of Techniques and Results + * Michael Mitzenmacher, Andrea W. Richa y, Ramesh Sitaraman + * http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/twosurvey.pdf + * + */ +static struct ip_vs_dest *ip_vs_twos_schedule(struct ip_vs_service *svc, + const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *choice1 = NULL, *choice2 = NULL; + int rweight1, rweight2, weight1 = -1, weight2 = -1, overhead1 = 0; + int overhead2, total_weight = 0, weight; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* Generate a random weight between [0,sum of all weights) */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD)) { + weight = atomic_read(&dest->weight); + if (weight > 0) { + total_weight += weight; + choice1 = dest; + } + } + } + + if (!choice1) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + /* Add 1 to total_weight so that the random weights are inclusive + * from 0 to total_weight + */ + total_weight += 1; + rweight1 = prandom_u32() % total_weight; + rweight2 = prandom_u32() % total_weight; + + /* Pick two weighted servers */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + weight = atomic_read(&dest->weight); + if (weight <= 0) + continue; + + rweight1 -= weight; + rweight2 -= weight; + + if (rweight1 <= 0 && weight1 == -1) { + choice1 = dest; + weight1 = weight; + overhead1 = ip_vs_dest_conn_overhead(dest); + } + + if (rweight2 <= 0 && weight2 == -1) { + choice2 = dest; + weight2 = weight; + overhead2 = ip_vs_dest_conn_overhead(dest); + } + + if (weight1 != -1 && weight2 != -1) + goto nextstage; + } + +nextstage: + if (choice2 && (weight2 * overhead1) > (weight1 * overhead2)) + choice1 = choice2; + + IP_VS_DBG_BUF(6, "twos: server %s:%u conns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(choice1->af, &choice1->addr), + ntohs(choice1->port), atomic_read(&choice1->activeconns), + refcount_read(&choice1->refcnt), + atomic_read(&choice1->weight)); + + return choice1; +} + +static struct ip_vs_scheduler ip_vs_twos_scheduler = { + .name = "twos", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_twos_scheduler.n_list), + .schedule = ip_vs_twos_schedule, +}; + +static int __init ip_vs_twos_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_twos_scheduler); +} + +static void __exit ip_vs_twos_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_twos_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_twos_init); +module_exit(ip_vs_twos_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 234b7cab37c3..ff0168736f6e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1229,7 +1229,8 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, * Let nf_ct_resolve_clash() deal with this later. */ if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && + nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) continue; NF_CT_STAT_INC_ATOMIC(net, found); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 3d0fd33be018..1469365bac7e 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -167,10 +167,14 @@ nla_put_failure: return -1; } -static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct, + bool skip_zero) { long timeout = nf_ct_expires(ct) / HZ; + if (skip_zero && timeout == 0) + return 0; + if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout))) goto nla_put_failure; return 0; @@ -179,7 +183,8 @@ nla_put_failure: return -1; } -static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) +static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct, + bool destroy) { const struct nf_conntrack_l4proto *l4proto; struct nlattr *nest_proto; @@ -193,7 +198,7 @@ static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) if (!nest_proto) goto nla_put_failure; - ret = l4proto->to_nlattr(skb, nest_proto, ct); + ret = l4proto->to_nlattr(skb, nest_proto, ct, destroy); nla_nest_end(skb, nest_proto); @@ -537,8 +542,8 @@ static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) return -1; if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) && - (ctnetlink_dump_timeout(skb, ct) < 0 || - ctnetlink_dump_protoinfo(skb, ct) < 0)) + (ctnetlink_dump_timeout(skb, ct, false) < 0 || + ctnetlink_dump_protoinfo(skb, ct, false) < 0)) return -1; return 0; @@ -780,15 +785,19 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) goto nla_put_failure; if (events & (1 << IPCT_DESTROY)) { + if (ctnetlink_dump_timeout(skb, ct, true) < 0) + goto nla_put_failure; + if (ctnetlink_dump_acct(skb, ct, type) < 0 || - ctnetlink_dump_timestamp(skb, ct) < 0) + ctnetlink_dump_timestamp(skb, ct) < 0 || + ctnetlink_dump_protoinfo(skb, ct, true) < 0) goto nla_put_failure; } else { - if (ctnetlink_dump_timeout(skb, ct) < 0) + if (ctnetlink_dump_timeout(skb, ct, false) < 0) goto nla_put_failure; - if (events & (1 << IPCT_PROTOINFO) - && ctnetlink_dump_protoinfo(skb, ct) < 0) + if (events & (1 << IPCT_PROTOINFO) && + ctnetlink_dump_protoinfo(skb, ct, false) < 0) goto nla_put_failure; if ((events & (1 << IPCT_HELPER) || nfct_help(ct)) @@ -2677,12 +2686,6 @@ ctnetlink_glue_build_size(const struct nf_conn *ct) ; } -static struct nf_conn *ctnetlink_glue_get_ct(const struct sk_buff *skb, - enum ip_conntrack_info *ctinfo) -{ - return nf_ct_get(skb, ctinfo); -} - static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) { const struct nf_conntrack_zone *zone; @@ -2720,10 +2723,10 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) if (ctnetlink_dump_status(skb, ct) < 0) goto nla_put_failure; - if (ctnetlink_dump_timeout(skb, ct) < 0) + if (ctnetlink_dump_timeout(skb, ct, false) < 0) goto nla_put_failure; - if (ctnetlink_dump_protoinfo(skb, ct) < 0) + if (ctnetlink_dump_protoinfo(skb, ct, false) < 0) goto nla_put_failure; if (ctnetlink_dump_helpinfo(skb, ct) < 0) @@ -2916,7 +2919,6 @@ static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct, } static struct nfnl_ct_hook ctnetlink_glue_hook = { - .get_ct = ctnetlink_glue_get_ct, .build_size = ctnetlink_glue_build_size, .build = ctnetlink_glue_build, .parse = ctnetlink_glue_parse, diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index b3f4a334f9d7..db7479db8512 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -589,7 +589,7 @@ static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct) #if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, - struct nf_conn *ct) + struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; @@ -597,15 +597,22 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP); if (!nest_parms) goto nla_put_failure; - if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state) || - nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE, + if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state)) + goto nla_put_failure; + + if (destroy) + goto skip_state; + + if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE, ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) || nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, cpu_to_be64(ct->proto.dccp.handshake_seq), CTA_PROTOINFO_DCCP_PAD)) goto nla_put_failure; +skip_state: nla_nest_end(skb, nest_parms); spin_unlock_bh(&ct->lock); + return 0; nla_put_failure: diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 810cca24b399..fb8dc02e502f 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -543,7 +543,7 @@ static bool sctp_can_early_drop(const struct nf_conn *ct) #include <linux/netfilter/nfnetlink_conntrack.h> static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, - struct nf_conn *ct) + struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; @@ -552,15 +552,20 @@ static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, if (!nest_parms) goto nla_put_failure; - if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state) || - nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, + if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state)) + goto nla_put_failure; + + if (destroy) + goto skip_state; + + if (nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) || nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY, ct->proto.sctp.vtag[IP_CT_DIR_REPLY])) goto nla_put_failure; +skip_state: spin_unlock_bh(&ct->lock); - nla_nest_end(skb, nest_parms); return 0; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index c8fb2187ad4b..1d7e1c595546 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -834,12 +834,6 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, return true; } -static bool nf_conntrack_tcp_established(const struct nf_conn *ct) -{ - return ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED && - test_bit(IPS_ASSURED_BIT, &ct->status); -} - /* Returns verdict for packet, or -1 for invalid. */ int nf_conntrack_tcp_packet(struct nf_conn *ct, struct sk_buff *skb, @@ -1192,7 +1186,7 @@ static bool tcp_can_early_drop(const struct nf_conn *ct) #include <linux/netfilter/nfnetlink_conntrack.h> static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, - struct nf_conn *ct) + struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; struct nf_ct_tcp_flags tmp = {}; @@ -1202,8 +1196,13 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, if (!nest_parms) goto nla_put_failure; - if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) || - nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, + if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state)) + goto nla_put_failure; + + if (destroy) + goto skip_state; + + if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, ct->proto.tcp.seen[0].td_scale) || nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, ct->proto.tcp.seen[1].td_scale)) @@ -1218,8 +1217,8 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, sizeof(struct nf_ct_tcp_flags), &tmp)) goto nla_put_failure; +skip_state: spin_unlock_bh(&ct->lock); - nla_nest_end(skb, nest_parms); return 0; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 46c5557c1fec..0ee702d374b0 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -523,6 +523,9 @@ nf_conntrack_hash_sysctl(struct ctl_table *table, int write, { int ret; + /* module_param hashsize could have changed value */ + nf_conntrack_htable_size_user = nf_conntrack_htable_size; + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret < 0 || !write) return ret; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 513f78db3cb2..5fa657b8e03d 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -191,14 +191,14 @@ static u32 flow_offload_hash(const void *data, u32 len, u32 seed) { const struct flow_offload_tuple *tuple = data; - return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed); + return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed); } static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) { const struct flow_offload_tuple_rhash *tuplehash = data; - return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed); + return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed); } static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, @@ -207,7 +207,7 @@ static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, const struct flow_offload_tuple *tuple = arg->key; const struct flow_offload_tuple_rhash *x = ptr; - if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir))) + if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash))) return 1; return 0; @@ -399,7 +399,7 @@ static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, return -1; tcph = (void *)(skb_network_header(skb) + thoff); - inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true); + inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false); return 0; } @@ -415,7 +415,7 @@ static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, udph = (void *)(skb_network_header(skb) + thoff); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace2(&udph->check, skb, port, - new_port, true); + new_port, false); if (!udph->check) udph->check = CSUM_MANGLED_0; } diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index ea923f8cf9c4..b7c3c902290f 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -1174,6 +1174,7 @@ static int __init nf_nat_init(void) ret = register_pernet_subsys(&nat_net_ops); if (ret < 0) { nf_ct_extend_unregister(&nat_extend); + kvfree(nf_nat_bysource); return ret; } diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index d7d34a62d3bf..b100c04a0e43 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -849,7 +849,7 @@ synproxy_send_tcp_ipv6(struct net *net, fl6.fl6_sport = nth->source; fl6.fl6_dport = nth->dest; security_skb_classify_flow((struct sk_buff *)skb, - flowi6_to_flowi(&fl6)); + flowi6_to_flowi_common(&fl6)); err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); if (err) { goto free_nskb; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0f58e98542be..c1eb5cdb3033 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -508,7 +508,7 @@ static int nft_delflowtable(struct nft_ctx *ctx, static struct nft_table *nft_table_lookup(const struct net *net, const struct nlattr *nla, - u8 family, u8 genmask) + u8 family, u8 genmask, u32 nlpid) { struct nft_table *table; @@ -519,8 +519,13 @@ static struct nft_table *nft_table_lookup(const struct net *net, lockdep_is_held(&net->nft.commit_mutex)) { if (!nla_strcmp(nla, table->name) && table->family == family && - nft_active_genmask(table, genmask)) + nft_active_genmask(table, genmask)) { + if (nft_table_has_owner(table) && + table->nlpid != nlpid) + return ERR_PTR(-EPERM); + return table; + } } return ERR_PTR(-ENOENT); @@ -581,7 +586,8 @@ struct nft_module_request { }; #ifdef CONFIG_MODULES -static int nft_request_module(struct net *net, const char *fmt, ...) +static __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, + ...) { char module_name[MODULE_NAME_LEN]; struct nft_module_request *req; @@ -619,7 +625,8 @@ static int nft_request_module(struct net *net, const char *fmt, ...) static void lockdep_nfnl_nft_mutex_not_held(void) { #ifdef CONFIG_PROVE_LOCKING - WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + if (debug_locks) + WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); #endif } @@ -677,6 +684,9 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle), NFTA_TABLE_PAD)) goto nla_put_failure; + if (nft_table_has_owner(table) && + nla_put_be32(skb, NFTA_TABLE_OWNER, htonl(table->nlpid))) + goto nla_put_failure; if (table->udata) { if (nla_put(skb, NFTA_TABLE_USERDATA, table->udlen, table->udata)) @@ -819,7 +829,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } - table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask); + table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]); return PTR_ERR(table); @@ -900,8 +910,8 @@ static int nf_tables_updtable(struct nft_ctx *ctx) return 0; flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS])); - if (flags & ~NFT_TABLE_F_DORMANT) - return -EINVAL; + if (flags & ~NFT_TABLE_F_MASK) + return -EOPNOTSUPP; if (flags == ctx->table->flags) return 0; @@ -1001,7 +1011,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, lockdep_assert_held(&net->nft.commit_mutex); attr = nla[NFTA_TABLE_NAME]; - table = nft_table_lookup(net, attr, family, genmask); + table = nft_table_lookup(net, attr, family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); @@ -1019,8 +1030,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (nla[NFTA_TABLE_FLAGS]) { flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS])); - if (flags & ~NFT_TABLE_F_DORMANT) - return -EINVAL; + if (flags & ~NFT_TABLE_F_MASK) + return -EOPNOTSUPP; } err = -ENOMEM; @@ -1051,6 +1062,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, table->family = family; table->flags = flags; table->handle = ++table_handle; + if (table->flags & NFT_TABLE_F_OWNER) + table->nlpid = NETLINK_CB(skb).portid; nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); @@ -1158,6 +1171,9 @@ static int nft_flush(struct nft_ctx *ctx, int family) if (!nft_is_active_next(ctx->net, table)) continue; + if (nft_table_has_owner(table) && table->nlpid != ctx->portid) + continue; + if (nla[NFTA_TABLE_NAME] && nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0) continue; @@ -1194,7 +1210,8 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, table = nft_table_lookup_byhandle(net, attr, genmask); } else { attr = nla[NFTA_TABLE_NAME]; - table = nft_table_lookup(net, attr, family, genmask); + table = nft_table_lookup(net, attr, family, genmask, + NETLINK_CB(skb).portid); } if (IS_ERR(table)) { @@ -1281,7 +1298,7 @@ static struct nft_chain *nft_chain_lookup(struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); - nla_strlcpy(search, nla, sizeof(search)); + nla_strscpy(search, nla, sizeof(search)); WARN_ON(!rcu_read_lock_held() && !lockdep_commit_lock_is_held(net)); @@ -1577,7 +1594,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } - table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); @@ -1721,7 +1738,11 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, goto err_hook_alloc; } - nla_strlcpy(ifname, attr, IFNAMSIZ); + nla_strscpy(ifname, attr, IFNAMSIZ); + /* nf_tables_netdev_event() is called under rtnl_mutex, this is + * indirectly serializing all the other holders of the commit_mutex with + * the rtnl_mutex. + */ dev = __dev_get_by_name(net, ifname); if (!dev) { err = -ENOENT; @@ -2293,7 +2314,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, lockdep_assert_held(&net->nft.commit_mutex); - table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); @@ -2389,7 +2411,8 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, u32 use; int err; - table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); @@ -3035,7 +3058,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } - table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); @@ -3173,7 +3196,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, lockdep_assert_held(&net->nft.commit_mutex); - table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); @@ -3397,7 +3421,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, int family = nfmsg->nfgen_family, err = 0; struct nft_ctx ctx; - table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); @@ -3565,6 +3590,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_SET_HANDLE] = { .type = NLA_U64 }, [NFTA_SET_EXPR] = { .type = NLA_NESTED }, + [NFTA_SET_EXPRESSIONS] = { .type = NLA_NESTED }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { @@ -3577,7 +3603,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, const struct nlmsghdr *nlh, const struct nlattr * const nla[], struct netlink_ext_ack *extack, - u8 genmask) + u8 genmask, u32 nlpid) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); int family = nfmsg->nfgen_family; @@ -3585,7 +3611,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, if (nla[NFTA_SET_TABLE] != NULL) { table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, - genmask); + genmask, nlpid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); return PTR_ERR(table); @@ -3718,7 +3744,7 @@ cont: return 0; } -static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) +int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) { u64 ms = be64_to_cpu(nla_get_be64(nla)); u64 max = (u64)(~((u64)0)); @@ -3732,7 +3758,7 @@ static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) return 0; } -static __be64 nf_jiffies64_to_msecs(u64 input) +__be64 nf_jiffies64_to_msecs(u64 input) { return cpu_to_be64(jiffies64_to_msecs(input)); } @@ -3772,6 +3798,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, u32 portid = ctx->portid; struct nlattr *nest; u32 seq = ctx->seq; + int i; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), @@ -3840,11 +3867,22 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nla_nest_end(skb, nest); - if (set->expr) { + if (set->num_exprs == 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); - if (nf_tables_fill_expr_info(skb, set->expr) < 0) + if (nf_tables_fill_expr_info(skb, set->exprs[0]) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + } else if (set->num_exprs > 1) { + nest = nla_nest_start_noflag(skb, NFTA_SET_EXPRESSIONS); + if (nest == NULL) goto nla_put_failure; + for (i = 0; i < set->num_exprs; i++) { + if (nft_expr_dump(skb, NFTA_LIST_ELEM, + set->exprs[i]) < 0) + goto nla_put_failure; + } nla_nest_end(skb, nest); } @@ -3988,7 +4026,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, /* Verify existence before starting dump */ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack, - genmask); + genmask, 0); if (err < 0) return err; @@ -4143,7 +4181,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | NFT_SET_INTERVAL | NFT_SET_TIMEOUT | NFT_SET_MAP | NFT_SET_EVAL | - NFT_SET_OBJECT | NFT_SET_CONCAT)) + NFT_SET_OBJECT | NFT_SET_CONCAT | NFT_SET_EXPR)) return -EOPNOTSUPP; /* Only one of these operations is supported */ if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) == @@ -4214,10 +4252,11 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, return err; } - if (nla[NFTA_SET_EXPR]) + if (nla[NFTA_SET_EXPR] || nla[NFTA_SET_EXPRESSIONS]) desc.expr = true; - table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); return PTR_ERR(table); @@ -4278,6 +4317,35 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, err = PTR_ERR(expr); goto err_set_alloc_name; } + set->exprs[0] = expr; + set->num_exprs++; + } else if (nla[NFTA_SET_EXPRESSIONS]) { + struct nft_expr *expr; + struct nlattr *tmp; + int left; + + if (!(flags & NFT_SET_EXPR)) { + err = -EINVAL; + goto err_set_alloc_name; + } + i = 0; + nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) { + if (i == NFT_SET_EXPR_MAX) { + err = -E2BIG; + goto err_set_init; + } + if (nla_type(tmp) != NFTA_LIST_ELEM) { + err = -EINVAL; + goto err_set_init; + } + expr = nft_set_elem_expr_alloc(&ctx, set, tmp); + if (IS_ERR(expr)) { + err = PTR_ERR(expr); + goto err_set_init; + } + set->exprs[i++] = expr; + set->num_exprs++; + } } udata = NULL; @@ -4295,7 +4363,6 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, set->dtype = dtype; set->objtype = objtype; set->dlen = desc.dlen; - set->expr = expr; set->flags = flags; set->size = desc.size; set->policy = policy; @@ -4324,8 +4391,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, err_set_trans: ops->destroy(set); err_set_init: - if (expr) - nft_expr_destroy(&ctx, expr); + for (i = 0; i < set->num_exprs; i++) + nft_expr_destroy(&ctx, set->exprs[i]); err_set_alloc_name: kfree(set->name); err_set_name: @@ -4335,11 +4402,13 @@ err_set_name: static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { + int i; + if (WARN_ON(set->use > 0)) return; - if (set->expr) - nft_expr_destroy(ctx, set->expr); + for (i = 0; i < set->num_exprs; i++) + nft_expr_destroy(ctx, set->exprs[i]); set->ops->destroy(set); kfree(set->name); @@ -4364,7 +4433,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, return -EINVAL; err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack, - genmask); + genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -4389,6 +4458,12 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, return nft_delset(&ctx, set); } +static int nft_validate_register_store(const struct nft_ctx *ctx, + enum nft_registers reg, + const struct nft_data *data, + enum nft_data_types type, + unsigned int len); + static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, @@ -4492,8 +4567,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = { [NFT_SET_EXT_DATA] = { .align = __alignof__(u32), }, - [NFT_SET_EXT_EXPR] = { - .align = __alignof__(struct nft_expr), + [NFT_SET_EXT_EXPRESSIONS] = { + .align = __alignof__(struct nft_set_elem_expr), }, [NFT_SET_EXT_OBJREF] = { .len = sizeof(struct nft_object *), @@ -4536,6 +4611,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING, .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_EXPRESSIONS] = { .type = NLA_NESTED }, }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -4552,14 +4628,14 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, const struct nlmsghdr *nlh, const struct nlattr * const nla[], struct netlink_ext_ack *extack, - u8 genmask) + u8 genmask, u32 nlpid) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); int family = nfmsg->nfgen_family; struct nft_table *table; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, - genmask); + genmask, nlpid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); return PTR_ERR(table); @@ -4569,6 +4645,43 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, return 0; } +static int nft_set_elem_expr_dump(struct sk_buff *skb, + const struct nft_set *set, + const struct nft_set_ext *ext) +{ + struct nft_set_elem_expr *elem_expr; + u32 size, num_exprs = 0; + struct nft_expr *expr; + struct nlattr *nest; + + elem_expr = nft_set_ext_expr(ext); + nft_setelem_expr_foreach(expr, elem_expr, size) + num_exprs++; + + if (num_exprs == 1) { + expr = nft_setelem_expr_at(elem_expr, 0); + if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr) < 0) + return -1; + + return 0; + } else if (num_exprs > 1) { + nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_EXPRESSIONS); + if (nest == NULL) + goto nla_put_failure; + + nft_setelem_expr_foreach(expr, elem_expr, size) { + expr = nft_setelem_expr_at(elem_expr, size); + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + } + return 0; + +nla_put_failure: + return -1; +} + static int nf_tables_fill_setelem(struct sk_buff *skb, const struct nft_set *set, const struct nft_set_elem *elem) @@ -4596,8 +4709,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, set->dlen) < 0) goto nla_put_failure; - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR) && - nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0) + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) && + nft_set_elem_expr_dump(skb, set, ext)) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && @@ -4939,7 +5052,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, int rem, err = 0; err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, - genmask); + genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -5092,8 +5205,8 @@ void *nft_set_elem_init(const struct nft_set *set, return elem; } -static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, - struct nft_expr *expr) +static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx, + struct nft_expr *expr) { if (expr->ops->destroy_clone) { expr->ops->destroy_clone(ctx, expr); @@ -5103,6 +5216,16 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, } } +static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, + struct nft_set_elem_expr *elem_expr) +{ + struct nft_expr *expr; + u32 size; + + nft_setelem_expr_foreach(expr, elem_expr, size) + __nft_set_elem_expr_destroy(ctx, expr); +} + void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr) { @@ -5115,7 +5238,7 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); - if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext)); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) @@ -5132,32 +5255,72 @@ static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, { struct nft_set_ext *ext = nft_set_elem_ext(set, elem); - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); kfree(elem); } +int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_expr *expr_array[]) +{ + struct nft_expr *expr; + int err, i, k; + + for (i = 0; i < set->num_exprs; i++) { + expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL); + if (!expr) + goto err_expr; + + err = nft_expr_clone(expr, set->exprs[i]); + if (err < 0) { + nft_expr_destroy(ctx, expr); + goto err_expr; + } + expr_array[i] = expr; + } + + return 0; + +err_expr: + for (k = i - 1; k >= 0; k--) + nft_expr_destroy(ctx, expr_array[k]); + + return -ENOMEM; +} + +static void nft_set_elem_expr_setup(const struct nft_set_ext *ext, int i, + struct nft_expr *expr_array[]) +{ + struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); + struct nft_expr *expr = nft_setelem_expr_at(elem_expr, elem_expr->size); + + memcpy(expr, expr_array[i], expr_array[i]->ops->size); + elem_expr->size += expr_array[i]->ops->size; + kfree(expr_array[i]); + expr_array[i] = NULL; +} + static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { + struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {}; struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; u8 genmask = nft_genmask_next(ctx->net); + u32 flags = 0, size = 0, num_exprs = 0; struct nft_set_ext_tmpl tmpl; struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; struct nft_object *obj = NULL; - struct nft_expr *expr = NULL; struct nft_userdata *udata; struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; - u32 flags = 0; u64 timeout; u64 expiration; + int err, i; u8 ulen; - int err; err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy, NULL); @@ -5190,7 +5353,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nla[NFTA_SET_ELEM_TIMEOUT] || nla[NFTA_SET_ELEM_EXPIRATION] || nla[NFTA_SET_ELEM_USERDATA] || - nla[NFTA_SET_ELEM_EXPR])) + nla[NFTA_SET_ELEM_EXPR] || + nla[NFTA_SET_ELEM_EXPRESSIONS])) return -EINVAL; timeout = 0; @@ -5215,23 +5379,64 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } - if (nla[NFTA_SET_ELEM_EXPR] != NULL) { + if (nla[NFTA_SET_ELEM_EXPR]) { + struct nft_expr *expr; + + if (set->num_exprs && set->num_exprs != 1) + return -EOPNOTSUPP; + expr = nft_set_elem_expr_alloc(ctx, set, nla[NFTA_SET_ELEM_EXPR]); if (IS_ERR(expr)) return PTR_ERR(expr); - err = -EOPNOTSUPP; - if (set->expr && set->expr->ops != expr->ops) + expr_array[0] = expr; + num_exprs = 1; + + if (set->num_exprs && set->exprs[0]->ops != expr->ops) { + err = -EOPNOTSUPP; goto err_set_elem_expr; - } else if (set->expr) { - expr = kzalloc(set->expr->ops->size, GFP_KERNEL); - if (!expr) - return -ENOMEM; + } + } else if (nla[NFTA_SET_ELEM_EXPRESSIONS]) { + struct nft_expr *expr; + struct nlattr *tmp; + int left; + + i = 0; + nla_for_each_nested(tmp, nla[NFTA_SET_ELEM_EXPRESSIONS], left) { + if (i == NFT_SET_EXPR_MAX || + (set->num_exprs && set->num_exprs == i)) { + err = -E2BIG; + goto err_set_elem_expr; + } + if (nla_type(tmp) != NFTA_LIST_ELEM) { + err = -EINVAL; + goto err_set_elem_expr; + } + expr = nft_set_elem_expr_alloc(ctx, set, tmp); + if (IS_ERR(expr)) { + err = PTR_ERR(expr); + goto err_set_elem_expr; + } + expr_array[i] = expr; + num_exprs++; - err = nft_expr_clone(expr, set->expr); - if (err < 0) + if (set->num_exprs && expr->ops != set->exprs[i]->ops) { + err = -EOPNOTSUPP; + goto err_set_elem_expr; + } + i++; + } + if (set->num_exprs && set->num_exprs != i) { + err = -EOPNOTSUPP; goto err_set_elem_expr; + } + } else if (set->num_exprs > 0) { + err = nft_set_elem_expr_clone(ctx, set, expr_array); + if (err < 0) + goto err_set_elem_expr_clone; + + num_exprs = set->num_exprs; } err = nft_setelem_parse_key(ctx, set, &elem.key.val, @@ -5256,9 +5461,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); } - if (expr) - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPR, - expr->ops->size); + if (num_exprs) { + for (i = 0; i < num_exprs; i++) + size += expr_array[i]->ops->size; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS, + sizeof(struct nft_set_elem_expr) + + size); + } if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { if (!(set->flags & NFT_SET_OBJECT)) { @@ -5340,11 +5550,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, *nft_set_ext_obj(ext) = obj; obj->use++; } - if (expr) { - memcpy(nft_set_ext_expr(ext), expr, expr->ops->size); - kfree(expr); - expr = NULL; - } + for (i = 0; i < num_exprs; i++) + nft_set_elem_expr_setup(ext, i, expr_array); trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) @@ -5405,9 +5612,9 @@ err_parse_key_end: err_parse_key: nft_data_release(&elem.key.val, NFT_DATA_VALUE); err_set_elem_expr: - if (expr != NULL) - nft_expr_destroy(ctx, expr); - + for (i = 0; i < num_exprs && expr_array[i]; i++) + nft_expr_destroy(ctx, expr_array[i]); +err_set_elem_expr_clone: return err; } @@ -5426,7 +5633,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, return -EINVAL; err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, - genmask); + genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -5634,7 +5841,7 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, int rem, err = 0; err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, - genmask); + genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -5734,7 +5941,7 @@ struct nft_object *nft_obj_lookup(const struct net *net, struct rhlist_head *tmp, *list; struct nft_object *obj; - nla_strlcpy(search, nla, sizeof(search)); + nla_strscpy(search, nla, sizeof(search)); k.name = search; WARN_ON_ONCE(!rcu_read_lock_held() && @@ -5937,7 +6144,8 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, !nla[NFTA_OBJ_DATA]) return -EINVAL; - table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); @@ -6207,7 +6415,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, !nla[NFTA_OBJ_TYPE]) return -EINVAL; - table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); @@ -6281,7 +6489,8 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE])) return -EINVAL; - table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); @@ -6698,7 +6907,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, return -EINVAL; table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, - genmask); + genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); @@ -6882,7 +7091,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, return -EINVAL; table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, - genmask); + genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); @@ -7090,7 +7299,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, return -EINVAL; table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, - genmask); + genmask, 0); if (IS_ERR(table)) return PTR_ERR(table); @@ -8409,7 +8618,7 @@ EXPORT_SYMBOL_GPL(nft_parse_u32_check); * Registers used to be 128 bit wide, these register numbers will be * mapped to the corresponding 32 bit register numbers. */ -unsigned int nft_parse_register(const struct nlattr *attr) +static unsigned int nft_parse_register(const struct nlattr *attr) { unsigned int reg; @@ -8421,7 +8630,6 @@ unsigned int nft_parse_register(const struct nlattr *attr) return reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00; } } -EXPORT_SYMBOL_GPL(nft_parse_register); /** * nft_dump_register - dump a register value to a netlink attribute @@ -8454,7 +8662,7 @@ EXPORT_SYMBOL_GPL(nft_dump_register); * Validate that the input register is one of the general purpose * registers and that the length of the load is within the bounds. */ -int nft_validate_register_load(enum nft_registers reg, unsigned int len) +static int nft_validate_register_load(enum nft_registers reg, unsigned int len) { if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) return -EINVAL; @@ -8465,7 +8673,21 @@ int nft_validate_register_load(enum nft_registers reg, unsigned int len) return 0; } -EXPORT_SYMBOL_GPL(nft_validate_register_load); + +int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len) +{ + u32 reg; + int err; + + reg = nft_parse_register(attr); + err = nft_validate_register_load(reg, len); + if (err < 0) + return err; + + *sreg = reg; + return 0; +} +EXPORT_SYMBOL_GPL(nft_parse_register_load); /** * nft_validate_register_store - validate an expressions' register store @@ -8481,10 +8703,11 @@ EXPORT_SYMBOL_GPL(nft_validate_register_load); * A value of NULL for the data means that its runtime gathered * data. */ -int nft_validate_register_store(const struct nft_ctx *ctx, - enum nft_registers reg, - const struct nft_data *data, - enum nft_data_types type, unsigned int len) +static int nft_validate_register_store(const struct nft_ctx *ctx, + enum nft_registers reg, + const struct nft_data *data, + enum nft_data_types type, + unsigned int len) { int err; @@ -8516,7 +8739,24 @@ int nft_validate_register_store(const struct nft_ctx *ctx, return 0; } } -EXPORT_SYMBOL_GPL(nft_validate_register_store); + +int nft_parse_register_store(const struct nft_ctx *ctx, + const struct nlattr *attr, u8 *dreg, + const struct nft_data *data, + enum nft_data_types type, unsigned int len) +{ + int err; + u32 reg; + + reg = nft_parse_register(attr); + err = nft_validate_register_store(ctx, reg, data, type, len); + if (err < 0) + return err; + + *dreg = reg; + return 0; +} +EXPORT_SYMBOL_GPL(nft_parse_register_store); static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { [NFTA_VERDICT_CODE] = { .type = NLA_U32 }, @@ -8770,10 +9010,25 @@ int __nft_release_basechain(struct nft_ctx *ctx) } EXPORT_SYMBOL_GPL(__nft_release_basechain); -static void __nft_release_tables(struct net *net) +static void __nft_release_hook(struct net *net, struct nft_table *table) +{ + struct nft_chain *chain; + + list_for_each_entry(chain, &table->chains, list) + nf_tables_unregister_hook(net, table, chain); +} + +static void __nft_release_hooks(struct net *net) +{ + struct nft_table *table; + + list_for_each_entry(table, &net->nft.tables, list) + __nft_release_hook(net, table); +} + +static void __nft_release_table(struct net *net, struct nft_table *table) { struct nft_flowtable *flowtable, *nf; - struct nft_table *table, *nt; struct nft_chain *chain, *nc; struct nft_object *obj, *ne; struct nft_rule *rule, *nr; @@ -8783,47 +9038,90 @@ static void __nft_release_tables(struct net *net) .family = NFPROTO_NETDEV, }; + ctx.family = table->family; + ctx.table = table; + list_for_each_entry(chain, &table->chains, list) { + ctx.chain = chain; + list_for_each_entry_safe(rule, nr, &chain->rules, list) { + list_del(&rule->list); + chain->use--; + nf_tables_rule_release(&ctx, rule); + } + } + list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) { + list_del(&flowtable->list); + table->use--; + nf_tables_flowtable_destroy(flowtable); + } + list_for_each_entry_safe(set, ns, &table->sets, list) { + list_del(&set->list); + table->use--; + nft_set_destroy(&ctx, set); + } + list_for_each_entry_safe(obj, ne, &table->objects, list) { + nft_obj_del(obj); + table->use--; + nft_obj_destroy(&ctx, obj); + } + list_for_each_entry_safe(chain, nc, &table->chains, list) { + ctx.chain = chain; + nft_chain_del(chain); + table->use--; + nf_tables_chain_destroy(&ctx); + } + list_del(&table->list); + nf_tables_table_destroy(&ctx); +} + +static void __nft_release_tables(struct net *net, u32 nlpid) +{ + struct nft_table *table, *nt; + list_for_each_entry_safe(table, nt, &net->nft.tables, list) { - ctx.family = table->family; + if (nft_table_has_owner(table) && + nlpid != table->nlpid) + continue; - list_for_each_entry(chain, &table->chains, list) - nf_tables_unregister_hook(net, table, chain); - /* No packets are walking on these chains anymore. */ - ctx.table = table; - list_for_each_entry(chain, &table->chains, list) { - ctx.chain = chain; - list_for_each_entry_safe(rule, nr, &chain->rules, list) { - list_del(&rule->list); - chain->use--; - nf_tables_rule_release(&ctx, rule); - } - } - list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) { - list_del(&flowtable->list); - table->use--; - nf_tables_flowtable_destroy(flowtable); - } - list_for_each_entry_safe(set, ns, &table->sets, list) { - list_del(&set->list); - table->use--; - nft_set_destroy(&ctx, set); - } - list_for_each_entry_safe(obj, ne, &table->objects, list) { - nft_obj_del(obj); - table->use--; - nft_obj_destroy(&ctx, obj); + __nft_release_table(net, table); + } +} + +static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct netlink_notify *n = ptr; + struct nft_table *table, *nt; + struct net *net = n->net; + bool release = false; + + if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) + return NOTIFY_DONE; + + mutex_lock(&net->nft.commit_mutex); + list_for_each_entry(table, &net->nft.tables, list) { + if (nft_table_has_owner(table) && + n->portid == table->nlpid) { + __nft_release_hook(net, table); + release = true; } - list_for_each_entry_safe(chain, nc, &table->chains, list) { - ctx.chain = chain; - nft_chain_del(chain); - table->use--; - nf_tables_chain_destroy(&ctx); + } + if (release) { + synchronize_rcu(); + list_for_each_entry_safe(table, nt, &net->nft.tables, list) { + if (nft_table_has_owner(table) && + n->portid == table->nlpid) + __nft_release_table(net, table); } - list_del(&table->list); - nf_tables_table_destroy(&ctx); } + mutex_unlock(&net->nft.commit_mutex); + + return NOTIFY_DONE; } +static struct notifier_block nft_nl_notifier = { + .notifier_call = nft_rcv_nl_event, +}; + static int __net_init nf_tables_init_net(struct net *net) { INIT_LIST_HEAD(&net->nft.tables); @@ -8837,12 +9135,17 @@ static int __net_init nf_tables_init_net(struct net *net) return 0; } +static void __net_exit nf_tables_pre_exit_net(struct net *net) +{ + __nft_release_hooks(net); +} + static void __net_exit nf_tables_exit_net(struct net *net) { mutex_lock(&net->nft.commit_mutex); if (!list_empty(&net->nft.commit_list)) __nf_tables_abort(net, NFNL_ABORT_NONE); - __nft_release_tables(net); + __nft_release_tables(net, 0); mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); WARN_ON_ONCE(!list_empty(&net->nft.module_list)); @@ -8850,8 +9153,9 @@ static void __net_exit nf_tables_exit_net(struct net *net) } static struct pernet_operations nf_tables_net_ops = { - .init = nf_tables_init_net, - .exit = nf_tables_exit_net, + .init = nf_tables_init_net, + .pre_exit = nf_tables_pre_exit_net, + .exit = nf_tables_exit_net, }; static int __init nf_tables_module_init(void) @@ -8865,43 +9169,50 @@ static int __init nf_tables_module_init(void) err = nft_chain_filter_init(); if (err < 0) - goto err1; + goto err_chain_filter; err = nf_tables_core_module_init(); if (err < 0) - goto err2; + goto err_core_module; err = register_netdevice_notifier(&nf_tables_flowtable_notifier); if (err < 0) - goto err3; + goto err_netdev_notifier; err = rhltable_init(&nft_objname_ht, &nft_objname_ht_params); if (err < 0) - goto err4; + goto err_rht_objname; err = nft_offload_init(); if (err < 0) - goto err5; + goto err_offload; + + err = netlink_register_notifier(&nft_nl_notifier); + if (err < 0) + goto err_netlink_notifier; /* must be last */ err = nfnetlink_subsys_register(&nf_tables_subsys); if (err < 0) - goto err6; + goto err_nfnl_subsys; nft_chain_route_init(); return err; -err6: + +err_nfnl_subsys: + netlink_unregister_notifier(&nft_nl_notifier); +err_netlink_notifier: nft_offload_exit(); -err5: +err_offload: rhltable_destroy(&nft_objname_ht); -err4: +err_rht_objname: unregister_netdevice_notifier(&nf_tables_flowtable_notifier); -err3: +err_netdev_notifier: nf_tables_core_module_exit(); -err2: +err_core_module: nft_chain_filter_fini(); -err1: +err_chain_filter: unregister_pernet_subsys(&nf_tables_net_ops); return err; } @@ -8909,6 +9220,7 @@ err1: static void __exit nf_tables_module_exit(void) { nfnetlink_subsys_unregister(&nf_tables_subsys); + netlink_unregister_notifier(&nft_nl_notifier); nft_offload_exit(); unregister_netdevice_notifier(&nf_tables_flowtable_notifier); nft_chain_filter_fini(); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 9f625724a20f..9ae14270c543 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -28,6 +28,23 @@ static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) return flow; } +void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, + enum flow_dissector_key_id addr_type) +{ + struct nft_flow_match *match = &flow->match; + struct nft_flow_key *mask = &match->mask; + struct nft_flow_key *key = &match->key; + + if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL)) + return; + + key->control.addr_type = addr_type; + mask->control.addr_type = 0xffff; + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL); + match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] = + offsetof(struct nft_flow_key, control); +} + struct nft_flow_rule *nft_flow_rule_create(struct net *net, const struct nft_rule *rule) { diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 5bfec829c12f..0fa1653b5f19 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -16,6 +16,7 @@ #include <linux/errno.h> #include <net/netlink.h> #include <net/sock.h> +#include <net/netns/generic.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> @@ -41,6 +42,17 @@ struct nfacct_filter { u32 mask; }; +struct nfnl_acct_net { + struct list_head nfnl_acct_list; +}; + +static unsigned int nfnl_acct_net_id __read_mostly; + +static inline struct nfnl_acct_net *nfnl_acct_pernet(struct net *net) +{ + return net_generic(net, nfnl_acct_net_id); +} + #define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) #define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */ @@ -49,6 +61,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, const struct nlattr * const tb[], struct netlink_ext_ack *extack) { + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *nfacct, *matching = NULL; char *acct_name; unsigned int size = 0; @@ -61,7 +74,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, if (strlen(acct_name) == 0) return -EINVAL; - list_for_each_entry(nfacct, &net->nfnl_acct_list, head) { + list_for_each_entry(nfacct, &nfnl_acct_net->nfnl_acct_list, head) { if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) continue; @@ -112,7 +125,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, nfacct->flags = flags; } - nla_strlcpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX); + nla_strscpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX); if (tb[NFACCT_BYTES]) { atomic64_set(&nfacct->bytes, @@ -123,7 +136,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS]))); } refcount_set(&nfacct->refcnt, 1); - list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list); + list_add_tail_rcu(&nfacct->head, &nfnl_acct_net->nfnl_acct_list); return 0; } @@ -188,6 +201,7 @@ static int nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *last; const struct nfacct_filter *filter = cb->data; @@ -199,7 +213,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->args[1] = 0; rcu_read_lock(); - list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) { + list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) { if (last) { if (cur != last) continue; @@ -269,6 +283,7 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl, const struct nlattr * const tb[], struct netlink_ext_ack *extack) { + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); int ret = -ENOENT; struct nf_acct *cur; char *acct_name; @@ -288,7 +303,7 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl, return -EINVAL; acct_name = nla_data(tb[NFACCT_NAME]); - list_for_each_entry(cur, &net->nfnl_acct_list, head) { + list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) @@ -342,19 +357,20 @@ static int nfnl_acct_del(struct net *net, struct sock *nfnl, const struct nlattr * const tb[], struct netlink_ext_ack *extack) { + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *tmp; int ret = -ENOENT; char *acct_name; if (!tb[NFACCT_NAME]) { - list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) + list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head) nfnl_acct_try_del(cur); return 0; } acct_name = nla_data(tb[NFACCT_NAME]); - list_for_each_entry(cur, &net->nfnl_acct_list, head) { + list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0) continue; @@ -402,10 +418,11 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT); struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name) { + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *acct = NULL; rcu_read_lock(); - list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) { + list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) continue; @@ -488,16 +505,17 @@ EXPORT_SYMBOL_GPL(nfnl_acct_overquota); static int __net_init nfnl_acct_net_init(struct net *net) { - INIT_LIST_HEAD(&net->nfnl_acct_list); + INIT_LIST_HEAD(&nfnl_acct_pernet(net)->nfnl_acct_list); return 0; } static void __net_exit nfnl_acct_net_exit(struct net *net) { + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *tmp; - list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) { + list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head) { list_del_rcu(&cur->head); if (refcount_dec_and_test(&cur->refcnt)) @@ -508,6 +526,8 @@ static void __net_exit nfnl_acct_net_exit(struct net *net) static struct pernet_operations nfnl_acct_ops = { .init = nfnl_acct_net_init, .exit = nfnl_acct_net_exit, + .id = &nfnl_acct_net_id, + .size = sizeof(struct nfnl_acct_net), }; static int __init nfnl_acct_init(void) diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 5b0d0a77379c..0f94fce1d3ed 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -146,7 +146,7 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy, !tb[NFCTH_POLICY_EXPECT_TIMEOUT]) return -EINVAL; - nla_strlcpy(expect_policy->name, + nla_strscpy(expect_policy->name, tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN); expect_policy->max_expected = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); @@ -233,7 +233,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[], if (ret < 0) goto err1; - nla_strlcpy(helper->name, + nla_strscpy(helper->name, tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN); size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); if (size > sizeof_field(struct nf_conn_help, data)) { diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index b35e8d9a5b37..26776b88a539 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -43,6 +43,10 @@ #include "../bridge/br_private.h" #endif +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack.h> +#endif + #define NFULNL_COPY_DISABLED 0xff #define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE #define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ @@ -733,14 +737,16 @@ nfulnl_log_packet(struct net *net, size += nla_total_size(sizeof(u_int32_t)); if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) size += nla_total_size(sizeof(u_int32_t)); +#if IS_ENABLED(CONFIG_NF_CONNTRACK) if (inst->flags & NFULNL_CFG_F_CONNTRACK) { nfnl_ct = rcu_dereference(nfnl_ct_hook); if (nfnl_ct != NULL) { - ct = nfnl_ct->get_ct(skb, &ctinfo); + ct = nf_ct_get(skb, &ctinfo); if (ct != NULL) size += nfnl_ct->build_size(ct); } } +#endif if (pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) size += nfulnl_get_bridge_size(skb); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index d1d8bca03b4f..48a07914fd94 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -444,13 +444,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, nfnl_ct = rcu_dereference(nfnl_ct_hook); +#if IS_ENABLED(CONFIG_NF_CONNTRACK) if (queue->flags & NFQA_CFG_F_CONNTRACK) { if (nfnl_ct != NULL) { - ct = nfnl_ct->get_ct(entskb, &ctinfo); + ct = nf_ct_get(entskb, &ctinfo); if (ct != NULL) size += nfnl_ct->build_size(ct); } } +#endif if (queue->flags & NFQA_CFG_F_UID_GID) { size += (nla_total_size(sizeof(u_int32_t)) /* uid */ @@ -1104,9 +1106,10 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct, struct nf_queue_entry *entry, enum ip_conntrack_info *ctinfo) { +#if IS_ENABLED(CONFIG_NF_CONNTRACK) struct nf_conn *ct; - ct = nfnl_ct->get_ct(entry->skb, ctinfo); + ct = nf_ct_get(entry->skb, ctinfo); if (ct == NULL) return NULL; @@ -1118,6 +1121,9 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct, NETLINK_CB(entry->skb).portid, nlmsg_report(nlh)); return ct; +#else + return NULL; +#endif } static int nfqa_parse_bridge(struct nf_queue_entry *entry, diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index bbd773d74377..47b0dba95054 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -16,8 +16,8 @@ #include <net/netfilter/nf_tables_offload.h> struct nft_bitwise { - enum nft_registers sreg:8; - enum nft_registers dreg:8; + u8 sreg; + u8 dreg; enum nft_bitwise_ops op:8; u8 len; struct nft_data mask; @@ -169,14 +169,14 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, priv->len = len; - priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]); - err = nft_validate_register_load(priv->sreg, priv->len); + err = nft_parse_register_load(tb[NFTA_BITWISE_SREG], &priv->sreg, + priv->len); if (err < 0) return err; - priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + err = nft_parse_register_store(ctx, tb[NFTA_BITWISE_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); if (err < 0) return err; @@ -315,14 +315,13 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx, struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr); int err; - priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]); - err = nft_validate_register_load(priv->sreg, sizeof(u32)); + err = nft_parse_register_load(tb[NFTA_BITWISE_SREG], &priv->sreg, + sizeof(u32)); if (err < 0) return err; - priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + err = nft_parse_register_store(ctx, tb[NFTA_BITWISE_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); if (err < 0) return err; diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index 12bed3f7bbc6..9d5947ab8d4e 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -16,8 +16,8 @@ #include <net/netfilter/nf_tables.h> struct nft_byteorder { - enum nft_registers sreg:8; - enum nft_registers dreg:8; + u8 sreg; + u8 dreg; enum nft_byteorder_ops op:8; u8 len; u8 size; @@ -131,20 +131,20 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, return -EINVAL; } - priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]); err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len); if (err < 0) return err; priv->len = len; - err = nft_validate_register_load(priv->sreg, priv->len); + err = nft_parse_register_load(tb[NFTA_BYTEORDER_SREG], &priv->sreg, + priv->len); if (err < 0) return err; - priv->dreg = nft_parse_register(tb[NFTA_BYTEORDER_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + return nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); } static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index bc079d68a536..eb6a43a180bb 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -18,7 +18,7 @@ struct nft_cmp_expr { struct nft_data data; - enum nft_registers sreg:8; + u8 sreg; u8 len; enum nft_cmp_ops op:8; }; @@ -87,8 +87,7 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return err; } - priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); - err = nft_validate_register_load(priv->sreg, desc.len); + err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; @@ -123,11 +122,11 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, u8 *mask = (u8 *)&flow->match.mask; u8 *key = (u8 *)&flow->match.key; - if (priv->op != NFT_CMP_EQ || reg->len != priv->len) + if (priv->op != NFT_CMP_EQ || priv->len > reg->len) return -EOPNOTSUPP; - memcpy(key + reg->offset, &priv->data, priv->len); - memcpy(mask + reg->offset, ®->mask, priv->len); + memcpy(key + reg->offset, &priv->data, reg->len); + memcpy(mask + reg->offset, ®->mask, reg->len); flow->match.dissector.used_keys |= BIT(reg->key); flow->match.dissector.offset[reg->key] = reg->base_offset; @@ -137,7 +136,7 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, nft_reg_load16(priv->data.data) != ARPHRD_ETHER) return -EOPNOTSUPP; - nft_offload_update_dependency(ctx, &priv->data, priv->len); + nft_offload_update_dependency(ctx, &priv->data, reg->len); return 0; } @@ -174,8 +173,7 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, if (err < 0) return err; - priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); - err = nft_validate_register_load(priv->sreg, desc.len); + err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; @@ -268,10 +266,8 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) if (err < 0) return ERR_PTR(err); - if (desc.type != NFT_DATA_VALUE) { - err = -EINVAL; + if (desc.type != NFT_DATA_VALUE) goto err1; - } if (desc.len <= sizeof(u32) && (op == NFT_CMP_EQ || op == NFT_CMP_NEQ)) return &nft_cmp_fast_ops; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 322bd674963e..882fe8648653 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -27,8 +27,8 @@ struct nft_ct { enum nft_ct_keys key:8; enum ip_conntrack_dir dir:8; union { - enum nft_registers dreg:8; - enum nft_registers sreg:8; + u8 dreg; + u8 sreg; }; }; @@ -177,8 +177,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr, } #endif case NFT_CT_ID: - if (!nf_ct_is_confirmed(ct)) - goto err; *dest = nf_ct_get_id(ct); return; default: @@ -500,9 +498,8 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, } } - priv->dreg = nft_parse_register(tb[NFTA_CT_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL, + NFT_DATA_VALUE, len); if (err < 0) return err; @@ -602,8 +599,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, } } - priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]); - err = nft_validate_register_load(priv->sreg, len); + err = nft_parse_register_load(tb[NFTA_CT_SREG], &priv->sreg, len); if (err < 0) goto err1; @@ -990,7 +986,7 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, if (!priv->l4proto) return -ENOENT; - nla_strlcpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name)); + nla_strscpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name)); if (tb[NFTA_CT_HELPER_L3PROTO]) family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO])); diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index 40788b3f1071..bbf3fcba3df4 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -14,7 +14,7 @@ #include <net/netfilter/nf_dup_netdev.h> struct nft_dup_netdev { - enum nft_registers sreg_dev:8; + u8 sreg_dev; }; static void nft_dup_netdev_eval(const struct nft_expr *expr, @@ -40,8 +40,8 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_DEV] == NULL) return -EINVAL; - priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); - return nft_validate_register_load(priv->sreg_dev, sizeof(int)); + return nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, + sizeof(int)); } static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 64ca13a1885b..d44a70c11b3f 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -16,14 +16,34 @@ struct nft_dynset { struct nft_set *set; struct nft_set_ext_tmpl tmpl; enum nft_dynset_ops op:8; - enum nft_registers sreg_key:8; - enum nft_registers sreg_data:8; + u8 sreg_key; + u8 sreg_data; bool invert; + bool expr; + u8 num_exprs; u64 timeout; - struct nft_expr *expr; + struct nft_expr *expr_array[NFT_SET_EXPR_MAX]; struct nft_set_binding binding; }; +static int nft_dynset_expr_setup(const struct nft_dynset *priv, + const struct nft_set_ext *ext) +{ + struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); + struct nft_expr *expr; + int i; + + for (i = 0; i < priv->num_exprs; i++) { + expr = nft_setelem_expr_at(elem_expr, elem_expr->size); + if (nft_expr_clone(expr, priv->expr_array[i]) < 0) + return -1; + + elem_expr->size += priv->expr_array[i]->ops->size; + } + + return 0; +} + static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, struct nft_regs *regs) { @@ -44,8 +64,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, goto err1; ext = nft_set_elem_ext(set, elem); - if (priv->expr != NULL && - nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0) + if (priv->num_exprs && nft_dynset_expr_setup(priv, ext) < 0) goto err2; return elem; @@ -90,6 +109,41 @@ void nft_dynset_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; } +static void nft_dynset_ext_add_expr(struct nft_dynset *priv) +{ + u8 size = 0; + int i; + + for (i = 0; i < priv->num_exprs; i++) + size += priv->expr_array[i]->ops->size; + + nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPRESSIONS, + sizeof(struct nft_set_elem_expr) + size); +} + +static struct nft_expr * +nft_dynset_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, + const struct nlattr *attr, int pos) +{ + struct nft_expr *expr; + int err; + + expr = nft_set_elem_expr_alloc(ctx, set, attr); + if (IS_ERR(expr)) + return expr; + + if (set->exprs[pos] && set->exprs[pos]->ops != expr->ops) { + err = -EOPNOTSUPP; + goto err_dynset_expr; + } + + return expr; + +err_dynset_expr: + nft_expr_destroy(ctx, expr); + return ERR_PTR(err); +} + static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, @@ -100,6 +154,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, [NFTA_DYNSET_EXPR] = { .type = NLA_NESTED }, [NFTA_DYNSET_FLAGS] = { .type = NLA_U32 }, + [NFTA_DYNSET_EXPRESSIONS] = { .type = NLA_NESTED }, }; static int nft_dynset_init(const struct nft_ctx *ctx, @@ -110,7 +165,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; u64 timeout; - int err; + int err, i; lockdep_assert_held(&ctx->net->nft.commit_mutex); @@ -121,11 +176,12 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (tb[NFTA_DYNSET_FLAGS]) { u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS])); - - if (flags & ~NFT_DYNSET_F_INV) - return -EINVAL; + if (flags & ~(NFT_DYNSET_F_INV | NFT_DYNSET_F_EXPR)) + return -EOPNOTSUPP; if (flags & NFT_DYNSET_F_INV) priv->invert = true; + if (flags & NFT_DYNSET_F_EXPR) + priv->expr = true; } set = nft_set_lookup_global(ctx->net, ctx->table, @@ -156,54 +212,110 @@ static int nft_dynset_init(const struct nft_ctx *ctx, timeout = 0; if (tb[NFTA_DYNSET_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) - return -EINVAL; - timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64( - tb[NFTA_DYNSET_TIMEOUT]))); + return -EOPNOTSUPP; + + err = nf_msecs_to_jiffies64(tb[NFTA_DYNSET_TIMEOUT], &timeout); + if (err) + return err; } - priv->sreg_key = nft_parse_register(tb[NFTA_DYNSET_SREG_KEY]); - err = nft_validate_register_load(priv->sreg_key, set->klen); + err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key, + set->klen); if (err < 0) return err; if (tb[NFTA_DYNSET_SREG_DATA] != NULL) { if (!(set->flags & NFT_SET_MAP)) - return -EINVAL; + return -EOPNOTSUPP; if (set->dtype == NFT_DATA_VERDICT) return -EOPNOTSUPP; - priv->sreg_data = nft_parse_register(tb[NFTA_DYNSET_SREG_DATA]); - err = nft_validate_register_load(priv->sreg_data, set->dlen); + err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_DATA], + &priv->sreg_data, set->dlen); if (err < 0) return err; } else if (set->flags & NFT_SET_MAP) return -EINVAL; - if (tb[NFTA_DYNSET_EXPR] != NULL) { - if (!(set->flags & NFT_SET_EVAL)) - return -EINVAL; + if ((tb[NFTA_DYNSET_EXPR] || tb[NFTA_DYNSET_EXPRESSIONS]) && + !(set->flags & NFT_SET_EVAL)) + return -EINVAL; + + if (tb[NFTA_DYNSET_EXPR]) { + struct nft_expr *dynset_expr; + + dynset_expr = nft_dynset_expr_alloc(ctx, set, + tb[NFTA_DYNSET_EXPR], 0); + if (IS_ERR(dynset_expr)) + return PTR_ERR(dynset_expr); + + priv->num_exprs++; + priv->expr_array[0] = dynset_expr; + + if (set->num_exprs > 1 || + (set->num_exprs == 1 && + dynset_expr->ops != set->exprs[0]->ops)) { + err = -EOPNOTSUPP; + goto err_expr_free; + } + } else if (tb[NFTA_DYNSET_EXPRESSIONS]) { + struct nft_expr *dynset_expr; + struct nlattr *tmp; + int left; - priv->expr = nft_set_elem_expr_alloc(ctx, set, - tb[NFTA_DYNSET_EXPR]); - if (IS_ERR(priv->expr)) - return PTR_ERR(priv->expr); + if (!priv->expr) + return -EINVAL; - if (set->expr && set->expr->ops != priv->expr->ops) { + i = 0; + nla_for_each_nested(tmp, tb[NFTA_DYNSET_EXPRESSIONS], left) { + if (i == NFT_SET_EXPR_MAX) { + err = -E2BIG; + goto err_expr_free; + } + if (nla_type(tmp) != NFTA_LIST_ELEM) { + err = -EINVAL; + goto err_expr_free; + } + dynset_expr = nft_dynset_expr_alloc(ctx, set, tmp, i); + if (IS_ERR(dynset_expr)) { + err = PTR_ERR(dynset_expr); + goto err_expr_free; + } + priv->expr_array[i] = dynset_expr; + priv->num_exprs++; + + if (set->num_exprs && + dynset_expr->ops != set->exprs[i]->ops) { + err = -EOPNOTSUPP; + goto err_expr_free; + } + i++; + } + if (set->num_exprs && set->num_exprs != i) { err = -EOPNOTSUPP; goto err_expr_free; } + } else if (set->num_exprs > 0) { + err = nft_set_elem_expr_clone(ctx, set, priv->expr_array); + if (err < 0) + return err; + + priv->num_exprs = set->num_exprs; } nft_set_ext_prepare(&priv->tmpl); nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen); if (set->flags & NFT_SET_MAP) nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_DATA, set->dlen); - if (priv->expr != NULL) - nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPR, - priv->expr->ops->size); + + if (priv->num_exprs) + nft_dynset_ext_add_expr(priv); + if (set->flags & NFT_SET_TIMEOUT) { - if (timeout || set->timeout) + if (timeout || set->timeout) { + nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT); nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION); + } } priv->timeout = timeout; @@ -219,8 +331,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return 0; err_expr_free: - if (priv->expr != NULL) - nft_expr_destroy(ctx, priv->expr); + for (i = 0; i < priv->num_exprs; i++) + nft_expr_destroy(ctx, priv->expr_array[i]); return err; } @@ -245,9 +357,10 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_dynset *priv = nft_expr_priv(expr); + int i; - if (priv->expr != NULL) - nft_expr_destroy(ctx, priv->expr); + for (i = 0; i < priv->num_exprs; i++) + nft_expr_destroy(ctx, priv->expr_array[i]); nf_tables_destroy_set(ctx, priv->set); } @@ -256,6 +369,7 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_dynset *priv = nft_expr_priv(expr); u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0; + int i; if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key)) goto nla_put_failure; @@ -267,11 +381,29 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name)) goto nla_put_failure; if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, - cpu_to_be64(jiffies_to_msecs(priv->timeout)), + nf_jiffies64_to_msecs(priv->timeout), NFTA_DYNSET_PAD)) goto nla_put_failure; - if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr)) - goto nla_put_failure; + if (priv->set->num_exprs == 0) { + if (priv->num_exprs == 1) { + if (nft_expr_dump(skb, NFTA_DYNSET_EXPR, + priv->expr_array[0])) + goto nla_put_failure; + } else if (priv->num_exprs > 1) { + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, NFTA_DYNSET_EXPRESSIONS); + if (!nest) + goto nla_put_failure; + + for (i = 0; i < priv->num_exprs; i++) { + if (nft_expr_dump(skb, NFTA_LIST_ELEM, + priv->expr_array[i])) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + } + } if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags))) goto nla_put_failure; return 0; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 3c48cdc8935d..f64f0017e9a5 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -19,8 +19,8 @@ struct nft_exthdr { u8 offset; u8 len; u8 op; - enum nft_registers dreg:8; - enum nft_registers sreg:8; + u8 dreg; + u8 sreg; u8 flags; }; @@ -350,12 +350,12 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; priv->len = len; - priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]); priv->flags = flags; priv->op = op; - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + return nft_parse_register_store(ctx, tb[NFTA_EXTHDR_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); } static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, @@ -400,11 +400,11 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; priv->len = len; - priv->sreg = nft_parse_register(tb[NFTA_EXTHDR_SREG]); priv->flags = flags; priv->op = op; - return nft_validate_register_load(priv->sreg, priv->len); + return nft_parse_register_load(tb[NFTA_EXTHDR_SREG], &priv->sreg, + priv->len); } static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 4dfdaeaf09a5..b10ce732b337 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -86,7 +86,6 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; priv->result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT])); - priv->dreg = nft_parse_register(tb[NFTA_FIB_DREG]); switch (priv->result) { case NFT_FIB_RESULT_OIF: @@ -106,8 +105,8 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; } - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + err = nft_parse_register_store(ctx, tb[NFTA_FIB_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); if (err < 0) return err; diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index b77985986b24..cd59afde5b2f 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -18,7 +18,7 @@ #include <net/ip.h> struct nft_fwd_netdev { - enum nft_registers sreg_dev:8; + u8 sreg_dev; }; static void nft_fwd_netdev_eval(const struct nft_expr *expr, @@ -50,8 +50,8 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx, if (tb[NFTA_FWD_SREG_DEV] == NULL) return -EINVAL; - priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]); - return nft_validate_register_load(priv->sreg_dev, sizeof(int)); + return nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, + sizeof(int)); } static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -78,8 +78,8 @@ static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx, } struct nft_fwd_neigh { - enum nft_registers sreg_dev:8; - enum nft_registers sreg_addr:8; + u8 sreg_dev; + u8 sreg_addr; u8 nfproto; }; @@ -157,8 +157,6 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx, !tb[NFTA_FWD_NFPROTO]) return -EINVAL; - priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]); - priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]); priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO])); switch (priv->nfproto) { @@ -172,11 +170,13 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - err = nft_validate_register_load(priv->sreg_dev, sizeof(int)); + err = nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, + sizeof(int)); if (err < 0) return err; - return nft_validate_register_load(priv->sreg_addr, addr_len); + return nft_parse_register_load(tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr, + addr_len); } static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 96371d878e7e..f829f5289e16 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -14,8 +14,8 @@ #include <linux/jhash.h> struct nft_jhash { - enum nft_registers sreg:8; - enum nft_registers dreg:8; + u8 sreg; + u8 dreg; u8 len; bool autogen_seed:1; u32 modulus; @@ -38,7 +38,7 @@ static void nft_jhash_eval(const struct nft_expr *expr, } struct nft_symhash { - enum nft_registers dreg:8; + u8 dreg; u32 modulus; u32 offset; }; @@ -83,9 +83,6 @@ static int nft_jhash_init(const struct nft_ctx *ctx, if (tb[NFTA_HASH_OFFSET]) priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); - priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]); - priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); - err = nft_parse_u32_check(tb[NFTA_HASH_LEN], U8_MAX, &len); if (err < 0) return err; @@ -94,6 +91,10 @@ static int nft_jhash_init(const struct nft_ctx *ctx, priv->len = len; + err = nft_parse_register_load(tb[NFTA_HASH_SREG], &priv->sreg, len); + if (err < 0) + return err; + priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); if (priv->modulus < 1) return -ERANGE; @@ -108,9 +109,8 @@ static int nft_jhash_init(const struct nft_ctx *ctx, get_random_bytes(&priv->seed, sizeof(priv->seed)); } - return nft_validate_register_load(priv->sreg, len) && - nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); } static int nft_symhash_init(const struct nft_ctx *ctx, @@ -126,8 +126,6 @@ static int nft_symhash_init(const struct nft_ctx *ctx, if (tb[NFTA_HASH_OFFSET]) priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); - priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); - priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); if (priv->modulus < 1) return -ERANGE; @@ -135,8 +133,9 @@ static int nft_symhash_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + sizeof(u32)); } static int nft_jhash_dump(struct sk_buff *skb, diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index c63eb3b17178..90c64d27ae53 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -48,9 +48,9 @@ static int nft_immediate_init(const struct nft_ctx *ctx, priv->dlen = desc.len; - priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, &priv->data, - desc.type, desc.len); + err = nft_parse_register_store(ctx, tb[NFTA_IMMEDIATE_DREG], + &priv->dreg, &priv->data, desc.type, + desc.len); if (err < 0) goto err1; diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 57899454a530..a06a46b039c5 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -152,7 +152,7 @@ static int nft_log_init(const struct nft_ctx *ctx, priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL); if (priv->prefix == NULL) return -ENOMEM; - nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1); + nla_strscpy(priv->prefix, nla, nla_len(nla) + 1); } else { priv->prefix = (char *)nft_log_null_prefix; } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index f1363b8aabba..b0f558b4fea5 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -17,8 +17,8 @@ struct nft_lookup { struct nft_set *set; - enum nft_registers sreg:8; - enum nft_registers dreg:8; + u8 sreg; + u8 dreg; bool invert; struct nft_set_binding binding; }; @@ -76,8 +76,8 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); - priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]); - err = nft_validate_register_load(priv->sreg, set->klen); + err = nft_parse_register_load(tb[NFTA_LOOKUP_SREG], &priv->sreg, + set->klen); if (err < 0) return err; @@ -100,9 +100,9 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (!(set->flags & NFT_SET_MAP)) return -EINVAL; - priv->dreg = nft_parse_register(tb[NFTA_LOOKUP_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - set->dtype, set->dlen); + err = nft_parse_register_store(ctx, tb[NFTA_LOOKUP_DREG], + &priv->dreg, NULL, set->dtype, + set->dlen); if (err < 0) return err; } else if (set->flags & NFT_SET_MAP) diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 71390b727040..9953e8053753 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -15,8 +15,8 @@ struct nft_masq { u32 flags; - enum nft_registers sreg_proto_min:8; - enum nft_registers sreg_proto_max:8; + u8 sreg_proto_min; + u8 sreg_proto_max; }; static const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { @@ -54,19 +54,15 @@ static int nft_masq_init(const struct nft_ctx *ctx, } if (tb[NFTA_MASQ_REG_PROTO_MIN]) { - priv->sreg_proto_min = - nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]); - - err = nft_validate_register_load(priv->sreg_proto_min, plen); + err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MIN], + &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_MASQ_REG_PROTO_MAX]) { - priv->sreg_proto_max = - nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]); - - err = nft_validate_register_load(priv->sreg_proto_max, - plen); + err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MAX], + &priv->sreg_proto_max, + plen); if (err < 0) return err; } else { diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index b37bd02448d8..a7e01e9952f1 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -535,9 +535,8 @@ int nft_meta_get_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_META_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } EXPORT_SYMBOL_GPL(nft_meta_get_init); @@ -661,8 +660,7 @@ int nft_meta_set_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->sreg = nft_parse_register(tb[NFTA_META_SREG]); - err = nft_validate_register_load(priv->sreg, len); + err = nft_parse_register_load(tb[NFTA_META_SREG], &priv->sreg, len); if (err < 0) return err; @@ -724,22 +722,22 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx, switch (priv->key) { case NFT_META_PROTOCOL: - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto, - sizeof(__u16), reg); + NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto, + sizeof(__u16), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; case NFT_META_L4PROTO: - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, - sizeof(__u8), reg); + NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, + sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); break; case NFT_META_IIF: - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta, - ingress_ifindex, sizeof(__u32), reg); + NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta, + ingress_ifindex, sizeof(__u32), reg); break; case NFT_META_IIFTYPE: - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta, - ingress_iftype, sizeof(__u16), reg); + NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta, + ingress_iftype, sizeof(__u16), reg); break; default: return -EOPNOTSUPP; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 4bcf33b049c4..0840c635b752 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -21,10 +21,10 @@ #include <net/ip.h> struct nft_nat { - enum nft_registers sreg_addr_min:8; - enum nft_registers sreg_addr_max:8; - enum nft_registers sreg_proto_min:8; - enum nft_registers sreg_proto_max:8; + u8 sreg_addr_min; + u8 sreg_addr_max; + u8 sreg_proto_min; + u8 sreg_proto_max; enum nf_nat_manip_type type:8; u8 family; u16 flags; @@ -206,18 +206,15 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { - priv->sreg_addr_min = - nft_parse_register(tb[NFTA_NAT_REG_ADDR_MIN]); - err = nft_validate_register_load(priv->sreg_addr_min, alen); + err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MIN], + &priv->sreg_addr_min, alen); if (err < 0) return err; if (tb[NFTA_NAT_REG_ADDR_MAX]) { - priv->sreg_addr_max = - nft_parse_register(tb[NFTA_NAT_REG_ADDR_MAX]); - - err = nft_validate_register_load(priv->sreg_addr_max, - alen); + err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MAX], + &priv->sreg_addr_max, + alen); if (err < 0) return err; } else { @@ -229,19 +226,15 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, plen = sizeof_field(struct nf_nat_range, min_addr.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { - priv->sreg_proto_min = - nft_parse_register(tb[NFTA_NAT_REG_PROTO_MIN]); - - err = nft_validate_register_load(priv->sreg_proto_min, plen); + err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MIN], + &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_NAT_REG_PROTO_MAX]) { - priv->sreg_proto_max = - nft_parse_register(tb[NFTA_NAT_REG_PROTO_MAX]); - - err = nft_validate_register_load(priv->sreg_proto_max, - plen); + err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MAX], + &priv->sreg_proto_max, + plen); if (err < 0) return err; } else { diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index f1fc824f9737..722cac1e90e0 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -16,7 +16,7 @@ static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state); struct nft_ng_inc { - enum nft_registers dreg:8; + u8 dreg; u32 modulus; atomic_t counter; u32 offset; @@ -66,11 +66,10 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); atomic_set(&priv->counter, priv->modulus - 1); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); } static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, @@ -100,7 +99,7 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) } struct nft_ng_random { - enum nft_registers dreg:8; + u8 dreg; u32 modulus; u32 offset; }; @@ -140,10 +139,8 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, prandom_init_once(&nft_numgen_prandom_state); - priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); - - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); } static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 5f9207a9f485..bc104d36d3bb 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -95,7 +95,7 @@ static const struct nft_expr_ops nft_objref_ops = { struct nft_objref_map { struct nft_set *set; - enum nft_registers sreg:8; + u8 sreg; struct nft_set_binding binding; }; @@ -137,8 +137,8 @@ static int nft_objref_map_init(const struct nft_ctx *ctx, if (!(set->flags & NFT_SET_OBJECT)) return -EINVAL; - priv->sreg = nft_parse_register(tb[NFTA_OBJREF_SET_SREG]); - err = nft_validate_register_load(priv->sreg, set->klen); + err = nft_parse_register_load(tb[NFTA_OBJREF_SET_SREG], &priv->sreg, + set->klen); if (err < 0) return err; diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index c261d57a666a..ac61f708b82d 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -6,7 +6,7 @@ #include <linux/netfilter/nfnetlink_osf.h> struct nft_osf { - enum nft_registers dreg:8; + u8 dreg; u8 ttl; u32 flags; }; @@ -78,9 +78,9 @@ static int nft_osf_init(const struct nft_ctx *ctx, priv->flags = flags; } - priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN); + err = nft_parse_register_store(ctx, tb[NFTA_OSF_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, + NFT_OSF_MAXGENRELEN); if (err < 0) return err; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index dcd3c7b8a367..cb1c8c231880 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -144,10 +144,10 @@ static int nft_payload_init(const struct nft_ctx *ctx, priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); - priv->dreg = nft_parse_register(tb[NFTA_PAYLOAD_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); } static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -165,6 +165,34 @@ nla_put_failure: return -1; } +static bool nft_payload_offload_mask(struct nft_offload_reg *reg, + u32 priv_len, u32 field_len) +{ + unsigned int remainder, delta, k; + struct nft_data mask = {}; + __be32 remainder_mask; + + if (priv_len == field_len) { + memset(®->mask, 0xff, priv_len); + return true; + } else if (priv_len > field_len) { + return false; + } + + memset(&mask, 0xff, field_len); + remainder = priv_len % sizeof(u32); + if (remainder) { + k = priv_len / sizeof(u32); + delta = field_len - priv_len; + remainder_mask = htonl(~((1 << (delta * BITS_PER_BYTE)) - 1)); + mask.data[k] = (__force u32)remainder_mask; + } + + memcpy(®->mask, &mask, field_len); + + return true; +} + static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_payload *priv) @@ -173,21 +201,21 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct ethhdr, h_source): - if (priv->len != ETH_ALEN) + if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN)) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, src, ETH_ALEN, reg); break; case offsetof(struct ethhdr, h_dest): - if (priv->len != ETH_ALEN) + if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN)) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, dst, ETH_ALEN, reg); break; case offsetof(struct ethhdr, h_proto): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, @@ -195,14 +223,14 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; case offsetof(struct vlan_ethhdr, h_vlan_TCI): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, vlan_tci, sizeof(__be16), reg); break; case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, @@ -210,7 +238,7 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; case offsetof(struct vlan_ethhdr, h_vlan_TCI) + sizeof(struct vlan_hdr): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, @@ -218,7 +246,7 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, break; case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) + sizeof(struct vlan_hdr): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, @@ -239,21 +267,25 @@ static int nft_payload_offload_ip(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct iphdr, saddr): - if (priv->len != sizeof(struct in_addr)) + if (!nft_payload_offload_mask(reg, priv->len, + sizeof(struct in_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src, sizeof(struct in_addr), reg); + nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS); break; case offsetof(struct iphdr, daddr): - if (priv->len != sizeof(struct in_addr)) + if (!nft_payload_offload_mask(reg, priv->len, + sizeof(struct in_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst, sizeof(struct in_addr), reg); + nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS); break; case offsetof(struct iphdr, protocol): - if (priv->len != sizeof(__u8)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, @@ -275,21 +307,25 @@ static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct ipv6hdr, saddr): - if (priv->len != sizeof(struct in6_addr)) + if (!nft_payload_offload_mask(reg, priv->len, + sizeof(struct in6_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src, sizeof(struct in6_addr), reg); + nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS); break; case offsetof(struct ipv6hdr, daddr): - if (priv->len != sizeof(struct in6_addr)) + if (!nft_payload_offload_mask(reg, priv->len, + sizeof(struct in6_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst, sizeof(struct in6_addr), reg); + nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS); break; case offsetof(struct ipv6hdr, nexthdr): - if (priv->len != sizeof(__u8)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, @@ -331,14 +367,14 @@ static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct tcphdr, source): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, sizeof(__be16), reg); break; case offsetof(struct tcphdr, dest): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, @@ -359,14 +395,14 @@ static int nft_payload_offload_udp(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct udphdr, source): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, sizeof(__be16), reg); break; case offsetof(struct udphdr, dest): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, @@ -622,7 +658,6 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); - priv->sreg = nft_parse_register(tb[NFTA_PAYLOAD_SREG]); if (tb[NFTA_PAYLOAD_CSUM_TYPE]) priv->csum_type = @@ -655,7 +690,8 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - return nft_validate_register_load(priv->sreg, priv->len); + return nft_parse_register_load(tb[NFTA_PAYLOAD_SREG], &priv->sreg, + priv->len); } static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index 23265d757acb..9ba1de51ac07 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -19,10 +19,10 @@ static u32 jhash_initval __read_mostly; struct nft_queue { - enum nft_registers sreg_qnum:8; - u16 queuenum; - u16 queues_total; - u16 flags; + u8 sreg_qnum; + u16 queuenum; + u16 queues_total; + u16 flags; }; static void nft_queue_eval(const struct nft_expr *expr, @@ -111,8 +111,8 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx, struct nft_queue *priv = nft_expr_priv(expr); int err; - priv->sreg_qnum = nft_parse_register(tb[NFTA_QUEUE_SREG_QNUM]); - err = nft_validate_register_load(priv->sreg_qnum, sizeof(u32)); + err = nft_parse_register_load(tb[NFTA_QUEUE_SREG_QNUM], + &priv->sreg_qnum, sizeof(u32)); if (err < 0) return err; diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 89efcc5a533d..e4a1c44d7f51 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -15,7 +15,7 @@ struct nft_range_expr { struct nft_data data_from; struct nft_data data_to; - enum nft_registers sreg:8; + u8 sreg; u8 len; enum nft_range_ops op:8; }; @@ -86,8 +86,8 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr goto err2; } - priv->sreg = nft_parse_register(tb[NFTA_RANGE_SREG]); - err = nft_validate_register_load(priv->sreg, desc_from.len); + err = nft_parse_register_load(tb[NFTA_RANGE_SREG], &priv->sreg, + desc_from.len); if (err < 0) goto err2; diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 2056051c0af0..ba09890dddb5 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -14,8 +14,8 @@ #include <net/netfilter/nf_tables.h> struct nft_redir { - enum nft_registers sreg_proto_min:8; - enum nft_registers sreg_proto_max:8; + u8 sreg_proto_min; + u8 sreg_proto_max; u16 flags; }; @@ -50,19 +50,15 @@ static int nft_redir_init(const struct nft_ctx *ctx, plen = sizeof_field(struct nf_nat_range, min_addr.all); if (tb[NFTA_REDIR_REG_PROTO_MIN]) { - priv->sreg_proto_min = - nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MIN]); - - err = nft_validate_register_load(priv->sreg_proto_min, plen); + err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MIN], + &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_REDIR_REG_PROTO_MAX]) { - priv->sreg_proto_max = - nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MAX]); - - err = nft_validate_register_load(priv->sreg_proto_max, - plen); + err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MAX], + &priv->sreg_proto_max, + plen); if (err < 0) return err; } else { diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 61fb7e8afbf0..927ff8459bd9 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -40,6 +40,7 @@ int nft_reject_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); + int icmp_code; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; @@ -47,9 +48,17 @@ int nft_reject_init(const struct nft_ctx *ctx, priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: if (tb[NFTA_REJECT_ICMP_CODE] == NULL) return -EINVAL; - priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + + icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + if (priv->type == NFT_REJECT_ICMPX_UNREACH && + icmp_code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + priv->icmp_code = icmp_code; + break; case NFT_REJECT_TCP_RST: break; default: @@ -69,6 +78,7 @@ int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index cf8f2646e93c..95090186ee90 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -28,7 +28,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt)); + nf_send_reset(nft_net(pkt), pkt->xt.state->sk, + pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: nf_send_unreach(pkt->skb, @@ -44,7 +45,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt)); + nf_send_reset6(nft_net(pkt), pkt->xt.state->sk, + pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: nf_send_unreach6(nft_net(pkt), pkt->skb, @@ -58,60 +60,16 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, regs->verdict.code = NF_DROP; } -static int nft_reject_inet_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_reject_inet_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) { - struct nft_reject *priv = nft_expr_priv(expr); - int icmp_code; - - if (tb[NFTA_REJECT_TYPE] == NULL) - return -EINVAL; - - priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - case NFT_REJECT_ICMPX_UNREACH: - if (tb[NFTA_REJECT_ICMP_CODE] == NULL) - return -EINVAL; - - icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); - if (priv->type == NFT_REJECT_ICMPX_UNREACH && - icmp_code > NFT_REJECT_ICMPX_MAX) - return -EINVAL; - - priv->icmp_code = icmp_code; - break; - case NFT_REJECT_TCP_RST: - break; - default: - return -EINVAL; - } - return 0; -} - -static int nft_reject_inet_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_reject *priv = nft_expr_priv(expr); - - if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) - goto nla_put_failure; - - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - case NFT_REJECT_ICMPX_UNREACH: - if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) - goto nla_put_failure; - break; - default: - break; - } - - return 0; - -nla_put_failure: - return -1; + return nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_INGRESS)); } static struct nft_expr_type nft_reject_inet_type; @@ -119,9 +77,9 @@ static const struct nft_expr_ops nft_reject_inet_ops = { .type = &nft_reject_inet_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), .eval = nft_reject_inet_eval, - .init = nft_reject_inet_init, - .dump = nft_reject_inet_dump, - .validate = nft_reject_validate, + .init = nft_reject_init, + .dump = nft_reject_dump, + .validate = nft_reject_inet_validate, }; static struct nft_expr_type nft_reject_inet_type __read_mostly = { diff --git a/net/netfilter/nft_reject_netdev.c b/net/netfilter/nft_reject_netdev.c new file mode 100644 index 000000000000..d89f68754f42 --- /dev/null +++ b/net/netfilter/nft_reject_netdev.c @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020 Laura Garcia Liebana <nevola@gmail.com> + * Copyright (c) 2020 Jose M. Guisado <guigom@riseup.net> + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> +#include <net/netfilter/ipv4/nf_reject.h> +#include <net/netfilter/ipv6/nf_reject.h> + +static void nft_reject_queue_xmit(struct sk_buff *nskb, struct sk_buff *oldskb) +{ + dev_hard_header(nskb, nskb->dev, ntohs(oldskb->protocol), + eth_hdr(oldskb)->h_source, eth_hdr(oldskb)->h_dest, + nskb->len); + dev_queue_xmit(nskb); +} + +static void nft_reject_netdev_send_v4_tcp_reset(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook) +{ + struct sk_buff *nskb; + + nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, dev, hook); + if (!nskb) + return; + + nft_reject_queue_xmit(nskb, oldskb); +} + +static void nft_reject_netdev_send_v4_unreach(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook, u8 code) +{ + struct sk_buff *nskb; + + nskb = nf_reject_skb_v4_unreach(net, oldskb, dev, hook, code); + if (!nskb) + return; + + nft_reject_queue_xmit(nskb, oldskb); +} + +static void nft_reject_netdev_send_v6_tcp_reset(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook) +{ + struct sk_buff *nskb; + + nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, dev, hook); + if (!nskb) + return; + + nft_reject_queue_xmit(nskb, oldskb); +} + + +static void nft_reject_netdev_send_v6_unreach(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook, u8 code) +{ + struct sk_buff *nskb; + + nskb = nf_reject_skb_v6_unreach(net, oldskb, dev, hook, code); + if (!nskb) + return; + + nft_reject_queue_xmit(nskb, oldskb); +} + +static void nft_reject_netdev_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct ethhdr *eth = eth_hdr(pkt->skb); + struct nft_reject *priv = nft_expr_priv(expr); + const unsigned char *dest = eth->h_dest; + + if (is_broadcast_ether_addr(dest) || + is_multicast_ether_addr(dest)) + goto out; + + switch (eth->h_proto) { + case htons(ETH_P_IP): + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nft_reject_netdev_send_v4_unreach(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt), + priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nft_reject_netdev_send_v4_tcp_reset(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt)); + break; + case NFT_REJECT_ICMPX_UNREACH: + nft_reject_netdev_send_v4_unreach(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt), + nft_reject_icmp_code(priv->icmp_code)); + break; + } + break; + case htons(ETH_P_IPV6): + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nft_reject_netdev_send_v6_unreach(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt), + priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nft_reject_netdev_send_v6_tcp_reset(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt)); + break; + case NFT_REJECT_ICMPX_UNREACH: + nft_reject_netdev_send_v6_unreach(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt), + nft_reject_icmpv6_code(priv->icmp_code)); + break; + } + break; + default: + /* No explicit way to reject this protocol, drop it. */ + break; + } +out: + regs->verdict.code = NF_DROP; +} + +static int nft_reject_netdev_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS)); +} + +static struct nft_expr_type nft_reject_netdev_type; +static const struct nft_expr_ops nft_reject_netdev_ops = { + .type = &nft_reject_netdev_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_netdev_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, + .validate = nft_reject_netdev_validate, +}; + +static struct nft_expr_type nft_reject_netdev_type __read_mostly = { + .family = NFPROTO_NETDEV, + .name = "reject", + .ops = &nft_reject_netdev_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_netdev_module_init(void) +{ + return nft_register_expr(&nft_reject_netdev_type); +} + +static void __exit nft_reject_netdev_module_exit(void) +{ + nft_unregister_expr(&nft_reject_netdev_type); +} + +module_init(nft_reject_netdev_module_init); +module_exit(nft_reject_netdev_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Laura Garcia Liebana <nevola@gmail.com>"); +MODULE_AUTHOR("Jose M. Guisado <guigom@riseup.net>"); +MODULE_DESCRIPTION("Reject packets from netdev via nftables"); +MODULE_ALIAS_NFT_AF_EXPR(5, "reject"); diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 7cfcb0e2f7ee..bcd01a63e38f 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -15,7 +15,7 @@ struct nft_rt { enum nft_rt_keys key:8; - enum nft_registers dreg:8; + u8 dreg; }; static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skbdst) @@ -141,9 +141,8 @@ static int nft_rt_get_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_RT_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_RT_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } static int nft_rt_get_dump(struct sk_buff *skb, diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 4d3f147e8d8d..bf618b7ec1ae 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -293,6 +293,22 @@ cont: rhashtable_walk_exit(&hti); } +static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, + struct nft_set_ext *ext) +{ + struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); + struct nft_expr *expr; + u32 size; + + nft_setelem_expr_foreach(expr, elem_expr, size) { + if (expr->ops->gc && + expr->ops->gc(read_pnet(&set->net), expr)) + return true; + } + + return false; +} + static void nft_rhash_gc(struct work_struct *work) { struct nft_set *set; @@ -314,16 +330,13 @@ static void nft_rhash_gc(struct work_struct *work) continue; } - if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) { - struct nft_expr *expr = nft_set_ext_expr(&he->ext); + if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) && + nft_rhash_expr_needs_gc_run(set, &he->ext)) + goto needs_gc_run; - if (expr->ops->gc && - expr->ops->gc(read_pnet(&set->net), expr)) - goto gc; - } if (!nft_set_elem_expired(&he->ext)) continue; -gc: +needs_gc_run: if (nft_set_elem_mark_busy(&he->ext)) continue; diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index a28aca5124ce..c9b8a2b03b71 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -10,7 +10,7 @@ struct nft_socket { enum nft_socket_keys key:8; union { - enum nft_registers dreg:8; + u8 dreg; }; }; @@ -133,9 +133,8 @@ static int nft_socket_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_SOCKET_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } static int nft_socket_dump(struct sk_buff *skb, diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index d67f83a0958d..43a5a780a6d3 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -13,9 +13,9 @@ #endif struct nft_tproxy { - enum nft_registers sreg_addr:8; - enum nft_registers sreg_port:8; - u8 family; + u8 sreg_addr; + u8 sreg_port; + u8 family; }; static void nft_tproxy_eval_v4(const struct nft_expr *expr, @@ -247,15 +247,15 @@ static int nft_tproxy_init(const struct nft_ctx *ctx, } if (tb[NFTA_TPROXY_REG_ADDR]) { - priv->sreg_addr = nft_parse_register(tb[NFTA_TPROXY_REG_ADDR]); - err = nft_validate_register_load(priv->sreg_addr, alen); + err = nft_parse_register_load(tb[NFTA_TPROXY_REG_ADDR], + &priv->sreg_addr, alen); if (err < 0) return err; } if (tb[NFTA_TPROXY_REG_PORT]) { - priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]); - err = nft_validate_register_load(priv->sreg_port, sizeof(u16)); + err = nft_parse_register_load(tb[NFTA_TPROXY_REG_PORT], + &priv->sreg_port, sizeof(u16)); if (err < 0) return err; } diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index d3eb953d0333..3b27926d5382 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -15,7 +15,7 @@ struct nft_tunnel { enum nft_tunnel_keys key:8; - enum nft_registers dreg:8; + u8 dreg; enum nft_tunnel_mode mode:8; }; @@ -93,8 +93,6 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_TUNNEL_DREG]); - if (tb[NFTA_TUNNEL_MODE]) { priv->mode = ntohl(nla_get_be32(tb[NFTA_TUNNEL_MODE])); if (priv->mode > NFT_TUNNEL_MODE_MAX) @@ -103,8 +101,8 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx, priv->mode = NFT_TUNNEL_MODE_NONE; } - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_TUNNEL_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } static int nft_tunnel_get_dump(struct sk_buff *skb, diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index 06d5cabf1d7c..cbbbc4ecad3a 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -24,7 +24,7 @@ static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = { struct nft_xfrm { enum nft_xfrm_keys key:8; - enum nft_registers dreg:8; + u8 dreg; u8 dir; u8 spnum; }; @@ -86,9 +86,8 @@ static int nft_xfrm_get_init(const struct nft_ctx *ctx, priv->spnum = spnum; - priv->dreg = nft_parse_register(tb[NFTA_XFRM_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_XFRM_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } /* Return true if key asks for daddr/saddr and current diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index af22dbe85e2c..acce622582e3 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1349,6 +1349,14 @@ struct xt_counters *xt_counters_alloc(unsigned int counters) } EXPORT_SYMBOL(xt_counters_alloc); +struct xt_table_info +*xt_table_get_private_protected(const struct xt_table *table) +{ + return rcu_dereference_protected(table->private, + mutex_is_locked(&xt[table->af].mutex)); +} +EXPORT_SYMBOL(xt_table_get_private_protected); + struct xt_table_info * xt_replace_table(struct xt_table *table, unsigned int num_counters, @@ -1356,7 +1364,6 @@ xt_replace_table(struct xt_table *table, int *error) { struct xt_table_info *private; - unsigned int cpu; int ret; ret = xt_jumpstack_alloc(newinfo); @@ -1366,47 +1373,20 @@ xt_replace_table(struct xt_table *table, } /* Do the substitution. */ - local_bh_disable(); - private = table->private; + private = xt_table_get_private_protected(table); /* Check inside lock: is the old number correct? */ if (num_counters != private->number) { pr_debug("num_counters != table->private->number (%u/%u)\n", num_counters, private->number); - local_bh_enable(); *error = -EAGAIN; return NULL; } newinfo->initial_entries = private->initial_entries; - /* - * Ensure contents of newinfo are visible before assigning to - * private. - */ - smp_wmb(); - table->private = newinfo; - - /* make sure all cpus see new ->private value */ - smp_wmb(); - /* - * Even though table entries have now been swapped, other CPU's - * may still be using the old entries... - */ - local_bh_enable(); - - /* ... so wait for even xt_recseq on all cpus */ - for_each_possible_cpu(cpu) { - seqcount_t *s = &per_cpu(xt_recseq, cpu); - u32 seq = raw_read_seqcount(s); - - if (seq & 1) { - do { - cond_resched(); - cpu_relax(); - } while (seq == raw_read_seqcount(s)); - } - } + rcu_assign_pointer(table->private, newinfo); + synchronize_rcu(); audit_log_nfcfg(table->name, table->af, private->number, !private->number ? AUDIT_XT_OP_REGISTER : @@ -1442,12 +1422,12 @@ struct xt_table *xt_register_table(struct net *net, } /* Simplifies replace_table code. */ - table->private = bootstrap; + rcu_assign_pointer(table->private, bootstrap); if (!xt_replace_table(table, 0, newinfo, &ret)) goto unlock; - private = table->private; + private = xt_table_get_private_protected(table); pr_debug("table->private->number = %u\n", private->number); /* save number of initial entries */ @@ -1470,7 +1450,8 @@ void *xt_unregister_table(struct xt_table *table) struct xt_table_info *private; mutex_lock(&xt[table->af].mutex); - private = table->private; + private = xt_table_get_private_protected(table); + RCU_INIT_POINTER(table->private, NULL); list_del(&table->list); mutex_unlock(&xt[table->af].mutex); audit_log_nfcfg(table->name, table->af, private->number, diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 37253d399c6b..0d5c422f8745 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -115,6 +115,9 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) } cfg; int ret; + if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name)) + return -ENAMETOOLONG; + net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); mutex_lock(&xn->hash_lock); diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index a97c2259bbc8..7c6bf1c16813 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -27,7 +27,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par) overquota = nfnl_acct_overquota(xt_net(par), info->nfacct); - return overquota == NFACCT_UNDERQUOTA ? false : true; + return overquota != NFACCT_UNDERQUOTA; } static int diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 606411869698..0446307516cd 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -152,7 +152,8 @@ static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) /* * Drop entries with timestamps older then 'time'. */ -static void recent_entry_reap(struct recent_table *t, unsigned long time) +static void recent_entry_reap(struct recent_table *t, unsigned long time, + struct recent_entry *working, bool update) { struct recent_entry *e; @@ -162,6 +163,12 @@ static void recent_entry_reap(struct recent_table *t, unsigned long time) e = list_entry(t->lru_list.next, struct recent_entry, lru_list); /* + * Do not reap the entry which are going to be updated. + */ + if (e == working && update) + return; + + /* * The last time stamp is the most recent. */ if (time_after(time, e->stamps[e->index-1])) @@ -303,7 +310,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par) /* info->seconds must be non-zero */ if (info->check_set & XT_RECENT_REAP) - recent_entry_reap(t, time); + recent_entry_reap(t, time, e, + info->check_set & XT_RECENT_UPDATE && ret); } if (info->check_set & XT_RECENT_SET || diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index 4e62f2ad3575..f28c8947c730 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -366,6 +366,7 @@ static const struct netlbl_calipso_ops *calipso_ops; /** * netlbl_calipso_ops_register - Register the CALIPSO operations + * @ops: ops to register * * Description: * Register the CALIPSO packet engine operations. diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index eb1d66d20afb..df1b41ed73fd 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -95,7 +95,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = -ENOMEM; goto add_free_entry; } - nla_strlcpy(entry->domain, + nla_strscpy(entry->domain, info->attrs[NLBL_MGMT_A_DOMAIN], tmp_size); } diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 2e8e3f7b2111..ccb491642811 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1166,12 +1166,13 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, struct netlbl_unlhsh_walk_arg cb_arg; u32 skip_bkt = cb->args[0]; u32 skip_chain = cb->args[1]; - u32 iter_bkt; - u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; + u32 skip_addr4 = cb->args[2]; + u32 iter_bkt, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; struct netlbl_unlhsh_iface *iface; struct list_head *iter_list; struct netlbl_af4list *addr4; #if IS_ENABLED(CONFIG_IPV6) + u32 skip_addr6 = cb->args[3]; struct netlbl_af6list *addr6; #endif @@ -1182,7 +1183,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, rcu_read_lock(); for (iter_bkt = skip_bkt; iter_bkt < rcu_dereference(netlbl_unlhsh)->size; - iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) { + iter_bkt++) { iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt]; list_for_each_entry_rcu(iface, iter_list, list) { if (!iface->valid || @@ -1190,7 +1191,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, continue; netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { - if (iter_addr4++ < cb->args[2]) + if (iter_addr4++ < skip_addr4) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, @@ -1203,10 +1204,12 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, goto unlabel_staticlist_return; } } + iter_addr4 = 0; + skip_addr4 = 0; #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { - if (iter_addr6++ < cb->args[3]) + if (iter_addr6++ < skip_addr6) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, @@ -1219,8 +1222,12 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, goto unlabel_staticlist_return; } } + iter_addr6 = 0; + skip_addr6 = 0; #endif /* IPv6 */ } + iter_chain = 0; + skip_chain = 0; } unlabel_staticlist_return: diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index daca50d6bb12..dd488938447f 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -67,6 +67,8 @@ #include <net/sock.h> #include <net/scm.h> #include <net/netlink.h> +#define CREATE_TRACE_POINTS +#include <trace/events/netlink.h> #include "af_netlink.h" @@ -147,6 +149,12 @@ static BLOCKING_NOTIFIER_HEAD(netlink_chain); static const struct rhashtable_params netlink_rhashtable_params; +void do_trace_netlink_extack(const char *msg) +{ + trace_netlink_extack(msg); +} +EXPORT_SYMBOL(do_trace_netlink_extack); + static inline u32 netlink_group_mask(u32 group) { return group ? 1 << (group - 1) : 0; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index c992424e4d63..2d6fdf40df66 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1360,11 +1360,43 @@ static struct genl_family genl_ctrl __ro_after_init = { .netnsok = true, }; +static int genl_bind(struct net *net, int group) +{ + const struct genl_family *family; + unsigned int id; + int ret = 0; + + genl_lock_all(); + + idr_for_each_entry(&genl_fam_idr, family, id) { + const struct genl_multicast_group *grp; + int i; + + if (family->n_mcgrps == 0) + continue; + + i = group - family->mcgrp_offset; + if (i < 0 || i >= family->n_mcgrps) + continue; + + grp = &family->mcgrps[i]; + if ((grp->flags & GENL_UNS_ADMIN_PERM) && + !ns_capable(net->user_ns, CAP_NET_ADMIN)) + ret = -EPERM; + + break; + } + + genl_unlock_all(); + return ret; +} + static int __net_init genl_pernet_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = genl_rcv, .flags = NL_CFG_F_NONROOT_RECV, + .bind = genl_bind, }; /* we'll bump the group number right afterwards */ diff --git a/net/nfc/Kconfig b/net/nfc/Kconfig index 9b27599870e3..466a0279b93e 100644 --- a/net/nfc/Kconfig +++ b/net/nfc/Kconfig @@ -1,10 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only # -# NFC sybsystem configuration +# NFC subsystem configuration # menuconfig NFC - depends on NET depends on RFKILL || !RFKILL tristate "NFC subsystem support" default n diff --git a/net/nfc/core.c b/net/nfc/core.c index eb377f87bcae..573c80c6ff7a 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -189,7 +189,8 @@ static const struct rfkill_ops nfc_rfkill_ops = { * nfc_start_poll - start polling for nfc targets * * @dev: The nfc device that must start polling - * @protocols: bitset of nfc protocols that must be used for polling + * @im_protocols: bitset of nfc initiator protocols to be used for polling + * @tm_protocols: bitset of nfc transport protocols to be used for polling * * The device remains polling for targets until a target is found or * the nfc_stop_poll function is called. @@ -436,6 +437,7 @@ error: * * @dev: The nfc device that found the target * @target_idx: index of the target that must be deactivated + * @mode: idle or sleep? */ int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx, u8 mode) { @@ -703,7 +705,11 @@ EXPORT_SYMBOL(nfc_tm_deactivated); /** * nfc_alloc_send_skb - allocate a skb for data exchange responses * + * @dev: device sending the response + * @sk: socket sending the response + * @flags: MSG_DONTWAIT flag * @size: size to allocate + * @err: pointer to memory to store the error code */ struct sk_buff *nfc_alloc_send_skb(struct nfc_dev *dev, struct sock *sk, unsigned int flags, unsigned int size, @@ -1039,6 +1045,8 @@ struct nfc_dev *nfc_get_device(unsigned int idx) * * @ops: device operations * @supported_protocols: NFC protocols supported by the device + * @tx_headroom: reserved space at beginning of skb + * @tx_tailroom: reserved space at end of skb */ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, u32 supported_protocols, diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index e3599ed4a7a8..da7e2112771f 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -458,6 +458,9 @@ static void digital_add_poll_tech(struct nfc_digital_dev *ddev, u8 rf_tech, /** * start_poll operation + * @nfc_dev: device to be polled + * @im_protocols: bitset of nfc initiator protocols to be used for polling + * @tm_protocols: bitset of nfc transport protocols to be used for polling * * For every supported protocol, the corresponding polling function is added * to the table of polling technologies (ddev->poll_techs[]) using diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index 0eb4ddc056e7..c0c8fea3a186 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -236,7 +236,7 @@ static void llc_shdlc_rcv_i_frame(struct llc_shdlc *shdlc, goto exit; } - if (shdlc->t1_active == false) { + if (!shdlc->t1_active) { shdlc->t1_active = true; mod_timer(&shdlc->t1_timer, jiffies + msecs_to_jiffies(SHDLC_T1_VALUE_MS(shdlc->w))); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 741da8f81c2b..59257400697d 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -165,7 +165,12 @@ static void nci_reset_req(struct nci_dev *ndev, unsigned long opt) static void nci_init_req(struct nci_dev *ndev, unsigned long opt) { - nci_send_cmd(ndev, NCI_OP_CORE_INIT_CMD, 0, NULL); + u8 plen = 0; + + if (opt) + plen = sizeof(struct nci_core_init_v2_cmd); + + nci_send_cmd(ndev, NCI_OP_CORE_INIT_CMD, plen, (void *)opt); } static void nci_init_complete_req(struct nci_dev *ndev, unsigned long opt) @@ -497,7 +502,16 @@ static int nci_open_device(struct nci_dev *ndev) } if (!rc) { - rc = __nci_request(ndev, nci_init_req, 0, + struct nci_core_init_v2_cmd nci_init_v2_cmd = { + .feature1 = NCI_FEATURE_DISABLE, + .feature2 = NCI_FEATURE_DISABLE + }; + unsigned long opt = 0; + + if (ndev->nci_ver & NCI_VER_2_MASK) + opt = (unsigned long)&nci_init_v2_cmd; + + rc = __nci_request(ndev, nci_init_req, opt, msecs_to_jiffies(NCI_INIT_TIMEOUT)); } @@ -565,11 +579,11 @@ static int nci_close_device(struct nci_dev *ndev) clear_bit(NCI_INIT, &ndev->flags); - del_timer_sync(&ndev->cmd_timer); - /* Flush cmd wq */ flush_workqueue(ndev->cmd_wq); + del_timer_sync(&ndev->cmd_timer); + /* Clear flags */ ndev->flags = 0; @@ -1112,6 +1126,8 @@ static struct nfc_ops nci_nfc_ops = { * * @ops: device operations * @supported_protocols: NFC protocols supported by the device + * @tx_headroom: Reserved space at beginning of skb + * @tx_tailroom: Reserved space at end of skb */ struct nci_dev *nci_allocate_device(struct nci_ops *ops, __u32 supported_protocols, diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index c18e76d6d8ba..6b275a387a92 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -363,16 +363,13 @@ exit: } static void nci_hci_resp_received(struct nci_dev *ndev, u8 pipe, - u8 result, struct sk_buff *skb) + struct sk_buff *skb) { struct nci_conn_info *conn_info; - u8 status = result; conn_info = ndev->hci_dev->conn_info; - if (!conn_info) { - status = NCI_STATUS_REJECTED; + if (!conn_info) goto exit; - } conn_info->rx_skb = skb; @@ -388,7 +385,7 @@ static void nci_hci_hcp_message_rx(struct nci_dev *ndev, u8 pipe, { switch (type) { case NCI_HCI_HCP_RESPONSE: - nci_hci_resp_received(ndev, pipe, instruction, skb); + nci_hci_resp_received(ndev, pipe, skb); break; case NCI_HCI_HCP_COMMAND: nci_hci_cmd_received(ndev, pipe, instruction, skb); diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 33e1170817f0..98af04c86b2c 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -27,6 +27,23 @@ /* Handle NCI Notification packets */ +static void nci_core_reset_ntf_packet(struct nci_dev *ndev, + struct sk_buff *skb) +{ + /* Handle NCI 2.x core reset notification */ + struct nci_core_reset_ntf *ntf = (void *)skb->data; + + ndev->nci_ver = ntf->nci_ver; + pr_debug("nci_ver 0x%x, config_status 0x%x\n", + ntf->nci_ver, ntf->config_status); + + ndev->manufact_id = ntf->manufact_id; + ndev->manufact_specific_info = + __le32_to_cpu(ntf->manufact_specific_info); + + nci_req_complete(ndev, NCI_STATUS_OK); +} + static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) { @@ -756,6 +773,10 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) } switch (ntf_opcode) { + case NCI_OP_CORE_RESET_NTF: + nci_core_reset_ntf_packet(ndev, skb); + break; + case NCI_OP_CORE_CONN_CREDITS_NTF: nci_core_conn_credits_ntf_packet(ndev, skb); break; diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index a48297b79f34..e9605922a322 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -31,16 +31,19 @@ static void nci_core_reset_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) pr_debug("status 0x%x\n", rsp->status); - if (rsp->status == NCI_STATUS_OK) { - ndev->nci_ver = rsp->nci_ver; - pr_debug("nci_ver 0x%x, config_status 0x%x\n", - rsp->nci_ver, rsp->config_status); - } + /* Handle NCI 1.x ver */ + if (skb->len != 1) { + if (rsp->status == NCI_STATUS_OK) { + ndev->nci_ver = rsp->nci_ver; + pr_debug("nci_ver 0x%x, config_status 0x%x\n", + rsp->nci_ver, rsp->config_status); + } - nci_req_complete(ndev, rsp->status); + nci_req_complete(ndev, rsp->status); + } } -static void nci_core_init_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) +static u8 nci_core_init_rsp_packet_v1(struct nci_dev *ndev, struct sk_buff *skb) { struct nci_core_init_rsp_1 *rsp_1 = (void *) skb->data; struct nci_core_init_rsp_2 *rsp_2; @@ -48,16 +51,14 @@ static void nci_core_init_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) pr_debug("status 0x%x\n", rsp_1->status); if (rsp_1->status != NCI_STATUS_OK) - goto exit; + return rsp_1->status; ndev->nfcc_features = __le32_to_cpu(rsp_1->nfcc_features); ndev->num_supported_rf_interfaces = rsp_1->num_supported_rf_interfaces; - if (ndev->num_supported_rf_interfaces > - NCI_MAX_SUPPORTED_RF_INTERFACES) { - ndev->num_supported_rf_interfaces = - NCI_MAX_SUPPORTED_RF_INTERFACES; - } + ndev->num_supported_rf_interfaces = + min((int)ndev->num_supported_rf_interfaces, + NCI_MAX_SUPPORTED_RF_INTERFACES); memcpy(ndev->supported_rf_interfaces, rsp_1->supported_rf_interfaces, @@ -77,6 +78,58 @@ static void nci_core_init_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) ndev->manufact_specific_info = __le32_to_cpu(rsp_2->manufact_specific_info); + return NCI_STATUS_OK; +} + +static u8 nci_core_init_rsp_packet_v2(struct nci_dev *ndev, struct sk_buff *skb) +{ + struct nci_core_init_rsp_nci_ver2 *rsp = (void *)skb->data; + u8 *supported_rf_interface = rsp->supported_rf_interfaces; + u8 rf_interface_idx = 0; + u8 rf_extension_cnt = 0; + + pr_debug("status %x\n", rsp->status); + + if (rsp->status != NCI_STATUS_OK) + return rsp->status; + + ndev->nfcc_features = __le32_to_cpu(rsp->nfcc_features); + ndev->num_supported_rf_interfaces = rsp->num_supported_rf_interfaces; + + ndev->num_supported_rf_interfaces = + min((int)ndev->num_supported_rf_interfaces, + NCI_MAX_SUPPORTED_RF_INTERFACES); + + while (rf_interface_idx < ndev->num_supported_rf_interfaces) { + ndev->supported_rf_interfaces[rf_interface_idx++] = *supported_rf_interface++; + + /* skip rf extension parameters */ + rf_extension_cnt = *supported_rf_interface++; + supported_rf_interface += rf_extension_cnt; + } + + ndev->max_logical_connections = rsp->max_logical_connections; + ndev->max_routing_table_size = + __le16_to_cpu(rsp->max_routing_table_size); + ndev->max_ctrl_pkt_payload_len = + rsp->max_ctrl_pkt_payload_len; + ndev->max_size_for_large_params = NCI_MAX_LARGE_PARAMS_NCI_v2; + + return NCI_STATUS_OK; +} + +static void nci_core_init_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) +{ + u8 status = 0; + + if (!(ndev->nci_ver & NCI_VER_2_MASK)) + status = nci_core_init_rsp_packet_v1(ndev, skb); + else + status = nci_core_init_rsp_packet_v2(ndev, skb); + + if (status != NCI_STATUS_OK) + goto exit; + pr_debug("nfcc_features 0x%x\n", ndev->nfcc_features); pr_debug("num_supported_rf_interfaces %d\n", @@ -103,7 +156,7 @@ static void nci_core_init_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) ndev->manufact_specific_info); exit: - nci_req_complete(ndev, rsp_1->status); + nci_req_complete(ndev, status); } static void nci_core_set_config_rsp_packet(struct nci_dev *ndev, diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index 11b554ce07ff..1204c438e87d 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -292,7 +292,8 @@ static int nci_uart_tty_ioctl(struct tty_struct *tty, struct file *file, /* We don't provide read/write/poll interface for user space. */ static ssize_t nci_uart_tty_read(struct tty_struct *tty, struct file *file, - unsigned char __user *buf, size_t nr) + unsigned char *buf, size_t nr, + void **cookie, unsigned long offset) { return 0; } diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 8709f3d4e7c4..722f7ef891e1 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -852,6 +852,7 @@ static int nfc_genl_stop_poll(struct sk_buff *skb, struct genl_info *info) if (!dev->polling) { device_unlock(&dev->dev); + nfc_put_device(dev); return -EINVAL; } @@ -1226,7 +1227,7 @@ static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info) if (!dev) return -ENODEV; - nla_strlcpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME], + nla_strscpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME], sizeof(firmware_name)); rc = nfc_fw_download(dev, firmware_name); @@ -1819,9 +1820,9 @@ static int nfc_genl_rcv_nl_event(struct notifier_block *this, w = kmalloc(sizeof(*w), GFP_ATOMIC); if (w) { - INIT_WORK((struct work_struct *) w, nfc_urelease_event_work); + INIT_WORK(&w->w, nfc_urelease_event_work); w->portid = n->portid; - schedule_work((struct work_struct *) w); + schedule_work(&w->w); } out: diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 955c195ae14b..9c7eb8455ba8 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -105,7 +105,7 @@ static int rawsock_connect(struct socket *sock, struct sockaddr *_addr, if (addr->target_idx > dev->target_next_idx - 1 || addr->target_idx < dev->target_next_idx - dev->n_targets) { rc = -EINVAL; - goto error; + goto put_dev; } rc = nfc_activate_target(dev, addr->target_idx, addr->nfc_protocol); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index b87bfc82f44f..92a0b67b2728 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -199,6 +199,9 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, __be32 lse; int err; + if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) + return -ENOMEM; + stack = mpls_hdr(skb); lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask); err = skb_mpls_update_lse(skb, lse); @@ -954,19 +957,15 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, static int dec_ttl_exception_handler(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, - const struct nlattr *attr, bool last) + const struct nlattr *attr) { - /* The first action is always 'OVS_DEC_TTL_ATTR_ARG'. */ - struct nlattr *dec_ttl_arg = nla_data(attr); - int rem = nla_len(attr); + /* The first attribute is always 'OVS_DEC_TTL_ATTR_ACTION'. */ + struct nlattr *actions = nla_data(attr); - if (nla_len(dec_ttl_arg)) { - struct nlattr *actions = nla_next(dec_ttl_arg, &rem); + if (nla_len(actions)) + return clone_execute(dp, skb, key, 0, nla_data(actions), + nla_len(actions), true, false); - if (actions) - return clone_execute(dp, skb, key, 0, actions, rem, - last, false); - } consume_skb(skb); return 0; } @@ -1210,7 +1209,7 @@ static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key) return -EHOSTUNREACH; key->ip.ttl = --nh->hop_limit; - } else { + } else if (skb->protocol == htons(ETH_P_IP)) { struct iphdr *nh; u8 old_ttl; @@ -1419,11 +1418,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_DEC_TTL: err = execute_dec_ttl(skb, key); - if (err == -EHOSTUNREACH) { - err = dec_ttl_exception_handler(dp, skb, key, - a, true); - return err; - } + if (err == -EHOSTUNREACH) + return dec_ttl_exception_handler(dp, skb, + key, a); break; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 4beb96139d77..5eddfe7bd391 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1037,6 +1037,14 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, ovs_ct_helper(skb, info->family) != NF_ACCEPT) { return -EINVAL; } + + if (nf_ct_protonum(ct) == IPPROTO_TCP && + nf_ct_is_confirmed(ct) && nf_conntrack_tcp_established(ct)) { + /* Be liberal for tcp packets so that out-of-window + * packets are not marked invalid. + */ + nf_ct_set_tcp_be_liberal(ct); + } } return 0; @@ -2025,15 +2033,11 @@ static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info, struct sk_buff *reply) { struct ovs_zone_limit zone_limit; - int err; zone_limit.zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE; zone_limit.limit = info->default_limit; - err = nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); - if (err) - return err; - return 0; + return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); } static int __ovs_ct_limit_get_zone_limit(struct net *net, diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index b03d142ec82e..c7f34d6a9934 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -294,6 +294,10 @@ static bool icmp6hdr_ok(struct sk_buff *skb) /** * Parse vlan tag from vlan header. + * @skb: skb containing frame to parse + * @key_vh: pointer to parsed vlan tag + * @untag_vlan: should the vlan header be removed from the frame + * * Returns ERROR on memory error. * Returns 0 if it encounters a non-vlan or incomplete packet. * Returns 1 after successfully parsing vlan tag. diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 9d3e50c4d29f..fd1f809e9bc1 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2503,28 +2503,52 @@ static int validate_and_copy_dec_ttl(struct net *net, __be16 eth_type, __be16 vlan_tci, u32 mpls_label_count, bool log) { - int start, err; - u32 nested = true; + const struct nlattr *attrs[OVS_DEC_TTL_ATTR_MAX + 1]; + int start, action_start, err, rem; + const struct nlattr *a, *actions; - if (!nla_len(attr)) - return ovs_nla_add_action(sfa, OVS_ACTION_ATTR_DEC_TTL, - NULL, 0, log); + memset(attrs, 0, sizeof(attrs)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + + /* Ignore unknown attributes to be future proof. */ + if (type > OVS_DEC_TTL_ATTR_MAX) + continue; + + if (!type || attrs[type]) { + OVS_NLERR(log, "Duplicate or invalid key (type %d).", + type); + return -EINVAL; + } + + attrs[type] = a; + } + + if (rem) { + OVS_NLERR(log, "Message has %d unknown bytes.", rem); + return -EINVAL; + } + + actions = attrs[OVS_DEC_TTL_ATTR_ACTION]; + if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) { + OVS_NLERR(log, "Missing valid actions attribute."); + return -EINVAL; + } start = add_nested_action_start(sfa, OVS_ACTION_ATTR_DEC_TTL, log); if (start < 0) return start; - err = ovs_nla_add_action(sfa, OVS_DEC_TTL_ATTR_ACTION, &nested, - sizeof(nested), log); - - if (err) - return err; + action_start = add_nested_action_start(sfa, OVS_DEC_TTL_ATTR_ACTION, log); + if (action_start < 0) + return action_start; - err = __ovs_nla_copy_actions(net, attr, key, sfa, eth_type, + err = __ovs_nla_copy_actions(net, actions, key, sfa, eth_type, vlan_tci, mpls_label_count, log); if (err) return err; + add_nested_action_end(*sfa, action_start); add_nested_action_end(*sfa, start); return 0; } @@ -3487,20 +3511,42 @@ out: static int dec_ttl_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) { - int err = 0, rem = nla_len(attr); - struct nlattr *start; + struct nlattr *start, *action_start; + const struct nlattr *a; + int err = 0, rem; start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_DEC_TTL); - if (!start) return -EMSGSIZE; - err = ovs_nla_put_actions(nla_data(attr), rem, skb); - if (err) - nla_nest_cancel(skb, start); - else - nla_nest_end(skb, start); + nla_for_each_attr(a, nla_data(attr), nla_len(attr), rem) { + switch (nla_type(a)) { + case OVS_DEC_TTL_ATTR_ACTION: + + action_start = nla_nest_start_noflag(skb, OVS_DEC_TTL_ATTR_ACTION); + if (!action_start) { + err = -EMSGSIZE; + goto out; + } + + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) + goto out; + nla_nest_end(skb, action_start); + break; + + default: + /* Ignore all other option to be future compatible */ + break; + } + } + + nla_nest_end(skb, start); + return 0; + +out: + nla_nest_cancel(skb, start); return err; } diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 8fbefd52af7f..15424d26e85d 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -423,7 +423,7 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) return -EINVAL; meter = dp_meter_create(a); - if (IS_ERR_OR_NULL(meter)) + if (IS_ERR(meter)) return PTR_ERR(meter); reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET, diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 1e30d8df3ba5..5b2ee9c1c00b 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -35,21 +35,18 @@ internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { int len, err; + /* store len value because skb can be freed inside ovs_vport_receive() */ len = skb->len; + rcu_read_lock(); err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); rcu_read_unlock(); - if (likely(!err)) { - struct pcpu_sw_netstats *tstats = this_cpu_ptr(netdev->tstats); - - u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += len; - tstats->tx_packets++; - u64_stats_update_end(&tstats->syncp); - } else { + if (likely(!err)) + dev_sw_netstats_tx_add(netdev, 1, len); + else netdev->stats.tx_errors++; - } + return NETDEV_TX_OK; } @@ -83,24 +80,12 @@ static void internal_dev_destructor(struct net_device *dev) ovs_vport_free(vport); } -static void -internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) -{ - memset(stats, 0, sizeof(*stats)); - stats->rx_errors = dev->stats.rx_errors; - stats->tx_errors = dev->stats.tx_errors; - stats->tx_dropped = dev->stats.tx_dropped; - stats->rx_dropped = dev->stats.rx_dropped; - - dev_fetch_sw_netstats(stats, dev->tstats); -} - static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, - .ndo_get_stats64 = internal_get_stats, + .ndo_get_stats64 = dev_get_tstats64, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 82d801f063b7..4ed7e52c7012 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -111,10 +111,12 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name) * * @priv_size: Size of private data area to allocate. * @ops: vport device ops + * @parms: information about new vport. * * Allocate and initialize a new vport defined by @ops. The vport will contain * a private data area of size @priv_size that can be accessed using - * vport_priv(). vports that are no longer needed should be released with + * vport_priv(). Some parameters of the vport will be initialized from @parms. + * @vports that are no longer needed should be released with * vport_free(). */ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index cefbd50c1090..e24b2841c643 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -46,6 +46,7 @@ * Copyright (C) 2011, <lokec@ccs.neu.edu> */ +#include <linux/ethtool.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/capability.h> @@ -93,8 +94,8 @@ /* Assumptions: - - If the device has no dev->header_ops, there is no LL header visible - above the device. In this case, its hard_header_len should be 0. + - If the device has no dev->header_ops->create, there is no LL header + visible above the device. In this case, its hard_header_len should be 0. The device may prepend its own header internally. In this case, its needed_headroom should be set to the space needed for it to add its internal header. @@ -108,37 +109,37 @@ On receive: ----------- -Incoming, dev->header_ops != NULL +Incoming, dev_has_header(dev) == true mac_header -> ll header data -> data -Outgoing, dev->header_ops != NULL +Outgoing, dev_has_header(dev) == true mac_header -> ll header data -> ll header -Incoming, dev->header_ops == NULL +Incoming, dev_has_header(dev) == false mac_header -> data However drivers often make it point to the ll header. This is incorrect because the ll header should be invisible to us. data -> data -Outgoing, dev->header_ops == NULL +Outgoing, dev_has_header(dev) == false mac_header -> data. ll header is invisible to us. data -> data Resume - If dev->header_ops == NULL we are unable to restore the ll header, + If dev_has_header(dev) == false we are unable to restore the ll header, because it is invisible to us. On transmit: ------------ -dev->header_ops != NULL +dev_has_header(dev) == true mac_header -> ll header data -> ll header -dev->header_ops == NULL (ll header is invisible to us) +dev_has_header(dev) == false (ll header is invisible to us) mac_header -> data data -> data @@ -1636,13 +1637,15 @@ static bool fanout_find_new_id(struct sock *sk, u16 *new_id) return false; } -static int fanout_add(struct sock *sk, u16 id, u16 type_flags) +static int fanout_add(struct sock *sk, struct fanout_args *args) { struct packet_rollover *rollover = NULL; struct packet_sock *po = pkt_sk(sk); + u16 type_flags = args->type_flags; struct packet_fanout *f, *match; u8 type = type_flags & 0xff; u8 flags = type_flags >> 8; + u16 id = args->id; int err; switch (type) { @@ -1700,11 +1703,21 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) } } err = -EINVAL; - if (match && match->flags != flags) - goto out; - if (!match) { + if (match) { + if (match->flags != flags) + goto out; + if (args->max_num_members && + args->max_num_members != match->max_num_members) + goto out; + } else { + if (args->max_num_members > PACKET_FANOUT_MAX) + goto out; + if (!args->max_num_members) + /* legacy PACKET_FANOUT_MAX */ + args->max_num_members = 256; err = -ENOMEM; - match = kzalloc(sizeof(*match), GFP_KERNEL); + match = kvzalloc(struct_size(match, arr, args->max_num_members), + GFP_KERNEL); if (!match) goto out; write_pnet(&match->net, sock_net(sk)); @@ -1720,6 +1733,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; match->prot_hook.id_match = match_fanout_group; + match->max_num_members = args->max_num_members; list_add(&match->list, &fanout_list); } err = -EINVAL; @@ -1730,7 +1744,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; - if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) { + if (refcount_read(&match->sk_ref) < match->max_num_members) { __dev_remove_pack(&po->prot_hook); po->fanout = match; po->rollover = rollover; @@ -1744,7 +1758,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) if (err && !refcount_read(&match->sk_ref)) { list_del(&match->list); - kfree(match); + kvfree(match); } out: @@ -2069,7 +2083,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; - if (dev->header_ops) { + if (dev_has_header(dev)) { /* The device has an explicit notion of ll header, * exported to higher levels. * @@ -2198,7 +2212,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; - if (dev->header_ops) { + if (dev_has_header(dev)) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { @@ -3075,7 +3089,7 @@ static int packet_release(struct socket *sock) kfree(po->rollover); if (f) { fanout_release_data(f); - kfree(f); + kvfree(f); } /* * Now the socket is dead. No more input will appear. @@ -3866,14 +3880,14 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, } case PACKET_FANOUT: { - int val; + struct fanout_args args = { 0 }; - if (optlen != sizeof(val)) + if (optlen != sizeof(int) && optlen != sizeof(args)) return -EINVAL; - if (copy_from_sockptr(&val, optval, sizeof(val))) + if (copy_from_sockptr(&args, optval, optlen)) return -EFAULT; - return fanout_add(sk, val & 0xffff, val >> 16); + return fanout_add(sk, &args); } case PACKET_FANOUT_DATA: { @@ -4581,7 +4595,9 @@ static void packet_seq_stop(struct seq_file *seq, void *v) static int packet_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) - seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n"); + seq_printf(seq, + "%*sRefCnt Type Proto Iface R Rmem User Inode\n", + IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk"); else { struct sock *s = sk_entry(v); const struct packet_sock *po = pkt_sk(s); @@ -4615,9 +4631,11 @@ static int __net_init packet_net_init(struct net *net) mutex_init(&net->packet.sklist_lock); INIT_HLIST_HEAD(&net->packet.sklist); +#ifdef CONFIG_PROC_FS if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; +#endif /* CONFIG_PROC_FS */ return 0; } diff --git a/net/packet/internal.h b/net/packet/internal.h index fd41ecb7f605..5f61e59ebbff 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -77,11 +77,12 @@ struct packet_ring_buffer { }; extern struct mutex fanout_mutex; -#define PACKET_FANOUT_MAX 256 +#define PACKET_FANOUT_MAX (1 << 16) struct packet_fanout { possible_net_t net; unsigned int num_members; + u32 max_num_members; u16 id; u8 type; u8 flags; @@ -90,10 +91,10 @@ struct packet_fanout { struct bpf_prog __rcu *bpf_prog; }; struct list_head list; - struct sock *arr[PACKET_FANOUT_MAX]; spinlock_t lock; refcount_t sk_ref; struct packet_type prot_hook ____cacheline_aligned_in_smp; + struct sock *arr[]; }; struct packet_rollover { @@ -138,7 +139,7 @@ struct packet_sock { atomic_t tp_drops ____cacheline_aligned_in_smp; }; -static struct packet_sock *pkt_sk(struct sock *sk) +static inline struct packet_sock *pkt_sk(struct sock *sk) { return (struct packet_sock *)sk; } diff --git a/net/psample/Kconfig b/net/psample/Kconfig index 028f514a9c60..be0b839209ba 100644 --- a/net/psample/Kconfig +++ b/net/psample/Kconfig @@ -4,7 +4,6 @@ # menuconfig PSAMPLE - depends on NET tristate "Packet-sampling netlink channel" default n help diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c index ff0c41467fc1..2bf2b1943e61 100644 --- a/net/qrtr/mhi.c +++ b/net/qrtr/mhi.c @@ -76,6 +76,11 @@ static int qcom_mhi_qrtr_probe(struct mhi_device *mhi_dev, struct qrtr_mhi_dev *qdev; int rc; + /* start channels */ + rc = mhi_prepare_for_transfer(mhi_dev); + if (rc) + return rc; + qdev = devm_kzalloc(&mhi_dev->dev, sizeof(*qdev), GFP_KERNEL); if (!qdev) return -ENOMEM; @@ -99,6 +104,7 @@ static void qcom_mhi_qrtr_remove(struct mhi_device *mhi_dev) struct qrtr_mhi_dev *qdev = dev_get_drvdata(&mhi_dev->dev); qrtr_endpoint_unregister(&qdev->ep); + mhi_unprepare_from_transfer(mhi_dev); dev_set_drvdata(&mhi_dev->dev, NULL); } diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index b8559c882431..8d00dfe8139e 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -517,10 +517,6 @@ static int ctrl_cmd_new_server(struct sockaddr_qrtr *from, port = from->sq_port; } - /* Don't accept spoofed messages */ - if (from->sq_node != node_id) - return -EINVAL; - srv = server_add(service, instance, node_id, port); if (!srv) return -EINVAL; @@ -559,10 +555,6 @@ static int ctrl_cmd_del_server(struct sockaddr_qrtr *from, port = from->sq_port; } - /* Don't accept spoofed messages */ - if (from->sq_node != node_id) - return -EINVAL; - /* Local servers may only unregister themselves */ if (from->sq_node == qrtr_ns.local_node && from->sq_port != port) return -EINVAL; @@ -763,7 +755,7 @@ static void qrtr_ns_data_ready(struct sock *sk) queue_work(qrtr_ns.workqueue, &qrtr_ns.work); } -void qrtr_ns_init(void) +int qrtr_ns_init(void) { struct sockaddr_qrtr sq; int ret; @@ -774,7 +766,7 @@ void qrtr_ns_init(void) ret = sock_create_kern(&init_net, AF_QIPCRTR, SOCK_DGRAM, PF_QIPCRTR, &qrtr_ns.sock); if (ret < 0) - return; + return ret; ret = kernel_getsockname(qrtr_ns.sock, (struct sockaddr *)&sq); if (ret < 0) { @@ -805,12 +797,13 @@ void qrtr_ns_init(void) if (ret < 0) goto err_wq; - return; + return 0; err_wq: destroy_workqueue(qrtr_ns.workqueue); err_sock: sock_release(qrtr_ns.sock); + return ret; } EXPORT_SYMBOL_GPL(qrtr_ns_init); diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 957aa9263ba4..b34358282f37 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -171,8 +171,13 @@ static void __qrtr_node_release(struct kref *kref) void __rcu **slot; spin_lock_irqsave(&qrtr_nodes_lock, flags); - if (node->nid != QRTR_EP_NID_AUTO) - radix_tree_delete(&qrtr_nodes, node->nid); + /* If the node is a bridge for other nodes, there are possibly + * multiple entries pointing to our released node, delete them all. + */ + radix_tree_for_each_slot(slot, &qrtr_nodes, &iter, 0) { + if (*slot == node) + radix_tree_iter_delete(&qrtr_nodes, &iter, slot); + } spin_unlock_irqrestore(&qrtr_nodes_lock, flags); list_del(&node->item); @@ -347,7 +352,7 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, hdr->src_port_id = cpu_to_le32(from->sq_port); if (to->sq_port == QRTR_PORT_CTRL) { hdr->dst_node_id = cpu_to_le32(node->nid); - hdr->dst_port_id = cpu_to_le32(QRTR_NODE_BCAST); + hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); } else { hdr->dst_node_id = cpu_to_le32(to->sq_node); hdr->dst_port_id = cpu_to_le32(to->sq_port); @@ -401,12 +406,13 @@ static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid) { unsigned long flags; - if (node->nid != QRTR_EP_NID_AUTO || nid == QRTR_EP_NID_AUTO) + if (nid == QRTR_EP_NID_AUTO) return; spin_lock_irqsave(&qrtr_nodes_lock, flags); radix_tree_insert(&qrtr_nodes, nid, node); - node->nid = nid; + if (node->nid == QRTR_EP_NID_AUTO) + node->nid = nid; spin_unlock_irqrestore(&qrtr_nodes_lock, flags); } @@ -494,6 +500,13 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) qrtr_node_assign(node, cb->src_node); + if (cb->type == QRTR_TYPE_NEW_SERVER) { + /* Remote node endpoint can bridge other distant nodes */ + const struct qrtr_ctrl_pkt *pkt = data + hdrlen; + + qrtr_node_assign(node, le32_to_cpu(pkt->server.node)); + } + if (cb->type == QRTR_TYPE_RESUME_TX) { qrtr_tx_resume(node, skb); } else { @@ -519,18 +532,20 @@ EXPORT_SYMBOL_GPL(qrtr_endpoint_post); /** * qrtr_alloc_ctrl_packet() - allocate control packet skb * @pkt: reference to qrtr_ctrl_pkt pointer + * @flags: the type of memory to allocate * * Returns newly allocated sk_buff, or NULL on failure * * This function allocates a sk_buff large enough to carry a qrtr_ctrl_pkt and * on success returns a reference to the control packet in @pkt. */ -static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt) +static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt, + gfp_t flags) { const int pkt_len = sizeof(struct qrtr_ctrl_pkt); struct sk_buff *skb; - skb = alloc_skb(QRTR_HDR_MAX_SIZE + pkt_len, GFP_KERNEL); + skb = alloc_skb(QRTR_HDR_MAX_SIZE + pkt_len, flags); if (!skb) return NULL; @@ -592,6 +607,7 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) struct qrtr_ctrl_pkt *pkt; struct qrtr_tx_flow *flow; struct sk_buff *skb; + unsigned long flags; void __rcu **slot; mutex_lock(&node->ep_lock); @@ -599,11 +615,18 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) mutex_unlock(&node->ep_lock); /* Notify the local controller about the event */ - skb = qrtr_alloc_ctrl_packet(&pkt); - if (skb) { - pkt->cmd = cpu_to_le32(QRTR_TYPE_BYE); - qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst); + spin_lock_irqsave(&qrtr_nodes_lock, flags); + radix_tree_for_each_slot(slot, &qrtr_nodes, &iter, 0) { + if (*slot != node) + continue; + src.sq_node = iter.index; + skb = qrtr_alloc_ctrl_packet(&pkt, GFP_ATOMIC); + if (skb) { + pkt->cmd = cpu_to_le32(QRTR_TYPE_BYE); + qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst); + } } + spin_unlock_irqrestore(&qrtr_nodes_lock, flags); /* Wake up any transmitters waiting for resume-tx from the node */ mutex_lock(&node->qrtr_tx_lock); @@ -656,7 +679,7 @@ static void qrtr_port_remove(struct qrtr_sock *ipc) to.sq_node = QRTR_NODE_BCAST; to.sq_port = QRTR_PORT_CTRL; - skb = qrtr_alloc_ctrl_packet(&pkt); + skb = qrtr_alloc_ctrl_packet(&pkt, GFP_KERNEL); if (skb) { pkt->cmd = cpu_to_le32(QRTR_TYPE_DEL_CLIENT); pkt->client.node = cpu_to_le32(ipc->us.sq_node); @@ -982,7 +1005,7 @@ static int qrtr_send_resume_tx(struct qrtr_cb *cb) if (!node) return -EINVAL; - skb = qrtr_alloc_ctrl_packet(&pkt); + skb = qrtr_alloc_ctrl_packet(&pkt, GFP_KERNEL); if (!skb) return -ENOMEM; @@ -1264,13 +1287,19 @@ static int __init qrtr_proto_init(void) return rc; rc = sock_register(&qrtr_family); - if (rc) { - proto_unregister(&qrtr_proto); - return rc; - } + if (rc) + goto err_proto; - qrtr_ns_init(); + rc = qrtr_ns_init(); + if (rc) + goto err_sock; + return 0; + +err_sock: + sock_unregister(qrtr_family.family); +err_proto: + proto_unregister(&qrtr_proto); return rc; } postcore_initcall(qrtr_proto_init); diff --git a/net/qrtr/qrtr.h b/net/qrtr/qrtr.h index dc2b67f17927..3f2d28696062 100644 --- a/net/qrtr/qrtr.h +++ b/net/qrtr/qrtr.h @@ -29,7 +29,7 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep); int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len); -void qrtr_ns_init(void); +int qrtr_ns_init(void); void qrtr_ns_remove(void); diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c index 15ce9b642b25..304b41fea5ab 100644 --- a/net/qrtr/tun.c +++ b/net/qrtr/tun.c @@ -31,6 +31,7 @@ static int qrtr_tun_send(struct qrtr_endpoint *ep, struct sk_buff *skb) static int qrtr_tun_open(struct inode *inode, struct file *filp) { struct qrtr_tun *tun; + int ret; tun = kzalloc(sizeof(*tun), GFP_KERNEL); if (!tun) @@ -43,7 +44,16 @@ static int qrtr_tun_open(struct inode *inode, struct file *filp) filp->private_data = tun; - return qrtr_endpoint_register(&tun->ep, QRTR_EP_NID_AUTO); + ret = qrtr_endpoint_register(&tun->ep, QRTR_EP_NID_AUTO); + if (ret) + goto out; + + return 0; + +out: + filp->private_data = NULL; + kfree(tun); + return ret; } static ssize_t qrtr_tun_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -80,6 +90,12 @@ static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; void *kbuf; + if (!len) + return -EINVAL; + + if (len > KMALLOC_MAX_SIZE) + return -ENOMEM; + kbuf = kzalloc(len, GFP_KERNEL); if (!kbuf) return -ENOMEM; diff --git a/net/rds/ib.c b/net/rds/ib.c index deecbdcdae84..24c9a9005a6f 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -30,7 +30,6 @@ * SOFTWARE. * */ -#include <linux/dmapool.h> #include <linux/kernel.h> #include <linux/in.h> #include <linux/if.h> @@ -108,7 +107,6 @@ static void rds_ib_dev_free(struct work_struct *work) rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool); if (rds_ibdev->pd) ib_dealloc_pd(rds_ibdev->pd); - dma_pool_destroy(rds_ibdev->rid_hdrs_pool); list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { list_del(&i_ipaddr->list); @@ -191,14 +189,6 @@ static int rds_ib_add_one(struct ib_device *device) rds_ibdev->pd = NULL; goto put_dev; } - rds_ibdev->rid_hdrs_pool = dma_pool_create(device->name, - device->dma_device, - sizeof(struct rds_header), - L1_CACHE_BYTES, 0); - if (!rds_ibdev->rid_hdrs_pool) { - ret = -ENOMEM; - goto put_dev; - } rds_ibdev->mr_1m_pool = rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL); diff --git a/net/rds/ib.h b/net/rds/ib.h index 8dfff43cf07f..2ba71102b1f1 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -246,7 +246,6 @@ struct rds_ib_device { struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; - struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */ u8 odp_capable:1; unsigned int max_mrs; @@ -264,13 +263,6 @@ struct rds_ib_device { int *vector_load; }; -static inline int ibdev_to_node(struct ib_device *ibdev) -{ - struct device *parent; - - parent = ibdev->dev.parent; - return parent ? dev_to_node(parent) : NUMA_NO_NODE; -} #define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev) /* bits for i_ack_flags */ @@ -387,11 +379,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6); void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event); -struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev, - struct dma_pool *pool, - dma_addr_t **dma_addrs, u32 num_hdrs); -void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs, - dma_addr_t *dma_addrs, u32 num_hdrs); #define rds_ib_conn_error(conn, fmt...) \ __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index b36b60668b1d..f5cbe963cd8f 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -30,7 +30,6 @@ * SOFTWARE. * */ -#include <linux/dmapool.h> #include <linux/kernel.h> #include <linux/in.h> #include <linux/slab.h> @@ -441,42 +440,87 @@ static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index) rds_ibdev->vector_load[index]--; } +static void rds_dma_hdr_free(struct ib_device *dev, struct rds_header *hdr, + dma_addr_t dma_addr, enum dma_data_direction dir) +{ + ib_dma_unmap_single(dev, dma_addr, sizeof(*hdr), dir); + kfree(hdr); +} + +static struct rds_header *rds_dma_hdr_alloc(struct ib_device *dev, + dma_addr_t *dma_addr, enum dma_data_direction dir) +{ + struct rds_header *hdr; + + hdr = kzalloc_node(sizeof(*hdr), GFP_KERNEL, ibdev_to_node(dev)); + if (!hdr) + return NULL; + + *dma_addr = ib_dma_map_single(dev, hdr, sizeof(*hdr), + DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(dev, *dma_addr)) { + kfree(hdr); + return NULL; + } + + return hdr; +} + +/* Free the DMA memory used to store struct rds_header. + * + * @dev: the RDS IB device + * @hdrs: pointer to the array storing DMA memory pointers + * @dma_addrs: pointer to the array storing DMA addresses + * @num_hdars: number of headers to free. + */ +static void rds_dma_hdrs_free(struct rds_ib_device *dev, + struct rds_header **hdrs, dma_addr_t *dma_addrs, u32 num_hdrs, + enum dma_data_direction dir) +{ + u32 i; + + for (i = 0; i < num_hdrs; i++) + rds_dma_hdr_free(dev->dev, hdrs[i], dma_addrs[i], dir); + kvfree(hdrs); + kvfree(dma_addrs); +} + + /* Allocate DMA coherent memory to be used to store struct rds_header for * sending/receiving packets. The pointers to the DMA memory and the * associated DMA addresses are stored in two arrays. * - * @ibdev: the IB device - * @pool: the DMA memory pool + * @dev: the RDS IB device * @dma_addrs: pointer to the array for storing DMA addresses * @num_hdrs: number of headers to allocate * * It returns the pointer to the array storing the DMA memory pointers. On * error, NULL pointer is returned. */ -struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev, - struct dma_pool *pool, - dma_addr_t **dma_addrs, u32 num_hdrs) +static struct rds_header **rds_dma_hdrs_alloc(struct rds_ib_device *dev, + dma_addr_t **dma_addrs, u32 num_hdrs, + enum dma_data_direction dir) { struct rds_header **hdrs; dma_addr_t *hdr_daddrs; u32 i; hdrs = kvmalloc_node(sizeof(*hdrs) * num_hdrs, GFP_KERNEL, - ibdev_to_node(ibdev)); + ibdev_to_node(dev->dev)); if (!hdrs) return NULL; hdr_daddrs = kvmalloc_node(sizeof(*hdr_daddrs) * num_hdrs, GFP_KERNEL, - ibdev_to_node(ibdev)); + ibdev_to_node(dev->dev)); if (!hdr_daddrs) { kvfree(hdrs); return NULL; } for (i = 0; i < num_hdrs; i++) { - hdrs[i] = dma_pool_zalloc(pool, GFP_KERNEL, &hdr_daddrs[i]); + hdrs[i] = rds_dma_hdr_alloc(dev->dev, &hdr_daddrs[i], dir); if (!hdrs[i]) { - rds_dma_hdrs_free(pool, hdrs, hdr_daddrs, i); + rds_dma_hdrs_free(dev, hdrs, hdr_daddrs, i, dir); return NULL; } } @@ -485,24 +529,6 @@ struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev, return hdrs; } -/* Free the DMA memory used to store struct rds_header. - * - * @pool: the DMA memory pool - * @hdrs: pointer to the array storing DMA memory pointers - * @dma_addrs: pointer to the array storing DMA addresses - * @num_hdars: number of headers to free. - */ -void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs, - dma_addr_t *dma_addrs, u32 num_hdrs) -{ - u32 i; - - for (i = 0; i < num_hdrs; i++) - dma_pool_free(pool, hdrs[i], dma_addrs[i]); - kvfree(hdrs); - kvfree(dma_addrs); -} - /* * This needs to be very careful to not leave IS_ERR pointers around for * cleanup to trip over. @@ -516,7 +542,6 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct rds_ib_device *rds_ibdev; unsigned long max_wrs; int ret, fr_queue_space; - struct dma_pool *pool; /* * It's normal to see a null device if an incoming connection races @@ -612,25 +637,26 @@ static int rds_ib_setup_qp(struct rds_connection *conn) goto recv_cq_out; } - pool = rds_ibdev->rid_hdrs_pool; - ic->i_send_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_send_hdrs_dma, - ic->i_send_ring.w_nr); + ic->i_send_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_send_hdrs_dma, + ic->i_send_ring.w_nr, + DMA_TO_DEVICE); if (!ic->i_send_hdrs) { ret = -ENOMEM; rdsdebug("DMA send hdrs alloc failed\n"); goto qp_out; } - ic->i_recv_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_recv_hdrs_dma, - ic->i_recv_ring.w_nr); + ic->i_recv_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_recv_hdrs_dma, + ic->i_recv_ring.w_nr, + DMA_FROM_DEVICE); if (!ic->i_recv_hdrs) { ret = -ENOMEM; rdsdebug("DMA recv hdrs alloc failed\n"); goto send_hdrs_dma_out; } - ic->i_ack = dma_pool_zalloc(pool, GFP_KERNEL, - &ic->i_ack_dma); + ic->i_ack = rds_dma_hdr_alloc(rds_ibdev->dev, &ic->i_ack_dma, + DMA_TO_DEVICE); if (!ic->i_ack) { ret = -ENOMEM; rdsdebug("DMA ack header alloc failed\n"); @@ -666,18 +692,19 @@ sends_out: vfree(ic->i_sends); ack_dma_out: - dma_pool_free(pool, ic->i_ack, ic->i_ack_dma); + rds_dma_hdr_free(rds_ibdev->dev, ic->i_ack, ic->i_ack_dma, + DMA_TO_DEVICE); ic->i_ack = NULL; recv_hdrs_dma_out: - rds_dma_hdrs_free(pool, ic->i_recv_hdrs, ic->i_recv_hdrs_dma, - ic->i_recv_ring.w_nr); + rds_dma_hdrs_free(rds_ibdev, ic->i_recv_hdrs, ic->i_recv_hdrs_dma, + ic->i_recv_ring.w_nr, DMA_FROM_DEVICE); ic->i_recv_hdrs = NULL; ic->i_recv_hdrs_dma = NULL; send_hdrs_dma_out: - rds_dma_hdrs_free(pool, ic->i_send_hdrs, ic->i_send_hdrs_dma, - ic->i_send_ring.w_nr); + rds_dma_hdrs_free(rds_ibdev, ic->i_send_hdrs, ic->i_send_hdrs_dma, + ic->i_send_ring.w_nr, DMA_TO_DEVICE); ic->i_send_hdrs = NULL; ic->i_send_hdrs_dma = NULL; @@ -1110,29 +1137,30 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) } if (ic->rds_ibdev) { - struct dma_pool *pool; - - pool = ic->rds_ibdev->rid_hdrs_pool; - /* then free the resources that ib callbacks use */ if (ic->i_send_hdrs) { - rds_dma_hdrs_free(pool, ic->i_send_hdrs, + rds_dma_hdrs_free(ic->rds_ibdev, + ic->i_send_hdrs, ic->i_send_hdrs_dma, - ic->i_send_ring.w_nr); + ic->i_send_ring.w_nr, + DMA_TO_DEVICE); ic->i_send_hdrs = NULL; ic->i_send_hdrs_dma = NULL; } if (ic->i_recv_hdrs) { - rds_dma_hdrs_free(pool, ic->i_recv_hdrs, + rds_dma_hdrs_free(ic->rds_ibdev, + ic->i_recv_hdrs, ic->i_recv_hdrs_dma, - ic->i_recv_ring.w_nr); + ic->i_recv_ring.w_nr, + DMA_FROM_DEVICE); ic->i_recv_hdrs = NULL; ic->i_recv_hdrs_dma = NULL; } if (ic->i_ack) { - dma_pool_free(pool, ic->i_ack, ic->i_ack_dma); + rds_dma_hdr_free(ic->rds_ibdev->dev, ic->i_ack, + ic->i_ack_dma, DMA_TO_DEVICE); ic->i_ack = NULL; } } else { diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 3cffcec5fb37..6fdedd9dbbc2 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -662,10 +662,16 @@ static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credi seq = rds_ib_get_ack(ic); rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); + + ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, ic->i_ack_dma, + sizeof(*hdr), DMA_TO_DEVICE); rds_message_populate_header(hdr, 0, 0, 0); hdr->h_ack = cpu_to_be64(seq); hdr->h_credit = adv_credits; rds_message_make_checksum(hdr); + ib_dma_sync_single_for_device(ic->rds_ibdev->dev, ic->i_ack_dma, + sizeof(*hdr), DMA_TO_DEVICE); + ic->i_ack_queued = jiffies; ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, NULL); @@ -845,6 +851,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_incoming *ibinc = ic->i_ibinc; struct rds_header *ihdr, *hdr; + dma_addr_t dma_addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs]; /* XXX shut down the connection if port 0,0 are seen? */ @@ -863,6 +870,8 @@ static void rds_ib_process_recv(struct rds_connection *conn, ihdr = ic->i_recv_hdrs[recv - ic->i_recvs]; + ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, dma_addr, + sizeof(*ihdr), DMA_FROM_DEVICE); /* Validate the checksum. */ if (!rds_message_verify_checksum(ihdr)) { rds_ib_conn_error(conn, "incoming message " @@ -870,7 +879,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, "forcing a reconnect\n", &conn->c_faddr); rds_stats_inc(s_recv_drop_bad_checksum); - return; + goto done; } /* Process the ACK sequence which comes with every packet */ @@ -899,7 +908,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, */ rds_ib_frag_free(ic, recv->r_frag); recv->r_frag = NULL; - return; + goto done; } /* @@ -933,7 +942,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, hdr->h_dport != ihdr->h_dport) { rds_ib_conn_error(conn, "fragment header mismatch; forcing reconnect\n"); - return; + goto done; } } @@ -965,6 +974,9 @@ static void rds_ib_process_recv(struct rds_connection *conn, rds_inc_put(&ibinc->ii_inc); } +done: + ib_dma_sync_single_for_device(ic->rds_ibdev->dev, dma_addr, + sizeof(*ihdr), DMA_FROM_DEVICE); } void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index dfe778220657..92b4a8689aae 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -638,6 +638,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send->s_sge[0].length = sizeof(struct rds_header); send->s_sge[0].lkey = ic->i_pd->local_dma_lkey; + ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, + ic->i_send_hdrs_dma[pos], + sizeof(struct rds_header), + DMA_TO_DEVICE); memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); @@ -688,6 +692,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, adv_credits = 0; rds_ib_stats_inc(s_ib_tx_credit_updates); } + ib_dma_sync_single_for_device(ic->rds_ibdev->dev, + ic->i_send_hdrs_dma[pos], + sizeof(struct rds_header), + DMA_TO_DEVICE); if (prev) prev->s_wr.next = &send->s_wr; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 1d0afb1dd77b..6f1a50d50d06 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -565,6 +565,9 @@ int rds_rdma_extra_size(struct rds_rdma_args *args, if (args->nr_local == 0) return -EINVAL; + if (args->nr_local > UIO_MAXIOV) + return -EMSGSIZE; + iov->iov = kcalloc(args->nr_local, sizeof(struct rds_iovec), GFP_KERNEL); diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 971c73c7d34c..68d6ef9e59fc 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -40,6 +40,7 @@ struct rfkill { enum rfkill_type type; unsigned long state; + unsigned long hard_block_reasons; u32 idx; @@ -265,6 +266,7 @@ static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill, ev->hard = !!(rfkill->state & RFKILL_BLOCK_HW); ev->soft = !!(rfkill->state & (RFKILL_BLOCK_SW | RFKILL_BLOCK_SW_PREV)); + ev->hard_block_reasons = rfkill->hard_block_reasons; spin_unlock_irqrestore(&rfkill->lock, flags); } @@ -522,19 +524,29 @@ bool rfkill_get_global_sw_state(const enum rfkill_type type) } #endif -bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked) +bool rfkill_set_hw_state_reason(struct rfkill *rfkill, + bool blocked, unsigned long reason) { unsigned long flags; bool ret, prev; BUG_ON(!rfkill); + if (WARN(reason & + ~(RFKILL_HARD_BLOCK_SIGNAL | RFKILL_HARD_BLOCK_NOT_OWNER), + "hw_state reason not supported: 0x%lx", reason)) + return blocked; + spin_lock_irqsave(&rfkill->lock, flags); - prev = !!(rfkill->state & RFKILL_BLOCK_HW); - if (blocked) + prev = !!(rfkill->hard_block_reasons & reason); + if (blocked) { rfkill->state |= RFKILL_BLOCK_HW; - else - rfkill->state &= ~RFKILL_BLOCK_HW; + rfkill->hard_block_reasons |= reason; + } else { + rfkill->hard_block_reasons &= ~reason; + if (!rfkill->hard_block_reasons) + rfkill->state &= ~RFKILL_BLOCK_HW; + } ret = !!(rfkill->state & RFKILL_BLOCK_ANY); spin_unlock_irqrestore(&rfkill->lock, flags); @@ -546,7 +558,7 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked) return ret; } -EXPORT_SYMBOL(rfkill_set_hw_state); +EXPORT_SYMBOL(rfkill_set_hw_state_reason); static void __rfkill_set_sw_state(struct rfkill *rfkill, bool blocked) { @@ -744,6 +756,16 @@ static ssize_t soft_store(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RW(soft); +static ssize_t hard_block_reasons_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct rfkill *rfkill = to_rfkill(dev); + + return sprintf(buf, "0x%lx\n", rfkill->hard_block_reasons); +} +static DEVICE_ATTR_RO(hard_block_reasons); + static u8 user_state_from_blocked(unsigned long state) { if (state & RFKILL_BLOCK_HW) @@ -796,6 +818,7 @@ static struct attribute *rfkill_dev_attrs[] = { &dev_attr_state.attr, &dev_attr_soft.attr, &dev_attr_hard.attr, + &dev_attr_hard_block_reasons.attr, NULL, }; ATTRIBUTE_GROUPS(rfkill_dev); @@ -811,6 +834,7 @@ static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env) { struct rfkill *rfkill = to_rfkill(dev); unsigned long flags; + unsigned long reasons; u32 state; int error; @@ -823,10 +847,13 @@ static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env) return error; spin_lock_irqsave(&rfkill->lock, flags); state = rfkill->state; + reasons = rfkill->hard_block_reasons; spin_unlock_irqrestore(&rfkill->lock, flags); error = add_uevent_var(env, "RFKILL_STATE=%d", user_state_from_blocked(state)); - return error; + if (error) + return error; + return add_uevent_var(env, "RFKILL_HW_BLOCK_REASON=0x%lx", reasons); } void rfkill_pause_polling(struct rfkill *rfkill) @@ -876,6 +903,9 @@ static int rfkill_resume(struct device *dev) rfkill->suspended = false; + if (!rfkill->registered) + return 0; + if (!rfkill->persistent) { cur = !!(rfkill->state & RFKILL_BLOCK_SW); rfkill_set_block(rfkill, cur); diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index 7b094275ea8b..11c45c8c6c16 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -96,10 +96,19 @@ static void rose_loopback_timer(struct timer_list *unused) } if (frametype == ROSE_CALL_REQUEST) { - if ((dev = rose_dev_get(dest)) != NULL) { - if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0) - kfree_skb(skb); - } else { + if (!rose_loopback_neigh->dev) { + kfree_skb(skb); + continue; + } + + dev = rose_dev_get(dest); + if (!dev) { + kfree_skb(skb); + continue; + } + + if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0) { + dev_put(dev); kfree_skb(skb); } } else { diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index d706bb408365..0885b22e5c0e 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -8,6 +8,7 @@ config AF_RXRPC depends on INET select CRYPTO select KEYS + select NET_UDP_TUNNEL help Say Y or M here to include support for RxRPC session sockets (just the transport part, not the presentation part: (un)marshalling is diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index ddd0f95713a9..b11281bed2a4 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -28,6 +28,7 @@ rxrpc-y := \ rtt.o \ security.o \ sendmsg.o \ + server_key.o \ skbuff.o \ utils.o diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 0a2f4817ec6c..41671af6b33f 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -990,7 +990,7 @@ static int __init af_rxrpc_init(void) goto error_security; } - ret = register_pernet_subsys(&rxrpc_net_ops); + ret = register_pernet_device(&rxrpc_net_ops); if (ret) goto error_pernet; @@ -1035,7 +1035,7 @@ error_key_type: error_sock: proto_unregister(&rxrpc_proto); error_proto: - unregister_pernet_subsys(&rxrpc_net_ops); + unregister_pernet_device(&rxrpc_net_ops); error_pernet: rxrpc_exit_security(); error_security: @@ -1057,7 +1057,7 @@ static void __exit af_rxrpc_exit(void) unregister_key_type(&key_type_rxrpc); sock_unregister(PF_RXRPC); proto_unregister(&rxrpc_proto); - unregister_pernet_subsys(&rxrpc_net_ops); + unregister_pernet_device(&rxrpc_net_ops); ASSERTCMP(atomic_read(&rxrpc_n_tx_skbs), ==, 0); ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index dce48162f6c2..7bd6f8a66a3e 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -12,6 +12,7 @@ #include <net/netns/generic.h> #include <net/sock.h> #include <net/af_rxrpc.h> +#include <keys/rxrpc-type.h> #include "protocol.h" #if 0 @@ -34,6 +35,7 @@ struct rxrpc_crypt { #define rxrpc_queue_delayed_work(WS,D) \ queue_delayed_work(rxrpc_workqueue, (WS), (D)) +struct key_preparsed_payload; struct rxrpc_connection; /* @@ -216,17 +218,30 @@ struct rxrpc_security { /* Clean up a security service */ void (*exit)(void); + /* Parse the information from a server key */ + int (*preparse_server_key)(struct key_preparsed_payload *); + + /* Clean up the preparse buffer after parsing a server key */ + void (*free_preparse_server_key)(struct key_preparsed_payload *); + + /* Destroy the payload of a server key */ + void (*destroy_server_key)(struct key *); + + /* Describe a server key */ + void (*describe_server_key)(const struct key *, struct seq_file *); + /* initialise a connection's security */ - int (*init_connection_security)(struct rxrpc_connection *); + int (*init_connection_security)(struct rxrpc_connection *, + struct rxrpc_key_token *); - /* prime a connection's packet security */ - int (*prime_packet_security)(struct rxrpc_connection *); + /* Work out how much data we can store in a packet, given an estimate + * of the amount of data remaining. + */ + int (*how_much_data)(struct rxrpc_call *, size_t, + size_t *, size_t *, size_t *); /* impose security on a packet */ - int (*secure_packet)(struct rxrpc_call *, - struct sk_buff *, - size_t, - void *); + int (*secure_packet)(struct rxrpc_call *, struct sk_buff *, size_t); /* verify the security on a received packet */ int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, @@ -438,10 +453,15 @@ struct rxrpc_connection { struct list_head proc_link; /* link in procfs list */ struct list_head link; /* link in master connection list */ struct sk_buff_head rx_queue; /* received conn-level packets */ + const struct rxrpc_security *security; /* applied security module */ - struct key *server_key; /* security for this service */ - struct crypto_sync_skcipher *cipher; /* encryption handle */ - struct rxrpc_crypt csum_iv; /* packet checksum base */ + union { + struct { + struct crypto_sync_skcipher *cipher; /* encryption handle */ + struct rxrpc_crypt csum_iv; /* packet checksum base */ + u32 nonce; /* response re-use preventer */ + } rxkad; + }; unsigned long flags; unsigned long events; unsigned long idle_timestamp; /* Time at which last became idle */ @@ -451,10 +471,7 @@ struct rxrpc_connection { int debug_id; /* debug ID for printks */ atomic_t serial; /* packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ - u32 security_nonce; /* response re-use preventer */ u32 service_id; /* Service ID, possibly upgraded */ - u8 size_align; /* data size alignment (for security) */ - u8 security_size; /* security header size */ u8 security_ix; /* security type */ u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ u8 bundle_shift; /* Index into bundle->avail_chans */ @@ -888,8 +905,7 @@ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, struct sk_buff *); struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *, gfp_t); void rxrpc_new_incoming_connection(struct rxrpc_sock *, struct rxrpc_connection *, - const struct rxrpc_security *, struct key *, - struct sk_buff *); + const struct rxrpc_security *, struct sk_buff *); void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* @@ -906,10 +922,8 @@ extern const struct rxrpc_security rxrpc_no_security; * key.c */ extern struct key_type key_type_rxrpc; -extern struct key_type key_type_rxrpc_s; int rxrpc_request_key(struct rxrpc_sock *, sockptr_t , int); -int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int); int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time64_t, u32); @@ -1052,11 +1066,13 @@ extern const struct rxrpc_security rxkad; * security.c */ int __init rxrpc_init_security(void); +const struct rxrpc_security *rxrpc_security_lookup(u8); void rxrpc_exit_security(void); int rxrpc_init_client_conn_security(struct rxrpc_connection *); -bool rxrpc_look_up_server_security(struct rxrpc_local *, struct rxrpc_sock *, - const struct rxrpc_security **, struct key **, - struct sk_buff *); +const struct rxrpc_security *rxrpc_get_incoming_security(struct rxrpc_sock *, + struct sk_buff *); +struct key *rxrpc_look_up_server_security(struct rxrpc_connection *, + struct sk_buff *, u32, u32); /* * sendmsg.c @@ -1064,6 +1080,13 @@ bool rxrpc_look_up_server_security(struct rxrpc_local *, struct rxrpc_sock *, int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); /* + * server_key.c + */ +extern struct key_type key_type_rxrpc_s; + +int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int); + +/* * skbuff.c */ void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 8df1964db333..1ae90fb97936 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -197,6 +197,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) tail = b->peer_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_peer *peer = b->peer_backlog[tail]; + rxrpc_put_local(peer->local); kfree(peer); tail = (tail + 1) & (size - 1); } @@ -261,7 +262,6 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_peer *peer, struct rxrpc_connection *conn, const struct rxrpc_security *sec, - struct key *key, struct sk_buff *skb) { struct rxrpc_backlog *b = rx->backlog; @@ -309,7 +309,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, conn->params.local = rxrpc_get_local(local); conn->params.peer = peer; rxrpc_see_connection(conn); - rxrpc_new_incoming_connection(rx, conn, sec, key, skb); + rxrpc_new_incoming_connection(rx, conn, sec, skb); } else { rxrpc_get_connection(conn); } @@ -353,7 +353,6 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, struct rxrpc_connection *conn; struct rxrpc_peer *peer = NULL; struct rxrpc_call *call = NULL; - struct key *key = NULL; _enter(""); @@ -374,11 +373,13 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, */ conn = rxrpc_find_connection_rcu(local, skb, &peer); - if (!conn && !rxrpc_look_up_server_security(local, rx, &sec, &key, skb)) - goto no_call; + if (!conn) { + sec = rxrpc_get_incoming_security(rx, skb); + if (!sec) + goto no_call; + } - call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, key, skb); - key_put(key); + call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, skb); if (!call) { skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; goto no_call; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index c845594b663f..4eb91d958a48 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -548,8 +548,6 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) rxrpc_disconnect_call(call); if (call->security) call->security->free_call_crypto(call); - - rxrpc_cleanup_ring(call); _leave(""); } diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 7e574c75be8e..dbea0bfee48e 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -180,10 +180,6 @@ rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) if (ret < 0) goto error_1; - ret = conn->security->prime_packet_security(conn); - if (ret < 0) - goto error_2; - atomic_inc(&rxnet->nr_conns); write_lock(&rxnet->conn_lock); list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); @@ -203,8 +199,6 @@ rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) _leave(" = %p", conn); return conn; -error_2: - conn->security->clear(conn); error_1: rxrpc_put_client_connection_id(conn); error_0: diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index aff184145ffa..aab069701398 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -333,11 +333,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, if (ret < 0) return ret; - ret = conn->security->init_connection_security(conn); - if (ret < 0) - return ret; - - ret = conn->security->prime_packet_security(conn); + ret = conn->security->init_connection_security( + conn, conn->params.key->payload.data[0]); if (ret < 0) return ret; @@ -377,7 +374,6 @@ static void rxrpc_secure_connection(struct rxrpc_connection *conn) _enter("{%d}", conn->debug_id); ASSERT(conn->security_ix != 0); - ASSERT(conn->server_key); if (conn->security->issue_challenge(conn) < 0) { abort_code = RX_CALL_DEAD; diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 3bcbe0665f91..b2159dbf5412 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -49,7 +49,6 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) conn->security = &rxrpc_no_security; spin_lock_init(&conn->state_lock); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); - conn->size_align = 4; conn->idle_timestamp = jiffies; } @@ -363,7 +362,6 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) conn->security->clear(conn); key_put(conn->params.key); - key_put(conn->server_key); rxrpc_put_bundle(conn->bundle); rxrpc_put_peer(conn->params.peer); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 6c847720494f..e1966dfc9152 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -156,7 +156,6 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, struct rxrpc_connection *conn, const struct rxrpc_security *sec, - struct key *key, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -170,7 +169,6 @@ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, conn->security_ix = sp->hdr.securityIndex; conn->out_clientflag = 0; conn->security = sec; - conn->server_key = key_get(key); if (conn->security_ix) conn->state = RXRPC_CONN_SERVICE_UNSECURED; else diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 667c44aa5a63..dc201363f2c4 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -430,7 +430,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) return; } - if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST) { + if (state == RXRPC_CALL_SERVER_RECV_REQUEST) { unsigned long timo = READ_ONCE(call->next_req_timo); unsigned long now, expect_req_by; diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index f6c59f5fae9d..9aae99d67833 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -8,20 +8,25 @@ #include <net/af_rxrpc.h> #include "ar-internal.h" -static int none_init_connection_security(struct rxrpc_connection *conn) +static int none_init_connection_security(struct rxrpc_connection *conn, + struct rxrpc_key_token *token) { return 0; } -static int none_prime_packet_security(struct rxrpc_connection *conn) +/* + * Work out how much data we can put in an unsecured packet. + */ +static int none_how_much_data(struct rxrpc_call *call, size_t remain, + size_t *_buf_size, size_t *_data_size, size_t *_offset) { + *_buf_size = *_data_size = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); + *_offset = 0; return 0; } -static int none_secure_packet(struct rxrpc_call *call, - struct sk_buff *skb, - size_t data_size, - void *sechdr) +static int none_secure_packet(struct rxrpc_call *call, struct sk_buff *skb, + size_t data_size) { return 0; } @@ -86,8 +91,8 @@ const struct rxrpc_security rxrpc_no_security = { .init = none_init, .exit = none_exit, .init_connection_security = none_init_connection_security, - .prime_packet_security = none_prime_packet_security, .free_call_crypto = none_free_call_crypto, + .how_much_data = none_how_much_data, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, .locate_data = none_locate_data, diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 2e8bd3b97301..8d2073e0e3da 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -5,7 +5,7 @@ * Written by David Howells (dhowells@redhat.com) * * RxRPC keys should have a description of describing their purpose: - * "afs@CAMBRIDGE.REDHAT.COM> + * "afs@example.com" */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -23,13 +23,9 @@ #include <keys/user-type.h> #include "ar-internal.h" -static int rxrpc_vet_description_s(const char *); static int rxrpc_preparse(struct key_preparsed_payload *); -static int rxrpc_preparse_s(struct key_preparsed_payload *); static void rxrpc_free_preparse(struct key_preparsed_payload *); -static void rxrpc_free_preparse_s(struct key_preparsed_payload *); static void rxrpc_destroy(struct key *); -static void rxrpc_destroy_s(struct key *); static void rxrpc_describe(const struct key *, struct seq_file *); static long rxrpc_read(const struct key *, char *, size_t); @@ -50,38 +46,6 @@ struct key_type key_type_rxrpc = { EXPORT_SYMBOL(key_type_rxrpc); /* - * rxrpc server defined keys take "<serviceId>:<securityIndex>" as the - * description and an 8-byte decryption key as the payload - */ -struct key_type key_type_rxrpc_s = { - .name = "rxrpc_s", - .flags = KEY_TYPE_NET_DOMAIN, - .vet_description = rxrpc_vet_description_s, - .preparse = rxrpc_preparse_s, - .free_preparse = rxrpc_free_preparse_s, - .instantiate = generic_key_instantiate, - .destroy = rxrpc_destroy_s, - .describe = rxrpc_describe, -}; - -/* - * Vet the description for an RxRPC server key - */ -static int rxrpc_vet_description_s(const char *desc) -{ - unsigned long num; - char *p; - - num = simple_strtoul(desc, &p, 10); - if (*p != ':' || num > 65535) - return -EINVAL; - num = simple_strtoul(p + 1, &p, 10); - if (*p || num < 1 || num > 255) - return -EINVAL; - return 0; -} - -/* * parse an RxKAD type XDR format token * - the caller guarantees we have at least 4 words */ @@ -165,402 +129,17 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, return 0; } -static void rxrpc_free_krb5_principal(struct krb5_principal *princ) -{ - int loop; - - if (princ->name_parts) { - for (loop = princ->n_name_parts - 1; loop >= 0; loop--) - kfree(princ->name_parts[loop]); - kfree(princ->name_parts); - } - kfree(princ->realm); -} - -static void rxrpc_free_krb5_tagged(struct krb5_tagged_data *td) -{ - kfree(td->data); -} - -/* - * free up an RxK5 token - */ -static void rxrpc_rxk5_free(struct rxk5_key *rxk5) -{ - int loop; - - rxrpc_free_krb5_principal(&rxk5->client); - rxrpc_free_krb5_principal(&rxk5->server); - rxrpc_free_krb5_tagged(&rxk5->session); - - if (rxk5->addresses) { - for (loop = rxk5->n_addresses - 1; loop >= 0; loop--) - rxrpc_free_krb5_tagged(&rxk5->addresses[loop]); - kfree(rxk5->addresses); - } - if (rxk5->authdata) { - for (loop = rxk5->n_authdata - 1; loop >= 0; loop--) - rxrpc_free_krb5_tagged(&rxk5->authdata[loop]); - kfree(rxk5->authdata); - } - - kfree(rxk5->ticket); - kfree(rxk5->ticket2); - kfree(rxk5); -} - -/* - * extract a krb5 principal - */ -static int rxrpc_krb5_decode_principal(struct krb5_principal *princ, - const __be32 **_xdr, - unsigned int *_toklen) -{ - const __be32 *xdr = *_xdr; - unsigned int toklen = *_toklen, n_parts, loop, tmp, paddedlen; - - /* there must be at least one name, and at least #names+1 length - * words */ - if (toklen <= 12) - return -EINVAL; - - _enter(",{%x,%x,%x},%u", - ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), toklen); - - n_parts = ntohl(*xdr++); - toklen -= 4; - if (n_parts <= 0 || n_parts > AFSTOKEN_K5_COMPONENTS_MAX) - return -EINVAL; - princ->n_name_parts = n_parts; - - if (toklen <= (n_parts + 1) * 4) - return -EINVAL; - - princ->name_parts = kcalloc(n_parts, sizeof(char *), GFP_KERNEL); - if (!princ->name_parts) - return -ENOMEM; - - for (loop = 0; loop < n_parts; loop++) { - if (toklen < 4) - return -EINVAL; - tmp = ntohl(*xdr++); - toklen -= 4; - if (tmp <= 0 || tmp > AFSTOKEN_STRING_MAX) - return -EINVAL; - paddedlen = (tmp + 3) & ~3; - if (paddedlen > toklen) - return -EINVAL; - princ->name_parts[loop] = kmalloc(tmp + 1, GFP_KERNEL); - if (!princ->name_parts[loop]) - return -ENOMEM; - memcpy(princ->name_parts[loop], xdr, tmp); - princ->name_parts[loop][tmp] = 0; - toklen -= paddedlen; - xdr += paddedlen >> 2; - } - - if (toklen < 4) - return -EINVAL; - tmp = ntohl(*xdr++); - toklen -= 4; - if (tmp <= 0 || tmp > AFSTOKEN_K5_REALM_MAX) - return -EINVAL; - paddedlen = (tmp + 3) & ~3; - if (paddedlen > toklen) - return -EINVAL; - princ->realm = kmalloc(tmp + 1, GFP_KERNEL); - if (!princ->realm) - return -ENOMEM; - memcpy(princ->realm, xdr, tmp); - princ->realm[tmp] = 0; - toklen -= paddedlen; - xdr += paddedlen >> 2; - - _debug("%s/...@%s", princ->name_parts[0], princ->realm); - - *_xdr = xdr; - *_toklen = toklen; - _leave(" = 0 [toklen=%u]", toklen); - return 0; -} - -/* - * extract a piece of krb5 tagged data - */ -static int rxrpc_krb5_decode_tagged_data(struct krb5_tagged_data *td, - size_t max_data_size, - const __be32 **_xdr, - unsigned int *_toklen) -{ - const __be32 *xdr = *_xdr; - unsigned int toklen = *_toklen, len, paddedlen; - - /* there must be at least one tag and one length word */ - if (toklen <= 8) - return -EINVAL; - - _enter(",%zu,{%x,%x},%u", - max_data_size, ntohl(xdr[0]), ntohl(xdr[1]), toklen); - - td->tag = ntohl(*xdr++); - len = ntohl(*xdr++); - toklen -= 8; - if (len > max_data_size) - return -EINVAL; - paddedlen = (len + 3) & ~3; - if (paddedlen > toklen) - return -EINVAL; - td->data_len = len; - - if (len > 0) { - td->data = kmemdup(xdr, len, GFP_KERNEL); - if (!td->data) - return -ENOMEM; - toklen -= paddedlen; - xdr += paddedlen >> 2; - } - - _debug("tag %x len %x", td->tag, td->data_len); - - *_xdr = xdr; - *_toklen = toklen; - _leave(" = 0 [toklen=%u]", toklen); - return 0; -} - -/* - * extract an array of tagged data - */ -static int rxrpc_krb5_decode_tagged_array(struct krb5_tagged_data **_td, - u8 *_n_elem, - u8 max_n_elem, - size_t max_elem_size, - const __be32 **_xdr, - unsigned int *_toklen) -{ - struct krb5_tagged_data *td; - const __be32 *xdr = *_xdr; - unsigned int toklen = *_toklen, n_elem, loop; - int ret; - - /* there must be at least one count */ - if (toklen < 4) - return -EINVAL; - - _enter(",,%u,%zu,{%x},%u", - max_n_elem, max_elem_size, ntohl(xdr[0]), toklen); - - n_elem = ntohl(*xdr++); - toklen -= 4; - if (n_elem > max_n_elem) - return -EINVAL; - *_n_elem = n_elem; - if (n_elem > 0) { - if (toklen <= (n_elem + 1) * 4) - return -EINVAL; - - _debug("n_elem %d", n_elem); - - td = kcalloc(n_elem, sizeof(struct krb5_tagged_data), - GFP_KERNEL); - if (!td) - return -ENOMEM; - *_td = td; - - for (loop = 0; loop < n_elem; loop++) { - ret = rxrpc_krb5_decode_tagged_data(&td[loop], - max_elem_size, - &xdr, &toklen); - if (ret < 0) - return ret; - } - } - - *_xdr = xdr; - *_toklen = toklen; - _leave(" = 0 [toklen=%u]", toklen); - return 0; -} - -/* - * extract a krb5 ticket - */ -static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen, - const __be32 **_xdr, unsigned int *_toklen) -{ - const __be32 *xdr = *_xdr; - unsigned int toklen = *_toklen, len, paddedlen; - - /* there must be at least one length word */ - if (toklen <= 4) - return -EINVAL; - - _enter(",{%x},%u", ntohl(xdr[0]), toklen); - - len = ntohl(*xdr++); - toklen -= 4; - if (len > AFSTOKEN_K5_TIX_MAX) - return -EINVAL; - paddedlen = (len + 3) & ~3; - if (paddedlen > toklen) - return -EINVAL; - *_tktlen = len; - - _debug("ticket len %u", len); - - if (len > 0) { - *_ticket = kmemdup(xdr, len, GFP_KERNEL); - if (!*_ticket) - return -ENOMEM; - toklen -= paddedlen; - xdr += paddedlen >> 2; - } - - *_xdr = xdr; - *_toklen = toklen; - _leave(" = 0 [toklen=%u]", toklen); - return 0; -} - -/* - * parse an RxK5 type XDR format token - * - the caller guarantees we have at least 4 words - */ -static int rxrpc_preparse_xdr_rxk5(struct key_preparsed_payload *prep, - size_t datalen, - const __be32 *xdr, unsigned int toklen) -{ - struct rxrpc_key_token *token, **pptoken; - struct rxk5_key *rxk5; - const __be32 *end_xdr = xdr + (toklen >> 2); - time64_t expiry; - int ret; - - _enter(",{%x,%x,%x,%x},%u", - ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), - toklen); - - /* reserve some payload space for this subkey - the length of the token - * is a reasonable approximation */ - prep->quotalen = datalen + toklen; - - token = kzalloc(sizeof(*token), GFP_KERNEL); - if (!token) - return -ENOMEM; - - rxk5 = kzalloc(sizeof(*rxk5), GFP_KERNEL); - if (!rxk5) { - kfree(token); - return -ENOMEM; - } - - token->security_index = RXRPC_SECURITY_RXK5; - token->k5 = rxk5; - - /* extract the principals */ - ret = rxrpc_krb5_decode_principal(&rxk5->client, &xdr, &toklen); - if (ret < 0) - goto error; - ret = rxrpc_krb5_decode_principal(&rxk5->server, &xdr, &toklen); - if (ret < 0) - goto error; - - /* extract the session key and the encoding type (the tag field -> - * ENCTYPE_xxx) */ - ret = rxrpc_krb5_decode_tagged_data(&rxk5->session, AFSTOKEN_DATA_MAX, - &xdr, &toklen); - if (ret < 0) - goto error; - - if (toklen < 4 * 8 + 2 * 4) - goto inval; - rxk5->authtime = be64_to_cpup((const __be64 *) xdr); - xdr += 2; - rxk5->starttime = be64_to_cpup((const __be64 *) xdr); - xdr += 2; - rxk5->endtime = be64_to_cpup((const __be64 *) xdr); - xdr += 2; - rxk5->renew_till = be64_to_cpup((const __be64 *) xdr); - xdr += 2; - rxk5->is_skey = ntohl(*xdr++); - rxk5->flags = ntohl(*xdr++); - toklen -= 4 * 8 + 2 * 4; - - _debug("times: a=%llx s=%llx e=%llx rt=%llx", - rxk5->authtime, rxk5->starttime, rxk5->endtime, - rxk5->renew_till); - _debug("is_skey=%x flags=%x", rxk5->is_skey, rxk5->flags); - - /* extract the permitted client addresses */ - ret = rxrpc_krb5_decode_tagged_array(&rxk5->addresses, - &rxk5->n_addresses, - AFSTOKEN_K5_ADDRESSES_MAX, - AFSTOKEN_DATA_MAX, - &xdr, &toklen); - if (ret < 0) - goto error; - - ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); - - /* extract the tickets */ - ret = rxrpc_krb5_decode_ticket(&rxk5->ticket, &rxk5->ticket_len, - &xdr, &toklen); - if (ret < 0) - goto error; - ret = rxrpc_krb5_decode_ticket(&rxk5->ticket2, &rxk5->ticket2_len, - &xdr, &toklen); - if (ret < 0) - goto error; - - ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); - - /* extract the typed auth data */ - ret = rxrpc_krb5_decode_tagged_array(&rxk5->authdata, - &rxk5->n_authdata, - AFSTOKEN_K5_AUTHDATA_MAX, - AFSTOKEN_BDATALN_MAX, - &xdr, &toklen); - if (ret < 0) - goto error; - - ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); - - if (toklen != 0) - goto inval; - - /* attach the payload */ - for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; - *pptoken; - pptoken = &(*pptoken)->next) - continue; - *pptoken = token; - expiry = rxrpc_u32_to_time64(token->k5->endtime); - if (expiry < prep->expiry) - prep->expiry = expiry; - - _leave(" = 0"); - return 0; - -inval: - ret = -EINVAL; -error: - rxrpc_rxk5_free(rxk5); - kfree(token); - _leave(" = %d", ret); - return ret; -} - /* * attempt to parse the data as the XDR format * - the caller guarantees we have more than 7 words */ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) { - const __be32 *xdr = prep->data, *token; + const __be32 *xdr = prep->data, *token, *p; const char *cp; unsigned int len, paddedlen, loop, ntoken, toklen, sec_ix; size_t datalen = prep->datalen; - int ret; + int ret, ret2; _enter(",{%x,%x,%x,%x},%zu", ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), @@ -610,20 +189,20 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) goto not_xdr; /* check each token wrapper */ - token = xdr; + p = xdr; loop = ntoken; do { if (datalen < 8) goto not_xdr; - toklen = ntohl(*xdr++); - sec_ix = ntohl(*xdr); + toklen = ntohl(*p++); + sec_ix = ntohl(*p); datalen -= 4; _debug("token: [%x/%zx] %x", toklen, datalen, sec_ix); paddedlen = (toklen + 3) & ~3; if (toklen < 20 || toklen > datalen || paddedlen > datalen) goto not_xdr; datalen -= paddedlen; - xdr += paddedlen >> 2; + p += paddedlen >> 2; } while (--loop > 0); @@ -634,44 +213,50 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) /* okay: we're going to assume it's valid XDR format * - we ignore the cellname, relying on the key to be correctly named */ + ret = -EPROTONOSUPPORT; do { - xdr = token; toklen = ntohl(*xdr++); - token = xdr + ((toklen + 3) >> 2); - sec_ix = ntohl(*xdr++); + token = xdr; + xdr += (toklen + 3) / 4; + + sec_ix = ntohl(*token++); toklen -= 4; - _debug("TOKEN type=%u [%p-%p]", sec_ix, xdr, token); + _debug("TOKEN type=%x len=%x", sec_ix, toklen); switch (sec_ix) { case RXRPC_SECURITY_RXKAD: - ret = rxrpc_preparse_xdr_rxkad(prep, datalen, xdr, toklen); - if (ret != 0) - goto error; + ret2 = rxrpc_preparse_xdr_rxkad(prep, datalen, token, toklen); + break; + default: + ret2 = -EPROTONOSUPPORT; break; + } - case RXRPC_SECURITY_RXK5: - ret = rxrpc_preparse_xdr_rxk5(prep, datalen, xdr, toklen); + switch (ret2) { + case 0: + ret = 0; + break; + case -EPROTONOSUPPORT: + break; + case -ENOPKG: if (ret != 0) - goto error; + ret = -ENOPKG; break; - default: - ret = -EPROTONOSUPPORT; + ret = ret2; goto error; } } while (--ntoken > 0); - _leave(" = 0"); - return 0; +error: + _leave(" = %d", ret); + return ret; not_xdr: _leave(" = -EPROTO"); return -EPROTO; -error: - _leave(" = %d", ret); - return ret; } /* @@ -805,10 +390,6 @@ static void rxrpc_free_token_list(struct rxrpc_key_token *token) case RXRPC_SECURITY_RXKAD: kfree(token->kad); break; - case RXRPC_SECURITY_RXK5: - if (token->k5) - rxrpc_rxk5_free(token->k5); - break; default: pr_err("Unknown token type %x on rxrpc key\n", token->security_index); @@ -828,45 +409,6 @@ static void rxrpc_free_preparse(struct key_preparsed_payload *prep) } /* - * Preparse a server secret key. - * - * The data should be the 8-byte secret key. - */ -static int rxrpc_preparse_s(struct key_preparsed_payload *prep) -{ - struct crypto_skcipher *ci; - - _enter("%zu", prep->datalen); - - if (prep->datalen != 8) - return -EINVAL; - - memcpy(&prep->payload.data[2], prep->data, 8); - - ci = crypto_alloc_skcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(ci)) { - _leave(" = %ld", PTR_ERR(ci)); - return PTR_ERR(ci); - } - - if (crypto_skcipher_setkey(ci, prep->data, 8) < 0) - BUG(); - - prep->payload.data[0] = ci; - _leave(" = 0"); - return 0; -} - -/* - * Clean up preparse data. - */ -static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep) -{ - if (prep->payload.data[0]) - crypto_free_skcipher(prep->payload.data[0]); -} - -/* * dispose of the data dangling from the corpse of a rxrpc key */ static void rxrpc_destroy(struct key *key) @@ -875,22 +417,29 @@ static void rxrpc_destroy(struct key *key) } /* - * dispose of the data dangling from the corpse of a rxrpc key - */ -static void rxrpc_destroy_s(struct key *key) -{ - if (key->payload.data[0]) { - crypto_free_skcipher(key->payload.data[0]); - key->payload.data[0] = NULL; - } -} - -/* * describe the rxrpc key */ static void rxrpc_describe(const struct key *key, struct seq_file *m) { + const struct rxrpc_key_token *token; + const char *sep = ": "; + seq_puts(m, key->description); + + for (token = key->payload.data[0]; token; token = token->next) { + seq_puts(m, sep); + + switch (token->security_index) { + case RXRPC_SECURITY_RXKAD: + seq_puts(m, "ka"); + break; + default: /* we have a ticket we can't encode */ + seq_printf(m, "%u", token->security_index); + break; + } + + sep = " "; + } } /* @@ -924,36 +473,6 @@ int rxrpc_request_key(struct rxrpc_sock *rx, sockptr_t optval, int optlen) } /* - * grab the security keyring for a server socket - */ -int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) -{ - struct key *key; - char *description; - - _enter(""); - - if (optlen <= 0 || optlen > PAGE_SIZE - 1) - return -EINVAL; - - description = memdup_sockptr_nul(optval, optlen); - if (IS_ERR(description)) - return PTR_ERR(description); - - key = request_key(&key_type_keyring, description, NULL); - if (IS_ERR(key)) { - kfree(description); - _leave(" = %ld", PTR_ERR(key)); - return PTR_ERR(key); - } - - rx->securities = key; - kfree(description); - _leave(" = 0 [key %x]", key->serial); - return 0; -} - -/* * generate a server data key */ int rxrpc_get_server_data_key(struct rxrpc_connection *conn, @@ -1044,12 +563,10 @@ static long rxrpc_read(const struct key *key, char *buffer, size_t buflen) { const struct rxrpc_key_token *token; - const struct krb5_principal *princ; size_t size; __be32 *xdr, *oldxdr; u32 cnlen, toksize, ntoks, tok, zero; u16 toksizes[AFSTOKEN_MAX]; - int loop; _enter(""); @@ -1074,42 +591,14 @@ static long rxrpc_read(const struct key *key, case RXRPC_SECURITY_RXKAD: toksize += 8 * 4; /* viceid, kvno, key*2, begin, * end, primary, tktlen */ - toksize += RND(token->kad->ticket_len); - break; - - case RXRPC_SECURITY_RXK5: - princ = &token->k5->client; - toksize += 4 + princ->n_name_parts * 4; - for (loop = 0; loop < princ->n_name_parts; loop++) - toksize += RND(strlen(princ->name_parts[loop])); - toksize += 4 + RND(strlen(princ->realm)); - - princ = &token->k5->server; - toksize += 4 + princ->n_name_parts * 4; - for (loop = 0; loop < princ->n_name_parts; loop++) - toksize += RND(strlen(princ->name_parts[loop])); - toksize += 4 + RND(strlen(princ->realm)); - - toksize += 8 + RND(token->k5->session.data_len); - - toksize += 4 * 8 + 2 * 4; - - toksize += 4 + token->k5->n_addresses * 8; - for (loop = 0; loop < token->k5->n_addresses; loop++) - toksize += RND(token->k5->addresses[loop].data_len); - - toksize += 4 + RND(token->k5->ticket_len); - toksize += 4 + RND(token->k5->ticket2_len); - - toksize += 4 + token->k5->n_authdata * 8; - for (loop = 0; loop < token->k5->n_authdata; loop++) - toksize += RND(token->k5->authdata[loop].data_len); + if (!token->no_leak_key) + toksize += RND(token->kad->ticket_len); break; default: /* we have a ticket we can't encode */ pr_err("Unsupported key token type (%u)\n", token->security_index); - continue; + return -ENOPKG; } _debug("token[%u]: toksize=%u", ntoks, toksize); @@ -1178,53 +667,16 @@ static long rxrpc_read(const struct key *key, ENCODE(token->kad->start); ENCODE(token->kad->expiry); ENCODE(token->kad->primary_flag); - ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); - break; - - case RXRPC_SECURITY_RXK5: - princ = &token->k5->client; - ENCODE(princ->n_name_parts); - for (loop = 0; loop < princ->n_name_parts; loop++) - ENCODE_STR(princ->name_parts[loop]); - ENCODE_STR(princ->realm); - - princ = &token->k5->server; - ENCODE(princ->n_name_parts); - for (loop = 0; loop < princ->n_name_parts; loop++) - ENCODE_STR(princ->name_parts[loop]); - ENCODE_STR(princ->realm); - - ENCODE(token->k5->session.tag); - ENCODE_DATA(token->k5->session.data_len, - token->k5->session.data); - - ENCODE64(token->k5->authtime); - ENCODE64(token->k5->starttime); - ENCODE64(token->k5->endtime); - ENCODE64(token->k5->renew_till); - ENCODE(token->k5->is_skey); - ENCODE(token->k5->flags); - - ENCODE(token->k5->n_addresses); - for (loop = 0; loop < token->k5->n_addresses; loop++) { - ENCODE(token->k5->addresses[loop].tag); - ENCODE_DATA(token->k5->addresses[loop].data_len, - token->k5->addresses[loop].data); - } - - ENCODE_DATA(token->k5->ticket_len, token->k5->ticket); - ENCODE_DATA(token->k5->ticket2_len, token->k5->ticket2); - - ENCODE(token->k5->n_authdata); - for (loop = 0; loop < token->k5->n_authdata; loop++) { - ENCODE(token->k5->authdata[loop].tag); - ENCODE_DATA(token->k5->authdata[loop].data_len, - token->k5->authdata[loop].data); - } + if (token->no_leak_key) + ENCODE(0); + else + ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); break; default: - break; + pr_err("Unsupported key token type (%u)\n", + token->security_index); + return -ENOPKG; } ASSERTCMP((unsigned long)xdr - (unsigned long)oldxdr, ==, diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 8c2881054266..a4111408ffd0 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -16,6 +16,7 @@ #include <linux/hashtable.h> #include <net/sock.h> #include <net/udp.h> +#include <net/udp_tunnel.h> #include <net/af_rxrpc.h> #include "ar-internal.h" @@ -106,58 +107,44 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, */ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) { + struct udp_tunnel_sock_cfg tuncfg = {NULL}; + struct sockaddr_rxrpc *srx = &local->srx; + struct udp_port_cfg udp_conf = {0}; struct sock *usk; int ret; _enter("%p{%d,%d}", - local, local->srx.transport_type, local->srx.transport.family); + local, srx->transport_type, srx->transport.family); - /* create a socket to represent the local endpoint */ - ret = sock_create_kern(net, local->srx.transport.family, - local->srx.transport_type, 0, &local->socket); + udp_conf.family = srx->transport.family; + if (udp_conf.family == AF_INET) { + udp_conf.local_ip = srx->transport.sin.sin_addr; + udp_conf.local_udp_port = srx->transport.sin.sin_port; +#if IS_ENABLED(CONFIG_AF_RXRPC_IPV6) + } else { + udp_conf.local_ip6 = srx->transport.sin6.sin6_addr; + udp_conf.local_udp_port = srx->transport.sin6.sin6_port; +#endif + } + ret = udp_sock_create(net, &udp_conf, &local->socket); if (ret < 0) { _leave(" = %d [socket]", ret); return ret; } + tuncfg.encap_type = UDP_ENCAP_RXRPC; + tuncfg.encap_rcv = rxrpc_input_packet; + tuncfg.sk_user_data = local; + setup_udp_tunnel_sock(net, local->socket, &tuncfg); + /* set the socket up */ usk = local->socket->sk; - inet_sk(usk)->mc_loop = 0; - - /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ - inet_inc_convert_csum(usk); - - rcu_assign_sk_user_data(usk, local); - - udp_sk(usk)->encap_type = UDP_ENCAP_RXRPC; - udp_sk(usk)->encap_rcv = rxrpc_input_packet; - udp_sk(usk)->encap_destroy = NULL; - udp_sk(usk)->gro_receive = NULL; - udp_sk(usk)->gro_complete = NULL; - - udp_encap_enable(); -#if IS_ENABLED(CONFIG_AF_RXRPC_IPV6) - if (local->srx.transport.family == AF_INET6) - udpv6_encap_enable(); -#endif usk->sk_error_report = rxrpc_error_report; - /* if a local address was supplied then bind it */ - if (local->srx.transport_len > sizeof(sa_family_t)) { - _debug("bind"); - ret = kernel_bind(local->socket, - (struct sockaddr *)&local->srx.transport, - local->srx.transport_len); - if (ret < 0) { - _debug("bind failed %d", ret); - goto error; - } - } - - switch (local->srx.transport.family) { + switch (srx->transport.family) { case AF_INET6: /* we want to receive ICMPv6 errors */ - ip6_sock_set_recverr(local->socket->sk); + ip6_sock_set_recverr(usk); /* Fall through and set IPv4 options too otherwise we don't get * errors from IPv4 packets sent through the IPv6 socket. @@ -165,13 +152,13 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) fallthrough; case AF_INET: /* we want to receive ICMP errors */ - ip_sock_set_recverr(local->socket->sk); + ip_sock_set_recverr(usk); /* we want to set the don't fragment bit */ - ip_sock_set_mtu_discover(local->socket->sk, IP_PMTUDISC_DO); + ip_sock_set_mtu_discover(usk, IP_PMTUDISC_DO); /* We want receive timestamps. */ - sock_enable_timestamps(local->socket->sk); + sock_enable_timestamps(usk); break; default: @@ -180,15 +167,6 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) _leave(" = 0"); return 0; - -error: - kernel_sock_shutdown(local->socket, SHUT_RDWR); - local->socket->sk->sk_user_data = NULL; - sock_release(local->socket); - local->socket = NULL; - - _leave(" = %d", ret); - return ret; } /* diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 2c842851d72e..fef3573fdc8b 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -69,7 +69,7 @@ bool __rxrpc_set_call_completion(struct rxrpc_call *call, if (call->state < RXRPC_CALL_COMPLETE) { call->abort_code = abort_code; call->error = error; - call->completion = compl, + call->completion = compl; call->state = RXRPC_CALL_COMPLETE; trace_rxrpc_call_complete(call); wake_up(&call->waitq); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index f114dc2af5cf..e2e9e9b0a6d7 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -15,6 +15,7 @@ #include <linux/scatterlist.h> #include <linux/ctype.h> #include <linux/slab.h> +#include <linux/key-type.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <keys/rxrpc-type.h> @@ -27,6 +28,7 @@ #define INST_SZ 40 /* size of principal's instance */ #define REALM_SZ 40 /* size of principal's auth domain */ #define SNAME_SZ 40 /* size of service name */ +#define RXKAD_ALIGN 8 struct rxkad_level1_hdr { __be32 data_size; /* true data size (excluding padding) */ @@ -37,6 +39,9 @@ struct rxkad_level2_hdr { __be32 checksum; /* decrypted data checksum */ }; +static int rxkad_prime_packet_security(struct rxrpc_connection *conn, + struct crypto_sync_skcipher *ci); + /* * this holds a pinned cipher so that keventd doesn't get called by the cipher * alloc routine, but since we have it to hand, we use it to decrypt RESPONSE @@ -47,17 +52,59 @@ static struct skcipher_request *rxkad_ci_req; static DEFINE_MUTEX(rxkad_ci_mutex); /* + * Parse the information from a server key + * + * The data should be the 8-byte secret key. + */ +static int rxkad_preparse_server_key(struct key_preparsed_payload *prep) +{ + struct crypto_skcipher *ci; + + if (prep->datalen != 8) + return -EINVAL; + + memcpy(&prep->payload.data[2], prep->data, 8); + + ci = crypto_alloc_skcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(ci)) { + _leave(" = %ld", PTR_ERR(ci)); + return PTR_ERR(ci); + } + + if (crypto_skcipher_setkey(ci, prep->data, 8) < 0) + BUG(); + + prep->payload.data[0] = ci; + _leave(" = 0"); + return 0; +} + +static void rxkad_free_preparse_server_key(struct key_preparsed_payload *prep) +{ + + if (prep->payload.data[0]) + crypto_free_skcipher(prep->payload.data[0]); +} + +static void rxkad_destroy_server_key(struct key *key) +{ + if (key->payload.data[0]) { + crypto_free_skcipher(key->payload.data[0]); + key->payload.data[0] = NULL; + } +} + +/* * initialise connection security */ -static int rxkad_init_connection_security(struct rxrpc_connection *conn) +static int rxkad_init_connection_security(struct rxrpc_connection *conn, + struct rxrpc_key_token *token) { struct crypto_sync_skcipher *ci; - struct rxrpc_key_token *token; int ret; _enter("{%d},{%x}", conn->debug_id, key_serial(conn->params.key)); - token = conn->params.key->payload.data[0]; conn->security_ix = token->security_index; ci = crypto_alloc_sync_skcipher("pcbc(fcrypt)", 0, 0); @@ -73,32 +120,68 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn) switch (conn->params.security_level) { case RXRPC_SECURITY_PLAIN: - break; case RXRPC_SECURITY_AUTH: - conn->size_align = 8; - conn->security_size = sizeof(struct rxkad_level1_hdr); - break; case RXRPC_SECURITY_ENCRYPT: - conn->size_align = 8; - conn->security_size = sizeof(struct rxkad_level2_hdr); break; default: ret = -EKEYREJECTED; goto error; } - conn->cipher = ci; - ret = 0; + ret = rxkad_prime_packet_security(conn, ci); + if (ret < 0) + goto error_ci; + + conn->rxkad.cipher = ci; + return 0; + +error_ci: + crypto_free_sync_skcipher(ci); error: _leave(" = %d", ret); return ret; } /* + * Work out how much data we can put in a packet. + */ +static int rxkad_how_much_data(struct rxrpc_call *call, size_t remain, + size_t *_buf_size, size_t *_data_size, size_t *_offset) +{ + size_t shdr, buf_size, chunk; + + switch (call->conn->params.security_level) { + default: + buf_size = chunk = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); + shdr = 0; + goto out; + case RXRPC_SECURITY_AUTH: + shdr = sizeof(struct rxkad_level1_hdr); + break; + case RXRPC_SECURITY_ENCRYPT: + shdr = sizeof(struct rxkad_level2_hdr); + break; + } + + buf_size = round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN); + + chunk = buf_size - shdr; + if (remain < chunk) + buf_size = round_up(shdr + remain, RXKAD_ALIGN); + +out: + *_buf_size = buf_size; + *_data_size = chunk; + *_offset = shdr; + return 0; +} + +/* * prime the encryption state with the invariant parts of a connection's * description */ -static int rxkad_prime_packet_security(struct rxrpc_connection *conn) +static int rxkad_prime_packet_security(struct rxrpc_connection *conn, + struct crypto_sync_skcipher *ci) { struct skcipher_request *req; struct rxrpc_key_token *token; @@ -116,7 +199,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn) if (!tmpbuf) return -ENOMEM; - req = skcipher_request_alloc(&conn->cipher->base, GFP_NOFS); + req = skcipher_request_alloc(&ci->base, GFP_NOFS); if (!req) { kfree(tmpbuf); return -ENOMEM; @@ -131,13 +214,13 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn) tmpbuf[3] = htonl(conn->security_ix); sg_init_one(&sg, tmpbuf, tmpsize); - skcipher_request_set_sync_tfm(req, conn->cipher); + skcipher_request_set_sync_tfm(req, ci); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, tmpsize, iv.x); crypto_skcipher_encrypt(req); skcipher_request_free(req); - memcpy(&conn->csum_iv, tmpbuf + 2, sizeof(conn->csum_iv)); + memcpy(&conn->rxkad.csum_iv, tmpbuf + 2, sizeof(conn->rxkad.csum_iv)); kfree(tmpbuf); _leave(" = 0"); return 0; @@ -149,7 +232,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn) */ static struct skcipher_request *rxkad_get_call_crypto(struct rxrpc_call *call) { - struct crypto_skcipher *tfm = &call->conn->cipher->base; + struct crypto_skcipher *tfm = &call->conn->rxkad.cipher->base; struct skcipher_request *cipher_req = call->cipher_req; if (!cipher_req) { @@ -176,15 +259,14 @@ static void rxkad_free_call_crypto(struct rxrpc_call *call) * partially encrypt a packet (level 1 security) */ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, - struct sk_buff *skb, - u32 data_size, - void *sechdr, + struct sk_buff *skb, u32 data_size, struct skcipher_request *req) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxkad_level1_hdr hdr; struct rxrpc_crypt iv; struct scatterlist sg; + size_t pad; u16 check; _enter(""); @@ -193,13 +275,19 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, data_size |= (u32)check << 16; hdr.data_size = htonl(data_size); - memcpy(sechdr, &hdr, sizeof(hdr)); + memcpy(skb->head, &hdr, sizeof(hdr)); + + pad = sizeof(struct rxkad_level1_hdr) + data_size; + pad = RXKAD_ALIGN - pad; + pad &= RXKAD_ALIGN - 1; + if (pad) + skb_put_zero(skb, pad); /* start the encryption afresh */ memset(&iv, 0, sizeof(iv)); - sg_init_one(&sg, sechdr, 8); - skcipher_request_set_sync_tfm(req, call->conn->cipher); + sg_init_one(&sg, skb->head, 8); + skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); @@ -215,7 +303,6 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct sk_buff *skb, u32 data_size, - void *sechdr, struct skcipher_request *req) { const struct rxrpc_key_token *token; @@ -224,6 +311,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct rxrpc_crypt iv; struct scatterlist sg[16]; unsigned int len; + size_t pad; u16 check; int err; @@ -235,14 +323,20 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, rxkhdr.data_size = htonl(data_size | (u32)check << 16); rxkhdr.checksum = 0; - memcpy(sechdr, &rxkhdr, sizeof(rxkhdr)); + memcpy(skb->head, &rxkhdr, sizeof(rxkhdr)); + + pad = sizeof(struct rxkad_level2_hdr) + data_size; + pad = RXKAD_ALIGN - pad; + pad &= RXKAD_ALIGN - 1; + if (pad) + skb_put_zero(skb, pad); /* encrypt from the session key */ token = call->conn->params.key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - sg_init_one(&sg[0], sechdr, sizeof(rxkhdr)); - skcipher_request_set_sync_tfm(req, call->conn->cipher); + sg_init_one(&sg[0], skb->head, sizeof(rxkhdr)); + skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg[0], &sg[0], sizeof(rxkhdr), iv.x); crypto_skcipher_encrypt(req); @@ -252,11 +346,10 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, if (skb_shinfo(skb)->nr_frags > 16) goto out; - len = data_size + call->conn->size_align - 1; - len &= ~(call->conn->size_align - 1); + len = round_up(data_size, RXKAD_ALIGN); sg_init_table(sg, ARRAY_SIZE(sg)); - err = skb_to_sgvec(skb, sg, 0, len); + err = skb_to_sgvec(skb, sg, 8, len); if (unlikely(err < 0)) goto out; skcipher_request_set_crypt(req, sg, sg, len, iv.x); @@ -275,8 +368,7 @@ out: */ static int rxkad_secure_packet(struct rxrpc_call *call, struct sk_buff *skb, - size_t data_size, - void *sechdr) + size_t data_size) { struct rxrpc_skb_priv *sp; struct skcipher_request *req; @@ -291,7 +383,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, call->debug_id, key_serial(call->conn->params.key), sp->hdr.seq, data_size); - if (!call->conn->cipher) + if (!call->conn->rxkad.cipher) return 0; ret = key_validate(call->conn->params.key); @@ -303,7 +395,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, return -ENOMEM; /* continue encrypting from where we left off */ - memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); + memcpy(&iv, call->conn->rxkad.csum_iv.x, sizeof(iv)); /* calculate the security checksum */ x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); @@ -312,7 +404,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, call->crypto_buf[1] = htonl(x); sg_init_one(&sg, call->crypto_buf, 8); - skcipher_request_set_sync_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); @@ -329,12 +421,10 @@ static int rxkad_secure_packet(struct rxrpc_call *call, ret = 0; break; case RXRPC_SECURITY_AUTH: - ret = rxkad_secure_packet_auth(call, skb, data_size, sechdr, - req); + ret = rxkad_secure_packet_auth(call, skb, data_size, req); break; case RXRPC_SECURITY_ENCRYPT: - ret = rxkad_secure_packet_encrypt(call, skb, data_size, - sechdr, req); + ret = rxkad_secure_packet_encrypt(call, skb, data_size, req); break; default: ret = -EPERM; @@ -380,7 +470,7 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, /* start the decryption afresh */ memset(&iv, 0, sizeof(iv)); - skcipher_request_set_sync_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, 8, iv.x); crypto_skcipher_decrypt(req); @@ -472,7 +562,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, token = call->conn->params.key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - skcipher_request_set_sync_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, len, iv.x); crypto_skcipher_decrypt(req); @@ -538,7 +628,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, _enter("{%d{%x}},{#%u}", call->debug_id, key_serial(call->conn->params.key), seq); - if (!call->conn->cipher) + if (!call->conn->rxkad.cipher) return 0; req = rxkad_get_call_crypto(call); @@ -546,7 +636,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, return -ENOMEM; /* continue encrypting from where we left off */ - memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); + memcpy(&iv, call->conn->rxkad.csum_iv.x, sizeof(iv)); /* validate the security checksum */ x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); @@ -555,7 +645,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, call->crypto_buf[1] = htonl(x); sg_init_one(&sg, call->crypto_buf, 8); - skcipher_request_set_sync_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); @@ -648,16 +738,12 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) u32 serial; int ret; - _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); + _enter("{%d}", conn->debug_id); - ret = key_validate(conn->server_key); - if (ret < 0) - return ret; - - get_random_bytes(&conn->security_nonce, sizeof(conn->security_nonce)); + get_random_bytes(&conn->rxkad.nonce, sizeof(conn->rxkad.nonce)); challenge.version = htonl(2); - challenge.nonce = htonl(conn->security_nonce); + challenge.nonce = htonl(conn->rxkad.nonce); challenge.min_level = htonl(0); challenge.__padding = 0; @@ -785,7 +871,7 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn, struct rxrpc_crypt iv; struct scatterlist sg[1]; - req = skcipher_request_alloc(&conn->cipher->base, GFP_NOFS); + req = skcipher_request_alloc(&conn->rxkad.cipher->base, GFP_NOFS); if (!req) return -ENOMEM; @@ -794,7 +880,7 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn, sg_init_table(sg, 1); sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); - skcipher_request_set_sync_tfm(req, conn->cipher); + skcipher_request_set_sync_tfm(req, conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); crypto_skcipher_encrypt(req); @@ -892,6 +978,7 @@ other_error: * decrypt the kerberos IV ticket in the response */ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, + struct key *server_key, struct sk_buff *skb, void *ticket, size_t ticket_len, struct rxrpc_crypt *_session_key, @@ -911,30 +998,17 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, u32 abort_code; u8 *p, *q, *name, *end; - _enter("{%d},{%x}", conn->debug_id, key_serial(conn->server_key)); + _enter("{%d},{%x}", conn->debug_id, key_serial(server_key)); *_expiry = 0; - ret = key_validate(conn->server_key); - if (ret < 0) { - switch (ret) { - case -EKEYEXPIRED: - abort_code = RXKADEXPIRED; - goto other_error; - default: - abort_code = RXKADNOAUTH; - goto other_error; - } - } - - ASSERT(conn->server_key->payload.data[0] != NULL); + ASSERT(server_key->payload.data[0] != NULL); ASSERTCMP((unsigned long) ticket & 7UL, ==, 0); - memcpy(&iv, &conn->server_key->payload.data[2], sizeof(iv)); + memcpy(&iv, &server_key->payload.data[2], sizeof(iv)); ret = -ENOMEM; - req = skcipher_request_alloc(conn->server_key->payload.data[0], - GFP_NOFS); + req = skcipher_request_alloc(server_key->payload.data[0], GFP_NOFS); if (!req) goto temporary_error; @@ -1090,6 +1164,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, struct rxkad_response *response; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt session_key; + struct key *server_key; const char *eproto; time64_t expiry; void *ticket; @@ -1097,7 +1172,27 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, __be32 csum; int ret, i; - _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); + _enter("{%d}", conn->debug_id); + + server_key = rxrpc_look_up_server_security(conn, skb, 0, 0); + if (IS_ERR(server_key)) { + switch (PTR_ERR(server_key)) { + case -ENOKEY: + abort_code = RXKADUNKNOWNKEY; + break; + case -EKEYEXPIRED: + abort_code = RXKADEXPIRED; + break; + default: + abort_code = RXKADNOAUTH; + break; + } + trace_rxrpc_abort(0, "SVK", + sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + abort_code, PTR_ERR(server_key)); + *_abort_code = abort_code; + return -EPROTO; + } ret = -ENOMEM; response = kzalloc(sizeof(struct rxkad_response), GFP_NOFS); @@ -1109,8 +1204,6 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), response, sizeof(*response)) < 0) goto protocol_error; - if (!pskb_pull(skb, sizeof(*response))) - BUG(); version = ntohl(response->version); ticket_len = ntohl(response->ticket_len); @@ -1141,12 +1234,12 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, eproto = tracepoint_string("rxkad_tkt_short"); abort_code = RXKADPACKETSHORT; - if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response), ticket, ticket_len) < 0) goto protocol_error_free; - ret = rxkad_decrypt_ticket(conn, skb, ticket, ticket_len, &session_key, - &expiry, _abort_code); + ret = rxkad_decrypt_ticket(conn, server_key, skb, ticket, ticket_len, + &session_key, &expiry, _abort_code); if (ret < 0) goto temporary_error_free_ticket; @@ -1196,7 +1289,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, eproto = tracepoint_string("rxkad_rsp_seq"); abort_code = RXKADOUTOFSEQUENCE; - if (ntohl(response->encrypted.inc_nonce) != conn->security_nonce + 1) + if (ntohl(response->encrypted.inc_nonce) != conn->rxkad.nonce + 1) goto protocol_error_free; eproto = tracepoint_string("rxkad_rsp_level"); @@ -1225,6 +1318,7 @@ protocol_error_free: protocol_error: kfree(response); trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); + key_put(server_key); *_abort_code = abort_code; return -EPROTO; @@ -1237,6 +1331,7 @@ temporary_error: * ENOMEM. We just want to send the challenge again. Note that we * also come out this way if the ticket decryption fails. */ + key_put(server_key); return ret; } @@ -1247,8 +1342,8 @@ static void rxkad_clear(struct rxrpc_connection *conn) { _enter(""); - if (conn->cipher) - crypto_free_sync_skcipher(conn->cipher); + if (conn->rxkad.cipher) + crypto_free_sync_skcipher(conn->rxkad.cipher); } /* @@ -1296,8 +1391,11 @@ const struct rxrpc_security rxkad = { .no_key_abort = RXKADUNKNOWNKEY, .init = rxkad_init, .exit = rxkad_exit, + .preparse_server_key = rxkad_preparse_server_key, + .free_preparse_server_key = rxkad_free_preparse_server_key, + .destroy_server_key = rxkad_destroy_server_key, .init_connection_security = rxkad_init_connection_security, - .prime_packet_security = rxkad_prime_packet_security, + .how_much_data = rxkad_how_much_data, .secure_packet = rxkad_secure_packet, .verify_packet = rxkad_verify_packet, .free_call_crypto = rxkad_free_call_crypto, diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 9b1fb9ed0717..50cb5f1ee0c0 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -55,7 +55,7 @@ void rxrpc_exit_security(void) /* * look up an rxrpc security module */ -static const struct rxrpc_security *rxrpc_security_lookup(u8 security_index) +const struct rxrpc_security *rxrpc_security_lookup(u8 security_index) { if (security_index >= ARRAY_SIZE(rxrpc_security_types)) return NULL; @@ -81,16 +81,17 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) if (ret < 0) return ret; - token = key->payload.data[0]; - if (!token) - return -EKEYREJECTED; + for (token = key->payload.data[0]; token; token = token->next) { + sec = rxrpc_security_lookup(token->security_index); + if (sec) + goto found; + } + return -EKEYREJECTED; - sec = rxrpc_security_lookup(token->security_index); - if (!sec) - return -EKEYREJECTED; +found: conn->security = sec; - ret = conn->security->init_connection_security(conn); + ret = conn->security->init_connection_security(conn, token); if (ret < 0) { conn->security = &rxrpc_no_security; return ret; @@ -101,22 +102,16 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) } /* - * Find the security key for a server connection. + * Set the ops a server connection. */ -bool rxrpc_look_up_server_security(struct rxrpc_local *local, struct rxrpc_sock *rx, - const struct rxrpc_security **_sec, - struct key **_key, - struct sk_buff *skb) +const struct rxrpc_security *rxrpc_get_incoming_security(struct rxrpc_sock *rx, + struct sk_buff *skb) { const struct rxrpc_security *sec; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - key_ref_t kref = NULL; - char kdesc[5 + 1 + 3 + 1]; _enter(""); - sprintf(kdesc, "%u:%u", sp->hdr.serviceId, sp->hdr.securityIndex); - sec = rxrpc_security_lookup(sp->hdr.securityIndex); if (!sec) { trace_rxrpc_abort(0, "SVS", @@ -124,35 +119,72 @@ bool rxrpc_look_up_server_security(struct rxrpc_local *local, struct rxrpc_sock RX_INVALID_OPERATION, EKEYREJECTED); skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; skb->priority = RX_INVALID_OPERATION; - return false; + return NULL; } - if (sp->hdr.securityIndex == RXRPC_SECURITY_NONE) - goto out; - - if (!rx->securities) { + if (sp->hdr.securityIndex != RXRPC_SECURITY_NONE && + !rx->securities) { trace_rxrpc_abort(0, "SVR", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, EKEYREJECTED); skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; - skb->priority = RX_INVALID_OPERATION; - return false; + skb->priority = sec->no_key_abort; + return NULL; } + return sec; +} + +/* + * Find the security key for a server connection. + */ +struct key *rxrpc_look_up_server_security(struct rxrpc_connection *conn, + struct sk_buff *skb, + u32 kvno, u32 enctype) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_sock *rx; + struct key *key = ERR_PTR(-EKEYREJECTED); + key_ref_t kref = NULL; + char kdesc[5 + 1 + 3 + 1 + 12 + 1 + 12 + 1]; + int ret; + + _enter(""); + + if (enctype) + sprintf(kdesc, "%u:%u:%u:%u", + sp->hdr.serviceId, sp->hdr.securityIndex, kvno, enctype); + else if (kvno) + sprintf(kdesc, "%u:%u:%u", + sp->hdr.serviceId, sp->hdr.securityIndex, kvno); + else + sprintf(kdesc, "%u:%u", + sp->hdr.serviceId, sp->hdr.securityIndex); + + rcu_read_lock(); + + rx = rcu_dereference(conn->params.local->service); + if (!rx) + goto out; + /* look through the service's keyring */ kref = keyring_search(make_key_ref(rx->securities, 1UL), &key_type_rxrpc_s, kdesc, true); if (IS_ERR(kref)) { - trace_rxrpc_abort(0, "SVK", - sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - sec->no_key_abort, EKEYREJECTED); - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; - skb->priority = sec->no_key_abort; - return false; + key = ERR_CAST(kref); + goto out; + } + + key = key_ref_to_ptr(kref); + + ret = key_validate(key); + if (ret < 0) { + key_put(key); + key = ERR_PTR(ret); + goto out; } out: - *_sec = sec; - *_key = key_ref_to_ptr(kref); - return true; + rcu_read_unlock(); + return key; } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index d27140c836cc..af8ad6c30b9f 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -327,7 +327,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, rxrpc_send_ack_packet(call, false, NULL); if (!skb) { - size_t size, chunk, max, space; + size_t remain, bufsize, chunk, offset; _debug("alloc"); @@ -342,24 +342,21 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, goto maybe_error; } - max = RXRPC_JUMBO_DATALEN; - max -= call->conn->security_size; - max &= ~(call->conn->size_align - 1UL); - - chunk = max; - if (chunk > msg_data_left(msg) && !more) - chunk = msg_data_left(msg); - - space = chunk + call->conn->size_align; - space &= ~(call->conn->size_align - 1UL); - - size = space + call->conn->security_size; + /* Work out the maximum size of a packet. Assume that + * the security header is going to be in the padded + * region (enc blocksize), but the trailer is not. + */ + remain = more ? INT_MAX : msg_data_left(msg); + ret = call->conn->security->how_much_data(call, remain, + &bufsize, &chunk, &offset); + if (ret < 0) + goto maybe_error; - _debug("SIZE: %zu/%zu/%zu", chunk, space, size); + _debug("SIZE: %zu/%zu @%zu", chunk, bufsize, offset); /* create a buffer that we can retain until it's ACK'd */ skb = sock_alloc_send_skb( - sk, size, msg->msg_flags & MSG_DONTWAIT, &ret); + sk, bufsize, msg->msg_flags & MSG_DONTWAIT, &ret); if (!skb) goto maybe_error; @@ -371,9 +368,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, ASSERTCMP(skb->mark, ==, 0); - _debug("HS: %u", call->conn->security_size); - skb_reserve(skb, call->conn->security_size); - skb->len += call->conn->security_size; + __skb_put(skb, offset); sp->remain = chunk; if (sp->remain > skb_tailroom(skb)) @@ -422,17 +417,6 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, (msg_data_left(msg) == 0 && !more)) { struct rxrpc_connection *conn = call->conn; uint32_t seq; - size_t pad; - - /* pad out if we're using security */ - if (conn->security_ix) { - pad = conn->security_size + skb->mark; - pad = conn->size_align - pad; - pad &= conn->size_align - 1; - _debug("pad %zu", pad); - if (pad) - skb_put_zero(skb, pad); - } seq = call->tx_top + 1; @@ -446,8 +430,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, call->tx_winsize) sp->hdr.flags |= RXRPC_MORE_PACKETS; - ret = call->security->secure_packet( - call, skb, skb->mark, skb->head); + ret = call->security->secure_packet(call, skb, skb->mark); if (ret < 0) goto out; diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c new file mode 100644 index 000000000000..ead3471307ee --- /dev/null +++ b/net/rxrpc/server_key.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* RxRPC key management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * RxRPC keys should have a description of describing their purpose: + * "afs@CAMBRIDGE.REDHAT.COM> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/skcipher.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/key-type.h> +#include <linux/ctype.h> +#include <linux/slab.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include <keys/rxrpc-type.h> +#include <keys/user-type.h> +#include "ar-internal.h" + +static int rxrpc_vet_description_s(const char *); +static int rxrpc_preparse_s(struct key_preparsed_payload *); +static void rxrpc_free_preparse_s(struct key_preparsed_payload *); +static void rxrpc_destroy_s(struct key *); +static void rxrpc_describe_s(const struct key *, struct seq_file *); + +/* + * rxrpc server keys take "<serviceId>:<securityIndex>[:<sec-specific>]" as the + * description and the key material as the payload. + */ +struct key_type key_type_rxrpc_s = { + .name = "rxrpc_s", + .flags = KEY_TYPE_NET_DOMAIN, + .vet_description = rxrpc_vet_description_s, + .preparse = rxrpc_preparse_s, + .free_preparse = rxrpc_free_preparse_s, + .instantiate = generic_key_instantiate, + .destroy = rxrpc_destroy_s, + .describe = rxrpc_describe_s, +}; + +/* + * Vet the description for an RxRPC server key. + */ +static int rxrpc_vet_description_s(const char *desc) +{ + unsigned long service, sec_class; + char *p; + + service = simple_strtoul(desc, &p, 10); + if (*p != ':' || service > 65535) + return -EINVAL; + sec_class = simple_strtoul(p + 1, &p, 10); + if ((*p && *p != ':') || sec_class < 1 || sec_class > 255) + return -EINVAL; + return 0; +} + +/* + * Preparse a server secret key. + */ +static int rxrpc_preparse_s(struct key_preparsed_payload *prep) +{ + const struct rxrpc_security *sec; + unsigned int service, sec_class; + int n; + + _enter("%zu", prep->datalen); + + if (!prep->orig_description) + return -EINVAL; + + if (sscanf(prep->orig_description, "%u:%u%n", &service, &sec_class, &n) != 2) + return -EINVAL; + + sec = rxrpc_security_lookup(sec_class); + if (!sec) + return -ENOPKG; + + prep->payload.data[1] = (struct rxrpc_security *)sec; + + return sec->preparse_server_key(prep); +} + +static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep) +{ + const struct rxrpc_security *sec = prep->payload.data[1]; + + if (sec) + sec->free_preparse_server_key(prep); +} + +static void rxrpc_destroy_s(struct key *key) +{ + const struct rxrpc_security *sec = key->payload.data[1]; + + if (sec) + sec->destroy_server_key(key); +} + +static void rxrpc_describe_s(const struct key *key, struct seq_file *m) +{ + const struct rxrpc_security *sec = key->payload.data[1]; + + seq_puts(m, key->description); + if (sec && sec->describe_server_key) + sec->describe_server_key(key, m); +} + +/* + * grab the security keyring for a server socket + */ +int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) +{ + struct key *key; + char *description; + + _enter(""); + + if (optlen <= 0 || optlen > PAGE_SIZE - 1) + return -EINVAL; + + description = memdup_sockptr_nul(optval, optlen); + if (IS_ERR(description)) + return PTR_ERR(description); + + key = request_key(&key_type_keyring, description, NULL); + if (IS_ERR(key)) { + kfree(description); + _leave(" = %ld", PTR_ERR(key)); + return PTR_ERR(key); + } + + rx->securities = key; + kfree(description); + _leave(" = 0 [key %x]", key->serial); + return 0; +} diff --git a/net/sched/Kconfig b/net/sched/Kconfig index a3b37d88800e..1e8ab4749c6c 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -281,7 +281,7 @@ config NET_SCH_CHOKE help Say Y here if you want to use the CHOKe packet scheduler (CHOose and Keep for responsive flows, CHOose and Kill for unresponsive - flows). This is a variation of RED which trys to penalize flows + flows). This is a variation of RED which tries to penalize flows that monopolize the queue. To compile this code as a module, choose M here: the @@ -813,7 +813,7 @@ config NET_ACT_SAMPLE config NET_ACT_IPT tristate "IPtables targets" - depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + depends on NET_CLS_ACT && NETFILTER && NETFILTER_XTABLES help Say Y here to be able to invoke iptables targets after successful classification. @@ -912,7 +912,7 @@ config NET_ACT_BPF config NET_ACT_CONNMARK tristate "Netfilter Connection Mark Retriever" - depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + depends on NET_CLS_ACT && NETFILTER depends on NF_CONNTRACK && NF_CONNTRACK_MARK help Say Y here to allow retrieving of conn mark @@ -924,7 +924,7 @@ config NET_ACT_CONNMARK config NET_ACT_CTINFO tristate "Netfilter Connection Mark Actions" - depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + depends on NET_CLS_ACT && NETFILTER depends on NF_CONNTRACK && NF_CONNTRACK_MARK help Say Y here to allow transfer of a connmark stored information. diff --git a/net/sched/Makefile b/net/sched/Makefile index 66bbf9a98f9e..dd14ef413fda 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -5,6 +5,7 @@ obj-y := sch_generic.o sch_mq.o +obj-$(CONFIG_INET) += sch_frag.o obj-$(CONFIG_NET_SCHED) += sch_api.o sch_blackhole.o obj-$(CONFIG_NET_CLS) += cls_api.o obj-$(CONFIG_NET_CLS_ACT) += act_api.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index f66417d5d2c3..b919826939e0 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -22,6 +22,22 @@ #include <net/act_api.h> #include <net/netlink.h> +#ifdef CONFIG_INET +DEFINE_STATIC_KEY_FALSE(tcf_frag_xmit_count); +EXPORT_SYMBOL_GPL(tcf_frag_xmit_count); +#endif + +int tcf_dev_queue_xmit(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)) +{ +#ifdef CONFIG_INET + if (static_branch_unlikely(&tcf_frag_xmit_count)) + return sch_frag_xmit_hook(skb, xmit); +#endif + + return xmit(skb); +} +EXPORT_SYMBOL_GPL(tcf_dev_queue_xmit); + static void tcf_action_goto_chain_exec(const struct tc_action *a, struct tcf_result *res) { @@ -215,6 +231,36 @@ static size_t tcf_action_fill_size(const struct tc_action *act) return sz; } +static int +tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a, bool from_act) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tc_cookie *cookie; + + if (nla_put_string(skb, TCA_KIND, a->ops->kind)) + goto nla_put_failure; + if (tcf_action_copy_stats(skb, a, 0)) + goto nla_put_failure; + if (from_act && nla_put_u32(skb, TCA_ACT_INDEX, a->tcfa_index)) + goto nla_put_failure; + + rcu_read_lock(); + cookie = rcu_dereference(a->act_cookie); + if (cookie) { + if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) { + rcu_read_unlock(); + goto nla_put_failure; + } + } + rcu_read_unlock(); + + return 0; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct netlink_callback *cb) { @@ -248,7 +294,9 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, index--; goto nla_put_failure; } - err = tcf_action_dump_1(skb, p, 0, 0); + err = (act_flags & TCA_ACT_FLAG_TERSE_DUMP) ? + tcf_action_dump_terse(skb, p, true) : + tcf_action_dump_1(skb, p, 0, 0); if (err < 0) { index--; nlmsg_trim(skb, nest); @@ -256,7 +304,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, } nla_nest_end(skb, nest); n_i++; - if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) && + if (!(act_flags & TCA_ACT_FLAG_LARGE_DUMP_ON) && n_i >= TCA_ACT_MAX_PRIO) goto done; } @@ -266,7 +314,7 @@ done: mutex_unlock(&idrinfo->lock); if (n_i) { - if (act_flags & TCA_FLAG_LARGE_DUMP_ON) + if (act_flags & TCA_ACT_FLAG_LARGE_DUMP_ON) cb->args[1] = n_i; } return n_i; @@ -651,7 +699,7 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind) return res; } -/*TCA_ACT_MAX_PRIO is 32, there count upto 32 */ +/*TCA_ACT_MAX_PRIO is 32, there count up to 32 */ #define TCA_ACT_MAX_PRIO_MASK 0x1FF int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res) @@ -752,34 +800,6 @@ tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) return a->ops->dump(skb, a, bind, ref); } -static int -tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cookie *cookie; - - if (nla_put_string(skb, TCA_KIND, a->ops->kind)) - goto nla_put_failure; - if (tcf_action_copy_stats(skb, a, 0)) - goto nla_put_failure; - - rcu_read_lock(); - cookie = rcu_dereference(a->act_cookie); - if (cookie) { - if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) { - rcu_read_unlock(); - goto nla_put_failure; - } - } - rcu_read_unlock(); - - return 0; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -787,7 +807,7 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; - if (tcf_action_dump_terse(skb, a)) + if (tcf_action_dump_terse(skb, a, false)) goto nla_put_failure; if (a->hw_stats != TCA_ACT_HW_STATS_ANY && @@ -832,7 +852,7 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; - err = terse ? tcf_action_dump_terse(skb, a) : + err = terse ? tcf_action_dump_terse(skb, a, false) : tcf_action_dump_1(skb, a, bind, ref); if (err < 0) goto errout; @@ -888,7 +908,7 @@ static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY), }; -static void tcf_idr_insert_many(struct tc_action *actions[]) +void tcf_idr_insert_many(struct tc_action *actions[]) { int i; @@ -908,19 +928,13 @@ static void tcf_idr_insert_many(struct tc_action *actions[]) } } -struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, - struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind, - bool rtnl_held, - struct netlink_ext_ack *extack) +struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla, + bool rtnl_held, + struct netlink_ext_ack *extack) { - struct nla_bitfield32 flags = { 0, 0 }; - u8 hw_stats = TCA_ACT_HW_STATS_ANY; - struct tc_action *a; + struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action_ops *a_o; - struct tc_cookie *cookie = NULL; char act_name[IFNAMSIZ]; - struct nlattr *tb[TCA_ACT_MAX + 1]; struct nlattr *kind; int err; @@ -928,33 +942,21 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) - goto err_out; + return ERR_PTR(err); err = -EINVAL; kind = tb[TCA_ACT_KIND]; if (!kind) { NL_SET_ERR_MSG(extack, "TC action kind must be specified"); - goto err_out; + return ERR_PTR(err); } - if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) { + if (nla_strscpy(act_name, kind, IFNAMSIZ) < 0) { NL_SET_ERR_MSG(extack, "TC action name too long"); - goto err_out; + return ERR_PTR(err); } - if (tb[TCA_ACT_COOKIE]) { - cookie = nla_memdup_cookie(tb); - if (!cookie) { - NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); - err = -ENOMEM; - goto err_out; - } - } - hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]); - if (tb[TCA_ACT_FLAGS]) - flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); } else { if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) { NL_SET_ERR_MSG(extack, "TC action name too long"); - err = -EINVAL; - goto err_out; + return ERR_PTR(-EINVAL); } } @@ -976,24 +978,56 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, * indicate this using -EAGAIN. */ if (a_o != NULL) { - err = -EAGAIN; - goto err_mod; + module_put(a_o->owner); + return ERR_PTR(-EAGAIN); } #endif NL_SET_ERR_MSG(extack, "Failed to load TC action module"); - err = -ENOENT; - goto err_free; + return ERR_PTR(-ENOENT); } + return a_o; +} + +struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, + struct nlattr *nla, struct nlattr *est, + char *name, int ovr, int bind, + struct tc_action_ops *a_o, bool rtnl_held, + struct netlink_ext_ack *extack) +{ + struct nla_bitfield32 flags = { 0, 0 }; + u8 hw_stats = TCA_ACT_HW_STATS_ANY; + struct nlattr *tb[TCA_ACT_MAX + 1]; + struct tc_cookie *cookie = NULL; + struct tc_action *a; + int err; + /* backward compatibility for policer */ - if (name == NULL) + if (name == NULL) { + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, + tcf_action_policy, extack); + if (err < 0) + return ERR_PTR(err); + if (tb[TCA_ACT_COOKIE]) { + cookie = nla_memdup_cookie(tb); + if (!cookie) { + NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); + err = -ENOMEM; + goto err_out; + } + } + hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]); + if (tb[TCA_ACT_FLAGS]) + flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); + err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, rtnl_held, tp, flags.value, extack); - else + } else { err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, tp, flags.value, extack); + } if (err < 0) - goto err_mod; + goto err_out; if (!name && tb[TCA_ACT_COOKIE]) tcf_set_action_cookie(&a->act_cookie, cookie); @@ -1010,14 +1044,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, return a; -err_mod: - module_put(a_o->owner); -err_free: +err_out: if (cookie) { kfree(cookie->data); kfree(cookie); } -err_out: return ERR_PTR(err); } @@ -1028,6 +1059,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct tc_action *actions[], size_t *attr_size, bool rtnl_held, struct netlink_ext_ack *extack) { + struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {}; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; size_t sz = 0; @@ -1040,8 +1072,19 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, return err; for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { + struct tc_action_ops *a_o; + + a_o = tc_action_load_ops(name, tb[i], rtnl_held, extack); + if (IS_ERR(a_o)) { + err = PTR_ERR(a_o); + goto err_mod; + } + ops[i - 1] = a_o; + } + + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, - rtnl_held, extack); + ops[i - 1], rtnl_held, extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; @@ -1061,6 +1104,11 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, err: tcf_action_destroy(actions, bind); +err_mod: + for (i = 0; i < TCA_ACT_MAX_PRIO; i++) { + if (ops[i]) + module_put(ops[i]->owner); + } return err; } @@ -1469,7 +1517,8 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, } static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = { - [TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_FLAG_LARGE_DUMP_ON), + [TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAG_LARGE_DUMP_ON | + TCA_ACT_FLAG_TERSE_DUMP), [TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 }, }; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index a4c7ba35a343..e48e980c3b93 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -65,7 +65,7 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, * In case a different well-known TC_ACT opcode has been * returned, it will overwrite the default one. * - * For everything else that is unkown, TC_ACT_UNSPEC is + * For everything else that is unknown, TC_ACT_UNSPEC is * returned. */ switch (filter_res) { diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index aba3cd85f284..f0a0aa125b00 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -183,6 +183,7 @@ static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, IP_CT_ESTABLISHED_REPLY; /* aligns with the CT reference on the SKB nf_ct_set */ entry->ct_metadata.cookie = (unsigned long)ct | ctinfo; + entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL; act_ct_labels = entry->ct_metadata.labels; ct_labels = nf_ct_labels_find(ct); @@ -296,7 +297,8 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params) goto err_insert; ct_ft->nf_ft.type = &flowtable_ct; - ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD; + ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD | + NF_FLOWTABLE_COUNTER; err = nf_flow_table_init(&ct_ft->nf_ft); if (err) goto err_init; @@ -540,7 +542,8 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, flow_offload_refresh(nf_ft, flow); nf_conntrack_get(&ct->ct_general); nf_ct_set(skb, ct, ctinfo); - nf_ct_acct_update(ct, dir, skb->len); + if (nf_ft->flags & NF_FLOWTABLE_COUNTER) + nf_ct_acct_update(ct, dir, skb->len); return true; } @@ -1028,6 +1031,7 @@ out_push: out: tcf_action_update_bstats(&c->common, skb); + qdisc_skb_cb(skb)->post_ct = true; if (defrag) qdisc_skb_cb(skb)->pkt_len = skb->len; return retval; @@ -1541,6 +1545,8 @@ static int __init ct_init_module(void) if (err) goto err_register; + static_branch_inc(&tcf_frag_xmit_count); + return 0; err_register: @@ -1552,6 +1558,7 @@ err_tbl_init: static void __exit ct_cleanup_module(void) { + static_branch_dec(&tcf_frag_xmit_count); tcf_unregister_action(&act_ct_ops, &ct_net_ops); tcf_ct_flow_tables_uninit(); destroy_workqueue(act_ct_wq); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 8dc3bec0d325..ac7297f42355 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -166,7 +166,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (unlikely(!tname)) goto err1; if (tb[TCA_IPT_TABLE] == NULL || - nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ) + nla_strscpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ) strcpy(tname, "mangle"); t = kmemdup(td, td->u.target_size, GFP_KERNEL); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index e24b7e2331cd..7153c67f641e 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -205,6 +205,18 @@ release_idr: return err; } +static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb) +{ + int err; + + if (!want_ingress) + err = tcf_dev_queue_xmit(skb, dev_queue_xmit); + else + err = netif_receive_skb(skb); + + return err; +} + static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -287,18 +299,15 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; - if (skb_tc_reinsert(skb, res)) + err = tcf_mirred_forward(res->ingress, skb); + if (err) tcf_action_inc_overlimit_qstats(&m->common); __this_cpu_dec(mirred_rec_level); return TC_ACT_CONSUMED; } } - if (!want_ingress) - err = dev_queue_xmit(skb2); - else - err = netif_receive_skb(skb2); - + err = tcf_mirred_forward(want_ingress, skb2); if (err) { out: tcf_action_inc_overlimit_qstats(&m->common); diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 5c7456e5b5cf..d1486ea496a2 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -105,6 +105,9 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, goto drop; break; case TCA_MPLS_ACT_MODIFY: + if (!pskb_may_pull(skb, + skb_network_offset(skb) + MPLS_HLEN)) + goto drop; new_lse = tcf_mpls_get_lse(mpls_hdr(skb), p, false); if (skb_mpls_update_lse(skb, new_lse)) goto drop; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index a4f3d0f0daa9..726cc956d06f 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -52,7 +52,7 @@ static int alloc_defdata(struct tcf_defact *d, const struct nlattr *defdata) d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL); if (unlikely(!d->tcfd_defdata)) return -ENOMEM; - nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); + nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); return 0; } @@ -71,7 +71,7 @@ static int reset_policy(struct tc_action *a, const struct nlattr *defdata, spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch); memset(d->tcfd_defdata, 0, SIMP_MAX_DATA); - nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); + nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); spin_unlock_bh(&d->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 838b3fd94d77..e37556cc37ab 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -223,7 +223,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) static bool tcf_proto_check_kind(struct nlattr *kind, char *name) { if (kind) - return nla_strlcpy(name, kind, IFNAMSIZ) >= IFNAMSIZ; + return nla_strscpy(name, kind, IFNAMSIZ) < 0; memset(name, 0, IFNAMSIZ); return false; } @@ -991,13 +991,12 @@ __tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp) */ struct tcf_proto * -tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp, - bool rtnl_held) +tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp) { struct tcf_proto *tp_next = __tcf_get_next_proto(chain, tp); if (tp) - tcf_proto_put(tp, rtnl_held, NULL); + tcf_proto_put(tp, true, NULL); return tp_next; } @@ -1924,15 +1923,14 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, struct tcf_block *block, struct Qdisc *q, u32 parent, struct nlmsghdr *n, - struct tcf_chain *chain, int event, - bool rtnl_held) + struct tcf_chain *chain, int event) { struct tcf_proto *tp; - for (tp = tcf_get_next_proto(chain, NULL, rtnl_held); - tp; tp = tcf_get_next_proto(chain, tp, rtnl_held)) + for (tp = tcf_get_next_proto(chain, NULL); + tp; tp = tcf_get_next_proto(chain, tp)) tfilter_notify(net, oskb, n, tp, block, - q, parent, NULL, event, false, rtnl_held); + q, parent, NULL, event, false, true); } static void tfilter_put(struct tcf_proto *tp, void *fh) @@ -2262,7 +2260,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (prio == 0) { tfilter_notify_chain(net, skb, block, q, parent, n, - chain, RTM_DELTFILTER, rtnl_held); + chain, RTM_DELTFILTER); tcf_chain_flush(chain, rtnl_held); err = 0; goto errout; @@ -2895,7 +2893,7 @@ replay: break; case RTM_DELCHAIN: tfilter_notify_chain(net, skb, block, q, parent, n, - chain, RTM_DELTFILTER, true); + chain, RTM_DELTFILTER); /* Flush the chain first as the user requested chain removal. */ tcf_chain_flush(chain, true); /* In case the chain was successfully deleted, put a reference @@ -2940,7 +2938,6 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) struct tcf_chain *chain; long index_start; long index; - u32 parent; int err; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) @@ -2955,13 +2952,6 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) block = tcf_block_refcnt_get(net, tcm->tcm_block_index); if (!block) goto out; - /* If we work with block index, q is NULL and parent value - * will never be used in the following code. The check - * in tcf_fill_node prevents it. However, compiler does not - * see that far, so set parent to zero to silence the warning - * about parent being uninitialized. - */ - parent = 0; } else { const struct Qdisc_class_ops *cops; struct net_device *dev; @@ -2971,13 +2961,11 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) if (!dev) return skb->len; - parent = tcm->tcm_parent; - if (!parent) { + if (!tcm->tcm_parent) q = dev->qdisc; - parent = q->handle; - } else { + else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); - } + if (!q) goto out; cops = q->ops->cl_ops; @@ -3055,16 +3043,24 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, size_t attr_size = 0; if (exts->police && tb[exts->police]) { + struct tc_action_ops *a_o; + + a_o = tc_action_load_ops("police", tb[exts->police], rtnl_held, extack); + if (IS_ERR(a_o)) + return PTR_ERR(a_o); act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, "police", ovr, - TCA_ACT_BIND, rtnl_held, + TCA_ACT_BIND, a_o, rtnl_held, extack); - if (IS_ERR(act)) + if (IS_ERR(act)) { + module_put(a_o->owner); return PTR_ERR(act); + } act->type = exts->type = TCA_OLD_COMPAT; exts->actions[0] = act; exts->nr_actions = 1; + tcf_idr_insert_many(exts->actions); } else if (exts->action && tb[exts->action]) { int err; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index fed18fd2c50b..d097b5c15faa 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -30,6 +30,11 @@ #include <uapi/linux/netfilter/nf_conntrack_common.h> +#define TCA_FLOWER_KEY_CT_FLAGS_MAX \ + ((__TCA_FLOWER_KEY_CT_FLAGS_MAX - 1) << 1) +#define TCA_FLOWER_KEY_CT_FLAGS_MASK \ + (TCA_FLOWER_KEY_CT_FLAGS_MAX - 1) + struct fl_flow_key { struct flow_dissector_key_meta meta; struct flow_dissector_key_control control; @@ -291,9 +296,11 @@ static u16 fl_ct_info_to_flower_map[] = { [IP_CT_RELATED] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | TCA_FLOWER_KEY_CT_FLAGS_RELATED, [IP_CT_ESTABLISHED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | - TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED, + TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED | + TCA_FLOWER_KEY_CT_FLAGS_REPLY, [IP_CT_RELATED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | - TCA_FLOWER_KEY_CT_FLAGS_RELATED, + TCA_FLOWER_KEY_CT_FLAGS_RELATED | + TCA_FLOWER_KEY_CT_FLAGS_REPLY, [IP_CT_NEW] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | TCA_FLOWER_KEY_CT_FLAGS_NEW, }; @@ -302,6 +309,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); + bool post_ct = qdisc_skb_cb(skb)->post_ct; struct fl_flow_key skb_key; struct fl_flow_mask *mask; struct cls_fl_filter *f; @@ -318,7 +326,8 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key); skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, fl_ct_info_to_flower_map, - ARRAY_SIZE(fl_ct_info_to_flower_map)); + ARRAY_SIZE(fl_ct_info_to_flower_map), + post_ct); skb_flow_dissect_hash(skb, &mask->dissector, &skb_key); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); @@ -686,8 +695,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED }, - [TCA_FLOWER_KEY_CT_STATE] = { .type = NLA_U16 }, - [TCA_FLOWER_KEY_CT_STATE_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CT_STATE] = + NLA_POLICY_MASK(NLA_U16, TCA_FLOWER_KEY_CT_FLAGS_MASK), + [TCA_FLOWER_KEY_CT_STATE_MASK] = + NLA_POLICY_MASK(NLA_U16, TCA_FLOWER_KEY_CT_FLAGS_MASK), [TCA_FLOWER_KEY_CT_ZONE] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_CT_ZONE_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_CT_MARK] = { .type = NLA_U32 }, @@ -1272,6 +1283,10 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); + if (!nla_ok(nla_opt_msk, msk_depth)) { + NL_SET_ERR_MSG(extack, "Invalid nested attribute for masks"); + return -EINVAL; + } } nla_for_each_attr(nla_opt_key, nla_enc_key, @@ -1307,9 +1322,6 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); return -EINVAL; } - - if (msk_depth) - nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); break; case TCA_FLOWER_KEY_ENC_OPTS_VXLAN: if (key->enc_opts.dst_opt_type) { @@ -1340,9 +1352,6 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); return -EINVAL; } - - if (msk_depth) - nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); break; case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN: if (key->enc_opts.dst_opt_type) { @@ -1373,14 +1382,54 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); return -EINVAL; } - - if (msk_depth) - nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); break; default: NL_SET_ERR_MSG(extack, "Unknown tunnel option type"); return -EINVAL; } + + if (!msk_depth) + continue; + + if (!nla_ok(nla_opt_msk, msk_depth)) { + NL_SET_ERR_MSG(extack, "A mask attribute is invalid"); + return -EINVAL; + } + nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); + } + + return 0; +} + +static int fl_validate_ct_state(u16 state, struct nlattr *tb, + struct netlink_ext_ack *extack) +{ + if (state && !(state & TCA_FLOWER_KEY_CT_FLAGS_TRACKED)) { + NL_SET_ERR_MSG_ATTR(extack, tb, + "no trk, so no other flag can be set"); + return -EINVAL; + } + + if (state & TCA_FLOWER_KEY_CT_FLAGS_NEW && + state & TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED) { + NL_SET_ERR_MSG_ATTR(extack, tb, + "new and est are mutually exclusive"); + return -EINVAL; + } + + if (state & TCA_FLOWER_KEY_CT_FLAGS_INVALID && + state & ~(TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_INVALID)) { + NL_SET_ERR_MSG_ATTR(extack, tb, + "when inv is set, only trk may be set"); + return -EINVAL; + } + + if (state & TCA_FLOWER_KEY_CT_FLAGS_NEW && + state & TCA_FLOWER_KEY_CT_FLAGS_REPLY) { + NL_SET_ERR_MSG_ATTR(extack, tb, + "new and rpl are mutually exclusive"); + return -EINVAL; } return 0; @@ -1392,6 +1441,8 @@ static int fl_set_key_ct(struct nlattr **tb, struct netlink_ext_ack *extack) { if (tb[TCA_FLOWER_KEY_CT_STATE]) { + int err; + if (!IS_ENABLED(CONFIG_NF_CONNTRACK)) { NL_SET_ERR_MSG(extack, "Conntrack isn't enabled"); return -EOPNOTSUPP; @@ -1399,6 +1450,13 @@ static int fl_set_key_ct(struct nlattr **tb, fl_set_key_val(tb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE, &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK, sizeof(key->ct_state)); + + err = fl_validate_ct_state(mask->ct_state, + tb[TCA_FLOWER_KEY_CT_STATE_MASK], + extack); + if (err) + return err; + } if (tb[TCA_FLOWER_KEY_CT_ZONE]) { if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) { @@ -2424,8 +2482,8 @@ static int fl_dump_key_mpls_opt_lse(struct sk_buff *skb, return err; } if (lse_mask->mpls_label) { - err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL, - lse_key->mpls_label); + err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL, + lse_key->mpls_label); if (err) return err; } diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index d36949d9382c..2e288f88ff02 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -238,7 +238,7 @@ static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h) } } - /* Something went wrong if we are trying to replace a non-existant + /* Something went wrong if we are trying to replace a non-existent * node. Mind as well halt instead of silently failing. */ BUG_ON(1); diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 78bec347b8b6..c4007b9cd16d 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -366,9 +366,13 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (tb[TCA_TCINDEX_MASK]) cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); - if (tb[TCA_TCINDEX_SHIFT]) + if (tb[TCA_TCINDEX_SHIFT]) { cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); - + if (cp->shift > 16) { + err = -EINVAL; + goto errout; + } + } if (!cp->hash) { /* Hash not specified, use perfect hash if the upper limit * of the hashing index is below the threshold. diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 54209a18d7fe..6e1abe805448 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -1171,7 +1171,6 @@ static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n, struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; - int err; tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack); cls_u32.command = add ? @@ -1194,13 +1193,9 @@ static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n, cls_u32.knode.link_handle = ht->handle; } - err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSU32, - &cls_u32, cb_priv, &n->flags, - &n->in_hw_count); - if (err) - return err; - - return 0; + return tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSU32, + &cls_u32, cb_priv, &n->flags, + &n->in_hw_count); } static int u32_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c index a4d09b1fb66a..f17b049ea530 100644 --- a/net/sched/em_cmp.c +++ b/net/sched/em_cmp.c @@ -41,7 +41,7 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em, break; case TCF_EM_ALIGN_U32: - /* Worth checking boundries? The branching seems + /* Worth checking boundaries? The branching seems * to get worse. Visit again. */ val = get_unaligned_be32(ptr); diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index 2c1192a2ee5e..a83b237cbeb0 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -31,7 +31,7 @@ static int em_nbyte_change(struct net *net, void *data, int data_len, em->datalen = sizeof(*nbyte) + nbyte->len; em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL); if (em->data == 0UL) - return -ENOBUFS; + return -ENOMEM; return 0; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 2a76a2f5ed88..e2e4353db8a7 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -412,7 +412,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, { struct qdisc_rate_table *rtab; - if (tab == NULL || r->rate == 0 || r->cell_log == 0 || + if (tab == NULL || r->rate == 0 || + r->cell_log == 0 || r->cell_log >= 32 || nla_len(tab) != TC_RTAB_SIZE) { NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching"); return NULL; @@ -1170,7 +1171,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, #ifdef CONFIG_MODULES if (ops == NULL && kind != NULL) { char name[IFNAMSIZ]; - if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { + if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { /* We dropped the RTNL semaphore in order to * perform the module load. So, even if we * succeeded in loading the module we have to @@ -1865,7 +1866,8 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, static int tclass_del_notify(struct net *net, const struct Qdisc_class_ops *cops, struct sk_buff *oskb, struct nlmsghdr *n, - struct Qdisc *q, unsigned long cl) + struct Qdisc *q, unsigned long cl, + struct netlink_ext_ack *extack) { u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; struct sk_buff *skb; @@ -1884,7 +1886,7 @@ static int tclass_del_notify(struct net *net, return -EINVAL; } - err = cops->delete(q, cl); + err = cops->delete(q, cl, extack); if (err) { kfree_skb(skb); return err; @@ -1943,8 +1945,8 @@ static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl, chain = tcf_get_next_chain(block, chain)) { struct tcf_proto *tp; - for (tp = tcf_get_next_proto(chain, NULL, true); - tp; tp = tcf_get_next_proto(chain, tp, true)) { + for (tp = tcf_get_next_proto(chain, NULL); + tp; tp = tcf_get_next_proto(chain, tp)) { struct tcf_bind_args arg = {}; arg.w.fn = tcf_node_bind; @@ -2087,7 +2089,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, goto out; break; case RTM_DELTCLASS: - err = tclass_del_notify(net, cops, skb, n, q, cl); + err = tclass_del_notify(net, cops, skb, n, q, cl, extack); /* Unbind the class with flilters with 0 */ tc_bind_tclass(q, portid, clid, 0); goto out; diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 1c281cc81f57..d0c9a57398fc 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -320,7 +320,8 @@ err_out: return error; } -static int atm_tc_delete(struct Qdisc *sch, unsigned long arg) +static int atm_tc_delete(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)arg; @@ -466,10 +467,10 @@ drop: __maybe_unused * non-ATM interfaces. */ -static void sch_atm_dequeue(unsigned long data) +static void sch_atm_dequeue(struct tasklet_struct *t) { - struct Qdisc *sch = (struct Qdisc *)data; - struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_qdisc_data *p = from_tasklet(p, t, task); + struct Qdisc *sch = qdisc_from_priv(p); struct atm_flow_data *flow; struct sk_buff *skb; @@ -563,7 +564,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt, if (err) return err; - tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch); + tasklet_setup(&p->task, sch_atm_dequeue); return 0; } diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 53d45e029c36..320b3d31fa97 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1675,7 +1675,8 @@ failure: return err; } -static int cbq_delete(struct Qdisc *sch, unsigned long arg) +static int cbq_delete(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 2eaac2ff380f..459cc240eda9 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -50,6 +50,7 @@ * locredit = max_frame_size * (sendslope / port_transmit_rate) */ +#include <linux/ethtool.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index bd618b00d319..50f680f03a54 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -362,7 +362,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, ctl = nla_data(tb[TCA_CHOKE_PARMS]); - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) return -EINVAL; if (ctl->limit > CHOKE_MAX_QUEUE) diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index dde564670ad8..fc1e47069593 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -146,7 +146,8 @@ static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl) kfree(cl); } -static int drr_delete_class(struct Qdisc *sch, unsigned long arg) +static int drr_delete_class(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl = (struct drr_class *)arg; diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 2b88710994d7..cd2748e2d4a2 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -150,7 +150,8 @@ errout: return err; } -static int dsmark_delete(struct Qdisc *sch, unsigned long arg) +static int dsmark_delete(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct dsmark_qdisc_data *p = qdisc_priv(sch); diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index 4dda15588cf4..949163fe68af 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -401,6 +401,7 @@ static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt, INIT_LIST_HEAD(&q->new_flows); INIT_LIST_HEAD(&q->old_flows); + timer_setup(&q->adapt_timer, fq_pie_timer, 0); if (opt) { err = fq_pie_change(sch, opt, extack); @@ -426,7 +427,6 @@ static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt, pie_vars_init(&flow->vars); } - timer_setup(&q->adapt_timer, fq_pie_timer, 0); mod_timer(&q->adapt_timer, jiffies + HZ / 2); return 0; diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c new file mode 100644 index 000000000000..e1e77d3fb6c0 --- /dev/null +++ b/net/sched/sch_frag.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +#include <net/netlink.h> +#include <net/sch_generic.h> +#include <net/dst.h> +#include <net/ip.h> +#include <net/ip6_fib.h> + +struct sch_frag_data { + unsigned long dst; + struct qdisc_skb_cb cb; + __be16 inner_protocol; + u16 vlan_tci; + __be16 vlan_proto; + unsigned int l2_len; + u8 l2_data[VLAN_ETH_HLEN]; + int (*xmit)(struct sk_buff *skb); +}; + +static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage); + +static int sch_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct sch_frag_data *data = this_cpu_ptr(&sch_frag_data_storage); + + if (skb_cow_head(skb, data->l2_len) < 0) { + kfree_skb(skb); + return -ENOMEM; + } + + __skb_dst_copy(skb, data->dst); + *qdisc_skb_cb(skb) = data->cb; + skb->inner_protocol = data->inner_protocol; + if (data->vlan_tci & VLAN_CFI_MASK) + __vlan_hwaccel_put_tag(skb, data->vlan_proto, + data->vlan_tci & ~VLAN_CFI_MASK); + else + __vlan_hwaccel_clear_tag(skb); + + /* Reconstruct the MAC header. */ + skb_push(skb, data->l2_len); + memcpy(skb->data, &data->l2_data, data->l2_len); + skb_postpush_rcsum(skb, skb->data, data->l2_len); + skb_reset_mac_header(skb); + + return data->xmit(skb); +} + +static void sch_frag_prepare_frag(struct sk_buff *skb, + int (*xmit)(struct sk_buff *skb)) +{ + unsigned int hlen = skb_network_offset(skb); + struct sch_frag_data *data; + + data = this_cpu_ptr(&sch_frag_data_storage); + data->dst = skb->_skb_refdst; + data->cb = *qdisc_skb_cb(skb); + data->xmit = xmit; + data->inner_protocol = skb->inner_protocol; + if (skb_vlan_tag_present(skb)) + data->vlan_tci = skb_vlan_tag_get(skb) | VLAN_CFI_MASK; + else + data->vlan_tci = 0; + data->vlan_proto = skb->vlan_proto; + data->l2_len = hlen; + memcpy(&data->l2_data, skb->data, hlen); + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + skb_pull(skb, hlen); +} + +static unsigned int +sch_frag_dst_get_mtu(const struct dst_entry *dst) +{ + return dst->dev->mtu; +} + +static struct dst_ops sch_frag_dst_ops = { + .family = AF_UNSPEC, + .mtu = sch_frag_dst_get_mtu, +}; + +static int sch_fragment(struct net *net, struct sk_buff *skb, + u16 mru, int (*xmit)(struct sk_buff *skb)) +{ + int ret = -1; + + if (skb_network_offset(skb) > VLAN_ETH_HLEN) { + net_warn_ratelimited("L2 header too long to fragment\n"); + goto err; + } + + if (skb_protocol(skb, true) == htons(ETH_P_IP)) { + struct dst_entry sch_frag_dst; + unsigned long orig_dst; + + sch_frag_prepare_frag(skb, xmit); + dst_init(&sch_frag_dst, &sch_frag_dst_ops, NULL, 1, + DST_OBSOLETE_NONE, DST_NOCOUNT); + sch_frag_dst.dev = skb->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &sch_frag_dst); + IPCB(skb)->frag_max_size = mru; + + ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit); + refdst_drop(orig_dst); + } else if (skb_protocol(skb, true) == htons(ETH_P_IPV6)) { + unsigned long orig_dst; + struct rt6_info sch_frag_rt; + + sch_frag_prepare_frag(skb, xmit); + memset(&sch_frag_rt, 0, sizeof(sch_frag_rt)); + dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, 1, + DST_OBSOLETE_NONE, DST_NOCOUNT); + sch_frag_rt.dst.dev = skb->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &sch_frag_rt.dst); + IP6CB(skb)->frag_max_size = mru; + + ret = ipv6_stub->ipv6_fragment(net, skb->sk, skb, + sch_frag_xmit); + refdst_drop(orig_dst); + } else { + net_warn_ratelimited("Fail frag %s: eth=%x, MRU=%d, MTU=%d\n", + netdev_name(skb->dev), + ntohs(skb_protocol(skb, true)), mru, + skb->dev->mtu); + goto err; + } + + return ret; +err: + kfree_skb(skb); + return ret; +} + +int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)) +{ + u16 mru = qdisc_skb_cb(skb)->mru; + int err; + + if (mru && skb->len > mru + skb->dev->hard_header_len) + err = sch_fragment(dev_net(skb->dev), skb, mru, xmit); + else + err = xmit(skb); + + return err; +} +EXPORT_SYMBOL_GPL(sch_frag_xmit_hook); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 8599c6f31b05..e0bc77533acc 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -480,7 +480,7 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, struct gred_sched *table = qdisc_priv(sch); struct gred_sched_data *q = table->tab[dp]; - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) { + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) { NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters"); return -EINVAL; } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index d1902fca9844..bf0034c66e35 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1090,7 +1090,8 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) } static int -hfsc_delete_class(struct Qdisc *sch, unsigned long arg) +hfsc_delete_class(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl = (struct hfsc_class *)arg; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index cd70dbcbd72f..dff3adf5a915 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -114,6 +114,7 @@ struct htb_class { * Written often fields */ struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_packed bstats_bias; struct tc_htb_xstats xstats; /* our special stats */ /* token bucket parameters */ @@ -174,6 +175,11 @@ struct htb_sched { int row_mask[TC_HTB_MAXDEPTH]; struct htb_level hlevel[TC_HTB_MAXDEPTH]; + + struct Qdisc **direct_qdiscs; + unsigned int num_direct_qdiscs; + + bool offload; }; /* find class in global hash table using given handle */ @@ -957,7 +963,7 @@ static void htb_reset(struct Qdisc *sch) if (cl->level) memset(&cl->inner, 0, sizeof(cl->inner)); else { - if (cl->leaf.q) + if (cl->leaf.q && !q->offload) qdisc_reset(cl->leaf.q); } cl->prio_activity = 0; @@ -980,6 +986,7 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = { [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 }, [TCA_HTB_RATE64] = { .type = NLA_U64 }, [TCA_HTB_CEIL64] = { .type = NLA_U64 }, + [TCA_HTB_OFFLOAD] = { .type = NLA_FLAG }, }; static void htb_work_func(struct work_struct *work) @@ -992,12 +999,27 @@ static void htb_work_func(struct work_struct *work) rcu_read_unlock(); } +static void htb_set_lockdep_class_child(struct Qdisc *q) +{ + static struct lock_class_key child_key; + + lockdep_set_class(qdisc_lock(q), &child_key); +} + +static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt) +{ + return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt); +} + static int htb_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + struct net_device *dev = qdisc_dev(sch); + struct tc_htb_qopt_offload offload_opt; struct htb_sched *q = qdisc_priv(sch); struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_glob *gopt; + unsigned int ntx; int err; qdisc_watchdog_init(&q->watchdog, sch); @@ -1022,9 +1044,26 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, if (gopt->version != HTB_VER >> 16) return -EINVAL; + q->offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]); + + if (q->offload) { + if (sch->parent != TC_H_ROOT) + return -EOPNOTSUPP; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return -EOPNOTSUPP; + + q->num_direct_qdiscs = dev->real_num_tx_queues; + q->direct_qdiscs = kcalloc(q->num_direct_qdiscs, + sizeof(*q->direct_qdiscs), + GFP_KERNEL); + if (!q->direct_qdiscs) + return -ENOMEM; + } + err = qdisc_class_hash_init(&q->clhash); if (err < 0) - return err; + goto err_free_direct_qdiscs; qdisc_skb_head_init(&q->direct_queue); @@ -1037,7 +1076,107 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, q->rate2quantum = 1; q->defcls = gopt->defcls; + if (!q->offload) + return 0; + + for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) { + struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); + struct Qdisc *qdisc; + + qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, 0), extack); + if (!qdisc) { + err = -ENOMEM; + goto err_free_qdiscs; + } + + htb_set_lockdep_class_child(qdisc); + q->direct_qdiscs[ntx] = qdisc; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; + } + + sch->flags |= TCQ_F_MQROOT; + + offload_opt = (struct tc_htb_qopt_offload) { + .command = TC_HTB_CREATE, + .parent_classid = TC_H_MAJ(sch->handle) >> 16, + .classid = TC_H_MIN(q->defcls), + .extack = extack, + }; + err = htb_offload(dev, &offload_opt); + if (err) + goto err_free_qdiscs; + return 0; + +err_free_qdiscs: + /* TC_HTB_CREATE call failed, avoid any further calls to the driver. */ + q->offload = false; + + for (ntx = 0; ntx < q->num_direct_qdiscs && q->direct_qdiscs[ntx]; + ntx++) + qdisc_put(q->direct_qdiscs[ntx]); + + qdisc_class_hash_destroy(&q->clhash); + /* Prevent use-after-free and double-free when htb_destroy gets called. + */ + q->clhash.hash = NULL; + q->clhash.hashsize = 0; + +err_free_direct_qdiscs: + kfree(q->direct_qdiscs); + q->direct_qdiscs = NULL; + return err; +} + +static void htb_attach_offload(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct htb_sched *q = qdisc_priv(sch); + unsigned int ntx; + + for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) { + struct Qdisc *old, *qdisc = q->direct_qdiscs[ntx]; + + old = dev_graft_qdisc(qdisc->dev_queue, qdisc); + qdisc_put(old); + qdisc_hash_add(qdisc, false); + } + for (ntx = q->num_direct_qdiscs; ntx < dev->num_tx_queues; ntx++) { + struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); + struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL); + + qdisc_put(old); + } + + kfree(q->direct_qdiscs); + q->direct_qdiscs = NULL; +} + +static void htb_attach_software(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned int ntx; + + /* Resemble qdisc_graft behavior. */ + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); + struct Qdisc *old = dev_graft_qdisc(dev_queue, sch); + + qdisc_refcount_inc(sch); + + qdisc_put(old); + } +} + +static void htb_attach(struct Qdisc *sch) +{ + struct htb_sched *q = qdisc_priv(sch); + + if (q->offload) + htb_attach_offload(sch); + else + htb_attach_software(sch); } static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -1046,6 +1185,11 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *nest; struct tc_htb_glob gopt; + if (q->offload) + sch->flags |= TCQ_F_OFFLOADED; + else + sch->flags &= ~TCQ_F_OFFLOADED; + sch->qstats.overlimits = q->overlimits; /* Its safe to not acquire qdisc lock. As we hold RTNL, * no change can happen on the qdisc parameters. @@ -1063,6 +1207,8 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) || nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen)) goto nla_put_failure; + if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD)) + goto nla_put_failure; return nla_nest_end(skb, nest); @@ -1075,6 +1221,7 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct htb_class *cl = (struct htb_class *)arg; + struct htb_sched *q = qdisc_priv(sch); struct nlattr *nest; struct tc_htb_opt opt; @@ -1101,6 +1248,8 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, opt.level = cl->level; if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt)) goto nla_put_failure; + if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD)) + goto nla_put_failure; if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) && nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps, TCA_HTB_PAD)) @@ -1117,10 +1266,39 @@ nla_put_failure: return -1; } +static void htb_offload_aggregate_stats(struct htb_sched *q, + struct htb_class *cl) +{ + struct htb_class *c; + unsigned int i; + + memset(&cl->bstats, 0, sizeof(cl->bstats)); + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) { + struct htb_class *p = c; + + while (p && p->level < cl->level) + p = p->parent; + + if (p != cl) + continue; + + cl->bstats.bytes += c->bstats_bias.bytes; + cl->bstats.packets += c->bstats_bias.packets; + if (c->level == 0) { + cl->bstats.bytes += c->leaf.q->bstats.bytes; + cl->bstats.packets += c->leaf.q->bstats.packets; + } + } + } +} + static int htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct htb_class *cl = (struct htb_class *)arg; + struct htb_sched *q = qdisc_priv(sch); struct gnet_stats_queue qs = { .drops = cl->drops, .overlimits = cl->overlimits, @@ -1135,6 +1313,19 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens), INT_MIN, INT_MAX); + if (q->offload) { + if (!cl->level) { + if (cl->leaf.q) + cl->bstats = cl->leaf.q->bstats; + else + memset(&cl->bstats, 0, sizeof(cl->bstats)); + cl->bstats.bytes += cl->bstats_bias.bytes; + cl->bstats.packets += cl->bstats_bias.packets; + } else { + htb_offload_aggregate_stats(q, cl); + } + } + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || @@ -1144,19 +1335,97 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); } +static struct netdev_queue * +htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_htb_qopt_offload offload_opt; + int err; + + offload_opt = (struct tc_htb_qopt_offload) { + .command = TC_HTB_LEAF_QUERY_QUEUE, + .classid = TC_H_MIN(tcm->tcm_parent), + }; + err = htb_offload(dev, &offload_opt); + if (err || offload_opt.qid >= dev->num_tx_queues) + return NULL; + return netdev_get_tx_queue(dev, offload_opt.qid); +} + +static struct Qdisc * +htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q) +{ + struct net_device *dev = dev_queue->dev; + struct Qdisc *old_q; + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + old_q = dev_graft_qdisc(dev_queue, new_q); + if (new_q) + new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; + if (dev->flags & IFF_UP) + dev_activate(dev); + + return old_q; +} + +static void htb_offload_move_qdisc(struct Qdisc *sch, u16 qid_old, u16 qid_new) +{ + struct netdev_queue *queue_old, *queue_new; + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *qdisc; + + queue_old = netdev_get_tx_queue(dev, qid_old); + queue_new = netdev_get_tx_queue(dev, qid_new); + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + qdisc = dev_graft_qdisc(queue_old, NULL); + qdisc->dev_queue = queue_new; + qdisc = dev_graft_qdisc(queue_new, qdisc); + if (dev->flags & IFF_UP) + dev_activate(dev); + + WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN)); +} + static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { + struct netdev_queue *dev_queue = sch->dev_queue; struct htb_class *cl = (struct htb_class *)arg; + struct htb_sched *q = qdisc_priv(sch); + struct Qdisc *old_q; if (cl->level) return -EINVAL; - if (new == NULL && - (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - cl->common.classid, extack)) == NULL) - return -ENOBUFS; + + if (q->offload) { + dev_queue = new->dev_queue; + WARN_ON(dev_queue != cl->leaf.q->dev_queue); + } + + if (!new) { + new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, + cl->common.classid, extack); + if (!new) + return -ENOBUFS; + } + + if (q->offload) { + htb_set_lockdep_class_child(new); + /* One ref for cl->leaf.q, the other for dev_queue->qdisc. */ + qdisc_refcount_inc(new); + old_q = htb_graft_helper(dev_queue, new); + } *old = qdisc_replace(sch, new, &cl->leaf.q); + + if (q->offload) { + WARN_ON(old_q != *old); + qdisc_put(old_q); + } + return 0; } @@ -1184,9 +1453,10 @@ static inline int htb_parent_last_child(struct htb_class *cl) return 1; } -static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, +static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl, struct Qdisc *new_q) { + struct htb_sched *q = qdisc_priv(sch); struct htb_class *parent = cl->parent; WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity); @@ -1204,6 +1474,76 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, parent->cmode = HTB_CAN_SEND; } +static void htb_parent_to_leaf_offload(struct Qdisc *sch, + struct netdev_queue *dev_queue, + struct Qdisc *new_q) +{ + struct Qdisc *old_q; + + /* One ref for cl->leaf.q, the other for dev_queue->qdisc. */ + qdisc_refcount_inc(new_q); + old_q = htb_graft_helper(dev_queue, new_q); + WARN_ON(!(old_q->flags & TCQ_F_BUILTIN)); +} + +static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, + bool last_child, bool destroying, + struct netlink_ext_ack *extack) +{ + struct tc_htb_qopt_offload offload_opt; + struct Qdisc *q = cl->leaf.q; + struct Qdisc *old = NULL; + int err; + + if (cl->level) + return -EINVAL; + + WARN_ON(!q); + if (!destroying) { + /* On destroy of HTB, two cases are possible: + * 1. q is a normal qdisc, but q->dev_queue has noop qdisc. + * 2. q is a noop qdisc (for nodes that were inner), + * q->dev_queue is noop_netdev_queue. + */ + old = htb_graft_helper(q->dev_queue, NULL); + WARN_ON(!old); + WARN_ON(old != q); + } + + if (cl->parent) { + cl->parent->bstats_bias.bytes += q->bstats.bytes; + cl->parent->bstats_bias.packets += q->bstats.packets; + } + + offload_opt = (struct tc_htb_qopt_offload) { + .command = !last_child ? TC_HTB_LEAF_DEL : + destroying ? TC_HTB_LEAF_DEL_LAST_FORCE : + TC_HTB_LEAF_DEL_LAST, + .classid = cl->common.classid, + .extack = extack, + }; + err = htb_offload(qdisc_dev(sch), &offload_opt); + + if (!err || destroying) + qdisc_put(old); + else + htb_graft_helper(q->dev_queue, old); + + if (last_child) + return err; + + if (!err && offload_opt.moved_qid != 0) { + if (destroying) + q->dev_queue = netdev_get_tx_queue(qdisc_dev(sch), + offload_opt.qid); + else + htb_offload_move_qdisc(sch, offload_opt.moved_qid, + offload_opt.qid); + } + + return err; +} + static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) { if (!cl->level) { @@ -1217,8 +1557,11 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) static void htb_destroy(struct Qdisc *sch) { + struct net_device *dev = qdisc_dev(sch); + struct tc_htb_qopt_offload offload_opt; struct htb_sched *q = qdisc_priv(sch); struct hlist_node *next; + bool nonempty, changed; struct htb_class *cl; unsigned int i; @@ -1237,21 +1580,68 @@ static void htb_destroy(struct Qdisc *sch) cl->block = NULL; } } - for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], - common.hnode) - htb_destroy_class(sch, cl); - } + + do { + nonempty = false; + changed = false; + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], + common.hnode) { + bool last_child; + + if (!q->offload) { + htb_destroy_class(sch, cl); + continue; + } + + nonempty = true; + + if (cl->level) + continue; + + changed = true; + + last_child = htb_parent_last_child(cl); + htb_destroy_class_offload(sch, cl, last_child, + true, NULL); + qdisc_class_hash_remove(&q->clhash, + &cl->common); + if (cl->parent) + cl->parent->children--; + if (last_child) + htb_parent_to_leaf(sch, cl, NULL); + htb_destroy_class(sch, cl); + } + } + } while (changed); + WARN_ON(nonempty); + qdisc_class_hash_destroy(&q->clhash); __qdisc_reset_queue(&q->direct_queue); + + if (!q->offload) + return; + + offload_opt = (struct tc_htb_qopt_offload) { + .command = TC_HTB_DESTROY, + }; + htb_offload(dev, &offload_opt); + + if (!q->direct_qdiscs) + return; + for (i = 0; i < q->num_direct_qdiscs && q->direct_qdiscs[i]; i++) + qdisc_put(q->direct_qdiscs[i]); + kfree(q->direct_qdiscs); } -static int htb_delete(struct Qdisc *sch, unsigned long arg) +static int htb_delete(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)arg; struct Qdisc *new_q = NULL; int last_child = 0; + int err; /* TODO: why don't allow to delete subtree ? references ? does * tc subsys guarantee us that in htb_destroy it holds no class @@ -1260,11 +1650,28 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) if (cl->children || cl->filter_cnt) return -EBUSY; - if (!cl->level && htb_parent_last_child(cl)) { - new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + if (!cl->level && htb_parent_last_child(cl)) + last_child = 1; + + if (q->offload) { + err = htb_destroy_class_offload(sch, cl, last_child, false, + extack); + if (err) + return err; + } + + if (last_child) { + struct netdev_queue *dev_queue; + + dev_queue = q->offload ? cl->leaf.q->dev_queue : sch->dev_queue; + new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, cl->parent->common.classid, NULL); - last_child = 1; + if (q->offload) { + if (new_q) + htb_set_lockdep_class_child(new_q); + htb_parent_to_leaf_offload(sch, dev_queue, new_q); + } } sch_tree_lock(sch); @@ -1285,7 +1692,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) &q->hlevel[cl->level].wait_pq); if (last_child) - htb_parent_to_leaf(q, cl, new_q); + htb_parent_to_leaf(sch, cl, new_q); sch_tree_unlock(sch); @@ -1300,9 +1707,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, int err = -EINVAL; struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)*arg, *parent; + struct tc_htb_qopt_offload offload_opt; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_HTB_MAX + 1]; struct Qdisc *parent_qdisc = NULL; + struct netdev_queue *dev_queue; struct tc_htb_opt *hopt; u64 rate64, ceil64; int warn = 0; @@ -1335,8 +1744,12 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB], NULL)); + rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; + ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; + if (!cl) { /* new class */ - struct Qdisc *new_q; + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *new_q, *old_q; int prio; struct { struct nlattr nla; @@ -1379,11 +1792,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, NULL, qdisc_root_sleeping_running(sch), tca[TCA_RATE] ? : &est.nla); - if (err) { - tcf_block_put(cl->block); - kfree(cl); - goto failure; - } + if (err) + goto err_block_put; } cl->children = 0; @@ -1392,12 +1802,76 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, for (prio = 0; prio < TC_HTB_NUMPRIO; prio++) RB_CLEAR_NODE(&cl->node[prio]); + cl->common.classid = classid; + + /* Make sure nothing interrupts us in between of two + * ndo_setup_tc calls. + */ + ASSERT_RTNL(); + /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL) * so that can't be used inside of sch_tree_lock * -- thanks to Karlis Peisenieks */ - new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + if (!q->offload) { + dev_queue = sch->dev_queue; + } else if (!(parent && !parent->level)) { + /* Assign a dev_queue to this classid. */ + offload_opt = (struct tc_htb_qopt_offload) { + .command = TC_HTB_LEAF_ALLOC_QUEUE, + .classid = cl->common.classid, + .parent_classid = parent ? + TC_H_MIN(parent->common.classid) : + TC_HTB_CLASSID_ROOT, + .rate = max_t(u64, hopt->rate.rate, rate64), + .ceil = max_t(u64, hopt->ceil.rate, ceil64), + .extack = extack, + }; + err = htb_offload(dev, &offload_opt); + if (err) { + pr_err("htb: TC_HTB_LEAF_ALLOC_QUEUE failed with err = %d\n", + err); + goto err_kill_estimator; + } + dev_queue = netdev_get_tx_queue(dev, offload_opt.qid); + } else { /* First child. */ + dev_queue = parent->leaf.q->dev_queue; + old_q = htb_graft_helper(dev_queue, NULL); + WARN_ON(old_q != parent->leaf.q); + offload_opt = (struct tc_htb_qopt_offload) { + .command = TC_HTB_LEAF_TO_INNER, + .classid = cl->common.classid, + .parent_classid = + TC_H_MIN(parent->common.classid), + .rate = max_t(u64, hopt->rate.rate, rate64), + .ceil = max_t(u64, hopt->ceil.rate, ceil64), + .extack = extack, + }; + err = htb_offload(dev, &offload_opt); + if (err) { + pr_err("htb: TC_HTB_LEAF_TO_INNER failed with err = %d\n", + err); + htb_graft_helper(dev_queue, old_q); + goto err_kill_estimator; + } + parent->bstats_bias.bytes += old_q->bstats.bytes; + parent->bstats_bias.packets += old_q->bstats.packets; + qdisc_put(old_q); + } + new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, classid, NULL); + if (q->offload) { + if (new_q) { + htb_set_lockdep_class_child(new_q); + /* One ref for cl->leaf.q, the other for + * dev_queue->qdisc. + */ + qdisc_refcount_inc(new_q); + } + old_q = htb_graft_helper(dev_queue, new_q); + /* No qdisc_put needed. */ + WARN_ON(!(old_q->flags & TCQ_F_BUILTIN)); + } sch_tree_lock(sch); if (parent && !parent->level) { /* turn parent into inner node */ @@ -1415,10 +1889,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, : TC_HTB_MAXDEPTH) - 1; memset(&parent->inner, 0, sizeof(parent->inner)); } + /* leaf (we) needs elementary qdisc */ cl->leaf.q = new_q ? new_q : &noop_qdisc; - cl->common.classid = classid; cl->parent = parent; /* set class to be in HTB_CAN_SEND state */ @@ -1444,12 +1918,30 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (err) return err; } - sch_tree_lock(sch); - } - rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; + if (q->offload) { + struct net_device *dev = qdisc_dev(sch); + + offload_opt = (struct tc_htb_qopt_offload) { + .command = TC_HTB_NODE_MODIFY, + .classid = cl->common.classid, + .rate = max_t(u64, hopt->rate.rate, rate64), + .ceil = max_t(u64, hopt->ceil.rate, ceil64), + .extack = extack, + }; + err = htb_offload(dev, &offload_opt); + if (err) + /* Estimator was replaced, and rollback may fail + * as well, so we don't try to recover it, and + * the estimator won't work property with the + * offload anyway, because bstats are updated + * only when the stats are queried. + */ + return err; + } - ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; + sch_tree_lock(sch); + } psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64); psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64); @@ -1492,6 +1984,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, *arg = (unsigned long)cl; return 0; +err_kill_estimator: + gen_kill_estimator(&cl->rate_est); +err_block_put: + tcf_block_put(cl->block); + kfree(cl); failure: return err; } @@ -1557,6 +2054,7 @@ static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) } static const struct Qdisc_class_ops htb_class_ops = { + .select_queue = htb_select_queue, .graft = htb_graft, .leaf = htb_leaf, .qlen_notify = htb_qlen_notify, @@ -1579,6 +2077,7 @@ static struct Qdisc_ops htb_qdisc_ops __read_mostly = { .dequeue = htb_dequeue, .peek = qdisc_peek_dequeued, .init = htb_init, + .attach = htb_attach, .reset = htb_reset, .destroy = htb_destroy, .dump = htb_dump, diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index c65077f0c0f3..5a457ff61acd 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -405,7 +405,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, /* We restart the measurement cycle if the following conditions are met * 1. If the delay has been low for 2 consecutive Tupdate periods * 2. Calculated drop probability is zero - * 3. If average dq_rate_estimator is enabled, we have atleast one + * 3. If average dq_rate_estimator is enabled, we have at least one * estimate for the avg_dq_rate ie., is a non-zero value */ if ((vars->qdelay < params->target / 2) && diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 6335230a971e..1db9d4a2ef5e 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -529,7 +529,8 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) kfree(cl); } -static int qfq_delete_class(struct Qdisc *sch, unsigned long arg) +static int qfq_delete_class(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)arg; diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index e89fab6ccb34..b4ae34d7aa96 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -250,7 +250,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0; ctl = nla_data(tb[TCA_RED_PARMS]); - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) return -EINVAL; err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS, diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index da047a37a3bf..dde829d4b9f8 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -649,7 +649,8 @@ static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return -ENOSYS; } -static int sfb_delete(struct Qdisc *sch, unsigned long cl) +static int sfb_delete(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) { return -ENOSYS; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index bca2be57d9fc..b25e51440623 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -647,7 +647,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) } if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, - ctl_v1->Wlog)) + ctl_v1->Wlog, ctl_v1->Scell_log)) return -EINVAL; if (ctl_v1 && ctl_v1->qth_min) { p = kmalloc(sizeof(*p), GFP_KERNEL); diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index b0ad7687ee2c..8287894541e3 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -6,6 +6,7 @@ * */ +#include <linux/ethtool.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/kernel.h> @@ -240,7 +241,7 @@ static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb, /* Here, we are just trying to find out the * first available interval in the next cycle. */ - entry_available = 1; + entry_available = true; entry_found = entry; *interval_start = ktime_add_ns(curr_intv_start, cycle); *interval_end = ktime_add_ns(curr_intv_end, cycle); @@ -371,7 +372,7 @@ static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) packet_transmit_time = length_to_duration(q, len); do { - sched_changed = 0; + sched_changed = false; entry = find_entry_to_transmit(skb, sch, sched, admin, minimum_time, @@ -389,7 +390,7 @@ static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) if (admin && admin != sched && ktime_after(txtime, admin->base_time)) { sched = admin; - sched_changed = 1; + sched_changed = true; continue; } @@ -1596,6 +1597,22 @@ free_sched: return err; } +static void taprio_reset(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int i; + + hrtimer_cancel(&q->advance_timer); + if (q->qdiscs) { + for (i = 0; i < dev->num_tx_queues; i++) + if (q->qdiscs[i]) + qdisc_reset(q->qdiscs[i]); + } + sch->qstats.backlog = 0; + sch->q.qlen = 0; +} + static void taprio_destroy(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); @@ -1606,12 +1623,11 @@ static void taprio_destroy(struct Qdisc *sch) list_del(&q->taprio_list); spin_unlock(&taprio_list_lock); - hrtimer_cancel(&q->advance_timer); taprio_disable_offload(dev, q, NULL); if (q->qdiscs) { - for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++) + for (i = 0; i < dev->num_tx_queues; i++) qdisc_put(q->qdiscs[i]); kfree(q->qdiscs); @@ -1953,6 +1969,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .init = taprio_init, .change = taprio_change, .destroy = taprio_destroy, + .reset = taprio_reset, .peek = taprio_peek, .dequeue = taprio_dequeue, .enqueue = taprio_enqueue, diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 39d7fa9569f8..5da599ff84a9 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -11,6 +11,7 @@ menuconfig IP_SCTP select CRYPTO_HMAC select CRYPTO_SHA1 select LIBCRC32C + select NET_UDP_TUNNEL help Stream Control Transmission Protocol diff --git a/net/sctp/associola.c b/net/sctp/associola.c index fdb69d46276d..336df4b36655 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -99,6 +99,8 @@ static struct sctp_association *sctp_association_init( */ asoc->hbinterval = msecs_to_jiffies(sp->hbinterval); + asoc->encap_port = sp->encap_port; + /* Initialize path max retrans value. */ asoc->pathmaxrxt = sp->pathmaxrxt; @@ -624,6 +626,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, */ peer->hbinterval = asoc->hbinterval; + peer->encap_port = asoc->encap_port; + /* Set the path max_retrans. */ peer->pathmaxrxt = asoc->pathmaxrxt; diff --git a/net/sctp/input.c b/net/sctp/input.c index 55d4fc6f371d..d508f6f3dd08 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -449,7 +449,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk, else { if (!mod_timer(&t->proto_unreach_timer, jiffies + (HZ/20))) - sctp_association_hold(asoc); + sctp_transport_hold(t); } } else { struct net *net = sock_net(sk); @@ -458,7 +458,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk, "encountered!\n", __func__); if (del_timer(&t->proto_unreach_timer)) - sctp_association_put(asoc); + sctp_transport_put(t); sctp_do_sm(net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 8a58f42d6d19..c3e89c776e66 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -55,6 +55,7 @@ #include <net/inet_common.h> #include <net/inet_ecn.h> #include <net/sctp/sctp.h> +#include <net/udp_tunnel.h> #include <linux/uaccess.h> @@ -191,33 +192,53 @@ out: return ret; } -static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) +static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) { + struct dst_entry *dst = dst_clone(t->dst); + struct flowi6 *fl6 = &t->fl.u.ip6; struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); - struct flowi6 *fl6 = &transport->fl.u.ip6; __u8 tclass = np->tclass; - int res; + __be32 label; pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb, skb->len, &fl6->saddr, &fl6->daddr); - if (transport->dscp & SCTP_DSCP_SET_MASK) - tclass = transport->dscp & SCTP_DSCP_VAL_MASK; + if (t->dscp & SCTP_DSCP_SET_MASK) + tclass = t->dscp & SCTP_DSCP_VAL_MASK; if (INET_ECN_is_capable(tclass)) IP6_ECN_flow_xmit(sk, fl6->flowlabel); - if (!(transport->param_flags & SPP_PMTUD_ENABLE)) + if (!(t->param_flags & SPP_PMTUD_ENABLE)) skb->ignore_df = 1; SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); - rcu_read_lock(); - res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), - tclass, sk->sk_priority); - rcu_read_unlock(); - return res; + if (!t->encap_port || !sctp_sk(sk)->udp_port) { + int res; + + skb_dst_set(skb, dst); + rcu_read_lock(); + res = ip6_xmit(sk, skb, fl6, sk->sk_mark, + rcu_dereference(np->opt), + tclass, sk->sk_priority); + rcu_read_unlock(); + return res; + } + + if (skb_is_gso(skb)) + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + + skb->encapsulation = 1; + skb_reset_inner_mac_header(skb); + skb_reset_inner_transport_header(skb); + skb_set_inner_ipproto(skb, IPPROTO_SCTP); + label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, fl6); + + return udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, + &fl6->daddr, tclass, ip6_dst_hoplimit(dst), + label, sctp_sk(sk)->udp_port, t->encap_port, false); } /* Returns the dst cache entry for the given source and destination ip @@ -1053,6 +1074,7 @@ static struct inet_protosw sctpv6_stream_protosw = { static int sctp6_rcv(struct sk_buff *skb) { + SCTP_INPUT_CB(skb)->encap_port = 0; return sctp_rcv(skb) ? -1 : 0; } diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 74847d613835..eb874e3c399a 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -27,7 +27,11 @@ static __le32 sctp_gso_make_checksum(struct sk_buff *skb) { skb->ip_summed = CHECKSUM_NONE; skb->csum_not_inet = 0; - gso_reset_checksum(skb, ~0); + /* csum and csum_start in GSO CB may be needed to do the UDP + * checksum when it's a UDP tunneling packet. + */ + SKB_GSO_CB(skb)->csum = (__force __wsum)~0; + SKB_GSO_CB(skb)->csum_start = skb_headroom(skb) + skb->len; return sctp_compute_cksum(skb, skb_transport_offset(skb)); } @@ -64,7 +68,7 @@ static struct sk_buff *sctp_gso_segment(struct sk_buff *skb, goto out; } - segs = skb_segment(skb, features | NETIF_F_HW_CSUM | NETIF_F_SG); + segs = skb_segment(skb, (features | NETIF_F_HW_CSUM) & ~NETIF_F_SG); if (IS_ERR(segs)) goto out; diff --git a/net/sctp/output.c b/net/sctp/output.c index 1441eaf460bb..6614c9fdc51e 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -508,20 +508,14 @@ merge: sizeof(struct inet6_skb_parm))); skb_shinfo(head)->gso_segs = pkt_count; skb_shinfo(head)->gso_size = GSO_BY_FRAGS; - rcu_read_lock(); - if (skb_dst(head) != tp->dst) { - dst_hold(tp->dst); - sk_setup_caps(sk, tp->dst); - } - rcu_read_unlock(); goto chksum; } if (sctp_checksum_disable) return 1; - if (!(skb_dst(head)->dev->features & NETIF_F_SCTP_CRC) || - dst_xfrm(skb_dst(head)) || packet->ipfragok) { + if (!(tp->dst->dev->features & NETIF_F_SCTP_CRC) || + dst_xfrm(tp->dst) || packet->ipfragok || tp->encap_port) { struct sctphdr *sh = (struct sctphdr *)skb_transport_header(head); @@ -548,7 +542,6 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) struct sctp_association *asoc = tp->asoc; struct sctp_chunk *chunk, *tmp; int pkt_count, gso = 0; - struct dst_entry *dst; struct sk_buff *head; struct sctphdr *sh; struct sock *sk; @@ -585,13 +578,18 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) sh->checksum = 0; /* drop packet if no dst */ - dst = dst_clone(tp->dst); - if (!dst) { + if (!tp->dst) { IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(head); goto out; } - skb_dst_set(head, dst); + + rcu_read_lock(); + if (__sk_dst_get(sk) != tp->dst) { + dst_hold(tp->dst); + sk_setup_caps(sk, tp->dst); + } + rcu_read_unlock(); /* pack up chunks */ pkt_count = sctp_packet_pack(packet, head, gso, gfp); diff --git a/net/sctp/proc.c b/net/sctp/proc.c index f7da88ae20a5..982a87b3e11f 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -215,6 +215,12 @@ static void sctp_transport_seq_stop(struct seq_file *seq, void *v) { struct sctp_ht_iter *iter = seq->private; + if (v && v != SEQ_START_TOKEN) { + struct sctp_transport *transport = v; + + sctp_transport_put(transport); + } + sctp_transport_walk_stop(&iter->hti); } @@ -222,6 +228,12 @@ static void *sctp_transport_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sctp_ht_iter *iter = seq->private; + if (v && v != SEQ_START_TOKEN) { + struct sctp_transport *transport = v; + + sctp_transport_put(transport); + } + ++*pos; return sctp_transport_get_next(seq_file_net(seq), &iter->hti); @@ -277,8 +289,6 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) sk->sk_rcvbuf); seq_printf(seq, "\n"); - sctp_transport_put(transport); - return 0; } @@ -354,8 +364,6 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } - sctp_transport_put(transport); - return 0; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 25833238fe93..6f2bbfeec3a4 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -44,6 +44,7 @@ #include <net/addrconf.h> #include <net/inet_common.h> #include <net/inet_ecn.h> +#include <net/udp_tunnel.h> #define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024) @@ -840,6 +841,92 @@ static int sctp_ctl_sock_init(struct net *net) return 0; } +static int sctp_udp_rcv(struct sock *sk, struct sk_buff *skb) +{ + SCTP_INPUT_CB(skb)->encap_port = udp_hdr(skb)->source; + + skb_set_transport_header(skb, sizeof(struct udphdr)); + sctp_rcv(skb); + return 0; +} + +static int sctp_udp_err_lookup(struct sock *sk, struct sk_buff *skb) +{ + struct sctp_association *asoc; + struct sctp_transport *t; + int family; + + skb->transport_header += sizeof(struct udphdr); + family = (ip_hdr(skb)->version == 4) ? AF_INET : AF_INET6; + sk = sctp_err_lookup(dev_net(skb->dev), family, skb, sctp_hdr(skb), + &asoc, &t); + if (!sk) + return -ENOENT; + + sctp_err_finish(sk, t); + return 0; +} + +int sctp_udp_sock_start(struct net *net) +{ + struct udp_tunnel_sock_cfg tuncfg = {NULL}; + struct udp_port_cfg udp_conf = {0}; + struct socket *sock; + int err; + + udp_conf.family = AF_INET; + udp_conf.local_ip.s_addr = htonl(INADDR_ANY); + udp_conf.local_udp_port = htons(net->sctp.udp_port); + err = udp_sock_create(net, &udp_conf, &sock); + if (err) { + pr_err("Failed to create the SCTP UDP tunneling v4 sock\n"); + return err; + } + + tuncfg.encap_type = 1; + tuncfg.encap_rcv = sctp_udp_rcv; + tuncfg.encap_err_lookup = sctp_udp_err_lookup; + setup_udp_tunnel_sock(net, sock, &tuncfg); + net->sctp.udp4_sock = sock->sk; + +#if IS_ENABLED(CONFIG_IPV6) + memset(&udp_conf, 0, sizeof(udp_conf)); + + udp_conf.family = AF_INET6; + udp_conf.local_ip6 = in6addr_any; + udp_conf.local_udp_port = htons(net->sctp.udp_port); + udp_conf.use_udp6_rx_checksums = true; + udp_conf.ipv6_v6only = true; + err = udp_sock_create(net, &udp_conf, &sock); + if (err) { + pr_err("Failed to create the SCTP UDP tunneling v6 sock\n"); + udp_tunnel_sock_release(net->sctp.udp4_sock->sk_socket); + net->sctp.udp4_sock = NULL; + return err; + } + + tuncfg.encap_type = 1; + tuncfg.encap_rcv = sctp_udp_rcv; + tuncfg.encap_err_lookup = sctp_udp_err_lookup; + setup_udp_tunnel_sock(net, sock, &tuncfg); + net->sctp.udp6_sock = sock->sk; +#endif + + return 0; +} + +void sctp_udp_sock_stop(struct net *net) +{ + if (net->sctp.udp4_sock) { + udp_tunnel_sock_release(net->sctp.udp4_sock->sk_socket); + net->sctp.udp4_sock = NULL; + } + if (net->sctp.udp6_sock) { + udp_tunnel_sock_release(net->sctp.udp6_sock->sk_socket); + net->sctp.udp6_sock = NULL; + } +} + /* Register address family specific functions. */ int sctp_register_af(struct sctp_af *af) { @@ -971,25 +1058,44 @@ static int sctp_inet_supported_addrs(const struct sctp_sock *opt, } /* Wrapper routine that calls the ip transmit routine. */ -static inline int sctp_v4_xmit(struct sk_buff *skb, - struct sctp_transport *transport) +static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t) { - struct inet_sock *inet = inet_sk(skb->sk); + struct dst_entry *dst = dst_clone(t->dst); + struct flowi4 *fl4 = &t->fl.u.ip4; + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(sk); __u8 dscp = inet->tos; + __be16 df = 0; pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb, - skb->len, &transport->fl.u.ip4.saddr, - &transport->fl.u.ip4.daddr); + skb->len, &fl4->saddr, &fl4->daddr); - if (transport->dscp & SCTP_DSCP_SET_MASK) - dscp = transport->dscp & SCTP_DSCP_VAL_MASK; + if (t->dscp & SCTP_DSCP_SET_MASK) + dscp = t->dscp & SCTP_DSCP_VAL_MASK; + + inet->pmtudisc = t->param_flags & SPP_PMTUD_ENABLE ? IP_PMTUDISC_DO + : IP_PMTUDISC_DONT; + SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); - inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ? - IP_PMTUDISC_DO : IP_PMTUDISC_DONT; + if (!t->encap_port || !sctp_sk(sk)->udp_port) { + skb_dst_set(skb, dst); + return __ip_queue_xmit(sk, skb, &t->fl, dscp); + } - SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS); + if (skb_is_gso(skb)) + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; - return __ip_queue_xmit(&inet->sk, skb, &transport->fl, dscp); + if (ip_dont_fragment(sk, dst) && !skb->ignore_df) + df = htons(IP_DF); + + skb->encapsulation = 1; + skb_reset_inner_mac_header(skb); + skb_reset_inner_transport_header(skb); + skb_set_inner_ipproto(skb, IPPROTO_SCTP); + udp_tunnel_xmit_skb((struct rtable *)dst, sk, skb, fl4->saddr, + fl4->daddr, dscp, ip4_dst_hoplimit(dst), df, + sctp_sk(sk)->udp_port, t->encap_port, false, false); + return 0; } static struct sctp_af sctp_af_inet; @@ -1054,9 +1160,15 @@ static struct inet_protosw sctp_stream_protosw = { .flags = SCTP_PROTOSW_FLAG }; +static int sctp4_rcv(struct sk_buff *skb) +{ + SCTP_INPUT_CB(skb)->encap_port = 0; + return sctp_rcv(skb); +} + /* Register with IP layer. */ static const struct net_protocol sctp_protocol = { - .handler = sctp_rcv, + .handler = sctp4_rcv, .err_handler = sctp_v4_err, .no_policy = 1, .netns_ok = 1, @@ -1271,6 +1383,12 @@ static int __net_init sctp_defaults_init(struct net *net) /* Enable ECN by default. */ net->sctp.ecn_enable = 1; + /* Set UDP tunneling listening port to 0 by default */ + net->sctp.udp_port = 0; + + /* Set remote encap port to 0 by default */ + net->sctp.encap_port = 0; + /* Set SCOPE policy to enabled */ net->sctp.scope_policy = SCTP_SCOPE_POLICY_ENABLE; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 9a56ae2f3651..f77484df097b 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1142,6 +1142,26 @@ nodata: return retval; } +struct sctp_chunk *sctp_make_new_encap_port(const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + struct sctp_new_encap_port_hdr nep; + struct sctp_chunk *retval; + + retval = sctp_make_abort(asoc, chunk, + sizeof(struct sctp_errhdr) + sizeof(nep)); + if (!retval) + goto nodata; + + sctp_init_cause(retval, SCTP_ERROR_NEW_ENCAP_PORT, sizeof(nep)); + nep.cur_port = SCTP_INPUT_CB(chunk->skb)->encap_port; + nep.new_port = chunk->transport->encap_port; + sctp_addto_chunk(retval, sizeof(nep), &nep); + +nodata: + return retval; +} + /* Make a HEARTBEAT chunk. */ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, const struct sctp_transport *transport) @@ -2321,6 +2341,7 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, * added as the primary transport. The source address seems to * be a better choice than any of the embedded addresses. */ + asoc->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port; if (!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE)) goto nomem; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 813d30767204..0948f14ce221 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -419,7 +419,7 @@ void sctp_generate_proto_unreach_event(struct timer_list *t) /* Try again later. */ if (!mod_timer(&transport->proto_unreach_timer, jiffies + (HZ/20))) - sctp_association_hold(asoc); + sctp_transport_hold(transport); goto out_unlock; } @@ -435,7 +435,7 @@ void sctp_generate_proto_unreach_event(struct timer_list *t) out_unlock: bh_unlock_sock(sk); - sctp_association_put(asoc); + sctp_transport_put(transport); } /* Handle the timeout of the RE-CONFIG timer. */ diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index c669f8bd1eab..af2b7041fa4e 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -87,6 +87,13 @@ static enum sctp_disposition sctp_sf_tabort_8_4_8( const union sctp_subtype type, void *arg, struct sctp_cmd_seq *commands); +static enum sctp_disposition sctp_sf_new_encap_port( + struct net *net, + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const union sctp_subtype type, + void *arg, + struct sctp_cmd_seq *commands); static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk); static enum sctp_disposition sctp_stop_t1_and_abort( @@ -1493,6 +1500,10 @@ static enum sctp_disposition sctp_sf_do_unexpected_init( if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); + + if (SCTP_INPUT_CB(chunk->skb)->encap_port != chunk->transport->encap_port) + return sctp_sf_new_encap_port(net, ep, asoc, type, arg, commands); + /* Grab the INIT header. */ chunk->subh.init_hdr = (struct sctp_inithdr *)chunk->skb->data; @@ -3392,6 +3403,45 @@ static enum sctp_disposition sctp_sf_tabort_8_4_8( sctp_packet_append_chunk(packet, abort); + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, SCTP_PACKET(packet)); + + SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); + + sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + return SCTP_DISPOSITION_CONSUME; +} + +/* Handling of SCTP Packets Containing an INIT Chunk Matching an + * Existing Associations when the UDP encap port is incorrect. + * + * From Section 4 at draft-tuexen-tsvwg-sctp-udp-encaps-cons-03. + */ +static enum sctp_disposition sctp_sf_new_encap_port( + struct net *net, + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const union sctp_subtype type, + void *arg, + struct sctp_cmd_seq *commands) +{ + struct sctp_packet *packet = NULL; + struct sctp_chunk *chunk = arg; + struct sctp_chunk *abort; + + packet = sctp_ootb_pkt_new(net, asoc, chunk); + if (!packet) + return SCTP_DISPOSITION_NOMEM; + + abort = sctp_make_new_encap_port(asoc, chunk); + if (!abort) { + sctp_ootb_pkt_free(packet); + return SCTP_DISPOSITION_NOMEM; + } + + abort->skb->sk = ep->base.sk; + + sctp_packet_append_chunk(packet, abort); + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, SCTP_PACKET(packet)); @@ -6268,6 +6318,8 @@ static struct sctp_packet *sctp_ootb_pkt_new( if (!transport) goto nomem; + transport->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port; + /* Cache a route for the transport with the chunk's destination as * the source address. */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 53d0a4161df3..a710917c5ac7 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4417,6 +4417,55 @@ out: return retval; } +static int sctp_setsockopt_encap_port(struct sock *sk, + struct sctp_udpencaps *encap, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_transport *t; + __be16 encap_port; + + if (optlen != sizeof(*encap)) + return -EINVAL; + + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. + */ + encap_port = (__force __be16)encap->sue_port; + if (!sctp_is_any(sk, (union sctp_addr *)&encap->sue_address)) { + t = sctp_addr_id2transport(sk, &encap->sue_address, + encap->sue_assoc_id); + if (!t) + return -EINVAL; + + t->encap_port = encap_port; + return 0; + } + + /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the + * socket is a one to many style socket, and an association + * was not found, then the id was invalid. + */ + asoc = sctp_id2assoc(sk, encap->sue_assoc_id); + if (!asoc && encap->sue_assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) + return -EINVAL; + + /* If changes are for association, also apply encap_port to + * each transport. + */ + if (asoc) { + list_for_each_entry(t, &asoc->peer.transport_addr_list, + transports) + t->encap_port = encap_port; + + return 0; + } + + sctp_sk(sk)->encap_port = encap_port; + return 0; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4636,6 +4685,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: retval = sctp_setsockopt_pf_expose(sk, kopt, optlen); break; + case SCTP_REMOTE_UDP_ENCAPS_PORT: + retval = sctp_setsockopt_encap_port(sk, kopt, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -4876,6 +4928,8 @@ static int sctp_init_sock(struct sock *sk) * be modified via SCTP_PEER_ADDR_PARAMS */ sp->hbinterval = net->sctp.hb_interval; + sp->udp_port = htons(net->sctp.udp_port); + sp->encap_port = htons(net->sctp.encap_port); sp->pathmaxrxt = net->sctp.max_retrans_path; sp->pf_retrans = net->sctp.pf_retrans; sp->ps_retrans = net->sctp.ps_retrans; @@ -7790,6 +7844,65 @@ out: return retval; } +static int sctp_getsockopt_encap_port(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_association *asoc; + struct sctp_udpencaps encap; + struct sctp_transport *t; + __be16 encap_port; + + if (len < sizeof(encap)) + return -EINVAL; + + len = sizeof(encap); + if (copy_from_user(&encap, optval, len)) + return -EFAULT; + + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. + */ + if (!sctp_is_any(sk, (union sctp_addr *)&encap.sue_address)) { + t = sctp_addr_id2transport(sk, &encap.sue_address, + encap.sue_assoc_id); + if (!t) { + pr_debug("%s: failed no transport\n", __func__); + return -EINVAL; + } + + encap_port = t->encap_port; + goto out; + } + + /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the + * socket is a one to many style socket, and an association + * was not found, then the id was invalid. + */ + asoc = sctp_id2assoc(sk, encap.sue_assoc_id); + if (!asoc && encap.sue_assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) { + pr_debug("%s: failed no association\n", __func__); + return -EINVAL; + } + + if (asoc) { + encap_port = asoc->encap_port; + goto out; + } + + encap_port = sctp_sk(sk)->encap_port; + +out: + encap.sue_port = (__force uint16_t)encap_port; + if (copy_to_user(optval, &encap, len)) + return -EFAULT; + + if (put_user(len, optlen)) + return -EFAULT; + + return 0; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -8010,6 +8123,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: retval = sctp_getsockopt_pf_expose(sk, len, optval, optlen); break; + case SCTP_REMOTE_UDP_ENCAPS_PORT: + retval = sctp_getsockopt_encap_port(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index c16c80963e55..e92df779af73 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -36,6 +36,7 @@ static int rto_alpha_max = 1000; static int rto_beta_max = 1000; static int pf_expose_max = SCTP_PF_EXPOSE_MAX; static int ps_retrans_max = SCTP_PS_RETRANS_MAX; +static int udp_port_max = 65535; static unsigned long max_autoclose_min = 0; static unsigned long max_autoclose_max = @@ -48,6 +49,8 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); +static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos); static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_auth(struct ctl_table *ctl, int write, @@ -291,6 +294,24 @@ static struct ctl_table sctp_net_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "udp_port", + .data = &init_net.sctp.udp_port, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_sctp_do_udp_port, + .extra1 = SYSCTL_ZERO, + .extra2 = &udp_port_max, + }, + { + .procname = "encap_port", + .data = &init_net.sctp.encap_port, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + .extra2 = &udp_port_max, + }, + { .procname = "addr_scope_policy", .data = &init_net.sctp.scope_policy, .maxlen = sizeof(int), @@ -477,6 +498,47 @@ static int proc_sctp_do_auth(struct ctl_table *ctl, int write, return ret; } +static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = current->nsproxy->net_ns; + unsigned int min = *(unsigned int *)ctl->extra1; + unsigned int max = *(unsigned int *)ctl->extra2; + struct ctl_table tbl; + int ret, new_value; + + memset(&tbl, 0, sizeof(struct ctl_table)); + tbl.maxlen = sizeof(unsigned int); + + if (write) + tbl.data = &new_value; + else + tbl.data = &net->sctp.udp_port; + + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) { + struct sock *sk = net->sctp.ctl_sock; + + if (new_value > max || new_value < min) + return -EINVAL; + + net->sctp.udp_port = new_value; + sctp_udp_sock_stop(net); + if (new_value) { + ret = sctp_udp_sock_start(net); + if (ret) + net->sctp.udp_port = 0; + } + + /* Update the value in the control socket */ + lock_sock(sk); + sctp_sk(sk)->udp_port = htons(net->sctp.udp_port); + release_sock(sk); + } + + return ret; +} + int sctp_sysctl_net_register(struct net *net) { struct ctl_table *table; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 806af58f4375..bf0ac467e757 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -8,7 +8,7 @@ * * This file is part of the SCTP kernel implementation * - * This module provides the abstraction for an SCTP tranport representing + * This module provides the abstraction for an SCTP transport representing * a remote transport address. For local transport addresses, we just use * union sctp_addr. * @@ -123,7 +123,7 @@ void sctp_transport_free(struct sctp_transport *transport) /* Delete the T3_rtx timer if it's active. * There is no point in not doing this now and letting * structure hang around in memory since we know - * the tranport is going away. + * the transport is going away. */ if (del_timer(&transport->T3_rtx_timer)) sctp_transport_put(transport); @@ -133,7 +133,7 @@ void sctp_transport_free(struct sctp_transport *transport) /* Delete the ICMP proto unreachable timer if it's active. */ if (del_timer(&transport->proto_unreach_timer)) - sctp_association_put(transport->asoc); + sctp_transport_put(transport); sctp_transport_put(transport); } diff --git a/net/smc/Makefile b/net/smc/Makefile index cb1254541f37..77e54fe42b1c 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o -smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o +smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e9f487c8c6d5..47340b3b514f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -45,6 +45,7 @@ #include "smc_ib.h" #include "smc_ism.h" #include "smc_pnet.h" +#include "smc_netlink.h" #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" @@ -552,8 +553,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, return smc_connect_fallback(smc, reason_code); } -/* abort connecting */ -static void smc_connect_abort(struct smc_sock *smc, int local_first) +static void smc_conn_abort(struct smc_sock *smc, int local_first) { if (local_first) smc_lgr_cleanup_early(&smc->conn); @@ -669,7 +669,7 @@ static int smc_find_proposal_devices(struct smc_sock *smc, ini->smc_type_v1 = SMC_TYPE_N; } /* else RDMA is supported for this connection */ } - if (smc_ism_v2_capable && smc_find_ism_v2_device_clnt(smc, ini)) + if (smc_ism_is_v2_capable() && smc_find_ism_v2_device_clnt(smc, ini)) ini->smc_type_v2 = SMC_TYPE_N; /* if neither ISM nor RDMA are supported, fallback */ @@ -814,7 +814,7 @@ static int smc_connect_rdma(struct smc_sock *smc, return 0; connect_abort: - smc_connect_abort(smc, ini->first_contact_local); + smc_conn_abort(smc, ini->first_contact_local); mutex_unlock(&smc_client_lgr_pending); smc->connect_nonblock = 0; @@ -893,7 +893,7 @@ static int smc_connect_ism(struct smc_sock *smc, return 0; connect_abort: - smc_connect_abort(smc, ini->first_contact_local); + smc_conn_abort(smc, ini->first_contact_local); mutex_unlock(&smc_server_lgr_pending); smc->connect_nonblock = 0; @@ -921,7 +921,7 @@ static int smc_connect_check_aclc(struct smc_init_info *ini, /* perform steps before actually connecting */ static int __smc_connect(struct smc_sock *smc) { - u8 version = smc_ism_v2_capable ? SMC_V2 : SMC_V1; + u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1; struct smc_clc_msg_accept_confirm_v2 *aclc2; struct smc_clc_msg_accept_confirm *aclc; struct smc_init_info *ini = NULL; @@ -946,9 +946,9 @@ static int __smc_connect(struct smc_sock *smc) version); ini->smcd_version = SMC_V1; - ini->smcd_version |= smc_ism_v2_capable ? SMC_V2 : 0; + ini->smcd_version |= smc_ism_is_v2_capable() ? SMC_V2 : 0; ini->smc_type_v1 = SMC_TYPE_B; - ini->smc_type_v2 = smc_ism_v2_capable ? SMC_TYPE_D : SMC_TYPE_N; + ini->smc_type_v2 = smc_ism_is_v2_capable() ? SMC_TYPE_D : SMC_TYPE_N; /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(smc->clcsock, ini)) { @@ -979,7 +979,8 @@ static int __smc_connect(struct smc_sock *smc) /* check if smc modes and versions of CLC proposal and accept match */ rc = smc_connect_check_aclc(ini, aclc); - version = aclc->hdr.version == SMC_V1 ? SMC_V1 : version; + version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2; + ini->smcd_version = version; if (rc) goto vlan_cleanup; @@ -1320,10 +1321,7 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, int local_first, u8 version) { /* RDMA setup failed, switch back to TCP */ - if (local_first) - smc_lgr_cleanup_early(&new_smc->conn); - else - smc_conn_free(&new_smc->conn); + smc_conn_abort(new_smc, local_first); if (reason_code < 0) { /* error, no fallback possible */ smc_listen_out_err(new_smc); return; @@ -1346,6 +1344,7 @@ static int smc_listen_v2_check(struct smc_sock *new_smc, { struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext; struct smc_clc_v2_extension *pclc_v2_ext; + int rc = SMC_CLC_DECL_PEERNOSMC; ini->smc_type_v1 = pclc->hdr.typev1; ini->smc_type_v2 = pclc->hdr.typev2; @@ -1353,29 +1352,30 @@ static int smc_listen_v2_check(struct smc_sock *new_smc, if (pclc->hdr.version > SMC_V1) ini->smcd_version |= ini->smc_type_v2 != SMC_TYPE_N ? SMC_V2 : 0; - if (!smc_ism_v2_capable) { + if (!(ini->smcd_version & SMC_V2)) { + rc = SMC_CLC_DECL_PEERNOSMC; + goto out; + } + if (!smc_ism_is_v2_capable()) { ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOISM2SUPP; goto out; } pclc_v2_ext = smc_get_clc_v2_ext(pclc); if (!pclc_v2_ext) { ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOV2EXT; goto out; } pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext); - if (!pclc_smcd_v2_ext) + if (!pclc_smcd_v2_ext) { ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOV2DEXT; + } out: - if (!ini->smcd_version) { - if (pclc->hdr.typev1 == SMC_TYPE_B || - pclc->hdr.typev2 == SMC_TYPE_B) - return SMC_CLC_DECL_NOSMCDEV; - if (pclc->hdr.typev1 == SMC_TYPE_D || - pclc->hdr.typev2 == SMC_TYPE_D) - return SMC_CLC_DECL_NOSMCDDEV; - return SMC_CLC_DECL_NOSMCRDEV; - } + if (!ini->smcd_version) + return rc; return 0; } @@ -1427,10 +1427,7 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, /* Create send and receive buffers */ rc = smc_buf_create(new_smc, true); if (rc) { - if (ini->first_contact_local) - smc_lgr_cleanup_early(&new_smc->conn); - else - smc_conn_free(&new_smc->conn); + smc_conn_abort(new_smc, ini->first_contact_local); return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM; } @@ -1473,6 +1470,12 @@ static void smc_check_ism_v2_match(struct smc_init_info *ini, } } +static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini) +{ + if (!ini->rc) + ini->rc = rc; +} + static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) @@ -1483,7 +1486,7 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, unsigned int matches = 0; u8 smcd_version; u8 *eid = NULL; - int i; + int i, rc; if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2)) goto not_found; @@ -1492,8 +1495,10 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, smc_v2_ext = smc_get_clc_v2_ext(pclc); smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); if (!smcd_v2_ext || - !smc_v2_ext->hdr.flag.seid) /* no system EID support for SMCD */ + !smc_v2_ext->hdr.flag.seid) { /* no system EID support for SMCD */ + smc_find_ism_store_rc(SMC_CLC_DECL_NOSEID, ini); goto not_found; + } mutex_lock(&smcd_dev_list.mutex); if (pclc_smcd->ism.chid) @@ -1525,9 +1530,12 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, ini->smcd_version = SMC_V2; ini->is_smcd = true; ini->ism_selected = i; - if (smc_listen_ism_init(new_smc, ini)) + rc = smc_listen_ism_init(new_smc, ini); + if (rc) { + smc_find_ism_store_rc(rc, ini); /* try next active ISM device */ continue; + } return; /* matching and usable V2 ISM device found */ } /* no V2 ISM device could be initialized */ @@ -1544,19 +1552,23 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, struct smc_init_info *ini) { struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc); + int rc = 0; /* check if ISM V1 is available */ if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1)) goto not_found; ini->is_smcd = true; /* prepare ISM check */ ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid); - if (smc_find_ism_device(new_smc, ini)) + rc = smc_find_ism_device(new_smc, ini); + if (rc) goto not_found; ini->ism_selected = 0; - if (!smc_listen_ism_init(new_smc, ini)) + rc = smc_listen_ism_init(new_smc, ini); + if (!rc) return; /* V1 ISM device found */ not_found: + smc_find_ism_store_rc(rc, ini); ini->ism_dev[0] = NULL; ini->is_smcd = false; } @@ -1613,16 +1625,16 @@ static int smc_listen_find_device(struct smc_sock *new_smc, return 0; if (!(ini->smcd_version & SMC_V1)) - return SMC_CLC_DECL_NOSMCDEV; + return ini->rc ?: SMC_CLC_DECL_NOSMCD2DEV; /* check for matching IP prefix and subnet length */ rc = smc_listen_prfx_check(new_smc, pclc); if (rc) - return rc; + return ini->rc ?: rc; /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) - return SMC_CLC_DECL_GETVLANERR; + return ini->rc ?: SMC_CLC_DECL_GETVLANERR; /* check for ISM device matching V1 proposed device */ smc_find_ism_v1_device_serv(new_smc, pclc, ini); @@ -1630,10 +1642,14 @@ static int smc_listen_find_device(struct smc_sock *new_smc, return 0; if (pclc->hdr.typev1 == SMC_TYPE_D) - return SMC_CLC_DECL_NOSMCDDEV; /* skip RDMA and decline */ + /* skip RDMA and decline */ + return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV; /* check if RDMA is available */ - return smc_find_rdma_v1_device_serv(new_smc, pclc, ini); + rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); + smc_find_ism_store_rc(rc, ini); + + return (!rc) ? 0 : ini->rc; } /* listen worker: finish RDMA setup */ @@ -1666,7 +1682,7 @@ static void smc_listen_work(struct work_struct *work) { struct smc_sock *new_smc = container_of(work, struct smc_sock, smc_listen_work); - u8 version = smc_ism_v2_capable ? SMC_V2 : SMC_V1; + u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1; struct socket *newclcsock = new_smc->clcsock; struct smc_clc_msg_accept_confirm *cclc; struct smc_clc_msg_proposal_area *buf; @@ -2480,10 +2496,14 @@ static int __init smc_init(void) smc_ism_init(); smc_clc_init(); - rc = smc_pnet_init(); + rc = smc_nl_init(); if (rc) goto out_pernet_subsys; + rc = smc_pnet_init(); + if (rc) + goto out_nl; + rc = -ENOMEM; smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); if (!smc_hs_wq) @@ -2554,6 +2574,8 @@ out_alloc_hs_wq: destroy_workqueue(smc_hs_wq); out_pnet: smc_pnet_exit(); +out_nl: + smc_nl_exit(); out_pernet_subsys: unregister_pernet_subsys(&smc_net_ops); @@ -2571,6 +2593,7 @@ static void __exit smc_exit(void) proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); + smc_nl_exit(); unregister_pernet_subsys(&smc_net_ops); rcu_barrier(); } diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index b1ce6ccbfaec..f23f558054a7 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -389,9 +389,9 @@ static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc) * Context: * - tasklet context */ -static void smcd_cdc_rx_tsklet(unsigned long data) +static void smcd_cdc_rx_tsklet(struct tasklet_struct *t) { - struct smc_connection *conn = (struct smc_connection *)data; + struct smc_connection *conn = from_tasklet(conn, t, rx_tsklet); struct smcd_cdc_msg *data_cdc; struct smcd_cdc_msg cdc; struct smc_sock *smc; @@ -411,7 +411,7 @@ static void smcd_cdc_rx_tsklet(unsigned long data) */ void smcd_cdc_rx_init(struct smc_connection *conn) { - tasklet_init(&conn->rx_tsklet, smcd_cdc_rx_tsklet, (unsigned long)conn); + tasklet_setup(&conn->rx_tsklet, smcd_cdc_rx_tsklet); } /***************************** init, exit, misc ******************************/ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 696d89c2dce4..e286dafd6e88 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -772,6 +772,11 @@ int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, return len > 0 ? 0 : len; } +void smc_clc_get_hostname(u8 **host) +{ + *host = &smc_hostname[0]; +} + void __init smc_clc_init(void) { struct new_utsname *u; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index b3f46ab79e47..32d37f7b70f2 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -37,6 +37,11 @@ #define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */ #define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */ #define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */ +#define SMC_CLC_DECL_NOISM2SUPP 0x03030003 /* hardware has no ISMv2 support */ +#define SMC_CLC_DECL_NOV2EXT 0x03030004 /* peer sent no clc v2 extension */ +#define SMC_CLC_DECL_NOV2DEXT 0x03030005 /* peer sent no clc SMC-Dv2 ext. */ +#define SMC_CLC_DECL_NOSEID 0x03030006 /* peer sent no SEID */ +#define SMC_CLC_DECL_NOSMCD2DEV 0x03030007 /* no SMC-Dv2 device found */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ @@ -124,7 +129,7 @@ struct smc_clc_v2_extension { struct smc_clnt_opts_area_hdr hdr; u8 roce[16]; /* RoCEv2 GID */ u8 reserved[16]; - u8 user_eids[0][SMC_MAX_EID_LEN]; + u8 user_eids[][SMC_MAX_EID_LEN]; }; struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ @@ -143,7 +148,7 @@ struct smc_clc_msg_smcd { /* SMC-D GID information */ struct smc_clc_smcd_v2_extension { u8 system_eid[SMC_MAX_EID_LEN]; u8 reserved[16]; - struct smc_clc_smcd_gid_chid gidchid[0]; + struct smc_clc_smcd_gid_chid gidchid[]; }; struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ @@ -329,5 +334,6 @@ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact, u8 version); void smc_clc_init(void) __init; +void smc_clc_get_hostname(u8 **host); #endif diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2b19863f7171..0df85a12651e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -16,6 +16,8 @@ #include <linux/wait.h> #include <linux/reboot.h> #include <linux/mutex.h> +#include <linux/list.h> +#include <linux/smc.h> #include <net/tcp.h> #include <net/sock.h> #include <rdma/ib_verbs.h> @@ -30,12 +32,13 @@ #include "smc_cdc.h" #include "smc_close.h" #include "smc_ism.h" +#include "smc_netlink.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) -static struct smc_lgr_list smc_lgr_list = { /* established link groups */ +struct smc_lgr_list smc_lgr_list = { /* established link groups */ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), .list = LIST_HEAD_INIT(smc_lgr_list.list), .num = 0, @@ -63,6 +66,16 @@ static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, return &smc_lgr_list.list; } +static void smc_ibdev_cnt_inc(struct smc_link *lnk) +{ + atomic_inc(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]); +} + +static void smc_ibdev_cnt_dec(struct smc_link *lnk) +{ + atomic_dec(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]); +} + static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { /* client link group creation always follows the server link group @@ -139,6 +152,7 @@ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) } if (!conn->lnk) return SMC_CLC_DECL_NOACTLINK; + atomic_inc(&conn->lnk->conn_cnt); return 0; } @@ -180,6 +194,8 @@ static void __smc_lgr_unregister_conn(struct smc_connection *conn) struct smc_link_group *lgr = conn->lgr; rb_erase(&conn->alert_node, &lgr->conns_all); + if (conn->lnk) + atomic_dec(&conn->lnk->conn_cnt); lgr->conns_num--; conn->alert_token_local = 0; sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ @@ -201,6 +217,367 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) conn->lgr = NULL; } +int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + char hostname[SMC_MAX_HOSTNAME_LEN + 1]; + char smc_seid[SMC_MAX_EID_LEN + 1]; + struct smcd_dev *smcd_dev; + struct nlattr *attrs; + u8 *seid = NULL; + u8 *host = NULL; + void *nlh; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_SYS_INFO); + if (!nlh) + goto errmsg; + if (cb_ctx->pos[0]) + goto errout; + attrs = nla_nest_start(skb, SMC_GEN_SYS_INFO); + if (!attrs) + goto errout; + if (nla_put_u8(skb, SMC_NLA_SYS_VER, SMC_V2)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_SYS_REL, SMC_RELEASE)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_SYS_IS_ISM_V2, smc_ism_is_v2_capable())) + goto errattr; + smc_clc_get_hostname(&host); + if (host) { + memcpy(hostname, host, SMC_MAX_HOSTNAME_LEN); + hostname[SMC_MAX_HOSTNAME_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_SYS_LOCAL_HOST, hostname)) + goto errattr; + } + mutex_lock(&smcd_dev_list.mutex); + smcd_dev = list_first_entry_or_null(&smcd_dev_list.list, + struct smcd_dev, list); + if (smcd_dev) + smc_ism_get_system_eid(smcd_dev, &seid); + mutex_unlock(&smcd_dev_list.mutex); + if (seid && smc_ism_is_v2_capable()) { + memcpy(smc_seid, seid, SMC_MAX_EID_LEN); + smc_seid[SMC_MAX_EID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_SYS_SEID, smc_seid)) + goto errattr; + } + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + cb_ctx->pos[0] = 1; + return skb->len; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return skb->len; +} + +static int smc_nl_fill_lgr(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_target[SMC_MAX_PNETID_LEN + 1]; + struct nlattr *attrs; + + attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCR); + if (!attrs) + goto errout; + + if (nla_put_u32(skb, SMC_NLA_LGR_R_ID, *((u32 *)&lgr->id))) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LGR_R_CONNS_NUM, lgr->conns_num)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_ROLE, lgr->role)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_TYPE, lgr->type)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id)) + goto errattr; + memcpy(smc_target, lgr->pnet_id, SMC_MAX_PNETID_LEN); + smc_target[SMC_MAX_PNETID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +static int smc_nl_fill_lgr_link(struct smc_link_group *lgr, + struct smc_link *link, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_ibname[IB_DEVICE_NAME_MAX]; + u8 smc_gid_target[41]; + struct nlattr *attrs; + u32 link_uid = 0; + void *nlh; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_LINK_SMCR); + if (!nlh) + goto errmsg; + + attrs = nla_nest_start(skb, SMC_GEN_LINK_SMCR); + if (!attrs) + goto errout; + + if (nla_put_u8(skb, SMC_NLA_LINK_ID, link->link_id)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LINK_STATE, link->state)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LINK_CONN_CNT, + atomic_read(&link->conn_cnt))) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LINK_IB_PORT, link->ibport)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LINK_NET_DEV, link->ndev_ifidx)) + goto errattr; + snprintf(smc_ibname, sizeof(smc_ibname), "%s", link->ibname); + if (nla_put_string(skb, SMC_NLA_LINK_IB_DEV, smc_ibname)) + goto errattr; + memcpy(&link_uid, link->link_uid, sizeof(link_uid)); + if (nla_put_u32(skb, SMC_NLA_LINK_UID, link_uid)) + goto errattr; + memcpy(&link_uid, link->peer_link_uid, sizeof(link_uid)); + if (nla_put_u32(skb, SMC_NLA_LINK_PEER_UID, link_uid)) + goto errattr; + memset(smc_gid_target, 0, sizeof(smc_gid_target)); + smc_gid_be16_convert(smc_gid_target, link->gid); + if (nla_put_string(skb, SMC_NLA_LINK_GID, smc_gid_target)) + goto errattr; + memset(smc_gid_target, 0, sizeof(smc_gid_target)); + smc_gid_be16_convert(smc_gid_target, link->peer_gid); + if (nla_put_string(skb, SMC_NLA_LINK_PEER_GID, smc_gid_target)) + goto errattr; + + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static int smc_nl_handle_lgr(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb, + bool list_links) +{ + void *nlh; + int i; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_LGR_SMCR); + if (!nlh) + goto errmsg; + if (smc_nl_fill_lgr(lgr, skb, cb)) + goto errout; + + genlmsg_end(skb, nlh); + if (!list_links) + goto out; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&lgr->lnk[i])) + continue; + if (smc_nl_fill_lgr_link(lgr, &lgr->lnk[i], skb, cb)) + goto errout; + } +out: + return 0; + +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static void smc_nl_fill_lgr_list(struct smc_lgr_list *smc_lgr, + struct sk_buff *skb, + struct netlink_callback *cb, + bool list_links) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_link_group *lgr; + int snum = cb_ctx->pos[0]; + int num = 0; + + spin_lock_bh(&smc_lgr->lock); + list_for_each_entry(lgr, &smc_lgr->list, list) { + if (num < snum) + goto next; + if (smc_nl_handle_lgr(lgr, skb, cb, list_links)) + goto errout; +next: + num++; + } +errout: + spin_unlock_bh(&smc_lgr->lock); + cb_ctx->pos[0] = num; +} + +static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_host[SMC_MAX_HOSTNAME_LEN + 1]; + char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + char smc_eid[SMC_MAX_EID_LEN + 1]; + struct nlattr *v2_attrs; + struct nlattr *attrs; + void *nlh; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_LGR_SMCD); + if (!nlh) + goto errmsg; + + attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCD); + if (!attrs) + goto errout; + + if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, lgr->smcd->local_gid, + SMC_NLA_LGR_D_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid, + SMC_NLA_LGR_D_PAD)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_D_VLAN_ID, lgr->vlan_id)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LGR_D_CONNS_NUM, lgr->conns_num)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LGR_D_CHID, smc_ism_get_chid(lgr->smcd))) + goto errattr; + memcpy(smc_pnet, lgr->smcd->pnetid, SMC_MAX_PNETID_LEN); + smc_pnet[SMC_MAX_PNETID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet)) + goto errattr; + + v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_V2); + if (!v2_attrs) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os)) + goto errv2attr; + memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN); + smc_host[SMC_MAX_HOSTNAME_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host)) + goto errv2attr; + memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN); + smc_eid[SMC_MAX_EID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid)) + goto errv2attr; + + nla_nest_end(skb, v2_attrs); + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; + +errv2attr: + nla_nest_cancel(skb, v2_attrs); +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static int smc_nl_handle_smcd_lgr(struct smcd_dev *dev, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_link_group *lgr; + int snum = cb_ctx->pos[1]; + int rc = 0, num = 0; + + spin_lock_bh(&dev->lgr_lock); + list_for_each_entry(lgr, &dev->lgr_list, list) { + if (!lgr->is_smcd) + continue; + if (num < snum) + goto next; + rc = smc_nl_fill_smcd_lgr(lgr, skb, cb); + if (rc) + goto errout; +next: + num++; + } +errout: + spin_unlock_bh(&dev->lgr_lock); + cb_ctx->pos[1] = num; + return rc; +} + +static int smc_nl_fill_smcd_dev(struct smcd_dev_list *dev_list, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smcd_dev *smcd_dev; + int snum = cb_ctx->pos[0]; + int rc = 0, num = 0; + + mutex_lock(&dev_list->mutex); + list_for_each_entry(smcd_dev, &dev_list->list, list) { + if (list_empty(&smcd_dev->lgr_list)) + continue; + if (num < snum) + goto next; + rc = smc_nl_handle_smcd_lgr(smcd_dev, skb, cb); + if (rc) + goto errout; +next: + num++; + } +errout: + mutex_unlock(&dev_list->mutex); + cb_ctx->pos[0] = num; + return rc; +} + +int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) +{ + bool list_links = false; + + smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); + return skb->len; +} + +int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb) +{ + bool list_links = true; + + smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); + return skb->len; +} + +int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) +{ + smc_nl_fill_smcd_dev(&smcd_dev_list, skb, cb); + return skb->len; +} + void smc_lgr_cleanup_early(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; @@ -300,6 +677,15 @@ static u8 smcr_next_link_id(struct smc_link_group *lgr) return link_id; } +static void smcr_copy_dev_info_to_link(struct smc_link *link) +{ + struct smc_ib_device *smcibdev = link->smcibdev; + + snprintf(link->ibname, sizeof(link->ibname), "%s", + smcibdev->ibdev->name); + link->ndev_ifidx = smcibdev->ndev_ifidx[link->ibport - 1]; +} + int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini) { @@ -313,7 +699,10 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->link_idx = link_idx; lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; + smc_ibdev_cnt_inc(lnk); + smcr_copy_dev_info_to_link(lnk); lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; + atomic_set(&lnk->conn_cnt, 0); smc_llc_link_set_uid(lnk); INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); if (!ini->ib_dev->initialized) { @@ -355,6 +744,7 @@ free_link_mem: clear_llc_lnk: smc_llc_link_clear(lnk, false); out: + smc_ibdev_cnt_dec(lnk); put_device(&ini->ib_dev->ibdev->dev); memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; @@ -526,6 +916,14 @@ static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend, return rc; } +static void smc_switch_link_and_count(struct smc_connection *conn, + struct smc_link *to_lnk) +{ + atomic_dec(&conn->lnk->conn_cnt); + conn->lnk = to_lnk; + atomic_inc(&conn->lnk->conn_cnt); +} + struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err) { @@ -574,7 +972,7 @@ again: smc->sk.sk_state == SMC_PEERABORTWAIT || smc->sk.sk_state == SMC_PROCESSABORT) { spin_lock_bh(&conn->send_lock); - conn->lnk = to_lnk; + smc_switch_link_and_count(conn, to_lnk); spin_unlock_bh(&conn->send_lock); continue; } @@ -588,7 +986,7 @@ again: } /* avoid race with smcr_tx_sndbuf_nonempty() */ spin_lock_bh(&conn->send_lock); - conn->lnk = to_lnk; + smc_switch_link_and_count(conn, to_lnk); rc = smc_switch_cursor(smc, pend, wr_buf); spin_unlock_bh(&conn->send_lock); sock_put(&smc->sk); @@ -737,6 +1135,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log) smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); + smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); @@ -1309,7 +1708,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) ini->ism_peer_gid[ini->ism_selected]) : smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) && !lgr->sync_err && - lgr->vlan_id == ini->vlan_id && + (ini->smcd_version == SMC_V2 || + lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { /* link group found */ diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index f1e867ce2e63..e8e448771f85 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -13,7 +13,10 @@ #define _SMC_CORE_H #include <linux/atomic.h> +#include <linux/smc.h> +#include <linux/pci.h> #include <rdma/ib_verbs.h> +#include <net/genetlink.h> #include "smc.h" #include "smc_ib.h" @@ -124,11 +127,14 @@ struct smc_link { u8 link_is_asym; /* is link asymmetric? */ struct smc_link_group *lgr; /* parent link group */ struct work_struct link_down_wrk; /* wrk to bring link down */ + char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */ + int ndev_ifidx; /* network device ifindex */ enum smc_link_state state; /* state of link */ struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ + atomic_t conn_cnt; /* connections on this link */ }; /* For now we just allow one parallel link per link group. The SMC protocol @@ -301,6 +307,7 @@ struct smc_init_info { u8 first_contact_peer; u8 first_contact_local; unsigned short vlan_id; + u32 rc; /* SMC-R */ struct smc_clc_msg_local *ib_lcl; struct smc_ib_device *ib_dev; @@ -362,6 +369,45 @@ static inline bool smc_link_active(struct smc_link *lnk) return lnk->state == SMC_LNK_ACTIVE; } +static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) +{ + sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", + be16_to_cpu(((__be16 *)gid_raw)[0]), + be16_to_cpu(((__be16 *)gid_raw)[1]), + be16_to_cpu(((__be16 *)gid_raw)[2]), + be16_to_cpu(((__be16 *)gid_raw)[3]), + be16_to_cpu(((__be16 *)gid_raw)[4]), + be16_to_cpu(((__be16 *)gid_raw)[5]), + be16_to_cpu(((__be16 *)gid_raw)[6]), + be16_to_cpu(((__be16 *)gid_raw)[7])); +} + +struct smc_pci_dev { + __u32 pci_fid; + __u16 pci_pchid; + __u16 pci_vendor; + __u16 pci_device; + __u8 pci_id[SMC_PCI_ID_STR_LEN]; +}; + +static inline void smc_set_pci_values(struct pci_dev *pci_dev, + struct smc_pci_dev *smc_dev) +{ + smc_dev->pci_vendor = pci_dev->vendor; + smc_dev->pci_device = pci_dev->device; + snprintf(smc_dev->pci_id, sizeof(smc_dev->pci_id), "%s", + pci_name(pci_dev)); +#if IS_ENABLED(CONFIG_S390) + { /* Set s390 specific PCI information */ + struct zpci_dev *zdev; + + zdev = to_zpci(pci_dev); + smc_dev->pci_fid = zdev->fid; + smc_dev->pci_pchid = zdev->pchid; + } +#endif +} + struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; @@ -409,6 +455,10 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err); void smcr_link_down_cond(struct smc_link *lnk); void smcr_link_down_cond_sched(struct smc_link *lnk); +int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb); +int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); +int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb); +int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index f15fca59b4b2..c952986a6aca 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -31,19 +31,6 @@ static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb) return (struct smc_diag_dump_ctx *)cb->ctx; } -static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) -{ - sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", - be16_to_cpu(((__be16 *)gid_raw)[0]), - be16_to_cpu(((__be16 *)gid_raw)[1]), - be16_to_cpu(((__be16 *)gid_raw)[2]), - be16_to_cpu(((__be16 *)gid_raw)[3]), - be16_to_cpu(((__be16 *)gid_raw)[4]), - be16_to_cpu(((__be16 *)gid_raw)[5]), - be16_to_cpu(((__be16 *)gid_raw)[6]), - be16_to_cpu(((__be16 *)gid_raw)[7])); -} - static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) { struct smc_sock *smc = smc_sk(sk); @@ -160,17 +147,17 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, !list_empty(&smc->conn.lgr->list)) { struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, - .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport, - .lnk[0].link_id = smc->conn.lgr->lnk[0].link_id, + .lnk[0].ibport = smc->conn.lnk->ibport, + .lnk[0].link_id = smc->conn.lnk->link_id, }; memcpy(linfo.lnk[0].ibname, smc->conn.lgr->lnk[0].smcibdev->ibdev->name, - sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name)); + sizeof(smc->conn.lnk->smcibdev->ibdev->name)); smc_gid_be16_convert(linfo.lnk[0].gid, - smc->conn.lgr->lnk[0].gid); + smc->conn.lnk->gid); smc_gid_be16_convert(linfo.lnk[0].peer_gid, - smc->conn.lgr->lnk[0].peer_gid); + smc->conn.lnk->peer_gid); if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) goto errout; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 1c314dbdc7fa..7d7ba0320d5a 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -25,6 +25,7 @@ #include "smc_core.h" #include "smc_wr.h" #include "smc.h" +#include "smc_netlink.h" #define SMC_MAX_CQE 32766 /* max. # of completion queue elements */ @@ -198,9 +199,9 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(attr); if (!IS_ERR(ndev) && - ((!vlan_id && !is_vlan_dev(attr->ndev)) || - (vlan_id && is_vlan_dev(attr->ndev) && - vlan_dev_vlan_id(attr->ndev) == vlan_id)) && + ((!vlan_id && !is_vlan_dev(ndev)) || + (vlan_id && is_vlan_dev(ndev) && + vlan_dev_vlan_id(ndev) == vlan_id)) && attr->gid_type == IB_GID_TYPE_ROCE) { rcu_read_unlock(); if (gid) @@ -326,6 +327,171 @@ int smc_ib_create_protection_domain(struct smc_link *lnk) return rc; } +static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr, + struct smc_ib_device *smcibdev) +{ + struct smc_link_group *lgr; + bool rc = false; + int i; + + spin_lock_bh(&smc_lgr->lock); + list_for_each_entry(lgr, &smc_lgr->list, list) { + if (lgr->is_smcd) + continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state == SMC_LNK_UNUSED || + lgr->lnk[i].smcibdev != smcibdev) + continue; + if (lgr->type == SMC_LGR_SINGLE || + lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) { + rc = true; + goto out; + } + } + } +out: + spin_unlock_bh(&smc_lgr->lock); + return rc; +} + +static int smc_nl_handle_dev_port(struct sk_buff *skb, + struct ib_device *ibdev, + struct smc_ib_device *smcibdev, + int port) +{ + char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + struct nlattr *port_attrs; + unsigned char port_state; + int lnk_count = 0; + + port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port); + if (!port_attrs) + goto errout; + + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, + smcibdev->pnetid_by_user[port])) + goto errattr; + memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN); + smc_pnet[SMC_MAX_PNETID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV, + smcibdev->ndev_ifidx[port])) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1)) + goto errattr; + port_state = smc_ib_port_active(smcibdev, port + 1); + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state)) + goto errattr; + lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]); + if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count)) + goto errattr; + nla_nest_end(skb, port_attrs); + return 0; +errattr: + nla_nest_cancel(skb, port_attrs); +errout: + return -EMSGSIZE; +} + +static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev, + struct sk_buff *skb) +{ + if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid)) + return false; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid)) + return false; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor)) + return false; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device)) + return false; + if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id)) + return false; + return true; +} + +static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_ibname[IB_DEVICE_NAME_MAX]; + struct smc_pci_dev smc_pci_dev; + struct pci_dev *pci_dev; + unsigned char is_crit; + struct nlattr *attrs; + void *nlh; + int i; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_DEV_SMCR); + if (!nlh) + goto errmsg; + attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR); + if (!attrs) + goto errout; + is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev); + if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit)) + goto errattr; + if (smcibdev->ibdev->dev.parent) { + memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); + pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent); + smc_set_pci_values(pci_dev, &smc_pci_dev); + if (!smc_nl_handle_pci_values(&smc_pci_dev, skb)) + goto errattr; + } + snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name); + if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname)) + goto errattr; + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!rdma_is_port_valid(smcibdev->ibdev, i)) + continue; + if (smc_nl_handle_dev_port(skb, smcibdev->ibdev, + smcibdev, i - 1)) + goto errattr; + } + + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_ib_device *smcibdev; + int snum = cb_ctx->pos[0]; + int num = 0; + + mutex_lock(&dev_list->mutex); + list_for_each_entry(smcibdev, &dev_list->list, list) { + if (num < snum) + goto next; + if (smc_nl_handle_smcr_dev(smcibdev, skb, cb)) + goto errout; +next: + num++; + } +errout: + mutex_unlock(&dev_list->mutex); + cb_ctx->pos[0] = num; +} + +int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) +{ + smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb); + return skb->len; +} + static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) { struct smc_link *lnk = (struct smc_link *)priv; @@ -557,6 +723,49 @@ out: static struct ib_client smc_ib_client; +static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port) +{ + struct ib_device *ibdev = smcibdev->ibdev; + struct net_device *ndev; + + if (!ibdev->ops.get_netdev) + return; + ndev = ibdev->ops.get_netdev(ibdev, port + 1); + if (ndev) { + smcibdev->ndev_ifidx[port] = ndev->ifindex; + dev_put(ndev); + } +} + +void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) +{ + struct smc_ib_device *smcibdev; + struct ib_device *libdev; + struct net_device *lndev; + u8 port_cnt; + int i; + + mutex_lock(&smc_ib_devices.mutex); + list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { + port_cnt = smcibdev->ibdev->phys_port_cnt; + for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { + libdev = smcibdev->ibdev; + if (!libdev->ops.get_netdev) + continue; + lndev = libdev->ops.get_netdev(libdev, i + 1); + if (lndev) + dev_put(lndev); + if (lndev != ndev) + continue; + if (event == NETDEV_REGISTER) + smcibdev->ndev_ifidx[i] = ndev->ifindex; + if (event == NETDEV_UNREGISTER) + smcibdev->ndev_ifidx[i] = 0; + } + } + mutex_unlock(&smc_ib_devices.mutex); +} + /* callback function for ib_register_client() */ static int smc_ib_add_dev(struct ib_device *ibdev) { @@ -596,6 +805,7 @@ static int smc_ib_add_dev(struct ib_device *ibdev) if (smc_pnetid_by_dev_port(ibdev->dev.parent, i, smcibdev->pnetid[i])) smc_pnetid_by_table_ib(smcibdev, i + 1); + smc_copy_netdev_ifindex(smcibdev, i); pr_warn_ratelimited("smc: ib device %s port %d has pnetid " "%.16s%s\n", smcibdev->ibdev->name, i + 1, diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 2ce481187dd0..3085f5180da7 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -30,6 +30,7 @@ struct smc_ib_devices { /* list of smc ib devices definition */ }; extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */ +extern struct smc_lgr_list smc_lgr_list; /* list of linkgroups */ struct smc_ib_device { /* ib-device infos for smc */ struct list_head list; @@ -53,11 +54,15 @@ struct smc_ib_device { /* ib-device infos for smc */ atomic_t lnk_cnt; /* number of links on ibdev */ wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/ struct mutex mutex; /* protect dev setup+cleanup */ + atomic_t lnk_cnt_by_port[SMC_MAX_PORTS]; + /* number of links per port */ + int ndev_ifidx[SMC_MAX_PORTS]; /* ndev if indexes */ }; struct smc_buf_desc; struct smc_link; +void smc_ib_ndev_change(struct net_device *ndev, unsigned long event); int smc_ib_register_client(void) __init; void smc_ib_unregister_client(void); bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); @@ -87,4 +92,5 @@ void smc_ib_sync_sg_for_device(struct smc_link *lnk, int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, unsigned short vlan_id, u8 gid[], u8 *sgid_index); bool smc_ib_is_valid_local_systemid(void); +int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); #endif diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 6abbdd09a580..9c6e95882553 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -15,13 +15,14 @@ #include "smc_core.h" #include "smc_ism.h" #include "smc_pnet.h" +#include "smc_netlink.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), .mutex = __MUTEX_INITIALIZER(smcd_dev_list.mutex) }; -bool smc_ism_v2_capable; +static bool smc_ism_v2_capable; /* Test if an ISM communication is possible - same CPC */ int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) @@ -51,6 +52,12 @@ u16 smc_ism_get_chid(struct smcd_dev *smcd) return smcd->ops->get_chid(smcd); } +/* HW supports ISM V2 and thus System EID is defined */ +bool smc_ism_is_v2_capable(void) +{ + return smc_ism_v2_capable; +} + /* Set a connection using this DMBE. */ void smc_ism_set_conn(struct smc_connection *conn) { @@ -201,6 +208,97 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, return rc; } +static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + struct smc_pci_dev smc_pci_dev; + struct nlattr *port_attrs; + struct nlattr *attrs; + int use_cnt = 0; + void *nlh; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_DEV_SMCD); + if (!nlh) + goto errmsg; + attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCD); + if (!attrs) + goto errout; + use_cnt = atomic_read(&smcd->lgr_cnt); + if (nla_put_u32(skb, SMC_NLA_DEV_USE_CNT, use_cnt)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) + goto errattr; + memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); + smc_set_pci_values(to_pci_dev(smcd->dev.parent), &smc_pci_dev); + if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev.pci_vendor)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev.pci_device)) + goto errattr; + if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev.pci_id)) + goto errattr; + + port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT); + if (!port_attrs) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, smcd->pnetid_by_user)) + goto errportattr; + memcpy(smc_pnet, smcd->pnetid, SMC_MAX_PNETID_LEN); + smc_pnet[SMC_MAX_PNETID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) + goto errportattr; + + nla_nest_end(skb, port_attrs); + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; + +errportattr: + nla_nest_cancel(skb, port_attrs); +errattr: + nla_nest_cancel(skb, attrs); +errout: + nlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + int snum = cb_ctx->pos[0]; + struct smcd_dev *smcd; + int num = 0; + + mutex_lock(&dev_list->mutex); + list_for_each_entry(smcd, &dev_list->list, list) { + if (num < snum) + goto next; + if (smc_nl_handle_smcd_dev(smcd, skb, cb)) + goto errout; +next: + num++; + } +errout: + mutex_unlock(&dev_list->mutex); + cb_ctx->pos[0] = num; +} + +int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) +{ + smc_nl_prep_smcd_dev(&smcd_dev_list, skb, cb); + return skb->len; +} + struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 8048e09ddcf8..113efc7352ed 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -10,6 +10,7 @@ #define SMCD_ISM_H #include <linux/uio.h> +#include <linux/types.h> #include <linux/mutex.h> #include "smc.h" @@ -20,9 +21,6 @@ struct smcd_dev_list { /* List of SMCD devices */ }; extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */ -extern bool smc_ism_v2_capable; /* HW supports ISM V2 and thus - * System EID is defined - */ struct smc_ism_vlanid { /* VLAN id set on ISM device */ struct list_head list; @@ -52,5 +50,7 @@ int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos, int smc_ism_signal_shutdown(struct smc_link_group *lgr); void smc_ism_get_system_eid(struct smcd_dev *dev, u8 **eid); u16 smc_ism_get_chid(struct smcd_dev *dev); +bool smc_ism_is_v2_capable(void); void smc_ism_init(void); +int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); #endif diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c new file mode 100644 index 000000000000..140419a19dbf --- /dev/null +++ b/net/smc/smc_netlink.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Generic netlink support functions to interact with SMC module + * + * Copyright IBM Corp. 2020 + * + * Author(s): Guvenc Gulce <guvenc@linux.ibm.com> + */ + +#include <linux/module.h> +#include <linux/list.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <linux/if.h> +#include <linux/smc.h> + +#include "smc_core.h" +#include "smc_ism.h" +#include "smc_ib.h" +#include "smc_netlink.h" + +#define SMC_CMD_MAX_ATTR 1 + +/* SMC_GENL generic netlink operation definition */ +static const struct genl_ops smc_gen_nl_ops[] = { + { + .cmd = SMC_NETLINK_GET_SYS_INFO, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_sys_info, + }, + { + .cmd = SMC_NETLINK_GET_LGR_SMCR, + /* can be retrieved by unprivileged users */ + .dumpit = smcr_nl_get_lgr, + }, + { + .cmd = SMC_NETLINK_GET_LINK_SMCR, + /* can be retrieved by unprivileged users */ + .dumpit = smcr_nl_get_link, + }, + { + .cmd = SMC_NETLINK_GET_LGR_SMCD, + /* can be retrieved by unprivileged users */ + .dumpit = smcd_nl_get_lgr, + }, + { + .cmd = SMC_NETLINK_GET_DEV_SMCD, + /* can be retrieved by unprivileged users */ + .dumpit = smcd_nl_get_device, + }, + { + .cmd = SMC_NETLINK_GET_DEV_SMCR, + /* can be retrieved by unprivileged users */ + .dumpit = smcr_nl_get_device, + }, +}; + +static const struct nla_policy smc_gen_nl_policy[2] = { + [SMC_CMD_MAX_ATTR] = { .type = NLA_REJECT, }, +}; + +/* SMC_GENL family definition */ +struct genl_family smc_gen_nl_family __ro_after_init = { + .hdrsize = 0, + .name = SMC_GENL_FAMILY_NAME, + .version = SMC_GENL_FAMILY_VERSION, + .maxattr = SMC_CMD_MAX_ATTR, + .policy = smc_gen_nl_policy, + .netnsok = true, + .module = THIS_MODULE, + .ops = smc_gen_nl_ops, + .n_ops = ARRAY_SIZE(smc_gen_nl_ops) +}; + +int __init smc_nl_init(void) +{ + return genl_register_family(&smc_gen_nl_family); +} + +void smc_nl_exit(void) +{ + genl_unregister_family(&smc_gen_nl_family); +} diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h new file mode 100644 index 000000000000..3477265cba6c --- /dev/null +++ b/net/smc/smc_netlink.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * SMC Generic netlink operations + * + * Copyright IBM Corp. 2020 + * + * Author(s): Guvenc Gulce <guvenc@linux.ibm.com> + */ + +#ifndef _SMC_NETLINK_H +#define _SMC_NETLINK_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +extern struct genl_family smc_gen_nl_family; + +struct smc_nl_dmp_ctx { + int pos[2]; +}; + +static inline struct smc_nl_dmp_ctx *smc_nl_dmp_ctx(struct netlink_callback *c) +{ + return (struct smc_nl_dmp_ctx *)c->ctx; +} + +int smc_nl_init(void) __init; +void smc_nl_exit(void); + +#endif diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index f3c18b991d35..6f6d33edb135 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -827,9 +827,11 @@ static int smc_pnet_netdev_event(struct notifier_block *this, case NETDEV_REBOOT: case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); + smc_ib_ndev_change(event_dev, event); return NOTIFY_OK; case NETDEV_REGISTER: smc_pnet_add_by_ndev(event_dev); + smc_ib_ndev_change(event_dev, event); return NOTIFY_OK; case NETDEV_UP: smc_pnet_add_base_pnetid(net, event_dev, ndev_pnetid); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 1e23cdd41eb1..cbc73a7e4d59 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -131,9 +131,9 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) wake_up(&link->wr_tx_wait); } -static void smc_wr_tx_tasklet_fn(unsigned long data) +static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_device *dev = (struct smc_ib_device *)data; + struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int i = 0, rc; int polled = 0; @@ -435,9 +435,9 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) } } -static void smc_wr_rx_tasklet_fn(unsigned long data) +static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_device *dev = (struct smc_ib_device *)data; + struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int polled = 0; int rc; @@ -698,10 +698,8 @@ void smc_wr_remove_dev(struct smc_ib_device *smcibdev) void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn, - (unsigned long)smcibdev); - tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn, - (unsigned long)smcibdev); + tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn); + tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); } int smc_wr_create_link(struct smc_link *lnk) diff --git a/net/socket.c b/net/socket.c index 6e6cccc2104f..7f0617ab5437 100644 --- a/net/socket.c +++ b/net/socket.c @@ -52,6 +52,7 @@ * Based upon Swansea University Computer Society NET3.039 */ +#include <linux/ethtool.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/file.h> @@ -64,7 +65,6 @@ #include <linux/seq_file.h> #include <linux/mutex.h> #include <linux/if_bridge.h> -#include <linux/if_frad.h> #include <linux/if_vlan.h> #include <linux/ptp_classify.h> #include <linux/init.h> @@ -445,17 +445,15 @@ static int sock_map_fd(struct socket *sock, int flags) /** * sock_from_file - Return the &socket bounded to @file. * @file: file - * @err: pointer to an error code return * - * On failure returns %NULL and assigns -ENOTSOCK to @err. + * On failure returns %NULL. */ -struct socket *sock_from_file(struct file *file, int *err) +struct socket *sock_from_file(struct file *file) { if (file->f_op == &socket_file_ops) return file->private_data; /* set in sock_map_fd */ - *err = -ENOTSOCK; return NULL; } EXPORT_SYMBOL(sock_from_file); @@ -484,9 +482,11 @@ struct socket *sockfd_lookup(int fd, int *err) return NULL; } - sock = sock_from_file(file, err); - if (!sock) + sock = sock_from_file(file); + if (!sock) { + *err = -ENOTSOCK; fput(file); + } return sock; } EXPORT_SYMBOL(sockfd_lookup); @@ -498,11 +498,12 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) *err = -EBADF; if (f.file) { - sock = sock_from_file(f.file, err); + sock = sock_from_file(f.file); if (likely(sock)) { *fput_needed = f.flags & FDPUT_FPUT; return sock; } + *err = -ENOTSOCK; fdput(f); } return NULL; @@ -1027,17 +1028,6 @@ void vlan_ioctl_set(int (*hook) (struct net *, void __user *)) } EXPORT_SYMBOL(vlan_ioctl_set); -static DEFINE_MUTEX(dlci_ioctl_mutex); -static int (*dlci_ioctl_hook) (unsigned int, void __user *); - -void dlci_ioctl_set(int (*hook) (unsigned int, void __user *)) -{ - mutex_lock(&dlci_ioctl_mutex); - dlci_ioctl_hook = hook; - mutex_unlock(&dlci_ioctl_mutex); -} -EXPORT_SYMBOL(dlci_ioctl_set); - static long sock_do_ioctl(struct net *net, struct socket *sock, unsigned int cmd, unsigned long arg) { @@ -1156,17 +1146,6 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) err = vlan_ioctl_hook(net, argp); mutex_unlock(&vlan_ioctl_mutex); break; - case SIOCADDDLCI: - case SIOCDELDLCI: - err = -ENOPKG; - if (!dlci_ioctl_hook) - request_module("dlci"); - - mutex_lock(&dlci_ioctl_mutex); - if (dlci_ioctl_hook) - err = dlci_ioctl_hook(cmd, argp); - mutex_unlock(&dlci_ioctl_mutex); - break; case SIOCGSKNS: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -1715,9 +1694,11 @@ int __sys_accept4_file(struct file *file, unsigned file_flags, if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - sock = sock_from_file(file, &err); - if (!sock) + sock = sock_from_file(file); + if (!sock) { + err = -ENOTSOCK; goto out; + } err = -ENFILE; newsock = sock_alloc(); @@ -1840,9 +1821,11 @@ int __sys_connect_file(struct file *file, struct sockaddr_storage *address, struct socket *sock; int err; - sock = sock_from_file(file, &err); - if (!sock) + sock = sock_from_file(file); + if (!sock) { + err = -ENOTSOCK; goto out; + } err = security_socket_connect(sock, (struct sockaddr *)address, addrlen); @@ -2143,6 +2126,9 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, return __sys_setsockopt(fd, level, optname, optval, optlen); } +INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level, + int optname)); + /* * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. @@ -2192,6 +2178,17 @@ SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, * Shutdown a socket. */ +int __sys_shutdown_sock(struct socket *sock, int how) +{ + int err; + + err = security_socket_shutdown(sock, how); + if (!err) + err = sock->ops->shutdown(sock, how); + + return err; +} + int __sys_shutdown(int fd, int how) { int err, fput_needed; @@ -2199,9 +2196,7 @@ int __sys_shutdown(int fd, int how) sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { - err = security_socket_shutdown(sock, how); - if (!err) - err = sock->ops->shutdown(sock, how); + err = __sys_shutdown_sock(sock, how); fput_light(sock->file, fput_needed); } return err; @@ -3427,8 +3422,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCBRDELBR: case SIOCGIFVLAN: case SIOCSIFVLAN: - case SIOCADDDLCI: - case SIOCDELDLCI: case SIOCGSKNS: case SIOCGSTAMP_NEW: case SIOCGSTAMPNS_NEW: diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c index 010dcb876f9d..6e4dbd577a39 100644 --- a/net/sunrpc/addr.c +++ b/net/sunrpc/addr.c @@ -185,7 +185,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf, scope_id = dev->ifindex; dev_put(dev); } else { - if (kstrtou32(p, 10, &scope_id) == 0) { + if (kstrtou32(p, 10, &scope_id) != 0) { kfree(p); return 0; } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 4ecc2a959567..5f42aa5fc612 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -29,6 +29,7 @@ #include <linux/uaccess.h> #include <linux/hashtable.h> +#include "auth_gss_internal.h" #include "../netns.h" #include <trace/events/rpcgss.h> @@ -125,35 +126,6 @@ gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags); } -static const void * -simple_get_bytes(const void *p, const void *end, void *res, size_t len) -{ - const void *q = (const void *)((const char *)p + len); - if (unlikely(q > end || q < p)) - return ERR_PTR(-EFAULT); - memcpy(res, p, len); - return q; -} - -static inline const void * -simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) -{ - const void *q; - unsigned int len; - - p = simple_get_bytes(p, end, &len, sizeof(len)); - if (IS_ERR(p)) - return p; - q = (const void *)((const char *)p + len); - if (unlikely(q > end || q < p)) - return ERR_PTR(-EFAULT); - dest->data = kmemdup(p, len, GFP_NOFS); - if (unlikely(dest->data == NULL)) - return ERR_PTR(-ENOMEM); - dest->len = len; - return q; -} - static struct gss_cl_ctx * gss_cred_get_ctx(struct rpc_cred *cred) { diff --git a/net/sunrpc/auth_gss/auth_gss_internal.h b/net/sunrpc/auth_gss/auth_gss_internal.h new file mode 100644 index 000000000000..f6d9631bd9d0 --- /dev/null +++ b/net/sunrpc/auth_gss/auth_gss_internal.h @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* + * linux/net/sunrpc/auth_gss/auth_gss_internal.h + * + * Internal definitions for RPCSEC_GSS client authentication + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + */ +#include <linux/err.h> +#include <linux/string.h> +#include <linux/sunrpc/xdr.h> + +static inline const void * +simple_get_bytes(const void *p, const void *end, void *res, size_t len) +{ + const void *q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + memcpy(res, p, len); + return q; +} + +static inline const void * +simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) +{ + const void *q; + unsigned int len; + + p = simple_get_bytes(p, end, &len, sizeof(len)); + if (IS_ERR(p)) + return p; + q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + if (len) { + dest->data = kmemdup(p, len, GFP_NOFS); + if (unlikely(dest->data == NULL)) + return ERR_PTR(-ENOMEM); + } else + dest->data = NULL; + dest->len = len; + return q; +} diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index ae9acf3a7389..1c092b05c2bb 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -21,6 +21,8 @@ #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/gss_krb5_enctypes.h> +#include "auth_gss_internal.h" + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif @@ -143,35 +145,6 @@ get_gss_krb5_enctype(int etype) return NULL; } -static const void * -simple_get_bytes(const void *p, const void *end, void *res, int len) -{ - const void *q = (const void *)((const char *)p + len); - if (unlikely(q > end || q < p)) - return ERR_PTR(-EFAULT); - memcpy(res, p, len); - return q; -} - -static const void * -simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res) -{ - const void *q; - unsigned int len; - - p = simple_get_bytes(p, end, &len, sizeof(len)); - if (IS_ERR(p)) - return p; - q = (const void *)((const char *)p + len); - if (unlikely(q > end || q < p)) - return ERR_PTR(-EFAULT); - res->data = kmemdup(p, len, GFP_NOFS); - if (unlikely(res->data == NULL)) - return ERR_PTR(-ENOMEM); - res->len = len; - return q; -} - static inline const void * get_key(const void *p, const void *end, struct krb5_ctx *ctx, struct crypto_sync_skcipher **res) diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index af9c7f43859c..d1c003a25b0f 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -200,7 +200,7 @@ static int gssp_call(struct net *net, struct rpc_message *msg) static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg) { - int i; + unsigned int i; for (i = 0; i < arg->npages && arg->pages[i]; i++) __free_page(arg->pages[i]); @@ -210,14 +210,19 @@ static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg) static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg) { + unsigned int i; + arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE); arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL); - /* - * XXX: actual pages are allocated by xdr layer in - * xdr_partial_copy_from_skb. - */ if (!arg->pages) return -ENOMEM; + for (i = 0; i < arg->npages; i++) { + arg->pages[i] = alloc_page(GFP_KERNEL); + if (!arg->pages[i]) { + gssp_free_receive_pages(arg); + return -ENOMEM; + } + } return 0; } diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index 2ff7b7083eba..d79f12c2550a 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -771,7 +771,6 @@ void gssx_enc_accept_sec_context(struct rpc_rqst *req, xdr_inline_pages(&req->rq_rcv_buf, PAGE_SIZE/2 /* pretty arbitrary */, arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE); - req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; done: if (err) dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err); @@ -789,7 +788,7 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, scratch = alloc_page(GFP_KERNEL); if (!scratch) return -ENOMEM; - xdr_set_scratch_buffer(xdr, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(xdr, scratch); /* res->status */ err = gssx_dec_status(xdr, &res->status); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 20c93b68505e..1a2c1c44bb00 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -778,7 +778,6 @@ void cache_clean_deferred(void *owner) */ static DEFINE_SPINLOCK(queue_lock); -static DEFINE_MUTEX(queue_io_mutex); struct cache_queue { struct list_head list; @@ -906,44 +905,26 @@ static ssize_t cache_do_downcall(char *kaddr, const char __user *buf, return ret; } -static ssize_t cache_slow_downcall(const char __user *buf, - size_t count, struct cache_detail *cd) -{ - static char write_buf[32768]; /* protected by queue_io_mutex */ - ssize_t ret = -EINVAL; - - if (count >= sizeof(write_buf)) - goto out; - mutex_lock(&queue_io_mutex); - ret = cache_do_downcall(write_buf, buf, count, cd); - mutex_unlock(&queue_io_mutex); -out: - return ret; -} - static ssize_t cache_downcall(struct address_space *mapping, const char __user *buf, size_t count, struct cache_detail *cd) { - struct page *page; - char *kaddr; + char *write_buf; ssize_t ret = -ENOMEM; - if (count >= PAGE_SIZE) - goto out_slow; + if (count >= 32768) { /* 32k is max userland buffer, lets check anyway */ + ret = -EINVAL; + goto out; + } - page = find_or_create_page(mapping, 0, GFP_KERNEL); - if (!page) - goto out_slow; + write_buf = kvmalloc(count + 1, GFP_KERNEL); + if (!write_buf) + goto out; - kaddr = kmap(page); - ret = cache_do_downcall(kaddr, buf, count, cd); - kunmap(page); - unlock_page(page); - put_page(page); + ret = cache_do_downcall(write_buf, buf, count, cd); + kvfree(write_buf); +out: return ret; -out_slow: - return cache_slow_downcall(buf, count, cd); } static ssize_t cache_write(struct file *filp, const char __user *buf, diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 3259120462ed..612f0a641f4c 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1251,10 +1251,7 @@ void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages, unsigned int base, unsigned int len, unsigned int hdrsize) { - /* Subtract one to force an extra word of buffer space for the - * payload's XDR pad to fall into the rcv_buf's tail iovec. - */ - hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign - 1; + hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign; xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len); trace_rpc_xdr_reply_pages(req->rq_task, &req->rq_rcv_buf); diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index fd9bca242724..56029e3af6ff 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -128,13 +128,13 @@ static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *n return 0; len = snprintf(name, sizeof(name), "../../rpc_xprt/%s", xprt->debugfs->d_name.name); - if (len > sizeof(name)) + if (len >= sizeof(name)) return -1; if (*nump == 0) strcpy(link, "xprt"); else { len = snprintf(link, sizeof(link), "xprt%d", *nump); - if (len > sizeof(link)) + if (len >= sizeof(link)) return -1; } debugfs_create_symlink(link, clnt->cl_debugfs, name); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index eadc0ede928c..8241f5a4a01c 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -781,7 +781,8 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, } /** - * rpc_mkpipe - make an rpc_pipefs file for kernel<->userspace communication + * rpc_mkpipe_dentry - make an rpc_pipefs file for kernel<->userspace + * communication * @parent: dentry of directory to create new "pipe" in * @name: name of pipe * @private: private data to associate with the pipe, for the caller's use diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index f06d7c315017..cf702a5f7fe5 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -676,6 +676,23 @@ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *queue) EXPORT_SYMBOL_GPL(rpc_wake_up_next); /** + * rpc_wake_up_locked - wake up all rpc_tasks + * @queue: rpc_wait_queue on which the tasks are sleeping + * + */ +static void rpc_wake_up_locked(struct rpc_wait_queue *queue) +{ + struct rpc_task *task; + + for (;;) { + task = __rpc_find_next_queued(queue); + if (task == NULL) + break; + rpc_wake_up_task_queue_locked(queue, task); + } +} + +/** * rpc_wake_up - wake up all rpc_tasks * @queue: rpc_wait_queue on which the tasks are sleeping * @@ -683,25 +700,28 @@ EXPORT_SYMBOL_GPL(rpc_wake_up_next); */ void rpc_wake_up(struct rpc_wait_queue *queue) { - struct list_head *head; - spin_lock(&queue->lock); - head = &queue->tasks[queue->maxpriority]; + rpc_wake_up_locked(queue); + spin_unlock(&queue->lock); +} +EXPORT_SYMBOL_GPL(rpc_wake_up); + +/** + * rpc_wake_up_status_locked - wake up all rpc_tasks and set their status value. + * @queue: rpc_wait_queue on which the tasks are sleeping + * @status: status value to set + */ +static void rpc_wake_up_status_locked(struct rpc_wait_queue *queue, int status) +{ + struct rpc_task *task; + for (;;) { - while (!list_empty(head)) { - struct rpc_task *task; - task = list_first_entry(head, - struct rpc_task, - u.tk_wait.list); - rpc_wake_up_task_queue_locked(queue, task); - } - if (head == &queue->tasks[0]) + task = __rpc_find_next_queued(queue); + if (task == NULL) break; - head--; + rpc_wake_up_task_queue_set_status_locked(queue, task, status); } - spin_unlock(&queue->lock); } -EXPORT_SYMBOL_GPL(rpc_wake_up); /** * rpc_wake_up_status - wake up all rpc_tasks and set their status value. @@ -712,23 +732,8 @@ EXPORT_SYMBOL_GPL(rpc_wake_up); */ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) { - struct list_head *head; - spin_lock(&queue->lock); - head = &queue->tasks[queue->maxpriority]; - for (;;) { - while (!list_empty(head)) { - struct rpc_task *task; - task = list_first_entry(head, - struct rpc_task, - u.tk_wait.list); - task->tk_status = status; - rpc_wake_up_task_queue_locked(queue, task); - } - if (head == &queue->tasks[0]) - break; - head--; - } + rpc_wake_up_status_locked(queue, status); spin_unlock(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up_status); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index c211b607239e..61fb8a18552c 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -559,7 +559,7 @@ EXPORT_SYMBOL_GPL(svc_destroy); /* * Allocate an RPC server's buffer space. - * We allocate pages and place them in rq_argpages. + * We allocate pages and place them in rq_pages. */ static int svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) @@ -614,6 +614,10 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node) rqstp->rq_server = serv; rqstp->rq_pool = pool; + rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0); + if (!rqstp->rq_scratch_page) + goto out_enomem; + rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_argp) goto out_enomem; @@ -842,6 +846,7 @@ void svc_rqst_free(struct svc_rqst *rqstp) { svc_release_buffer(rqstp); + put_page(rqstp->rq_scratch_page); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); @@ -1622,7 +1627,7 @@ u32 svc_max_payload(const struct svc_rqst *rqstp) EXPORT_SYMBOL_GPL(svc_max_payload); /** - * svc_encode_read_payload - mark a range of bytes as a READ payload + * svc_encode_result_payload - mark a range of bytes as a result payload * @rqstp: svc_rqst to operate on * @offset: payload's byte offset in rqstp->rq_res * @length: size of payload, in bytes @@ -1630,12 +1635,13 @@ EXPORT_SYMBOL_GPL(svc_max_payload); * Returns zero on success, or a negative errno if a permanent * error occurred. */ -int svc_encode_read_payload(struct svc_rqst *rqstp, unsigned int offset, - unsigned int length) +int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length) { - return rqstp->rq_xprt->xpt_ops->xpo_read_payload(rqstp, offset, length); + return rqstp->rq_xprt->xpt_ops->xpo_result_payload(rqstp, offset, + length); } -EXPORT_SYMBOL_GPL(svc_encode_read_payload); +EXPORT_SYMBOL_GPL(svc_encode_result_payload); /** * svc_fill_write_vector - Construct data argument for VFS write call diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 43cf8dbde898..dcc50ae54550 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -813,8 +813,6 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) len = svc_deferred_recv(rqstp); else len = xprt->xpt_ops->xpo_recvfrom(rqstp); - if (len > 0) - trace_svc_xdr_recvfrom(rqstp, &rqstp->rq_arg); rqstp->rq_stime = ktime_get(); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); @@ -859,6 +857,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) err = -EAGAIN; if (len <= 0) goto out_release; + trace_svc_xdr_recvfrom(&rqstp->rq_arg); clear_bit(XPT_OLD, &xprt->xpt_flags); @@ -868,7 +867,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (serv->sv_stats) serv->sv_stats->netcnt++; - trace_svc_recv(rqstp, len); return len; out_release: rqstp->rq_res.len = 0; @@ -906,7 +904,7 @@ int svc_send(struct svc_rqst *rqstp) xb->len = xb->head[0].iov_len + xb->page_len + xb->tail[0].iov_len; - trace_svc_xdr_sendto(rqstp, xb); + trace_svc_xdr_sendto(rqstp->rq_xid, xb); trace_svc_stats_latency(rqstp); len = xprt->xpt_ops->xpo_sendto(rqstp); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c2752e2b9ce3..5a809c64dc7b 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -181,8 +181,8 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) } } -static int svc_sock_read_payload(struct svc_rqst *rqstp, unsigned int offset, - unsigned int length) +static int svc_sock_result_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length) { return 0; } @@ -635,7 +635,7 @@ static const struct svc_xprt_ops svc_udp_ops = { .xpo_create = svc_udp_create, .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, - .xpo_read_payload = svc_sock_read_payload, + .xpo_result_payload = svc_sock_result_payload, .xpo_release_rqst = svc_udp_release_rqst, .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, @@ -1062,6 +1062,91 @@ err_noclose: return 0; /* record not complete */ } +static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec, + int flags) +{ + return kernel_sendpage(sock, virt_to_page(vec->iov_base), + offset_in_page(vec->iov_base), + vec->iov_len, flags); +} + +/* + * kernel_sendpage() is used exclusively to reduce the number of + * copy operations in this path. Therefore the caller must ensure + * that the pages backing @xdr are unchanging. + * + * In addition, the logic assumes that * .bv_len is never larger + * than PAGE_SIZE. + */ +static int svc_tcp_sendmsg(struct socket *sock, struct msghdr *msg, + struct xdr_buf *xdr, rpc_fraghdr marker, + unsigned int *sentp) +{ + const struct kvec *head = xdr->head; + const struct kvec *tail = xdr->tail; + struct kvec rm = { + .iov_base = &marker, + .iov_len = sizeof(marker), + }; + int flags, ret; + + *sentp = 0; + xdr_alloc_bvec(xdr, GFP_KERNEL); + + msg->msg_flags = MSG_MORE; + ret = kernel_sendmsg(sock, msg, &rm, 1, rm.iov_len); + if (ret < 0) + return ret; + *sentp += ret; + if (ret != rm.iov_len) + return -EAGAIN; + + flags = head->iov_len < xdr->len ? MSG_MORE | MSG_SENDPAGE_NOTLAST : 0; + ret = svc_tcp_send_kvec(sock, head, flags); + if (ret < 0) + return ret; + *sentp += ret; + if (ret != head->iov_len) + goto out; + + if (xdr->page_len) { + unsigned int offset, len, remaining; + struct bio_vec *bvec; + + bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT); + offset = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + flags = MSG_MORE | MSG_SENDPAGE_NOTLAST; + while (remaining > 0) { + if (remaining <= PAGE_SIZE && tail->iov_len == 0) + flags = 0; + + len = min(remaining, bvec->bv_len - offset); + ret = kernel_sendpage(sock, bvec->bv_page, + bvec->bv_offset + offset, + len, flags); + if (ret < 0) + return ret; + *sentp += ret; + if (ret != len) + goto out; + remaining -= len; + offset = 0; + bvec++; + } + } + + if (tail->iov_len) { + ret = svc_tcp_send_kvec(sock, tail, 0); + if (ret < 0) + return ret; + *sentp += ret; + } + +out: + return 0; +} + /** * svc_tcp_sendto - Send out a reply on a TCP socket * @rqstp: completed svc_rqst @@ -1089,7 +1174,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) mutex_lock(&xprt->xpt_mutex); if (svc_xprt_is_dead(xprt)) goto out_notconn; - err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent); + err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker, &sent); xdr_free_bvec(xdr); trace_svcsock_tcp_send(xprt, err < 0 ? err : sent); if (err < 0 || sent != (xdr->len + sizeof(marker))) @@ -1123,7 +1208,7 @@ static const struct svc_xprt_ops svc_tcp_ops = { .xpo_create = svc_tcp_create, .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, - .xpo_read_payload = svc_sock_read_payload, + .xpo_result_payload = svc_sock_result_payload, .xpo_release_rqst = svc_tcp_release_rqst, .xpo_detach = svc_tcp_sock_detach, .xpo_free = svc_sock_free, diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index a18b36b5422d..3aad6ef18504 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -63,19 +63,20 @@ static int proc_do_xprt(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; - size_t len; + ssize_t len; - if ((*ppos && !write) || !*lenp) { + if (write || *ppos) { *lenp = 0; return 0; } len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); - *lenp = memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); + len = memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); - if (*lenp < 0) { + if (len < 0) { *lenp = 0; return -EINVAL; } + *lenp = len; return 0; } diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 71e03b930b70..3964ff74ee51 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -123,8 +123,7 @@ EXPORT_SYMBOL_GPL(xdr_decode_string_inplace); * @len: length of string, in bytes * */ -void -xdr_terminate_string(struct xdr_buf *buf, const u32 len) +void xdr_terminate_string(const struct xdr_buf *buf, const u32 len) { char *kaddr; @@ -134,8 +133,7 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len) } EXPORT_SYMBOL_GPL(xdr_terminate_string); -size_t -xdr_buf_pagecount(struct xdr_buf *buf) +size_t xdr_buf_pagecount(const struct xdr_buf *buf) { if (!buf->page_len) return 0; @@ -193,9 +191,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, tail->iov_base = buf + offset; tail->iov_len = buflen - offset; - if ((xdr->page_len & 3) == 0) - tail->iov_len -= sizeof(__be32); - xdr->buflen += len; } EXPORT_SYMBOL_GPL(xdr_inline_pages); @@ -228,6 +223,9 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base, BUG_ON(pgfrom_base <= pgto_base); + if (!len) + return; + pgto = pages + (pgto_base >> PAGE_SHIFT); pgfrom = pages + (pgfrom_base >> PAGE_SHIFT); @@ -266,26 +264,6 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base, } while ((len -= copy) != 0); } -static void -_shift_data_left_tail(struct xdr_buf *buf, unsigned int pgto, size_t len) -{ - struct kvec *tail = buf->tail; - - if (len > tail->iov_len) - len = tail->iov_len; - - _copy_to_pages(buf->pages, - buf->page_base + pgto, - (char *)tail->iov_base, - len); - tail->iov_len -= len; - - if (tail->iov_len > 0) - memmove((char *)tail->iov_base, - tail->iov_base + len, - tail->iov_len); -} - /** * _shift_data_right_pages * @pages: vector of pages containing both the source and dest memory area. @@ -310,6 +288,9 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base, BUG_ON(pgto_base <= pgfrom_base); + if (!len) + return; + pgto_base += len; pgfrom_base += len; @@ -351,46 +332,6 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base, } while ((len -= copy) != 0); } -static unsigned int -_shift_data_right_tail(struct xdr_buf *buf, unsigned int pgfrom, size_t len) -{ - struct kvec *tail = buf->tail; - unsigned int tailbuf_len; - unsigned int result = 0; - size_t copy; - - tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len; - - /* Shift the tail first */ - if (tailbuf_len != 0) { - unsigned int free_space = tailbuf_len - tail->iov_len; - - if (len < free_space) - free_space = len; - if (len > free_space) - len = free_space; - - tail->iov_len += free_space; - copy = len; - - if (tail->iov_len > len) { - char *p = (char *)tail->iov_base + len; - memmove(p, tail->iov_base, tail->iov_len - free_space); - result += tail->iov_len - free_space; - } else - copy = tail->iov_len; - - /* Copy from the inlined pages into the tail */ - _copy_from_pages((char *)tail->iov_base, - buf->pages, - buf->page_base + pgfrom, - copy); - result += copy; - } - - return result; -} - /** * _copy_to_pages * @pages: array of pages @@ -408,6 +349,9 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len) char *vto; size_t copy; + if (!len) + return; + pgto = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; @@ -452,6 +396,9 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) char *vfrom; size_t copy; + if (!len) + return; + pgfrom = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; @@ -475,18 +422,42 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) } EXPORT_SYMBOL_GPL(_copy_from_pages); +static void xdr_buf_iov_zero(const struct kvec *iov, unsigned int base, + unsigned int len) +{ + if (base >= iov->iov_len) + return; + if (len > iov->iov_len - base) + len = iov->iov_len - base; + memset(iov->iov_base + base, 0, len); +} + /** - * _zero_pages - * @pages: array of pages - * @pgbase: beginning page vector address + * xdr_buf_pages_zero + * @buf: xdr_buf + * @pgbase: beginning offset * @len: length */ -static void -_zero_pages(struct page **pages, size_t pgbase, size_t len) +static void xdr_buf_pages_zero(const struct xdr_buf *buf, unsigned int pgbase, + unsigned int len) { + struct page **pages = buf->pages; struct page **page; char *vpage; - size_t zero; + unsigned int zero; + + if (!len) + return; + if (pgbase >= buf->page_len) { + xdr_buf_iov_zero(buf->tail, pgbase - buf->page_len, len); + return; + } + if (pgbase + len > buf->page_len) { + xdr_buf_iov_zero(buf->tail, 0, pgbase + len - buf->page_len); + len = buf->page_len - pgbase; + } + + pgbase += buf->page_base; page = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; @@ -507,122 +478,367 @@ _zero_pages(struct page **pages, size_t pgbase, size_t len) } while ((len -= zero) != 0); } +static unsigned int xdr_buf_pages_fill_sparse(const struct xdr_buf *buf, + unsigned int buflen, gfp_t gfp) +{ + unsigned int i, npages, pagelen; + + if (!(buf->flags & XDRBUF_SPARSE_PAGES)) + return buflen; + if (buflen <= buf->head->iov_len) + return buflen; + pagelen = buflen - buf->head->iov_len; + if (pagelen > buf->page_len) + pagelen = buf->page_len; + npages = (pagelen + buf->page_base + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + if (!buf->pages[i]) + continue; + buf->pages[i] = alloc_page(gfp); + if (likely(buf->pages[i])) + continue; + buflen -= pagelen; + pagelen = i << PAGE_SHIFT; + if (pagelen > buf->page_base) + buflen += pagelen - buf->page_base; + break; + } + return buflen; +} + +static void xdr_buf_try_expand(struct xdr_buf *buf, unsigned int len) +{ + struct kvec *head = buf->head; + struct kvec *tail = buf->tail; + unsigned int sum = head->iov_len + buf->page_len + tail->iov_len; + unsigned int free_space, newlen; + + if (sum > buf->len) { + free_space = min_t(unsigned int, sum - buf->len, len); + newlen = xdr_buf_pages_fill_sparse(buf, buf->len + free_space, + GFP_KERNEL); + free_space = newlen - buf->len; + buf->len = newlen; + len -= free_space; + if (!len) + return; + } + + if (buf->buflen > sum) { + /* Expand the tail buffer */ + free_space = min_t(unsigned int, buf->buflen - sum, len); + tail->iov_len += free_space; + buf->len += free_space; + } +} + +static void xdr_buf_tail_copy_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *tail = buf->tail; + unsigned int to = base + shift; + + if (to >= tail->iov_len) + return; + if (len + to > tail->iov_len) + len = tail->iov_len - to; + memmove(tail->iov_base + to, tail->iov_base + base, len); +} + +static void xdr_buf_pages_copy_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *tail = buf->tail; + unsigned int to = base + shift; + unsigned int pglen = 0; + unsigned int talen = 0, tato = 0; + + if (base >= buf->page_len) + return; + if (len > buf->page_len - base) + len = buf->page_len - base; + if (to >= buf->page_len) { + tato = to - buf->page_len; + if (tail->iov_len >= len + tato) + talen = len; + else if (tail->iov_len > tato) + talen = tail->iov_len - tato; + } else if (len + to >= buf->page_len) { + pglen = buf->page_len - to; + talen = len - pglen; + if (talen > tail->iov_len) + talen = tail->iov_len; + } else + pglen = len; + + _copy_from_pages(tail->iov_base + tato, buf->pages, + buf->page_base + base + pglen, talen); + _shift_data_right_pages(buf->pages, buf->page_base + to, + buf->page_base + base, pglen); +} + +static void xdr_buf_head_copy_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *head = buf->head; + const struct kvec *tail = buf->tail; + unsigned int to = base + shift; + unsigned int pglen = 0, pgto = 0; + unsigned int talen = 0, tato = 0; + + if (base >= head->iov_len) + return; + if (len > head->iov_len - base) + len = head->iov_len - base; + if (to >= buf->page_len + head->iov_len) { + tato = to - buf->page_len - head->iov_len; + talen = len; + } else if (to >= head->iov_len) { + pgto = to - head->iov_len; + pglen = len; + if (pgto + pglen > buf->page_len) { + talen = pgto + pglen - buf->page_len; + pglen -= talen; + } + } else { + pglen = len - to; + if (pglen > buf->page_len) { + talen = pglen - buf->page_len; + pglen = buf->page_len; + } + } + + len -= talen; + base += len; + if (talen + tato > tail->iov_len) + talen = tail->iov_len > tato ? tail->iov_len - tato : 0; + memcpy(tail->iov_base + tato, head->iov_base + base, talen); + + len -= pglen; + base -= pglen; + _copy_to_pages(buf->pages, buf->page_base + pgto, head->iov_base + base, + pglen); + + base -= len; + memmove(head->iov_base + to, head->iov_base + base, len); +} + +static void xdr_buf_tail_shift_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *tail = buf->tail; + + if (base >= tail->iov_len || !shift || !len) + return; + xdr_buf_tail_copy_right(buf, base, len, shift); +} + +static void xdr_buf_pages_shift_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + if (!shift || !len) + return; + if (base >= buf->page_len) { + xdr_buf_tail_shift_right(buf, base - buf->page_len, len, shift); + return; + } + if (base + len > buf->page_len) + xdr_buf_tail_shift_right(buf, 0, base + len - buf->page_len, + shift); + xdr_buf_pages_copy_right(buf, base, len, shift); +} + +static void xdr_buf_head_shift_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *head = buf->head; + + if (!shift) + return; + if (base >= head->iov_len) { + xdr_buf_pages_shift_right(buf, head->iov_len - base, len, + shift); + return; + } + if (base + len > head->iov_len) + xdr_buf_pages_shift_right(buf, 0, base + len - head->iov_len, + shift); + xdr_buf_head_copy_right(buf, base, len, shift); +} + +static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base, + unsigned int len, unsigned int shift) +{ + const struct kvec *tail = buf->tail; + + if (base >= tail->iov_len) + return; + if (len > tail->iov_len - base) + len = tail->iov_len - base; + /* Shift data into head */ + if (shift > buf->page_len + base) { + const struct kvec *head = buf->head; + unsigned int hdto = + head->iov_len + buf->page_len + base - shift; + unsigned int hdlen = len; + + if (WARN_ONCE(shift > head->iov_len + buf->page_len + base, + "SUNRPC: Misaligned data.\n")) + return; + if (hdto + hdlen > head->iov_len) + hdlen = head->iov_len - hdto; + memcpy(head->iov_base + hdto, tail->iov_base + base, hdlen); + base += hdlen; + len -= hdlen; + if (!len) + return; + } + /* Shift data into pages */ + if (shift > base) { + unsigned int pgto = buf->page_len + base - shift; + unsigned int pglen = len; + + if (pgto + pglen > buf->page_len) + pglen = buf->page_len - pgto; + _copy_to_pages(buf->pages, buf->page_base + pgto, + tail->iov_base + base, pglen); + base += pglen; + len -= pglen; + if (!len) + return; + } + memmove(tail->iov_base + base - shift, tail->iov_base + base, len); +} + +static void xdr_buf_pages_copy_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + unsigned int pgto; + + if (base >= buf->page_len) + return; + if (len > buf->page_len - base) + len = buf->page_len - base; + /* Shift data into head */ + if (shift > base) { + const struct kvec *head = buf->head; + unsigned int hdto = head->iov_len + base - shift; + unsigned int hdlen = len; + + if (WARN_ONCE(shift > head->iov_len + base, + "SUNRPC: Misaligned data.\n")) + return; + if (hdto + hdlen > head->iov_len) + hdlen = head->iov_len - hdto; + _copy_from_pages(head->iov_base + hdto, buf->pages, + buf->page_base + base, hdlen); + base += hdlen; + len -= hdlen; + if (!len) + return; + } + pgto = base - shift; + _shift_data_left_pages(buf->pages, buf->page_base + pgto, + buf->page_base + base, len); +} + +static void xdr_buf_tail_shift_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + if (!shift || !len) + return; + xdr_buf_tail_copy_left(buf, base, len, shift); +} + +static void xdr_buf_pages_shift_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + if (!shift || !len) + return; + if (base >= buf->page_len) { + xdr_buf_tail_shift_left(buf, base - buf->page_len, len, shift); + return; + } + xdr_buf_pages_copy_left(buf, base, len, shift); + len += base; + if (len <= buf->page_len) + return; + xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift); +} + /** * xdr_shrink_bufhead * @buf: xdr_buf - * @len: bytes to remove from buf->head[0] + * @len: new length of buf->head[0] * - * Shrinks XDR buffer's header kvec buf->head[0] by + * Shrinks XDR buffer's header kvec buf->head[0], setting it to * 'len' bytes. The extra data is not lost, but is instead * moved into the inlined pages and/or the tail. */ -static unsigned int -xdr_shrink_bufhead(struct xdr_buf *buf, size_t len) +static unsigned int xdr_shrink_bufhead(struct xdr_buf *buf, unsigned int len) { - struct kvec *head, *tail; - size_t copy, offs; - unsigned int pglen = buf->page_len; - unsigned int result; - - result = 0; - tail = buf->tail; - head = buf->head; + struct kvec *head = buf->head; + unsigned int shift, buflen = max(buf->len, len); WARN_ON_ONCE(len > head->iov_len); - if (len > head->iov_len) - len = head->iov_len; - - /* Shift the tail first */ - if (tail->iov_len != 0) { - if (tail->iov_len > len) { - copy = tail->iov_len - len; - memmove((char *)tail->iov_base + len, - tail->iov_base, copy); - result += copy; - } - /* Copy from the inlined pages into the tail */ - copy = len; - if (copy > pglen) - copy = pglen; - offs = len - copy; - if (offs >= tail->iov_len) - copy = 0; - else if (copy > tail->iov_len - offs) - copy = tail->iov_len - offs; - if (copy != 0) { - _copy_from_pages((char *)tail->iov_base + offs, - buf->pages, - buf->page_base + pglen + offs - len, - copy); - result += copy; - } - /* Do we also need to copy data from the head into the tail ? */ - if (len > pglen) { - offs = copy = len - pglen; - if (copy > tail->iov_len) - copy = tail->iov_len; - memcpy(tail->iov_base, - (char *)head->iov_base + - head->iov_len - offs, - copy); - result += copy; - } + if (head->iov_len > buflen) { + buf->buflen -= head->iov_len - buflen; + head->iov_len = buflen; } - /* Now handle pages */ - if (pglen != 0) { - if (pglen > len) - _shift_data_right_pages(buf->pages, - buf->page_base + len, - buf->page_base, - pglen - len); - copy = len; - if (len > pglen) - copy = pglen; - _copy_to_pages(buf->pages, buf->page_base, - (char *)head->iov_base + head->iov_len - len, - copy); - result += copy; - } - head->iov_len -= len; - buf->buflen -= len; - /* Have we truncated the message? */ - if (buf->len > buf->buflen) - buf->len = buf->buflen; - - return result; + if (len >= head->iov_len) + return 0; + shift = head->iov_len - len; + xdr_buf_try_expand(buf, shift); + xdr_buf_head_shift_right(buf, len, buflen - len, shift); + head->iov_len = len; + buf->buflen -= shift; + buf->len -= shift; + return shift; } /** - * xdr_shrink_pagelen - shrinks buf->pages by up to @len bytes + * xdr_shrink_pagelen - shrinks buf->pages to @len bytes * @buf: xdr_buf - * @len: bytes to remove from buf->pages + * @len: new page buffer length * * The extra data is not lost, but is instead moved into buf->tail. * Returns the actual number of bytes moved. */ -static unsigned int -xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) +static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len) { - unsigned int pglen = buf->page_len; - unsigned int result; - - if (len > buf->page_len) - len = buf-> page_len; + unsigned int shift, buflen = buf->len - buf->head->iov_len; - result = _shift_data_right_tail(buf, pglen - len, len); - buf->page_len -= len; - buf->buflen -= len; - /* Have we truncated the message? */ - if (buf->len > buf->buflen) - buf->len = buf->buflen; - - return result; + WARN_ON_ONCE(len > buf->page_len); + if (buf->head->iov_len >= buf->len || len > buflen) + buflen = len; + if (buf->page_len > buflen) { + buf->buflen -= buf->page_len - buflen; + buf->page_len = buflen; + } + if (len >= buf->page_len) + return 0; + shift = buf->page_len - len; + xdr_buf_try_expand(buf, shift); + xdr_buf_pages_shift_right(buf, len, buflen - len, shift); + buf->page_len = len; + buf->len -= shift; + buf->buflen -= shift; + return shift; } void xdr_shift_buf(struct xdr_buf *buf, size_t len) { - xdr_shrink_bufhead(buf, len); + xdr_shrink_bufhead(buf, buf->head->iov_len - len); } EXPORT_SYMBOL_GPL(xdr_shift_buf); @@ -636,6 +852,18 @@ unsigned int xdr_stream_pos(const struct xdr_stream *xdr) } EXPORT_SYMBOL_GPL(xdr_stream_pos); +static void xdr_stream_set_pos(struct xdr_stream *xdr, unsigned int pos) +{ + unsigned int blen = xdr->buf->len; + + xdr->nwords = blen > pos ? XDR_QUADLEN(blen) - XDR_QUADLEN(pos) : 0; +} + +static void xdr_stream_page_set_pos(struct xdr_stream *xdr, unsigned int pos) +{ + xdr_stream_set_pos(xdr, pos + xdr->buf->head[0].iov_len); +} + /** * xdr_page_pos - Return the current offset from the start of the xdr pages * @xdr: pointer to struct xdr_stream @@ -669,7 +897,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct kvec *iov = buf->head; int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len; - xdr_set_scratch_buffer(xdr, NULL, 0); + xdr_reset_scratch_buffer(xdr); BUG_ON(scratch_len < 0); xdr->buf = buf; xdr->iov = iov; @@ -713,7 +941,7 @@ inline void xdr_commit_encode(struct xdr_stream *xdr) page = page_address(*xdr->page_ptr); memcpy(xdr->scratch.iov_base, page, shift); memmove(page, page + shift, (void *)xdr->p - page); - xdr->scratch.iov_len = 0; + xdr_reset_scratch_buffer(xdr); } EXPORT_SYMBOL_GPL(xdr_commit_encode); @@ -743,8 +971,7 @@ static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, * the "scratch" iov to track any temporarily unused fragment of * space at the end of the previous buffer: */ - xdr->scratch.iov_base = xdr->p; - xdr->scratch.iov_len = frag1bytes; + xdr_set_scratch_buffer(xdr, xdr->p, frag1bytes); p = page_address(*xdr->page_ptr); /* * Note this is where the next encode will start after we've @@ -970,19 +1197,31 @@ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int b } EXPORT_SYMBOL_GPL(xdr_write_pages); -static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov, - unsigned int len) +static unsigned int xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov, + unsigned int base, unsigned int len) { if (len > iov->iov_len) len = iov->iov_len; - xdr->p = (__be32*)iov->iov_base; + if (unlikely(base > len)) + base = len; + xdr->p = (__be32*)(iov->iov_base + base); xdr->end = (__be32*)(iov->iov_base + len); xdr->iov = iov; xdr->page_ptr = NULL; + return len - base; +} + +static unsigned int xdr_set_tail_base(struct xdr_stream *xdr, + unsigned int base, unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + + xdr_stream_set_pos(xdr, base + buf->page_len + buf->head->iov_len); + return xdr_set_iov(xdr, buf->tail, base, len); } -static int xdr_set_page_base(struct xdr_stream *xdr, - unsigned int base, unsigned int len) +static unsigned int xdr_set_page_base(struct xdr_stream *xdr, + unsigned int base, unsigned int len) { unsigned int pgnr; unsigned int maxlen; @@ -991,12 +1230,15 @@ static int xdr_set_page_base(struct xdr_stream *xdr, void *kaddr; maxlen = xdr->buf->page_len; - if (base >= maxlen) - return -EINVAL; - maxlen -= base; + if (base >= maxlen) { + base = maxlen; + maxlen = 0; + } else + maxlen -= base; if (len > maxlen) len = maxlen; + xdr_stream_page_set_pos(xdr, base); base += xdr->buf->page_base; pgnr = base >> PAGE_SHIFT; @@ -1011,14 +1253,16 @@ static int xdr_set_page_base(struct xdr_stream *xdr, pgend = PAGE_SIZE; xdr->end = (__be32*)(kaddr + pgend); xdr->iov = NULL; - return 0; + return len; } static void xdr_set_page(struct xdr_stream *xdr, unsigned int base, unsigned int len) { - if (xdr_set_page_base(xdr, base, len) < 0) - xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2); + if (xdr_set_page_base(xdr, base, len) == 0) { + base -= xdr->buf->page_len; + xdr_set_tail_base(xdr, base, len); + } } static void xdr_set_next_page(struct xdr_stream *xdr) @@ -1027,17 +1271,18 @@ static void xdr_set_next_page(struct xdr_stream *xdr) newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT; newbase -= xdr->buf->page_base; - - xdr_set_page(xdr, newbase, PAGE_SIZE); + if (newbase < xdr->buf->page_len) + xdr_set_page_base(xdr, newbase, xdr_stream_remaining(xdr)); + else + xdr_set_tail_base(xdr, 0, xdr_stream_remaining(xdr)); } static bool xdr_set_next_buffer(struct xdr_stream *xdr) { if (xdr->page_ptr != NULL) xdr_set_next_page(xdr); - else if (xdr->iov == xdr->buf->head) { - xdr_set_page(xdr, 0, PAGE_SIZE); - } + else if (xdr->iov == xdr->buf->head) + xdr_set_page(xdr, 0, xdr_stream_remaining(xdr)); return xdr->p != xdr->end; } @@ -1052,15 +1297,11 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst) { xdr->buf = buf; - xdr->scratch.iov_base = NULL; - xdr->scratch.iov_len = 0; + xdr_reset_scratch_buffer(xdr); xdr->nwords = XDR_QUADLEN(buf->len); - if (buf->head[0].iov_len != 0) - xdr_set_iov(xdr, buf->head, buf->len); - else if (buf->page_len != 0) - xdr_set_page_base(xdr, 0, buf->len); - else - xdr_set_iov(xdr, buf->head, buf->len); + if (xdr_set_iov(xdr, buf->head, 0, buf->len) == 0 && + xdr_set_page_base(xdr, 0, buf->len) == 0) + xdr_set_iov(xdr, buf->tail, 0, buf->len); if (p != NULL && p > xdr->p && xdr->end >= p) { xdr->nwords -= p - xdr->p; xdr->p = p; @@ -1101,24 +1342,6 @@ static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) return p; } -/** - * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data. - * @xdr: pointer to xdr_stream struct - * @buf: pointer to an empty buffer - * @buflen: size of 'buf' - * - * The scratch buffer is used when decoding from an array of pages. - * If an xdr_inline_decode() call spans across page boundaries, then - * we copy the data into the scratch buffer in order to allow linear - * access. - */ -void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen) -{ - xdr->scratch.iov_base = buf; - xdr->scratch.iov_len = buflen; -} -EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer); - static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes) { __be32 *p; @@ -1178,14 +1401,13 @@ static void xdr_realign_pages(struct xdr_stream *xdr) struct xdr_buf *buf = xdr->buf; struct kvec *iov = buf->head; unsigned int cur = xdr_stream_pos(xdr); - unsigned int copied, offset; + unsigned int copied; /* Realign pages to current pointer position */ if (iov->iov_len > cur) { - offset = iov->iov_len - cur; - copied = xdr_shrink_bufhead(buf, offset); - trace_rpc_xdr_alignment(xdr, offset, copied); - xdr->nwords = XDR_QUADLEN(buf->len - cur); + copied = xdr_shrink_bufhead(buf, cur); + trace_rpc_xdr_alignment(xdr, cur, copied); + xdr_set_page(xdr, 0, buf->page_len); } } @@ -1193,8 +1415,7 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) { struct xdr_buf *buf = xdr->buf; unsigned int nwords = XDR_QUADLEN(len); - unsigned int cur = xdr_stream_pos(xdr); - unsigned int copied, offset; + unsigned int copied; if (xdr->nwords == 0) return 0; @@ -1208,125 +1429,103 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) len = buf->page_len; else if (nwords < xdr->nwords) { /* Truncate page data and move it into the tail */ - offset = buf->page_len - len; - copied = xdr_shrink_pagelen(buf, offset); - trace_rpc_xdr_alignment(xdr, offset, copied); - xdr->nwords = XDR_QUADLEN(buf->len - cur); + copied = xdr_shrink_pagelen(buf, len); + trace_rpc_xdr_alignment(xdr, len, copied); } return len; } /** - * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position + * xdr_read_pages - align page-based XDR data to current pointer position * @xdr: pointer to xdr_stream struct * @len: number of bytes of page data * * Moves data beyond the current pointer position from the XDR head[] buffer - * into the page list. Any data that lies beyond current position + "len" - * bytes is moved into the XDR tail[]. + * into the page list. Any data that lies beyond current position + @len + * bytes is moved into the XDR tail[]. The xdr_stream current position is + * then advanced past that data to align to the next XDR object in the tail. * * Returns the number of XDR encoded bytes now contained in the pages */ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len) { - struct xdr_buf *buf = xdr->buf; - struct kvec *iov; - unsigned int nwords; - unsigned int end; - unsigned int padding; + unsigned int nwords = XDR_QUADLEN(len); + unsigned int base, end, pglen; - len = xdr_align_pages(xdr, len); - if (len == 0) + pglen = xdr_align_pages(xdr, nwords << 2); + if (pglen == 0) return 0; - nwords = XDR_QUADLEN(len); - padding = (nwords << 2) - len; - xdr->iov = iov = buf->tail; - /* Compute remaining message length. */ - end = ((xdr->nwords - nwords) << 2) + padding; - if (end > iov->iov_len) - end = iov->iov_len; - /* - * Position current pointer at beginning of tail, and - * set remaining message length. - */ - xdr->p = (__be32 *)((char *)iov->iov_base + padding); - xdr->end = (__be32 *)((char *)iov->iov_base + end); - xdr->page_ptr = NULL; - xdr->nwords = XDR_QUADLEN(end - padding); - return len; + base = (nwords << 2) - pglen; + end = xdr_stream_remaining(xdr) - pglen; + + xdr_set_tail_base(xdr, base, end); + return len <= pglen ? len : pglen; } EXPORT_SYMBOL_GPL(xdr_read_pages); -uint64_t xdr_align_data(struct xdr_stream *xdr, uint64_t offset, uint32_t length) +unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset, + unsigned int length) { struct xdr_buf *buf = xdr->buf; - unsigned int from, bytes; - unsigned int shift = 0; - - if ((offset + length) < offset || - (offset + length) > buf->page_len) - length = buf->page_len - offset; + unsigned int from, bytes, len; + unsigned int shift; xdr_realign_pages(xdr); from = xdr_page_pos(xdr); - bytes = xdr->nwords << 2; - if (length < bytes) - bytes = length; + + if (from >= buf->page_len + buf->tail->iov_len) + return 0; + if (from + buf->head->iov_len >= buf->len) + return 0; + + len = buf->len - buf->head->iov_len; + + /* We only shift data left! */ + if (WARN_ONCE(from < offset, "SUNRPC: misaligned data src=%u dst=%u\n", + from, offset)) + return 0; + if (WARN_ONCE(offset > buf->page_len, + "SUNRPC: buffer overflow. offset=%u, page_len=%u\n", + offset, buf->page_len)) + return 0; /* Move page data to the left */ - if (from > offset) { - shift = min_t(unsigned int, bytes, buf->page_len - from); - _shift_data_left_pages(buf->pages, - buf->page_base + offset, - buf->page_base + from, - shift); - bytes -= shift; - - /* Move tail data into the pages, if necessary */ - if (bytes > 0) - _shift_data_left_tail(buf, offset + shift, bytes); - } + shift = from - offset; + xdr_buf_pages_shift_left(buf, from, len, shift); + + bytes = xdr_stream_remaining(xdr); + if (length > bytes) + length = bytes; + bytes -= length; - xdr->nwords -= XDR_QUADLEN(length); - xdr_set_page(xdr, from + length, PAGE_SIZE); + xdr->buf->len -= shift; + xdr_set_page(xdr, offset + length, bytes); return length; } EXPORT_SYMBOL_GPL(xdr_align_data); -uint64_t xdr_expand_hole(struct xdr_stream *xdr, uint64_t offset, uint64_t length) +unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset, + unsigned int length) { struct xdr_buf *buf = xdr->buf; - unsigned int bytes; - unsigned int from; - unsigned int truncated = 0; - - if ((offset + length) < offset || - (offset + length) > buf->page_len) - length = buf->page_len - offset; + unsigned int from, to, shift; xdr_realign_pages(xdr); from = xdr_page_pos(xdr); - bytes = xdr->nwords << 2; - - if (offset + length + bytes > buf->page_len) { - unsigned int shift = (offset + length + bytes) - buf->page_len; - unsigned int res = _shift_data_right_tail(buf, from + bytes - shift, shift); - truncated = shift - res; - xdr->nwords -= XDR_QUADLEN(truncated); - bytes -= shift; - } - - /* Now move the page data over and zero pages */ - if (bytes > 0) - _shift_data_right_pages(buf->pages, - buf->page_base + offset + length, - buf->page_base + from, - bytes); - _zero_pages(buf->pages, buf->page_base + offset, length); + to = xdr_align_size(offset + length); + + /* Could the hole be behind us? */ + if (to > from) { + unsigned int buflen = buf->len - buf->head->iov_len; + shift = to - from; + xdr_buf_try_expand(buf, shift); + xdr_buf_pages_shift_right(buf, from, buflen, shift); + xdr_set_page(xdr, to, xdr_stream_remaining(xdr)); + } else if (to != from) + xdr_align_data(xdr, to, 0); + xdr_buf_pages_zero(buf, offset, length); - buf->len += length - (from - offset) - truncated; - xdr_set_page(xdr, offset + length, PAGE_SIZE); return length; } EXPORT_SYMBOL_GPL(xdr_expand_hole); @@ -1355,8 +1554,7 @@ EXPORT_SYMBOL_GPL(xdr_enter_page); static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0}; -void -xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf) +void xdr_buf_from_iov(const struct kvec *iov, struct xdr_buf *buf) { buf->head[0] = *iov; buf->tail[0] = empty_iov; @@ -1379,9 +1577,8 @@ EXPORT_SYMBOL_GPL(xdr_buf_from_iov); * * Returns -1 if base of length are out of bounds. */ -int -xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, - unsigned int base, unsigned int len) +int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf, + unsigned int base, unsigned int len) { subbuf->buflen = subbuf->len = len; if (base < buf->head[0].iov_len) { @@ -1429,6 +1626,51 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, EXPORT_SYMBOL_GPL(xdr_buf_subsegment); /** + * xdr_stream_subsegment - set @subbuf to a portion of @xdr + * @xdr: an xdr_stream set up for decoding + * @subbuf: the result buffer + * @nbytes: length of @xdr to extract, in bytes + * + * Sets up @subbuf to represent a portion of @xdr. The portion + * starts at the current offset in @xdr, and extends for a length + * of @nbytes. If this is successful, @xdr is advanced to the next + * position following that portion. + * + * Return values: + * %true: @subbuf has been initialized, and @xdr has been advanced. + * %false: a bounds error has occurred + */ +bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf, + unsigned int nbytes) +{ + unsigned int remaining, offset, len; + + if (xdr_buf_subsegment(xdr->buf, subbuf, xdr_stream_pos(xdr), nbytes)) + return false; + + if (subbuf->head[0].iov_len) + if (!__xdr_inline_decode(xdr, subbuf->head[0].iov_len)) + return false; + + remaining = subbuf->page_len; + offset = subbuf->page_base; + while (remaining) { + len = min_t(unsigned int, remaining, PAGE_SIZE) - offset; + + if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr)) + return false; + if (!__xdr_inline_decode(xdr, len)) + return false; + + remaining -= len; + offset = 0; + } + + return true; +} +EXPORT_SYMBOL_GPL(xdr_stream_subsegment); + +/** * xdr_buf_trim - lop at most "len" bytes off the end of "buf" * @buf: buf to be trimmed * @len: number of bytes to reduce "buf" by @@ -1469,7 +1711,8 @@ fix_len: } EXPORT_SYMBOL_GPL(xdr_buf_trim); -static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) +static void __read_bytes_from_xdr_buf(const struct xdr_buf *subbuf, + void *obj, unsigned int len) { unsigned int this_len; @@ -1478,8 +1721,7 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->page_len); - if (this_len) - _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len); + _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len); len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len); @@ -1487,7 +1729,8 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne } /* obj is assumed to point to allocated memory of size at least len: */ -int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len) +int read_bytes_from_xdr_buf(const struct xdr_buf *buf, unsigned int base, + void *obj, unsigned int len) { struct xdr_buf subbuf; int status; @@ -1500,7 +1743,8 @@ int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, u } EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf); -static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) +static void __write_bytes_to_xdr_buf(const struct xdr_buf *subbuf, + void *obj, unsigned int len) { unsigned int this_len; @@ -1509,8 +1753,7 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->page_len); - if (this_len) - _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len); + _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len); len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len); @@ -1518,7 +1761,8 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned } /* obj is assumed to point to allocated memory of size at least len: */ -int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len) +int write_bytes_to_xdr_buf(const struct xdr_buf *buf, unsigned int base, + void *obj, unsigned int len) { struct xdr_buf subbuf; int status; @@ -1531,8 +1775,7 @@ int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, un } EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf); -int -xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) +int xdr_decode_word(const struct xdr_buf *buf, unsigned int base, u32 *obj) { __be32 raw; int status; @@ -1545,8 +1788,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) } EXPORT_SYMBOL_GPL(xdr_decode_word); -int -xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) +int xdr_encode_word(const struct xdr_buf *buf, unsigned int base, u32 obj) { __be32 raw = cpu_to_be32(obj); @@ -1555,9 +1797,8 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) EXPORT_SYMBOL_GPL(xdr_encode_word); /* Returns 0 on success, or else a negative error code. */ -static int -xdr_xcode_array2(struct xdr_buf *buf, unsigned int base, - struct xdr_array2_desc *desc, int encode) +static int xdr_xcode_array2(const struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc, int encode) { char *elem = NULL, *c; unsigned int copied = 0, todo, avail_here; @@ -1749,9 +1990,8 @@ out: return err; } -int -xdr_decode_array2(struct xdr_buf *buf, unsigned int base, - struct xdr_array2_desc *desc) +int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc) { if (base >= buf->len) return -EINVAL; @@ -1760,9 +2000,8 @@ xdr_decode_array2(struct xdr_buf *buf, unsigned int base, } EXPORT_SYMBOL_GPL(xdr_decode_array2); -int -xdr_encode_array2(struct xdr_buf *buf, unsigned int base, - struct xdr_array2_desc *desc) +int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc) { if ((unsigned long) base + 4 + desc->array_len * desc->elem_size > buf->head->iov_len + buf->page_len + buf->tail->iov_len) @@ -1772,9 +2011,9 @@ xdr_encode_array2(struct xdr_buf *buf, unsigned int base, } EXPORT_SYMBOL_GPL(xdr_encode_array2); -int -xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, - int (*actor)(struct scatterlist *, void *), void *data) +int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, + unsigned int len, + int (*actor)(struct scatterlist *, void *), void *data) { int i, ret = 0; unsigned int page_len, thislen, page_offset; @@ -1942,10 +2181,8 @@ ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); if (ret > 0) { - char *s = kmalloc(ret + 1, gfp_flags); + char *s = kmemdup_nul(p, ret, gfp_flags); if (s != NULL) { - memcpy(s, p, ret); - s[ret] = '\0'; *str = s; return strlen(s); } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index f6c17e75f20e..691ccf8049a4 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -151,33 +151,94 @@ out: } EXPORT_SYMBOL_GPL(xprt_unregister_transport); -/** - * xprt_load_transport - load a transport implementation - * @transport_name: transport to load - * - * Returns: - * 0: transport successfully loaded - * -ENOENT: transport module not available - */ -int xprt_load_transport(const char *transport_name) +static void +xprt_class_release(const struct xprt_class *t) { - struct xprt_class *t; - int result; + module_put(t->owner); +} + +static const struct xprt_class * +xprt_class_find_by_ident_locked(int ident) +{ + const struct xprt_class *t; + + list_for_each_entry(t, &xprt_list, list) { + if (t->ident != ident) + continue; + if (!try_module_get(t->owner)) + continue; + return t; + } + return NULL; +} + +static const struct xprt_class * +xprt_class_find_by_ident(int ident) +{ + const struct xprt_class *t; - result = 0; spin_lock(&xprt_list_lock); + t = xprt_class_find_by_ident_locked(ident); + spin_unlock(&xprt_list_lock); + return t; +} + +static const struct xprt_class * +xprt_class_find_by_netid_locked(const char *netid) +{ + const struct xprt_class *t; + unsigned int i; + list_for_each_entry(t, &xprt_list, list) { - if (strcmp(t->name, transport_name) == 0) { - spin_unlock(&xprt_list_lock); - goto out; + for (i = 0; t->netid[i][0] != '\0'; i++) { + if (strcmp(t->netid[i], netid) != 0) + continue; + if (!try_module_get(t->owner)) + continue; + return t; } } + return NULL; +} + +static const struct xprt_class * +xprt_class_find_by_netid(const char *netid) +{ + const struct xprt_class *t; + + spin_lock(&xprt_list_lock); + t = xprt_class_find_by_netid_locked(netid); + if (!t) { + spin_unlock(&xprt_list_lock); + request_module("rpc%s", netid); + spin_lock(&xprt_list_lock); + t = xprt_class_find_by_netid_locked(netid); + } spin_unlock(&xprt_list_lock); - result = request_module("xprt%s", transport_name); -out: - return result; + return t; +} + +/** + * xprt_find_transport_ident - convert a netid into a transport identifier + * @netid: transport to load + * + * Returns: + * > 0: transport identifier + * -ENOENT: transport module not available + */ +int xprt_find_transport_ident(const char *netid) +{ + const struct xprt_class *t; + int ret; + + t = xprt_class_find_by_netid(netid); + if (!t) + return -ENOENT; + ret = t->ident; + xprt_class_release(t); + return ret; } -EXPORT_SYMBOL_GPL(xprt_load_transport); +EXPORT_SYMBOL_GPL(xprt_find_transport_ident); static void xprt_clear_locked(struct rpc_xprt *xprt) { @@ -1896,21 +1957,17 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) struct rpc_xprt *xprt_create_transport(struct xprt_create *args) { struct rpc_xprt *xprt; - struct xprt_class *t; + const struct xprt_class *t; - spin_lock(&xprt_list_lock); - list_for_each_entry(t, &xprt_list, list) { - if (t->ident == args->ident) { - spin_unlock(&xprt_list_lock); - goto found; - } + t = xprt_class_find_by_ident(args->ident); + if (!t) { + dprintk("RPC: transport (%d) not supported\n", args->ident); + return ERR_PTR(-EIO); } - spin_unlock(&xprt_list_lock); - dprintk("RPC: transport (%d) not supported\n", args->ident); - return ERR_PTR(-EIO); -found: xprt = t->setup(args); + xprt_class_release(t); + if (IS_ERR(xprt)) goto out; if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT) diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 8ed0377d7a18..55b21bae866d 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \ - module.o + svc_rdma_pcl.o module.o rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index c92c1aac270a..946edf2db646 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015-2020, Oracle and/or its affiliates. * * Support for backward direction RPCs on RPC/RDMA. */ @@ -82,7 +82,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) &rqst->rq_snd_buf, rpcrdma_noch_pullup)) return -EIO; - trace_xprtrdma_cb_reply(rqst); + trace_xprtrdma_cb_reply(r_xprt, rqst); return 0; } @@ -260,7 +260,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, */ req = rpcr_to_rdmar(rqst); req->rl_reply = rep; - trace_xprtrdma_cb_call(rqst); + trace_xprtrdma_cb_call(r_xprt, rqst); /* Queue rqst for ULP's callback service */ bc_serv = xprt->bc_serv; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 44888f5badef..baca49fe83af 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -65,18 +65,23 @@ void frwr_release_mr(struct rpcrdma_mr *mr) kfree(mr); } +static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) +{ + if (mr->mr_device) { + trace_xprtrdma_mr_unmap(mr); + ib_dma_unmap_sg(mr->mr_device, mr->mr_sg, mr->mr_nents, + mr->mr_dir); + mr->mr_device = NULL; + } +} + static void frwr_mr_recycle(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; trace_xprtrdma_mr_recycle(mr); - if (mr->mr_dir != DMA_NONE) { - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - mr->mr_dir = DMA_NONE; - } + frwr_mr_unmap(r_xprt, mr); spin_lock(&r_xprt->rx_buf.rb_lock); list_del(&mr->mr_all); @@ -86,6 +91,16 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr) frwr_release_mr(mr); } +static void frwr_mr_put(struct rpcrdma_mr *mr) +{ + frwr_mr_unmap(mr->mr_xprt, mr); + + /* The MR is returned to the req's MR free list instead + * of to the xprt's MR free list. No spinlock is needed. + */ + rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); +} + /* frwr_reset - Place MRs back on the free list * @req: request to reset * @@ -101,7 +116,7 @@ void frwr_reset(struct rpcrdma_req *req) struct rpcrdma_mr *mr; while ((mr = rpcrdma_mr_pop(&req->rl_registered))) - rpcrdma_mr_put(mr); + frwr_mr_put(mr); } /** @@ -130,7 +145,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) mr->mr_xprt = r_xprt; mr->frwr.fr_mr = frmr; - mr->mr_dir = DMA_NONE; + mr->mr_device = NULL; INIT_LIST_HEAD(&mr->mr_list); init_completion(&mr->frwr.fr_linv_done); @@ -315,6 +330,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, mr->mr_dir); if (!dma_nents) goto out_dmamap_err; + mr->mr_device = ep->re_id->device; ibmr = mr->frwr.fr_mr; n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE); @@ -341,7 +357,6 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, return seg; out_dmamap_err: - mr->mr_dir = DMA_NONE; trace_xprtrdma_frwr_sgerr(mr, i); return ERR_PTR(-EIO); @@ -363,12 +378,21 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) container_of(cqe, struct rpcrdma_frwr, fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_fastreg(wc, frwr); + trace_xprtrdma_wc_fastreg(wc, &frwr->fr_cid); /* The MR will get recycled when the associated req is retransmitted */ rpcrdma_flush_disconnect(cq->cq_context, wc); } +static void frwr_cid_init(struct rpcrdma_ep *ep, + struct rpcrdma_frwr *frwr) +{ + struct rpc_rdma_cid *cid = &frwr->fr_cid; + + cid->ci_queue_id = ep->re_attr.send_cq->res.id; + cid->ci_completion_id = frwr->fr_mr->res.id; +} + /** * frwr_send - post Send WRs containing the RPC Call message * @r_xprt: controlling transport instance @@ -385,6 +409,7 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) */ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_send_wr *post_wr; struct rpcrdma_mr *mr; @@ -395,6 +420,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_fastreg; + frwr_cid_init(ep, frwr); frwr->fr_regwr.wr.next = post_wr; frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe; frwr->fr_regwr.wr.num_sge = 0; @@ -404,7 +430,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) post_wr = &frwr->fr_regwr.wr; } - return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL); + return ib_post_send(ep->re_id->qp, post_wr, NULL); } /** @@ -420,18 +446,17 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); - trace_xprtrdma_mr_reminv(mr); - rpcrdma_mr_put(mr); + frwr_mr_put(mr); break; /* only one invalidated MR per RPC */ } } -static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) +static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr) { if (wc->status != IB_WC_SUCCESS) frwr_mr_recycle(mr); else - rpcrdma_mr_put(mr); + frwr_mr_put(mr); } /** @@ -448,8 +473,8 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li(wc, frwr); - __frwr_release_mr(wc, mr); + trace_xprtrdma_wc_li(wc, &frwr->fr_cid); + frwr_mr_done(wc, mr); rpcrdma_flush_disconnect(cq->cq_context, wc); } @@ -469,8 +494,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_wake(wc, frwr); - __frwr_release_mr(wc, mr); + trace_xprtrdma_wc_li_wake(wc, &frwr->fr_cid); + frwr_mr_done(wc, mr); complete(&frwr->fr_linv_done); rpcrdma_flush_disconnect(cq->cq_context, wc); @@ -490,6 +515,7 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, **prev, *last; + struct rpcrdma_ep *ep = r_xprt->rx_ep; const struct ib_send_wr *bad_wr; struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; @@ -509,6 +535,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; + frwr_cid_init(ep, frwr); last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; @@ -534,7 +561,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); + rc = ib_post_send(ep->re_id->qp, first, &bad_wr); /* The final LOCAL_INV WR in the chain is supposed to * do the wake. If it was never posted, the wake will @@ -547,7 +574,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Recycle MRs in the LOCAL_INV chain that did not get posted. */ - trace_xprtrdma_post_linv(req, rc); + trace_xprtrdma_post_linv_err(req, rc); while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); @@ -574,10 +601,10 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_rep *rep = mr->mr_req->rl_reply; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_done(wc, frwr); - __frwr_release_mr(wc, mr); + trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid); + frwr_mr_done(wc, mr); - /* Ensure @rep is generated before __frwr_release_mr */ + /* Ensure @rep is generated before frwr_mr_done */ smp_rmb(); rpcrdma_complete_rqst(rep); @@ -597,6 +624,7 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, *last, **prev; + struct rpcrdma_ep *ep = r_xprt->rx_ep; const struct ib_send_wr *bad_wr; struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; @@ -614,6 +642,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; + frwr_cid_init(ep, frwr); last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; @@ -639,13 +668,13 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); + rc = ib_post_send(ep->re_id->qp, first, &bad_wr); if (!rc) return; /* Recycle MRs in the LOCAL_INV chain that did not get posted. */ - trace_xprtrdma_post_linv(req, rc); + trace_xprtrdma_post_linv_err(req, rc); while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); mr = container_of(frwr, struct rpcrdma_mr, frwr); diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c index 620327c01302..45c5b41ac8dc 100644 --- a/net/sunrpc/xprtrdma/module.c +++ b/net/sunrpc/xprtrdma/module.c @@ -24,6 +24,7 @@ MODULE_DESCRIPTION("RPC/RDMA Transport"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("svcrdma"); MODULE_ALIAS("xprtrdma"); +MODULE_ALIAS("rpcrdma6"); static void __exit rpc_rdma_cleanup(void) { diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 0f5120c7668f..8f5d0cb68360 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (c) 2014-2017 Oracle. All rights reserved. + * Copyright (c) 2014-2020, Oracle and/or its affiliates. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -179,6 +179,31 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, r_xprt->rx_ep->re_max_inline_recv; } +/* ACL likes to be lazy in allocating pages. For TCP, these + * pages can be allocated during receive processing. Not true + * for RDMA, which must always provision receive buffers + * up front. + */ +static noinline int +rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) +{ + struct page **ppages; + int len; + + len = buf->page_len; + ppages = buf->pages + (buf->page_base >> PAGE_SHIFT); + while (len > 0) { + if (!*ppages) + *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); + if (!*ppages) + return -ENOBUFS; + ppages++; + len -= PAGE_SIZE; + } + + return 0; +} + /* Split @vec on page boundaries into SGEs. FMR registers pages, not * a byte range. Other modes coalesce these SGEs into a single MR * when they can. @@ -233,15 +258,6 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); page_base = offset_in_page(xdrbuf->page_base); while (len) { - /* ACL likes to be lazy in allocating pages - ACLs - * are small by default but can get huge. - */ - if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) { - if (!*ppages) - *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); - if (!*ppages) - return -ENOBUFS; - } seg->mr_page = *ppages; seg->mr_offset = (char *)page_base; seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); @@ -315,7 +331,6 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, *mr = rpcrdma_mr_get(r_xprt); if (!*mr) goto out_getmr_err; - trace_xprtrdma_mr_get(req); (*mr)->mr_req = req; } @@ -323,7 +338,7 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); out_getmr_err: - trace_xprtrdma_nomrs(req); + trace_xprtrdma_nomrs_err(r_xprt, req); xprt_wait_for_buffer_space(&r_xprt->rx_xprt); rpcrdma_mrs_refresh(r_xprt); return ERR_PTR(-EAGAIN); @@ -867,6 +882,12 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) __be32 *p; int ret; + if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) { + ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf); + if (ret) + return ret; + } + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf), rqst); @@ -1322,20 +1343,13 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) break; - dprintk("RPC: %s: server reports " - "version error (%u-%u), xid %08x\n", __func__, - be32_to_cpup(p), be32_to_cpu(*(p + 1)), - be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_vers(rqst, p, p + 1); break; case err_chunk: - dprintk("RPC: %s: server reports " - "header decoding error, xid %08x\n", __func__, - be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_chunk(rqst); break; default: - dprintk("RPC: %s: server reports " - "unrecognized error %d, xid %08x\n", __func__, - be32_to_cpup(p), be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_unrecognized(rqst, p); } return -EIO; @@ -1376,7 +1390,7 @@ out: return; out_badheader: - trace_xprtrdma_reply_hdr(rep); + trace_xprtrdma_reply_hdr_err(rep); r_xprt->rx_stats.bad_reply_count++; rqst->rq_task->tk_status = status; status = 0; @@ -1450,14 +1464,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) rpcrdma_post_recvs(r_xprt, false); req = rpcr_to_rdmar(rqst); - if (req->rl_reply) { - trace_xprtrdma_leaked_rep(rqst, req->rl_reply); + if (unlikely(req->rl_reply)) rpcrdma_recv_buffer_put(req->rl_reply); - } req->rl_reply = rep; rep->rr_rqst = rqst; - trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); + trace_xprtrdma_reply(rqst->rq_task, rep, credits); if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) frwr_reminv(rep, &req->rl_registered); @@ -1469,16 +1481,16 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) return; out_badversion: - trace_xprtrdma_reply_vers(rep); + trace_xprtrdma_reply_vers_err(rep); goto out; out_norqst: spin_unlock(&xprt->queue_lock); - trace_xprtrdma_reply_rqst(rep); + trace_xprtrdma_reply_rqst_err(rep); goto out; out_shortreply: - trace_xprtrdma_reply_short(rep); + trace_xprtrdma_reply_short_err(rep); out: rpcrdma_recv_buffer_put(rep); diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 526da5d4710b..5bc20e9d09cd 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -62,51 +62,47 @@ static unsigned int max_max_requests = 16384; unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH; static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH; static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH; +static unsigned int svcrdma_stat_unused; +static unsigned int zero; -atomic_t rdma_stat_recv; -atomic_t rdma_stat_read; -atomic_t rdma_stat_write; -atomic_t rdma_stat_sq_starve; -atomic_t rdma_stat_rq_starve; -atomic_t rdma_stat_rq_poll; -atomic_t rdma_stat_rq_prod; -atomic_t rdma_stat_sq_poll; -atomic_t rdma_stat_sq_prod; +struct percpu_counter svcrdma_stat_read; +struct percpu_counter svcrdma_stat_recv; +struct percpu_counter svcrdma_stat_sq_starve; +struct percpu_counter svcrdma_stat_write; -/* - * This function implements reading and resetting an atomic_t stat - * variable through read/write to a proc file. Any write to the file - * resets the associated statistic to zero. Any read returns it's - * current value. - */ -static int read_reset_stat(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +enum { + SVCRDMA_COUNTER_BUFSIZ = sizeof(unsigned long long), +}; + +static int svcrdma_counter_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { - atomic_t *stat = (atomic_t *)table->data; - - if (!stat) - return -EINVAL; - - if (write) - atomic_set(stat, 0); - else { - char str_buf[32]; - int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat)); - if (len >= 32) - return -EFAULT; - len = strlen(str_buf); - if (*ppos > len) { - *lenp = 0; - return 0; - } - len -= *ppos; - if (len > *lenp) - len = *lenp; - if (len) - memcpy(buffer, str_buf, len); - *lenp = len; - *ppos += len; + struct percpu_counter *stat = (struct percpu_counter *)table->data; + char tmp[SVCRDMA_COUNTER_BUFSIZ + 1]; + int len; + + if (write) { + percpu_counter_set(stat, 0); + return 0; } + + len = snprintf(tmp, SVCRDMA_COUNTER_BUFSIZ, "%lld\n", + percpu_counter_sum_positive(stat)); + if (len >= SVCRDMA_COUNTER_BUFSIZ) + return -EFAULT; + len = strlen(tmp); + if (*ppos > len) { + *lenp = 0; + return 0; + } + len -= *ppos; + if (len > *lenp) + len = *lenp; + if (len) + memcpy(buffer, tmp, len); + *lenp = len; + *ppos += len; + return 0; } @@ -142,66 +138,76 @@ static struct ctl_table svcrdma_parm_table[] = { { .procname = "rdma_stat_read", - .data = &rdma_stat_read, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_read, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_recv", - .data = &rdma_stat_recv, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_recv, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_write", - .data = &rdma_stat_write, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_write, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_sq_starve", - .data = &rdma_stat_sq_starve, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_sq_starve, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_rq_starve", - .data = &rdma_stat_rq_starve, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_rq_poll", - .data = &rdma_stat_rq_poll, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_rq_prod", - .data = &rdma_stat_rq_prod, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_sq_poll", - .data = &rdma_stat_sq_poll, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_sq_prod", - .data = &rdma_stat_sq_prod, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { }, }; @@ -224,27 +230,69 @@ static struct ctl_table svcrdma_root_table[] = { { }, }; +static void svc_rdma_proc_cleanup(void) +{ + if (!svcrdma_table_header) + return; + unregister_sysctl_table(svcrdma_table_header); + svcrdma_table_header = NULL; + + percpu_counter_destroy(&svcrdma_stat_write); + percpu_counter_destroy(&svcrdma_stat_sq_starve); + percpu_counter_destroy(&svcrdma_stat_recv); + percpu_counter_destroy(&svcrdma_stat_read); +} + +static int svc_rdma_proc_init(void) +{ + int rc; + + if (svcrdma_table_header) + return 0; + + rc = percpu_counter_init(&svcrdma_stat_read, 0, GFP_KERNEL); + if (rc) + goto out_err; + rc = percpu_counter_init(&svcrdma_stat_recv, 0, GFP_KERNEL); + if (rc) + goto out_err; + rc = percpu_counter_init(&svcrdma_stat_sq_starve, 0, GFP_KERNEL); + if (rc) + goto out_err; + rc = percpu_counter_init(&svcrdma_stat_write, 0, GFP_KERNEL); + if (rc) + goto out_err; + + svcrdma_table_header = register_sysctl_table(svcrdma_root_table); + return 0; + +out_err: + percpu_counter_destroy(&svcrdma_stat_sq_starve); + percpu_counter_destroy(&svcrdma_stat_recv); + percpu_counter_destroy(&svcrdma_stat_read); + return rc; +} + void svc_rdma_cleanup(void) { dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); - if (svcrdma_table_header) { - unregister_sysctl_table(svcrdma_table_header); - svcrdma_table_header = NULL; - } svc_unreg_xprt_class(&svc_rdma_class); + svc_rdma_proc_cleanup(); } int svc_rdma_init(void) { + int rc; + dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); dprintk("\tmax_requests : %u\n", svcrdma_max_requests); dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); - if (!svcrdma_table_header) - svcrdma_table_header = - register_sysctl_table(svcrdma_root_table); + rc = svc_rdma_proc_init(); + if (rc) + return rc; /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 5e7c4ba9e147..63f8be974df2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -74,11 +74,17 @@ out_unlock: */ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst, - struct svc_rdma_send_ctxt *ctxt) + struct svc_rdma_send_ctxt *sctxt) { + struct svc_rdma_recv_ctxt *rctxt; int ret; - ret = svc_rdma_map_reply_msg(rdma, ctxt, NULL, &rqst->rq_snd_buf); + rctxt = svc_rdma_recv_ctxt_get(rdma); + if (!rctxt) + return -EIO; + + ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqst->rq_snd_buf); + svc_rdma_recv_ctxt_put(rdma, rctxt); if (ret < 0) return -EIO; @@ -86,8 +92,8 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, * the rq_buffer before all retransmits are complete. */ get_page(virt_to_page(rqst->rq_buffer)); - ctxt->sc_send_wr.opcode = IB_WR_SEND; - return svc_rdma_send(rdma, ctxt); + sctxt->sc_send_wr.opcode = IB_WR_SEND; + return svc_rdma_send(rdma, sctxt); } /* Server-side transport endpoint wants a whole page for its send diff --git a/net/sunrpc/xprtrdma/svc_rdma_pcl.c b/net/sunrpc/xprtrdma/svc_rdma_pcl.c new file mode 100644 index 000000000000..b63cfeaa2923 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_pcl.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Oracle. All rights reserved. + */ + +#include <linux/sunrpc/svc_rdma.h> +#include <linux/sunrpc/rpc_rdma.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +/** + * pcl_free - Release all memory associated with a parsed chunk list + * @pcl: parsed chunk list + * + */ +void pcl_free(struct svc_rdma_pcl *pcl) +{ + while (!list_empty(&pcl->cl_chunks)) { + struct svc_rdma_chunk *chunk; + + chunk = pcl_first_chunk(pcl); + list_del(&chunk->ch_list); + kfree(chunk); + } +} + +static struct svc_rdma_chunk *pcl_alloc_chunk(u32 segcount, u32 position) +{ + struct svc_rdma_chunk *chunk; + + chunk = kmalloc(struct_size(chunk, ch_segments, segcount), GFP_KERNEL); + if (!chunk) + return NULL; + + chunk->ch_position = position; + chunk->ch_length = 0; + chunk->ch_payload_length = 0; + chunk->ch_segcount = 0; + return chunk; +} + +static struct svc_rdma_chunk * +pcl_lookup_position(struct svc_rdma_pcl *pcl, u32 position) +{ + struct svc_rdma_chunk *pos; + + pcl_for_each_chunk(pos, pcl) { + if (pos->ch_position == position) + return pos; + } + return NULL; +} + +static void pcl_insert_position(struct svc_rdma_pcl *pcl, + struct svc_rdma_chunk *chunk) +{ + struct svc_rdma_chunk *pos; + + pcl_for_each_chunk(pos, pcl) { + if (pos->ch_position > chunk->ch_position) + break; + } + __list_add(&chunk->ch_list, pos->ch_list.prev, &pos->ch_list); + pcl->cl_count++; +} + +static void pcl_set_read_segment(const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_chunk *chunk, + u32 handle, u32 length, u64 offset) +{ + struct svc_rdma_segment *segment; + + segment = &chunk->ch_segments[chunk->ch_segcount]; + segment->rs_handle = handle; + segment->rs_length = length; + segment->rs_offset = offset; + + trace_svcrdma_decode_rseg(&rctxt->rc_cid, chunk, segment); + + chunk->ch_length += length; + chunk->ch_segcount++; +} + +/** + * pcl_alloc_call - Construct a parsed chunk list for the Call body + * @rctxt: Ingress receive context + * @p: Start of an un-decoded Read list + * + * Assumptions: + * - The incoming Read list has already been sanity checked. + * - cl_count is already set to the number of segments in + * the un-decoded list. + * - The list might not be in order by position. + * + * Return values: + * %true: Parsed chunk list was successfully constructed, and + * cl_count is updated to be the number of chunks (ie. + * unique positions) in the Read list. + * %false: Memory allocation failed. + */ +bool pcl_alloc_call(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) +{ + struct svc_rdma_pcl *pcl = &rctxt->rc_call_pcl; + unsigned int i, segcount = pcl->cl_count; + + pcl->cl_count = 0; + for (i = 0; i < segcount; i++) { + struct svc_rdma_chunk *chunk; + u32 position, handle, length; + u64 offset; + + p++; /* skip the list discriminator */ + p = xdr_decode_read_segment(p, &position, &handle, + &length, &offset); + if (position != 0) + continue; + + if (pcl_is_empty(pcl)) { + chunk = pcl_alloc_chunk(segcount, position); + if (!chunk) + return false; + pcl_insert_position(pcl, chunk); + } else { + chunk = list_first_entry(&pcl->cl_chunks, + struct svc_rdma_chunk, + ch_list); + } + + pcl_set_read_segment(rctxt, chunk, handle, length, offset); + } + + return true; +} + +/** + * pcl_alloc_read - Construct a parsed chunk list for normal Read chunks + * @rctxt: Ingress receive context + * @p: Start of an un-decoded Read list + * + * Assumptions: + * - The incoming Read list has already been sanity checked. + * - cl_count is already set to the number of segments in + * the un-decoded list. + * - The list might not be in order by position. + * + * Return values: + * %true: Parsed chunk list was successfully constructed, and + * cl_count is updated to be the number of chunks (ie. + * unique position values) in the Read list. + * %false: Memory allocation failed. + * + * TODO: + * - Check for chunk range overlaps + */ +bool pcl_alloc_read(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) +{ + struct svc_rdma_pcl *pcl = &rctxt->rc_read_pcl; + unsigned int i, segcount = pcl->cl_count; + + pcl->cl_count = 0; + for (i = 0; i < segcount; i++) { + struct svc_rdma_chunk *chunk; + u32 position, handle, length; + u64 offset; + + p++; /* skip the list discriminator */ + p = xdr_decode_read_segment(p, &position, &handle, + &length, &offset); + if (position == 0) + continue; + + chunk = pcl_lookup_position(pcl, position); + if (!chunk) { + chunk = pcl_alloc_chunk(segcount, position); + if (!chunk) + return false; + pcl_insert_position(pcl, chunk); + } + + pcl_set_read_segment(rctxt, chunk, handle, length, offset); + } + + return true; +} + +/** + * pcl_alloc_write - Construct a parsed chunk list from a Write list + * @rctxt: Ingress receive context + * @pcl: Parsed chunk list to populate + * @p: Start of an un-decoded Write list + * + * Assumptions: + * - The incoming Write list has already been sanity checked, and + * - cl_count is set to the number of chunks in the un-decoded list. + * + * Return values: + * %true: Parsed chunk list was successfully constructed. + * %false: Memory allocation failed. + */ +bool pcl_alloc_write(struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_pcl *pcl, __be32 *p) +{ + struct svc_rdma_segment *segment; + struct svc_rdma_chunk *chunk; + unsigned int i, j; + u32 segcount; + + for (i = 0; i < pcl->cl_count; i++) { + p++; /* skip the list discriminator */ + segcount = be32_to_cpup(p++); + + chunk = pcl_alloc_chunk(segcount, 0); + if (!chunk) + return false; + list_add_tail(&chunk->ch_list, &pcl->cl_chunks); + + for (j = 0; j < segcount; j++) { + segment = &chunk->ch_segments[j]; + p = xdr_decode_rdma_segment(p, &segment->rs_handle, + &segment->rs_length, + &segment->rs_offset); + trace_svcrdma_decode_wseg(&rctxt->rc_cid, chunk, j); + + chunk->ch_length += segment->rs_length; + chunk->ch_segcount++; + } + } + return true; +} + +static int pcl_process_region(const struct xdr_buf *xdr, + unsigned int offset, unsigned int length, + int (*actor)(const struct xdr_buf *, void *), + void *data) +{ + struct xdr_buf subbuf; + + if (!length) + return 0; + if (xdr_buf_subsegment(xdr, &subbuf, offset, length)) + return -EMSGSIZE; + return actor(&subbuf, data); +} + +/** + * pcl_process_nonpayloads - Process non-payload regions inside @xdr + * @pcl: Chunk list to process + * @xdr: xdr_buf to process + * @actor: Function to invoke on each non-payload region + * @data: Arguments for @actor + * + * This mechanism must ignore not only result payloads that were already + * sent via RDMA Write, but also XDR padding for those payloads that + * the upper layer has added. + * + * Assumptions: + * The xdr->len and ch_position fields are aligned to 4-byte multiples. + * + * Returns: + * On success, zero, + * %-EMSGSIZE on XDR buffer overflow, or + * The return value of @actor + */ +int pcl_process_nonpayloads(const struct svc_rdma_pcl *pcl, + const struct xdr_buf *xdr, + int (*actor)(const struct xdr_buf *, void *), + void *data) +{ + struct svc_rdma_chunk *chunk, *next; + unsigned int start; + int ret; + + chunk = pcl_first_chunk(pcl); + + /* No result payloads were generated */ + if (!chunk || !chunk->ch_payload_length) + return actor(xdr, data); + + /* Process the region before the first result payload */ + ret = pcl_process_region(xdr, 0, chunk->ch_position, actor, data); + if (ret < 0) + return ret; + + /* Process the regions between each middle result payload */ + while ((next = pcl_next_chunk(pcl, chunk))) { + if (!next->ch_payload_length) + break; + + start = pcl_chunk_end_offset(chunk); + ret = pcl_process_region(xdr, start, next->ch_position - start, + actor, data); + if (ret < 0) + return ret; + + chunk = next; + } + + /* Process the region after the last result payload */ + start = pcl_chunk_end_offset(chunk); + ret = pcl_process_region(xdr, start, xdr->len - start, actor, data); + if (ret < 0) + return ret; + + return 0; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index c6ea2903c21a..6d28f23ceb35 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -93,6 +93,7 @@ * (see rdma_read_complete() below). */ +#include <linux/slab.h> #include <linux/spinlock.h> #include <asm/unaligned.h> #include <rdma/ib_verbs.h> @@ -143,6 +144,10 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) goto fail2; svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid); + pcl_init(&ctxt->rc_call_pcl); + pcl_init(&ctxt->rc_read_pcl); + pcl_init(&ctxt->rc_write_pcl); + pcl_init(&ctxt->rc_reply_pcl); ctxt->rc_recv_wr.next = NULL; ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe; @@ -189,8 +194,13 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma) } } -static struct svc_rdma_recv_ctxt * -svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) +/** + * svc_rdma_recv_ctxt_get - Allocate a recv_ctxt + * @rdma: controlling svcxprt_rdma + * + * Returns a recv_ctxt or (rarely) NULL if none are available. + */ +struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; struct llist_node *node; @@ -202,7 +212,6 @@ svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) out: ctxt->rc_page_count = 0; - ctxt->rc_read_payload_length = 0; return ctxt; out_empty: @@ -226,6 +235,11 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, for (i = 0; i < ctxt->rc_page_count; i++) put_page(ctxt->rc_pages[i]); + pcl_free(&ctxt->rc_call_pcl); + pcl_free(&ctxt->rc_read_pcl); + pcl_free(&ctxt->rc_write_pcl); + pcl_free(&ctxt->rc_reply_pcl); + if (!ctxt->rc_temp) llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); else @@ -252,33 +266,46 @@ void svc_rdma_release_rqst(struct svc_rqst *rqstp) svc_rdma_recv_ctxt_put(rdma, ctxt); } -static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma, - struct svc_rdma_recv_ctxt *ctxt) +static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, + unsigned int wanted, bool temp) { + const struct ib_recv_wr *bad_wr = NULL; + struct svc_rdma_recv_ctxt *ctxt; + struct ib_recv_wr *recv_chain; int ret; - trace_svcrdma_post_recv(ctxt); - ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL); + recv_chain = NULL; + while (wanted--) { + ctxt = svc_rdma_recv_ctxt_get(rdma); + if (!ctxt) + break; + + trace_svcrdma_post_recv(ctxt); + ctxt->rc_temp = temp; + ctxt->rc_recv_wr.next = recv_chain; + recv_chain = &ctxt->rc_recv_wr; + rdma->sc_pending_recvs++; + } + if (!recv_chain) + return false; + + ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr); if (ret) goto err_post; - return 0; + return true; err_post: - trace_svcrdma_rq_post_err(rdma, ret); - svc_rdma_recv_ctxt_put(rdma, ctxt); - return ret; -} - -static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) -{ - struct svc_rdma_recv_ctxt *ctxt; + while (bad_wr) { + ctxt = container_of(bad_wr, struct svc_rdma_recv_ctxt, + rc_recv_wr); + bad_wr = bad_wr->next; + svc_rdma_recv_ctxt_put(rdma, ctxt); + } - if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) - return 0; - ctxt = svc_rdma_recv_ctxt_get(rdma); - if (!ctxt) - return -ENOMEM; - return __svc_rdma_post_recv(rdma, ctxt); + trace_svcrdma_rq_post_err(rdma, ret); + /* Since we're destroying the xprt, no need to reset + * sc_pending_recvs. */ + return false; } /** @@ -289,20 +316,7 @@ static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) */ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) { - struct svc_rdma_recv_ctxt *ctxt; - unsigned int i; - int ret; - - for (i = 0; i < rdma->sc_max_requests; i++) { - ctxt = svc_rdma_recv_ctxt_get(rdma); - if (!ctxt) - return false; - ctxt->rc_temp = true; - ret = __svc_rdma_post_recv(rdma, ctxt); - if (ret) - return false; - } - return true; + return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true); } /** @@ -310,8 +324,6 @@ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) * @cq: Completion Queue context * @wc: Work Completion object * - * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that - * the Receive completion handler could be running. */ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) { @@ -319,6 +331,8 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_recv_ctxt *ctxt; + rdma->sc_pending_recvs--; + /* WARNING: Only wc->wr_cqe and wc->status are reliable */ ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe); @@ -326,14 +340,8 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) if (wc->status != IB_WC_SUCCESS) goto flushed; - if (svc_rdma_post_recv(rdma)) - goto post_err; - /* All wc fields are now known to be valid */ ctxt->rc_byte_len = wc->byte_len; - ib_dma_sync_single_for_cpu(rdma->sc_pd->device, - ctxt->rc_recv_sge.addr, - wc->byte_len, DMA_FROM_DEVICE); spin_lock(&rdma->sc_rq_dto_lock); list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q); @@ -342,11 +350,18 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) spin_unlock(&rdma->sc_rq_dto_lock); if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags)) svc_xprt_enqueue(&rdma->sc_xprt); + + if (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) && + rdma->sc_pending_recvs < rdma->sc_max_requests) + if (!svc_rdma_refresh_recvs(rdma, RPCRDMA_MAX_RECV_BATCH, + false)) + goto post_err; + return; flushed: -post_err: svc_rdma_recv_ctxt_put(rdma, ctxt); +post_err: set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); svc_xprt_enqueue(&rdma->sc_xprt); } @@ -385,100 +400,123 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, arg->len = ctxt->rc_byte_len; } -/* This accommodates the largest possible Write chunk. - */ -#define MAX_BYTES_WRITE_CHUNK ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT)) - -/* This accommodates the largest possible Position-Zero - * Read chunk or Reply chunk. - */ -#define MAX_BYTES_SPECIAL_CHUNK ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT)) - -/* Sanity check the Read list. - * - * Implementation limits: - * - This implementation supports only one Read chunk. +/** + * xdr_count_read_segments - Count number of Read segments in Read list + * @rctxt: Ingress receive context + * @p: Start of an un-decoded Read list * - * Sanity checks: - * - Read list does not overflow Receive buffer. - * - Segment size limited by largest NFS data payload. + * Before allocating anything, ensure the ingress Read list is safe + * to use. * - * The segment count is limited to how many segments can - * fit in the transport header without overflowing the - * buffer. That's about 40 Read segments for a 1KB inline - * threshold. + * The segment count is limited to how many segments can fit in the + * transport header without overflowing the buffer. That's about 40 + * Read segments for a 1KB inline threshold. * * Return values: - * %true: Read list is valid. @rctxt's xdr_stream is updated - * to point to the first byte past the Read list. - * %false: Read list is corrupt. @rctxt's xdr_stream is left - * in an unknown state. + * %true: Read list is valid. @rctxt's xdr_stream is updated to point + * to the first byte past the Read list. rc_read_pcl and + * rc_call_pcl cl_count fields are set to the number of + * Read segments in the list. + * %false: Read list is corrupt. @rctxt's xdr_stream is left in an + * unknown state. */ -static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt) +static bool xdr_count_read_segments(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) { - u32 position, len; - bool first; - __be32 *p; - - p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); - if (!p) - return false; - - len = 0; - first = true; + rctxt->rc_call_pcl.cl_count = 0; + rctxt->rc_read_pcl.cl_count = 0; while (xdr_item_is_present(p)) { + u32 position, handle, length; + u64 offset; + p = xdr_inline_decode(&rctxt->rc_stream, rpcrdma_readseg_maxsz * sizeof(*p)); if (!p) return false; - if (first) { - position = be32_to_cpup(p); - first = false; - } else if (be32_to_cpup(p) != position) { - return false; + xdr_decode_read_segment(p, &position, &handle, + &length, &offset); + if (position) { + if (position & 3) + return false; + ++rctxt->rc_read_pcl.cl_count; + } else { + ++rctxt->rc_call_pcl.cl_count; } - p += 2; - len += be32_to_cpup(p); p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); if (!p) return false; } - return len <= MAX_BYTES_SPECIAL_CHUNK; + return true; } -/* The segment count is limited to how many segments can - * fit in the transport header without overflowing the - * buffer. That's about 60 Write segments for a 1KB inline - * threshold. +/* Sanity check the Read list. + * + * Sanity checks: + * - Read list does not overflow Receive buffer. + * - Chunk size limited by largest NFS data payload. + * + * Return values: + * %true: Read list is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Read list. + * %false: Read list is corrupt. @rctxt's xdr_stream is left + * in an unknown state. */ -static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen) +static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt) { - u32 i, segcount, total; __be32 *p; p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); if (!p) return false; - segcount = be32_to_cpup(p); + if (!xdr_count_read_segments(rctxt, p)) + return false; + if (!pcl_alloc_call(rctxt, p)) + return false; + return pcl_alloc_read(rctxt, p); +} - total = 0; - for (i = 0; i < segcount; i++) { - u32 handle, length; - u64 offset; +static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt) +{ + u32 segcount; + __be32 *p; - p = xdr_inline_decode(&rctxt->rc_stream, - rpcrdma_segment_maxsz * sizeof(*p)); - if (!p) - return false; + if (xdr_stream_decode_u32(&rctxt->rc_stream, &segcount)) + return false; - xdr_decode_rdma_segment(p, &handle, &length, &offset); - trace_svcrdma_decode_wseg(handle, length, offset); + /* A bogus segcount causes this buffer overflow check to fail. */ + p = xdr_inline_decode(&rctxt->rc_stream, + segcount * rpcrdma_segment_maxsz * sizeof(*p)); + return p != NULL; +} - total += length; +/** + * xdr_count_write_chunks - Count number of Write chunks in Write list + * @rctxt: Received header and decoding state + * @p: start of an un-decoded Write list + * + * Before allocating anything, ensure the ingress Write list is + * safe to use. + * + * Return values: + * %true: Write list is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Write list, and + * the number of Write chunks is in rc_write_pcl.cl_count. + * %false: Write list is corrupt. @rctxt's xdr_stream is left + * in an indeterminate state. + */ +static bool xdr_count_write_chunks(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) +{ + rctxt->rc_write_pcl.cl_count = 0; + while (xdr_item_is_present(p)) { + if (!xdr_check_write_chunk(rctxt)) + return false; + ++rctxt->rc_write_pcl.cl_count; + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; } - return total <= maxlen; + return true; } /* Sanity check the Write list. @@ -498,24 +536,18 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen) */ static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt) { - u32 chcount = 0; __be32 *p; p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); if (!p) return false; - rctxt->rc_write_list = p; - while (xdr_item_is_present(p)) { - if (!xdr_check_write_chunk(rctxt, MAX_BYTES_WRITE_CHUNK)) - return false; - ++chcount; - p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); - if (!p) - return false; - } - if (!chcount) - rctxt->rc_write_list = NULL; - return chcount < 2; + if (!xdr_count_write_chunks(rctxt, p)) + return false; + if (!pcl_alloc_write(rctxt, &rctxt->rc_write_pcl, p)) + return false; + + rctxt->rc_cur_result_payload = pcl_first_chunk(&rctxt->rc_write_pcl); + return true; } /* Sanity check the Reply chunk. @@ -537,13 +569,14 @@ static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt) p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); if (!p) return false; - rctxt->rc_reply_chunk = NULL; - if (xdr_item_is_present(p)) { - if (!xdr_check_write_chunk(rctxt, MAX_BYTES_SPECIAL_CHUNK)) - return false; - rctxt->rc_reply_chunk = p; - } - return true; + + if (!xdr_item_is_present(p)) + return true; + if (!xdr_check_write_chunk(rctxt)) + return false; + + rctxt->rc_reply_pcl.cl_count = 1; + return pcl_alloc_write(rctxt, &rctxt->rc_reply_pcl, p); } /* RPC-over-RDMA Version One private extension: Remote Invalidation. @@ -552,60 +585,53 @@ static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt) * * If there is exactly one distinct R_key in the received transport * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero. - * - * Perform this operation while the received transport header is - * still in the CPU cache. */ static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt) { - __be32 inv_rkey, *p; - u32 i, segcount; + struct svc_rdma_segment *segment; + struct svc_rdma_chunk *chunk; + u32 inv_rkey; ctxt->rc_inv_rkey = 0; if (!rdma->sc_snd_w_inv) return; - inv_rkey = xdr_zero; - p = ctxt->rc_recv_buf; - p += rpcrdma_fixed_maxsz; - - /* Read list */ - while (xdr_item_is_present(p++)) { - p++; /* position */ - if (inv_rkey == xdr_zero) - inv_rkey = *p; - else if (inv_rkey != *p) - return; - p += 4; + inv_rkey = 0; + pcl_for_each_chunk(chunk, &ctxt->rc_call_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) + return; + } } - - /* Write list */ - while (xdr_item_is_present(p++)) { - segcount = be32_to_cpup(p++); - for (i = 0; i < segcount; i++) { - if (inv_rkey == xdr_zero) - inv_rkey = *p; - else if (inv_rkey != *p) + pcl_for_each_chunk(chunk, &ctxt->rc_read_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) return; - p += 4; } } - - /* Reply chunk */ - if (xdr_item_is_present(p++)) { - segcount = be32_to_cpup(p++); - for (i = 0; i < segcount; i++) { - if (inv_rkey == xdr_zero) - inv_rkey = *p; - else if (inv_rkey != *p) + pcl_for_each_chunk(chunk, &ctxt->rc_write_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) return; - p += 4; } } - - ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey); + pcl_for_each_chunk(chunk, &ctxt->rc_reply_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) + return; + } + } + ctxt->rc_inv_rkey = inv_rkey; } /** @@ -641,7 +667,8 @@ static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg, if (*p != rpcrdma_version) goto out_version; p += 2; - switch (*p) { + rctxt->rc_msgtype = *p; + switch (rctxt->rc_msgtype) { case rdma_msg: break; case rdma_nomsg: @@ -735,30 +762,28 @@ static void svc_rdma_send_error(struct svcxprt_rdma *rdma, * the RPC/RDMA header small and fixed in size, so it is * straightforward to check the RPC header's direction field. */ -static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, - __be32 *rdma_resp) +static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt, + struct svc_rdma_recv_ctxt *rctxt) { - __be32 *p; + __be32 *p = rctxt->rc_recv_buf; if (!xprt->xpt_bc_xprt) return false; - p = rdma_resp + 3; - if (*p++ != rdma_msg) + if (rctxt->rc_msgtype != rdma_msg) return false; - if (*p++ != xdr_zero) + if (!pcl_is_empty(&rctxt->rc_call_pcl)) return false; - if (*p++ != xdr_zero) + if (!pcl_is_empty(&rctxt->rc_read_pcl)) return false; - if (*p++ != xdr_zero) + if (!pcl_is_empty(&rctxt->rc_write_pcl)) return false; - - /* XID sanity */ - if (*p++ != *rdma_resp) + if (!pcl_is_empty(&rctxt->rc_reply_pcl)) return false; - /* call direction */ - if (*p == cpu_to_be32(RPC_CALL)) + + /* RPC call direction */ + if (*(p + 8) == cpu_to_be32(RPC_CALL)) return false; return true; @@ -800,7 +825,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svcxprt_rdma *rdma_xprt = container_of(xprt, struct svcxprt_rdma, sc_xprt); struct svc_rdma_recv_ctxt *ctxt; - __be32 *p; int ret; rqstp->rq_xprt_ctxt = NULL; @@ -822,9 +846,11 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) } list_del(&ctxt->rc_list); spin_unlock(&rdma_xprt->sc_rq_dto_lock); + percpu_counter_inc(&svcrdma_stat_recv); - atomic_inc(&rdma_stat_recv); - + ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device, + ctxt->rc_recv_sge.addr, ctxt->rc_byte_len, + DMA_FROM_DEVICE); svc_rdma_build_arg_xdr(rqstp, ctxt); /* Prevent svc_xprt_release from releasing pages in rq_pages @@ -833,7 +859,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_next_page = rqstp->rq_respages; - p = (__be32 *)rqstp->rq_arg.head[0].iov_base; ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt); if (ret < 0) goto out_err; @@ -841,14 +866,14 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto out_drop; rqstp->rq_xprt_hlen = ret; - if (svc_rdma_is_backchannel_reply(xprt, p)) + if (svc_rdma_is_reverse_direction_reply(xprt, ctxt)) goto out_backchannel; svc_rdma_get_inv_rkey(rdma_xprt, ctxt); - p += rpcrdma_fixed_maxsz; - if (*p != xdr_zero) - goto out_readchunk; + if (!pcl_is_empty(&ctxt->rc_read_pcl) || + !pcl_is_empty(&ctxt->rc_call_pcl)) + goto out_readlist; complete: rqstp->rq_xprt_ctxt = ctxt; @@ -856,10 +881,10 @@ complete: svc_xprt_copy_addrs(rqstp, xprt); return rqstp->rq_arg.len; -out_readchunk: - ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p); +out_readlist: + ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt); if (ret < 0) - goto out_postfail; + goto out_readfail; return 0; out_err: @@ -867,7 +892,7 @@ out_err: svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; -out_postfail: +out_readfail: if (ret == -EINVAL) svc_rdma_send_error(rdma_xprt, ctxt, ret); svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 80a0c0e87590..693d139a8633 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -190,14 +190,14 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, * - Stores arguments for the SGL constructor functions */ struct svc_rdma_write_info { + const struct svc_rdma_chunk *wi_chunk; + /* write state of this chunk */ unsigned int wi_seg_off; unsigned int wi_seg_no; - unsigned int wi_nsegs; - __be32 *wi_segs; /* SGL constructor arguments */ - struct xdr_buf *wi_xdr; + const struct xdr_buf *wi_xdr; unsigned char *wi_base; unsigned int wi_next_off; @@ -205,7 +205,8 @@ struct svc_rdma_write_info { }; static struct svc_rdma_write_info * -svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) +svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, + const struct svc_rdma_chunk *chunk) { struct svc_rdma_write_info *info; @@ -213,10 +214,9 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) if (!info) return info; + info->wi_chunk = chunk; info->wi_seg_off = 0; info->wi_seg_no = 0; - info->wi_nsegs = be32_to_cpup(++chunk); - info->wi_segs = ++chunk; svc_rdma_cc_init(rdma, &info->wi_cc); info->wi_cc.cc_cqe.done = svc_rdma_write_done; return info; @@ -258,11 +258,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) /* State for pulling a Read chunk. */ struct svc_rdma_read_info { + struct svc_rqst *ri_rqst; struct svc_rdma_recv_ctxt *ri_readctxt; - unsigned int ri_position; unsigned int ri_pageno; unsigned int ri_pageoff; - unsigned int ri_chunklen; + unsigned int ri_totalbytes; struct svc_rdma_chunk_ctxt ri_cc; }; @@ -358,13 +358,13 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) do { if (atomic_sub_return(cc->cc_sqecount, &rdma->sc_sq_avail) > 0) { - trace_svcrdma_post_chunk(&cc->cc_cid, cc->cc_sqecount); ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); if (ret) break; return 0; } + percpu_counter_inc(&svcrdma_stat_sq_starve); trace_svcrdma_sq_full(rdma); atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wait_event(rdma->sc_send_wait, @@ -405,7 +405,7 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, struct svc_rdma_rw_ctxt *ctxt) { unsigned int sge_no, sge_bytes, page_off, page_no; - struct xdr_buf *xdr = info->wi_xdr; + const struct xdr_buf *xdr = info->wi_xdr; struct scatterlist *sg; struct page **page; @@ -443,40 +443,37 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, { struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; struct svcxprt_rdma *rdma = cc->cc_rdma; + const struct svc_rdma_segment *seg; struct svc_rdma_rw_ctxt *ctxt; - __be32 *seg; int ret; - seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz; do { unsigned int write_len; - u32 handle, length; u64 offset; - if (info->wi_seg_no >= info->wi_nsegs) + seg = &info->wi_chunk->ch_segments[info->wi_seg_no]; + if (!seg) goto out_overflow; - xdr_decode_rdma_segment(seg, &handle, &length, &offset); - offset += info->wi_seg_off; - - write_len = min(remaining, length - info->wi_seg_off); + write_len = min(remaining, seg->rs_length - info->wi_seg_off); + if (!write_len) + goto out_overflow; ctxt = svc_rdma_get_rw_ctxt(rdma, (write_len >> PAGE_SHIFT) + 2); if (!ctxt) return -ENOMEM; constructor(info, write_len, ctxt); - ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, handle, + offset = seg->rs_offset + info->wi_seg_off; + ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle, DMA_TO_DEVICE); if (ret < 0) return -EIO; - - trace_svcrdma_send_wseg(handle, write_len, offset); + percpu_counter_inc(&svcrdma_stat_write); list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; - if (write_len == length - info->wi_seg_off) { - seg += 4; + if (write_len == seg->rs_length - info->wi_seg_off) { info->wi_seg_no++; info->wi_seg_off = 0; } else { @@ -489,31 +486,46 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, out_overflow: trace_svcrdma_small_wrch_err(rdma, remaining, info->wi_seg_no, - info->wi_nsegs); + info->wi_chunk->ch_segcount); return -E2BIG; } -/* Send one of an xdr_buf's kvecs by itself. To send a Reply - * chunk, the whole RPC Reply is written back to the client. - * This function writes either the head or tail of the xdr_buf - * containing the Reply. +/** + * svc_rdma_iov_write - Construct RDMA Writes from an iov + * @info: pointer to write arguments + * @iov: kvec to write + * + * Returns: + * On succes, returns zero + * %-E2BIG if the client-provided Write chunk is too small + * %-ENOMEM if a resource has been exhausted + * %-EIO if an rdma-rw error occurred */ -static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info, - struct kvec *vec) +static int svc_rdma_iov_write(struct svc_rdma_write_info *info, + const struct kvec *iov) { - info->wi_base = vec->iov_base; + info->wi_base = iov->iov_base; return svc_rdma_build_writes(info, svc_rdma_vec_to_sg, - vec->iov_len); + iov->iov_len); } -/* Send an xdr_buf's page list by itself. A Write chunk is just - * the page list. A Reply chunk is @xdr's head, page list, and - * tail. This function is shared between the two types of chunk. +/** + * svc_rdma_pages_write - Construct RDMA Writes from pages + * @info: pointer to write arguments + * @xdr: xdr_buf with pages to write + * @offset: offset into the content of @xdr + * @length: number of bytes to write + * + * Returns: + * On succes, returns zero + * %-E2BIG if the client-provided Write chunk is too small + * %-ENOMEM if a resource has been exhausted + * %-EIO if an rdma-rw error occurred */ -static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, - struct xdr_buf *xdr, - unsigned int offset, - unsigned long length) +static int svc_rdma_pages_write(struct svc_rdma_write_info *info, + const struct xdr_buf *xdr, + unsigned int offset, + unsigned long length) { info->wi_xdr = xdr; info->wi_next_off = offset - xdr->head[0].iov_len; @@ -522,12 +534,48 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, } /** + * svc_rdma_xb_write - Construct RDMA Writes to write an xdr_buf + * @xdr: xdr_buf to write + * @data: pointer to write arguments + * + * Returns: + * On succes, returns zero + * %-E2BIG if the client-provided Write chunk is too small + * %-ENOMEM if a resource has been exhausted + * %-EIO if an rdma-rw error occurred + */ +static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data) +{ + struct svc_rdma_write_info *info = data; + int ret; + + if (xdr->head[0].iov_len) { + ret = svc_rdma_iov_write(info, &xdr->head[0]); + if (ret < 0) + return ret; + } + + if (xdr->page_len) { + ret = svc_rdma_pages_write(info, xdr, xdr->head[0].iov_len, + xdr->page_len); + if (ret < 0) + return ret; + } + + if (xdr->tail[0].iov_len) { + ret = svc_rdma_iov_write(info, &xdr->tail[0]); + if (ret < 0) + return ret; + } + + return xdr->len; +} + +/** * svc_rdma_send_write_chunk - Write all segments in a Write chunk * @rdma: controlling RDMA transport - * @wr_ch: Write chunk provided by client + * @chunk: Write chunk provided by the client * @xdr: xdr_buf containing the data payload - * @offset: payload's byte offset in @xdr - * @length: size of payload, in bytes * * Returns a non-negative number of bytes the chunk consumed, or * %-E2BIG if the payload was larger than the Write chunk, @@ -536,30 +584,28 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, * %-ENOTCONN if posting failed (connection is lost), * %-EIO if rdma_rw initialization failed (DMA mapping, etc). */ -int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, - struct xdr_buf *xdr, - unsigned int offset, unsigned long length) +int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; + struct svc_rdma_chunk_ctxt *cc; int ret; - if (!length) - return 0; - - info = svc_rdma_write_info_alloc(rdma, wr_ch); + info = svc_rdma_write_info_alloc(rdma, chunk); if (!info) return -ENOMEM; + cc = &info->wi_cc; - ret = svc_rdma_send_xdr_pagelist(info, xdr, offset, length); - if (ret < 0) + ret = svc_rdma_xb_write(xdr, info); + if (ret != xdr->len) goto out_err; - ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); + trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); + ret = svc_rdma_post_chunk_ctxt(cc); if (ret < 0) goto out_err; - - trace_svcrdma_send_write_chunk(xdr->page_len); - return length; + return xdr->len; out_err: svc_rdma_write_info_free(info); @@ -581,62 +627,62 @@ out_err: */ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_recv_ctxt *rctxt, - struct xdr_buf *xdr) + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; - int consumed, ret; + struct svc_rdma_chunk_ctxt *cc; + struct svc_rdma_chunk *chunk; + int ret; - info = svc_rdma_write_info_alloc(rdma, rctxt->rc_reply_chunk); + if (pcl_is_empty(&rctxt->rc_reply_pcl)) + return 0; + + chunk = pcl_first_chunk(&rctxt->rc_reply_pcl); + info = svc_rdma_write_info_alloc(rdma, chunk); if (!info) return -ENOMEM; + cc = &info->wi_cc; - ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]); + ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_write, info); if (ret < 0) goto out_err; - consumed = xdr->head[0].iov_len; - - /* Send the page list in the Reply chunk only if the - * client did not provide Write chunks. - */ - if (!rctxt->rc_write_list && xdr->page_len) { - ret = svc_rdma_send_xdr_pagelist(info, xdr, - xdr->head[0].iov_len, - xdr->page_len); - if (ret < 0) - goto out_err; - consumed += xdr->page_len; - } - - if (xdr->tail[0].iov_len) { - ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]); - if (ret < 0) - goto out_err; - consumed += xdr->tail[0].iov_len; - } - ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); + trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); + ret = svc_rdma_post_chunk_ctxt(cc); if (ret < 0) goto out_err; - trace_svcrdma_send_reply_chunk(consumed); - return consumed; + return xdr->len; out_err: svc_rdma_write_info_free(info); return ret; } +/** + * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment + * @info: context for ongoing I/O + * @segment: co-ordinates of remote memory to be read + * + * Returns: + * %0: the Read WR chain was constructed successfully + * %-EINVAL: there were not enough rq_pages to finish + * %-ENOMEM: allocating a local resources failed + * %-EIO: a DMA mapping error occurred + */ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, - struct svc_rqst *rqstp, - u32 rkey, u32 len, u64 offset) + const struct svc_rdma_segment *segment) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; struct svc_rdma_chunk_ctxt *cc = &info->ri_cc; + struct svc_rqst *rqstp = info->ri_rqst; struct svc_rdma_rw_ctxt *ctxt; - unsigned int sge_no, seg_len; + unsigned int sge_no, seg_len, len; struct scatterlist *sg; int ret; + len = segment->rs_length; sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT; ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no); if (!ctxt) @@ -670,10 +716,11 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, goto out_overrun; } - ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, offset, rkey, - DMA_FROM_DEVICE); + ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, segment->rs_offset, + segment->rs_handle, DMA_FROM_DEVICE); if (ret < 0) return -EIO; + percpu_counter_inc(&svcrdma_stat_read); list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; @@ -684,54 +731,177 @@ out_overrun: return -EINVAL; } -/* Walk the segments in the Read chunk starting at @p and construct - * RDMA Read operations to pull the chunk to the server. +/** + * svc_rdma_build_read_chunk - Build RDMA Read WQEs to pull one RDMA chunk + * @info: context for ongoing I/O + * @chunk: Read chunk to pull + * + * Return values: + * %0: the Read WR chain was constructed successfully + * %-EINVAL: there were not enough resources to finish + * %-ENOMEM: allocating a local resources failed + * %-EIO: a DMA mapping error occurred */ -static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, - struct svc_rdma_read_info *info, - __be32 *p) +static int svc_rdma_build_read_chunk(struct svc_rdma_read_info *info, + const struct svc_rdma_chunk *chunk) { + const struct svc_rdma_segment *segment; int ret; ret = -EINVAL; - info->ri_chunklen = 0; - while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) { - u32 handle, length; - u64 offset; + pcl_for_each_segment(segment, chunk) { + ret = svc_rdma_build_read_segment(info, segment); + if (ret < 0) + break; + info->ri_totalbytes += segment->rs_length; + } + return ret; +} + +/** + * svc_rdma_copy_inline_range - Copy part of the inline content into pages + * @info: context for RDMA Reads + * @offset: offset into the Receive buffer of region to copy + * @remaining: length of region to copy + * + * Take a page at a time from rqstp->rq_pages and copy the inline + * content from the Receive buffer into that page. Update + * info->ri_pageno and info->ri_pageoff so that the next RDMA Read + * result will land contiguously with the copied content. + * + * Return values: + * %0: Inline content was successfully copied + * %-EINVAL: offset or length was incorrect + */ +static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, + unsigned int offset, + unsigned int remaining) +{ + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + unsigned char *dst, *src = head->rc_recv_buf; + struct svc_rqst *rqstp = info->ri_rqst; + unsigned int page_no, numpages; + + numpages = PAGE_ALIGN(info->ri_pageoff + remaining) >> PAGE_SHIFT; + for (page_no = 0; page_no < numpages; page_no++) { + unsigned int page_len; + + page_len = min_t(unsigned int, remaining, + PAGE_SIZE - info->ri_pageoff); + + head->rc_arg.pages[info->ri_pageno] = + rqstp->rq_pages[info->ri_pageno]; + if (!info->ri_pageoff) + head->rc_page_count++; + + dst = page_address(head->rc_arg.pages[info->ri_pageno]); + memcpy(dst + info->ri_pageno, src + offset, page_len); + + info->ri_totalbytes += page_len; + info->ri_pageoff += page_len; + if (info->ri_pageoff == PAGE_SIZE) { + info->ri_pageno++; + info->ri_pageoff = 0; + } + remaining -= page_len; + offset += page_len; + } + + return -EINVAL; +} + +/** + * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks + * @info: context for RDMA Reads + * + * The chunk data lands in head->rc_arg as a series of contiguous pages, + * like an incoming TCP call. + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). + */ +static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *info) +{ + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; + struct svc_rdma_chunk *chunk, *next; + struct xdr_buf *buf = &head->rc_arg; + unsigned int start, length; + int ret; - p = xdr_decode_rdma_segment(p, &handle, &length, &offset); - ret = svc_rdma_build_read_segment(info, rqstp, handle, length, - offset); + start = 0; + chunk = pcl_first_chunk(pcl); + length = chunk->ch_position; + ret = svc_rdma_copy_inline_range(info, start, length); + if (ret < 0) + return ret; + + pcl_for_each_chunk(chunk, pcl) { + ret = svc_rdma_build_read_chunk(info, chunk); if (ret < 0) + return ret; + + next = pcl_next_chunk(pcl, chunk); + if (!next) break; - trace_svcrdma_send_rseg(handle, length, offset); - info->ri_chunklen += length; + start += length; + length = next->ch_position - info->ri_totalbytes; + ret = svc_rdma_copy_inline_range(info, start, length); + if (ret < 0) + return ret; } - return ret; + start += length; + length = head->rc_byte_len - start; + ret = svc_rdma_copy_inline_range(info, start, length); + if (ret < 0) + return ret; + + buf->len += info->ri_totalbytes; + buf->buflen += info->ri_totalbytes; + + head->rc_hdr_count = 1; + buf->head[0].iov_base = page_address(head->rc_pages[0]); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); + buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; + return 0; } -/* Construct RDMA Reads to pull over a normal Read chunk. The chunk - * data lands in the page list of head->rc_arg.pages. +/** + * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks + * @info: context for RDMA Reads + * + * The chunk data lands in the page list of head->rc_arg.pages. * * Currently NFSD does not look at the head->rc_arg.tail[0] iovec. * Therefore, XDR round-up of the Read chunk and trailing * inline content must both be added at the end of the pagelist. + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, - struct svc_rdma_read_info *info, - __be32 *p) +static int svc_rdma_read_data_item(struct svc_rdma_read_info *info) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + struct xdr_buf *buf = &head->rc_arg; + struct svc_rdma_chunk *chunk; + unsigned int length; int ret; - ret = svc_rdma_build_read_chunk(rqstp, info, p); + chunk = pcl_first_chunk(&head->rc_read_pcl); + ret = svc_rdma_build_read_chunk(info, chunk); if (ret < 0) goto out; - trace_svcrdma_send_read_chunk(info->ri_chunklen, info->ri_position); - head->rc_hdr_count = 0; /* Split the Receive buffer between the head and tail @@ -739,11 +909,9 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, * chunk is not included in either the pagelist or in * the tail. */ - head->rc_arg.tail[0].iov_base = - head->rc_arg.head[0].iov_base + info->ri_position; - head->rc_arg.tail[0].iov_len = - head->rc_arg.head[0].iov_len - info->ri_position; - head->rc_arg.head[0].iov_len = info->ri_position; + buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position; + buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position; + buf->head[0].iov_len = chunk->ch_position; /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). * @@ -754,50 +922,149 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, * Currently these chunks always start at page offset 0, * thus the rounded-up length never crosses a page boundary. */ - info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2; - - head->rc_arg.page_len = info->ri_chunklen; - head->rc_arg.len += info->ri_chunklen; - head->rc_arg.buflen += info->ri_chunklen; + length = XDR_QUADLEN(info->ri_totalbytes) << 2; + buf->page_len = length; + buf->len += length; + buf->buflen += length; out: return ret; } -/* Construct RDMA Reads to pull over a Position Zero Read chunk. - * The start of the data lands in the first page just after - * the Transport header, and the rest lands in the page list of +/** + * svc_rdma_read_chunk_range - Build RDMA Read WQEs for portion of a chunk + * @info: context for RDMA Reads + * @chunk: parsed Call chunk to pull + * @offset: offset of region to pull + * @length: length of region to pull + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: there were not enough resources to finish + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). + */ +static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, + const struct svc_rdma_chunk *chunk, + unsigned int offset, unsigned int length) +{ + const struct svc_rdma_segment *segment; + int ret; + + ret = -EINVAL; + pcl_for_each_segment(segment, chunk) { + struct svc_rdma_segment dummy; + + if (offset > segment->rs_length) { + offset -= segment->rs_length; + continue; + } + + dummy.rs_handle = segment->rs_handle; + dummy.rs_length = min_t(u32, length, segment->rs_length) - offset; + dummy.rs_offset = segment->rs_offset + offset; + + ret = svc_rdma_build_read_segment(info, &dummy); + if (ret < 0) + break; + + info->ri_totalbytes += dummy.rs_length; + length -= dummy.rs_length; + offset = 0; + } + return ret; +} + +/** + * svc_rdma_read_call_chunk - Build RDMA Read WQEs to pull a Long Message + * @info: context for RDMA Reads + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: there were not enough resources to finish + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). + */ +static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) +{ + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + const struct svc_rdma_chunk *call_chunk = + pcl_first_chunk(&head->rc_call_pcl); + const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; + struct svc_rdma_chunk *chunk, *next; + unsigned int start, length; + int ret; + + if (pcl_is_empty(pcl)) + return svc_rdma_build_read_chunk(info, call_chunk); + + start = 0; + chunk = pcl_first_chunk(pcl); + length = chunk->ch_position; + ret = svc_rdma_read_chunk_range(info, call_chunk, start, length); + if (ret < 0) + return ret; + + pcl_for_each_chunk(chunk, pcl) { + ret = svc_rdma_build_read_chunk(info, chunk); + if (ret < 0) + return ret; + + next = pcl_next_chunk(pcl, chunk); + if (!next) + break; + + start += length; + length = next->ch_position - info->ri_totalbytes; + ret = svc_rdma_read_chunk_range(info, call_chunk, + start, length); + if (ret < 0) + return ret; + } + + start += length; + length = call_chunk->ch_length - start; + return svc_rdma_read_chunk_range(info, call_chunk, start, length); +} + +/** + * svc_rdma_read_special - Build RDMA Read WQEs to pull a Long Message + * @info: context for RDMA Reads + * + * The start of the data lands in the first page just after the + * Transport header, and the rest lands in the page list of * head->rc_arg.pages. * * Assumptions: - * - A PZRC has an XDR-aligned length (no implicit round-up). - * - There can be no trailing inline content (IOW, we assume - * a PZRC is never sent in an RDMA_MSG message, though it's - * allowed by spec). + * - A PZRC is never sent in an RDMA_MSG message, though it's + * allowed by spec. + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, - struct svc_rdma_read_info *info, - __be32 *p) +static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + struct xdr_buf *buf = &head->rc_arg; int ret; - ret = svc_rdma_build_read_chunk(rqstp, info, p); + ret = svc_rdma_read_call_chunk(info); if (ret < 0) goto out; - trace_svcrdma_send_pzr(info->ri_chunklen); - - head->rc_arg.len += info->ri_chunklen; - head->rc_arg.buflen += info->ri_chunklen; + buf->len += info->ri_totalbytes; + buf->buflen += info->ri_totalbytes; head->rc_hdr_count = 1; - head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]); - head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE, - info->ri_chunklen); - - head->rc_arg.page_len = info->ri_chunklen - - head->rc_arg.head[0].iov_len; + buf->head[0].iov_base = page_address(head->rc_pages[0]); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); + buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; out: return ret; @@ -824,26 +1091,34 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, } /** - * svc_rdma_recv_read_chunk - Pull a Read chunk from the client + * svc_rdma_process_read_list - Pull list of Read chunks from the client * @rdma: controlling RDMA transport * @rqstp: set of pages to use as Read sink buffers * @head: pages under I/O collect here - * @p: pointer to start of Read chunk * - * Returns: - * %0 if all needed RDMA Reads were posted successfully, - * %-EINVAL if client provided too many segments, - * %-ENOMEM if rdma_rw context pool was exhausted, - * %-ENOTCONN if posting failed (connection is lost), - * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + * The RPC/RDMA protocol assumes that the upper layer's XDR decoders + * pull each Read chunk as they decode an incoming RPC message. * - * Assumptions: - * - All Read segments in @p have the same Position value. + * On Linux, however, the server needs to have a fully-constructed RPC + * message in rqstp->rq_arg when there is a positive return code from + * ->xpo_recvfrom. So the Read list is safety-checked immediately when + * it is received, then here the whole Read list is pulled all at once. + * The ingress RPC message is fully reconstructed once all associated + * RDMA Reads have completed. + * + * Return values: + * %1: all needed RDMA Reads were posted successfully, + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, - struct svc_rdma_recv_ctxt *head, __be32 *p) +int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, + struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { struct svc_rdma_read_info *info; + struct svc_rdma_chunk_ctxt *cc; int ret; /* The request (with page list) is constructed in @@ -861,23 +1136,29 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, info = svc_rdma_read_info_alloc(rdma); if (!info) return -ENOMEM; + cc = &info->ri_cc; + info->ri_rqst = rqstp; info->ri_readctxt = head; info->ri_pageno = 0; info->ri_pageoff = 0; - - info->ri_position = be32_to_cpup(p + 1); - if (info->ri_position) - ret = svc_rdma_build_normal_read_chunk(rqstp, info, p); - else - ret = svc_rdma_build_pz_read_chunk(rqstp, info, p); + info->ri_totalbytes = 0; + + if (pcl_is_empty(&head->rc_call_pcl)) { + if (head->rc_read_pcl.cl_count == 1) + ret = svc_rdma_read_data_item(info); + else + ret = svc_rdma_read_multiple_chunks(info); + } else + ret = svc_rdma_read_special(info); if (ret < 0) goto out_err; - ret = svc_rdma_post_chunk_ctxt(&info->ri_cc); + trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount); + ret = svc_rdma_post_chunk_ctxt(cc); if (ret < 0) goto out_err; svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count); - return 0; + return 1; out_err: svc_rdma_read_info_free(info); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index c3d588b149aa..52c759a8543e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -317,7 +317,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) /* If the SQ is full, wait until an SQ entry is available */ while (1) { if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { - atomic_inc(&rdma_stat_sq_starve); + percpu_counter_inc(&svcrdma_stat_sq_starve); trace_svcrdma_sq_full(rdma); atomic_inc(&rdma->sc_sq_avail); wait_event(rdma->sc_send_wait, @@ -358,49 +358,42 @@ static ssize_t svc_rdma_encode_read_list(struct svc_rdma_send_ctxt *sctxt) /** * svc_rdma_encode_write_segment - Encode one Write segment - * @src: matching Write chunk in the RPC Call header * @sctxt: Send context for the RPC Reply + * @chunk: Write chunk to push * @remaining: remaining bytes of the payload left in the Write chunk + * @segno: which segment in the chunk * * Return values: * On success, returns length in bytes of the Reply XDR buffer - * that was consumed by the Write segment + * that was consumed by the Write segment, and updates @remaining * %-EMSGSIZE on XDR buffer overflow */ -static ssize_t svc_rdma_encode_write_segment(__be32 *src, - struct svc_rdma_send_ctxt *sctxt, - unsigned int *remaining) +static ssize_t svc_rdma_encode_write_segment(struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_chunk *chunk, + u32 *remaining, unsigned int segno) { + const struct svc_rdma_segment *segment = &chunk->ch_segments[segno]; + const size_t len = rpcrdma_segment_maxsz * sizeof(__be32); + u32 length; __be32 *p; - const size_t len = rpcrdma_segment_maxsz * sizeof(*p); - u32 handle, length; - u64 offset; p = xdr_reserve_space(&sctxt->sc_stream, len); if (!p) return -EMSGSIZE; - xdr_decode_rdma_segment(src, &handle, &length, &offset); - - if (*remaining < length) { - /* segment only partly filled */ - length = *remaining; - *remaining = 0; - } else { - /* entire segment was consumed */ - *remaining -= length; - } - xdr_encode_rdma_segment(p, handle, length, offset); - - trace_svcrdma_encode_wseg(handle, length, offset); + length = min_t(u32, *remaining, segment->rs_length); + *remaining -= length; + xdr_encode_rdma_segment(p, segment->rs_handle, length, + segment->rs_offset); + trace_svcrdma_encode_wseg(sctxt, segno, segment->rs_handle, length, + segment->rs_offset); return len; } /** * svc_rdma_encode_write_chunk - Encode one Write chunk - * @src: matching Write chunk in the RPC Call header * @sctxt: Send context for the RPC Reply - * @remaining: size in bytes of the payload in the Write chunk + * @chunk: Write chunk to push * * Copy a Write chunk from the Call transport header to the * Reply transport header. Update each segment's length field @@ -411,33 +404,28 @@ static ssize_t svc_rdma_encode_write_segment(__be32 *src, * that was consumed by the Write chunk * %-EMSGSIZE on XDR buffer overflow */ -static ssize_t svc_rdma_encode_write_chunk(__be32 *src, - struct svc_rdma_send_ctxt *sctxt, - unsigned int remaining) +static ssize_t svc_rdma_encode_write_chunk(struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_chunk *chunk) { - unsigned int i, nsegs; + u32 remaining = chunk->ch_payload_length; + unsigned int segno; ssize_t len, ret; len = 0; - trace_svcrdma_encode_write_chunk(remaining); - - src++; ret = xdr_stream_encode_item_present(&sctxt->sc_stream); if (ret < 0) - return -EMSGSIZE; + return ret; len += ret; - nsegs = be32_to_cpup(src++); - ret = xdr_stream_encode_u32(&sctxt->sc_stream, nsegs); + ret = xdr_stream_encode_u32(&sctxt->sc_stream, chunk->ch_segcount); if (ret < 0) - return -EMSGSIZE; + return ret; len += ret; - for (i = nsegs; i; i--) { - ret = svc_rdma_encode_write_segment(src, sctxt, &remaining); + for (segno = 0; segno < chunk->ch_segcount; segno++) { + ret = svc_rdma_encode_write_segment(sctxt, chunk, &remaining, segno); if (ret < 0) - return -EMSGSIZE; - src += rpcrdma_segment_maxsz; + return ret; len += ret; } @@ -448,32 +436,25 @@ static ssize_t svc_rdma_encode_write_chunk(__be32 *src, * svc_rdma_encode_write_list - Encode RPC Reply's Write chunk list * @rctxt: Reply context with information about the RPC Call * @sctxt: Send context for the RPC Reply - * @length: size in bytes of the payload in the first Write chunk - * - * The client provides a Write chunk list in the Call message. Fill - * in the segments in the first Write chunk in the Reply's transport - * header with the number of bytes consumed in each segment. - * Remaining chunks are returned unused. - * - * Assumptions: - * - Client has provided only one Write chunk * * Return values: * On success, returns length in bytes of the Reply XDR buffer * that was consumed by the Reply's Write list * %-EMSGSIZE on XDR buffer overflow */ -static ssize_t -svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt, - struct svc_rdma_send_ctxt *sctxt, - unsigned int length) +static ssize_t svc_rdma_encode_write_list(struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt) { + struct svc_rdma_chunk *chunk; ssize_t len, ret; - ret = svc_rdma_encode_write_chunk(rctxt->rc_write_list, sctxt, length); - if (ret < 0) - return ret; - len = ret; + len = 0; + pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) { + ret = svc_rdma_encode_write_chunk(sctxt, chunk); + if (ret < 0) + return ret; + len += ret; + } /* Terminate the Write list */ ret = xdr_stream_encode_item_absent(&sctxt->sc_stream); @@ -489,56 +470,174 @@ svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt, * @sctxt: Send context for the RPC Reply * @length: size in bytes of the payload in the Reply chunk * - * Assumptions: - * - Reply can always fit in the client-provided Reply chunk - * * Return values: * On success, returns length in bytes of the Reply XDR buffer * that was consumed by the Reply's Reply chunk * %-EMSGSIZE on XDR buffer overflow + * %-E2BIG if the RPC message is larger than the Reply chunk */ static ssize_t -svc_rdma_encode_reply_chunk(const struct svc_rdma_recv_ctxt *rctxt, +svc_rdma_encode_reply_chunk(struct svc_rdma_recv_ctxt *rctxt, struct svc_rdma_send_ctxt *sctxt, unsigned int length) { - return svc_rdma_encode_write_chunk(rctxt->rc_reply_chunk, sctxt, - length); + struct svc_rdma_chunk *chunk; + + if (pcl_is_empty(&rctxt->rc_reply_pcl)) + return xdr_stream_encode_item_absent(&sctxt->sc_stream); + + chunk = pcl_first_chunk(&rctxt->rc_reply_pcl); + if (length > chunk->ch_length) + return -E2BIG; + + chunk->ch_payload_length = length; + return svc_rdma_encode_write_chunk(sctxt, chunk); } -static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - struct page *page, - unsigned long offset, - unsigned int len) +struct svc_rdma_map_data { + struct svcxprt_rdma *md_rdma; + struct svc_rdma_send_ctxt *md_ctxt; +}; + +/** + * svc_rdma_page_dma_map - DMA map one page + * @data: pointer to arguments + * @page: struct page to DMA map + * @offset: offset into the page + * @len: number of bytes to map + * + * Returns: + * %0 if DMA mapping was successful + * %-EIO if the page cannot be DMA mapped + */ +static int svc_rdma_page_dma_map(void *data, struct page *page, + unsigned long offset, unsigned int len) { + struct svc_rdma_map_data *args = data; + struct svcxprt_rdma *rdma = args->md_rdma; + struct svc_rdma_send_ctxt *ctxt = args->md_ctxt; struct ib_device *dev = rdma->sc_cm_id->device; dma_addr_t dma_addr; + ++ctxt->sc_cur_sge_no; + dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE); - trace_svcrdma_dma_map_page(rdma, dma_addr, len); if (ib_dma_mapping_error(dev, dma_addr)) goto out_maperr; + trace_svcrdma_dma_map_page(rdma, dma_addr, len); ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr; ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len; ctxt->sc_send_wr.num_sge++; return 0; out_maperr: + trace_svcrdma_dma_map_err(rdma, dma_addr, len); return -EIO; } -/* ib_dma_map_page() is used here because svc_rdma_dma_unmap() +/** + * svc_rdma_iov_dma_map - DMA map an iovec + * @data: pointer to arguments + * @iov: kvec to DMA map + * + * ib_dma_map_page() is used here because svc_rdma_dma_unmap() * handles DMA-unmap and it uses ib_dma_unmap_page() exclusively. + * + * Returns: + * %0 if DMA mapping was successful + * %-EIO if the iovec cannot be DMA mapped */ -static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - unsigned char *base, - unsigned int len) +static int svc_rdma_iov_dma_map(void *data, const struct kvec *iov) { - return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base), - offset_in_page(base), len); + if (!iov->iov_len) + return 0; + return svc_rdma_page_dma_map(data, virt_to_page(iov->iov_base), + offset_in_page(iov->iov_base), + iov->iov_len); +} + +/** + * svc_rdma_xb_dma_map - DMA map all segments of an xdr_buf + * @xdr: xdr_buf containing portion of an RPC message to transmit + * @data: pointer to arguments + * + * Returns: + * %0 if DMA mapping was successful + * %-EIO if DMA mapping failed + * + * On failure, any DMA mappings that have been already done must be + * unmapped by the caller. + */ +static int svc_rdma_xb_dma_map(const struct xdr_buf *xdr, void *data) +{ + unsigned int len, remaining; + unsigned long pageoff; + struct page **ppages; + int ret; + + ret = svc_rdma_iov_dma_map(data, &xdr->head[0]); + if (ret < 0) + return ret; + + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + pageoff = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + len = min_t(u32, PAGE_SIZE - pageoff, remaining); + + ret = svc_rdma_page_dma_map(data, *ppages++, pageoff, len); + if (ret < 0) + return ret; + + remaining -= len; + pageoff = 0; + } + + ret = svc_rdma_iov_dma_map(data, &xdr->tail[0]); + if (ret < 0) + return ret; + + return xdr->len; +} + +struct svc_rdma_pullup_data { + u8 *pd_dest; + unsigned int pd_length; + unsigned int pd_num_sges; +}; + +/** + * svc_rdma_xb_count_sges - Count how many SGEs will be needed + * @xdr: xdr_buf containing portion of an RPC message to transmit + * @data: pointer to arguments + * + * Returns: + * Number of SGEs needed to Send the contents of @xdr inline + */ +static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr, + void *data) +{ + struct svc_rdma_pullup_data *args = data; + unsigned int remaining; + unsigned long offset; + + if (xdr->head[0].iov_len) + ++args->pd_num_sges; + + offset = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + ++args->pd_num_sges; + remaining -= min_t(u32, PAGE_SIZE - offset, remaining); + offset = 0; + } + + if (xdr->tail[0].iov_len) + ++args->pd_num_sges; + + args->pd_length += xdr->len; + return 0; } /** @@ -549,48 +648,71 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, * @xdr: xdr_buf containing RPC message to transmit * * Returns: - * %true if pull-up must be used - * %false otherwise + * %true if pull-up must be used + * %false otherwise */ -static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *sctxt, +static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma, + const struct svc_rdma_send_ctxt *sctxt, const struct svc_rdma_recv_ctxt *rctxt, - struct xdr_buf *xdr) + const struct xdr_buf *xdr) { - int elements; + /* Resources needed for the transport header */ + struct svc_rdma_pullup_data args = { + .pd_length = sctxt->sc_hdrbuf.len, + .pd_num_sges = 1, + }; + int ret; - /* For small messages, copying bytes is cheaper than DMA mapping. - */ - if (sctxt->sc_hdrbuf.len + xdr->len < RPCRDMA_PULLUP_THRESH) + ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_count_sges, &args); + if (ret < 0) + return false; + + if (args.pd_length < RPCRDMA_PULLUP_THRESH) return true; + return args.pd_num_sges >= rdma->sc_max_send_sges; +} - /* Check whether the xdr_buf has more elements than can - * fit in a single RDMA Send. - */ - /* xdr->head */ - elements = 1; - - /* xdr->pages */ - if (!rctxt || !rctxt->rc_write_list) { - unsigned int remaining; - unsigned long pageoff; - - pageoff = xdr->page_base & ~PAGE_MASK; - remaining = xdr->page_len; - while (remaining) { - ++elements; - remaining -= min_t(u32, PAGE_SIZE - pageoff, - remaining); - pageoff = 0; - } +/** + * svc_rdma_xb_linearize - Copy region of xdr_buf to flat buffer + * @xdr: xdr_buf containing portion of an RPC message to copy + * @data: pointer to arguments + * + * Returns: + * Always zero. + */ +static int svc_rdma_xb_linearize(const struct xdr_buf *xdr, + void *data) +{ + struct svc_rdma_pullup_data *args = data; + unsigned int len, remaining; + unsigned long pageoff; + struct page **ppages; + + if (xdr->head[0].iov_len) { + memcpy(args->pd_dest, xdr->head[0].iov_base, xdr->head[0].iov_len); + args->pd_dest += xdr->head[0].iov_len; } - /* xdr->tail */ - if (xdr->tail[0].iov_len) - ++elements; + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + pageoff = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + len = min_t(u32, PAGE_SIZE - pageoff, remaining); + memcpy(args->pd_dest, page_address(*ppages) + pageoff, len); + remaining -= len; + args->pd_dest += len; + pageoff = 0; + ppages++; + } - /* assume 1 SGE is needed for the transport header */ - return elements >= rdma->sc_max_send_sges; + if (xdr->tail[0].iov_len) { + memcpy(args->pd_dest, xdr->tail[0].iov_base, xdr->tail[0].iov_len); + args->pd_dest += xdr->tail[0].iov_len; + } + + args->pd_length += xdr->len; + return 0; } /** @@ -603,54 +725,30 @@ static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, * The device is not capable of sending the reply directly. * Assemble the elements of @xdr into the transport header buffer. * - * Returns zero on success, or a negative errno on failure. + * Assumptions: + * pull_up_needed has determined that @xdr will fit in the buffer. + * + * Returns: + * %0 if pull-up was successful + * %-EMSGSIZE if a buffer manipulation problem occurred */ -static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, +static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, const struct svc_rdma_recv_ctxt *rctxt, const struct xdr_buf *xdr) { - unsigned char *dst, *tailbase; - unsigned int taillen; - - dst = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len; - memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len); - dst += xdr->head[0].iov_len; - - tailbase = xdr->tail[0].iov_base; - taillen = xdr->tail[0].iov_len; - if (rctxt && rctxt->rc_write_list) { - u32 xdrpad; - - xdrpad = xdr_pad_size(xdr->page_len); - if (taillen && xdrpad) { - tailbase += xdrpad; - taillen -= xdrpad; - } - } else { - unsigned int len, remaining; - unsigned long pageoff; - struct page **ppages; - - ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); - pageoff = xdr->page_base & ~PAGE_MASK; - remaining = xdr->page_len; - while (remaining) { - len = min_t(u32, PAGE_SIZE - pageoff, remaining); - - memcpy(dst, page_address(*ppages) + pageoff, len); - remaining -= len; - dst += len; - pageoff = 0; - ppages++; - } - } + struct svc_rdma_pullup_data args = { + .pd_dest = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len, + }; + int ret; - if (taillen) - memcpy(dst, tailbase, taillen); + ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_linearize, &args); + if (ret < 0) + return ret; - sctxt->sc_sges[0].length += xdr->len; - trace_svcrdma_send_pullup(sctxt->sc_sges[0].length); + sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len + args.pd_length; + trace_svcrdma_send_pullup(sctxt, args.pd_length); return 0; } @@ -660,22 +758,22 @@ static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, * @rctxt: Write and Reply chunks provided by client * @xdr: prepared xdr_buf containing RPC message * - * Load the xdr_buf into the ctxt's sge array, and DMA map each - * element as it is added. The Send WR's num_sge field is set. + * Returns: + * %0 if DMA mapping was successful. + * %-EMSGSIZE if a buffer manipulation problem occurred + * %-EIO if DMA mapping failed * - * Returns zero on success, or a negative errno on failure. + * The Send WR's num_sge field is set in all cases. */ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, const struct svc_rdma_recv_ctxt *rctxt, - struct xdr_buf *xdr) + const struct xdr_buf *xdr) { - unsigned int len, remaining; - unsigned long page_off; - struct page **ppages; - unsigned char *base; - u32 xdr_pad; - int ret; + struct svc_rdma_map_data args = { + .md_rdma = rdma, + .md_ctxt = sctxt, + }; /* Set up the (persistently-mapped) transport header SGE. */ sctxt->sc_send_wr.num_sge = 1; @@ -684,7 +782,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, /* If there is a Reply chunk, nothing follows the transport * header, and we're done here. */ - if (rctxt && rctxt->rc_reply_chunk) + if (!pcl_is_empty(&rctxt->rc_reply_pcl)) return 0; /* For pull-up, svc_rdma_send() will sync the transport header. @@ -693,58 +791,8 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr)) return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr); - ++sctxt->sc_cur_sge_no; - ret = svc_rdma_dma_map_buf(rdma, sctxt, - xdr->head[0].iov_base, - xdr->head[0].iov_len); - if (ret < 0) - return ret; - - /* If a Write chunk is present, the xdr_buf's page list - * is not included inline. However the Upper Layer may - * have added XDR padding in the tail buffer, and that - * should not be included inline. - */ - if (rctxt && rctxt->rc_write_list) { - base = xdr->tail[0].iov_base; - len = xdr->tail[0].iov_len; - xdr_pad = xdr_pad_size(xdr->page_len); - - if (len && xdr_pad) { - base += xdr_pad; - len -= xdr_pad; - } - - goto tail; - } - - ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); - page_off = xdr->page_base & ~PAGE_MASK; - remaining = xdr->page_len; - while (remaining) { - len = min_t(u32, PAGE_SIZE - page_off, remaining); - - ++sctxt->sc_cur_sge_no; - ret = svc_rdma_dma_map_page(rdma, sctxt, *ppages++, - page_off, len); - if (ret < 0) - return ret; - - remaining -= len; - page_off = 0; - } - - base = xdr->tail[0].iov_base; - len = xdr->tail[0].iov_len; -tail: - if (len) { - ++sctxt->sc_cur_sge_no; - ret = svc_rdma_dma_map_buf(rdma, sctxt, base, len); - if (ret < 0) - return ret; - } - - return 0; + return pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_dma_map, &args); } /* The svc_rqst and all resources it owns are released as soon as @@ -894,9 +942,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) container_of(xprt, struct svcxprt_rdma, sc_xprt); struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; __be32 *rdma_argp = rctxt->rc_recv_buf; - __be32 *wr_lst = rctxt->rc_write_list; - __be32 *rp_ch = rctxt->rc_reply_chunk; - struct xdr_buf *xdr = &rqstp->rq_res; struct svc_rdma_send_ctxt *sctxt; __be32 *p; int ret; @@ -914,45 +959,22 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) rpcrdma_fixed_maxsz * sizeof(*p)); if (!p) goto err0; + + ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); + if (ret < 0) + goto err2; + *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); *p++ = rdma->sc_fc_credits; - *p = rp_ch ? rdma_nomsg : rdma_msg; + *p = pcl_is_empty(&rctxt->rc_reply_pcl) ? rdma_msg : rdma_nomsg; if (svc_rdma_encode_read_list(sctxt) < 0) goto err0; - if (wr_lst) { - /* XXX: Presume the client sent only one Write chunk */ - unsigned long offset; - unsigned int length; - - if (rctxt->rc_read_payload_length) { - offset = rctxt->rc_read_payload_offset; - length = rctxt->rc_read_payload_length; - } else { - offset = xdr->head[0].iov_len; - length = xdr->page_len; - } - ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr, offset, - length); - if (ret < 0) - goto err2; - if (svc_rdma_encode_write_list(rctxt, sctxt, length) < 0) - goto err0; - } else { - if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0) - goto err0; - } - if (rp_ch) { - ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); - if (ret < 0) - goto err2; - if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0) - goto err0; - } else { - if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0) - goto err0; - } + if (svc_rdma_encode_write_list(rctxt, sctxt) < 0) + goto err0; + if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0) + goto err0; ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp); if (ret < 0) @@ -979,28 +1001,46 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) } /** - * svc_rdma_read_payload - special processing for a READ payload + * svc_rdma_result_payload - special processing for a result payload * @rqstp: svc_rqst to operate on * @offset: payload's byte offset in @xdr * @length: size of payload, in bytes * - * Returns zero on success. - * - * For the moment, just record the xdr_buf location of the READ - * payload. svc_rdma_sendto will use that location later when - * we actually send the payload. + * Return values: + * %0 if successful or nothing needed to be done + * %-EMSGSIZE on XDR buffer overflow + * %-E2BIG if the payload was larger than the Write chunk + * %-EINVAL if client provided too many segments + * %-ENOMEM if rdma_rw context pool was exhausted + * %-ENOTCONN if posting failed (connection is lost) + * %-EIO if rdma_rw initialization failed (DMA mapping, etc) */ -int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset, - unsigned int length) +int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length) { struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; + struct svc_rdma_chunk *chunk; + struct svcxprt_rdma *rdma; + struct xdr_buf subbuf; + int ret; - /* XXX: Just one READ payload slot for now, since our - * transport implementation currently supports only one - * Write chunk. - */ - rctxt->rc_read_payload_offset = offset; - rctxt->rc_read_payload_length = length; + chunk = rctxt->rc_cur_result_payload; + if (!length || !chunk) + return 0; + rctxt->rc_cur_result_payload = + pcl_next_chunk(&rctxt->rc_write_pcl, chunk); + if (length > chunk->ch_length) + return -E2BIG; + chunk->ch_position = offset; + chunk->ch_payload_length = length; + + if (xdr_buf_subsegment(&rqstp->rq_res, &subbuf, offset, length)) + return -EMSGSIZE; + + rdma = container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); + ret = svc_rdma_send_write_chunk(rdma, chunk, &subbuf); + if (ret < 0) + return ret; return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index fb044792b571..afba4e9d5425 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -80,7 +80,7 @@ static const struct svc_xprt_ops svc_rdma_ops = { .xpo_create = svc_rdma_create, .xpo_recvfrom = svc_rdma_recvfrom, .xpo_sendto = svc_rdma_sendto, - .xpo_read_payload = svc_rdma_read_payload, + .xpo_result_payload = svc_rdma_result_payload, .xpo_release_rqst = svc_rdma_release_rqst, .xpo_detach = svc_rdma_detach, .xpo_free = svc_rdma_free, diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8915e42240d3..78d29d1bcc20 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -599,11 +599,12 @@ static void xprt_rdma_free(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - if (!list_empty(&req->rl_registered)) - frwr_unmap_sync(r_xprt, req); + if (unlikely(!list_empty(&req->rl_registered))) { + trace_xprtrdma_mrs_zap(task); + frwr_unmap_sync(rpcx_to_rdmax(rqst->rq_xprt), req); + } /* XXX: If the RPC is completing because of a signal and * not because a reply was received, we ought to ensure @@ -768,6 +769,7 @@ static struct xprt_class xprt_rdma = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_RDMA, .setup = xprt_setup_rdma, + .netid = { "rdma", "rdma6", "" }, }; void xprt_rdma_cleanup(void) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index ad6e2e4994ce..ec912cf9c618 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -167,7 +167,7 @@ static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_send(sc, wc); + trace_xprtrdma_wc_send(wc, &sc->sc_cid); rpcrdma_sendctx_put_locked(r_xprt, sc); rpcrdma_flush_disconnect(r_xprt, wc); } @@ -186,7 +186,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_receive(wc); + trace_xprtrdma_wc_receive(wc, &rep->rr_cid); --r_xprt->rx_ep->re_receive_count; if (wc->status != IB_WC_SUCCESS) goto out_flushed; @@ -643,6 +643,9 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep) return NULL; sc->sc_cqe.done = rpcrdma_wc_send; + sc->sc_cid.ci_queue_id = ep->re_attr.send_cq->res.id; + sc->sc_cid.ci_completion_id = + atomic_inc_return(&ep->re_completion_ids); return sc; } @@ -972,6 +975,9 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) goto out_free_regbuf; + rep->rr_cid.ci_completion_id = + atomic_inc_return(&r_xprt->rx_ep->re_completion_ids); + xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf), rdmab_length(rep->rr_rdmabuf)); rep->rr_cqe.done = rpcrdma_wc_receive; @@ -1179,25 +1185,6 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) } /** - * rpcrdma_mr_put - DMA unmap an MR and release it - * @mr: MR to release - * - */ -void rpcrdma_mr_put(struct rpcrdma_mr *mr) -{ - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - - if (mr->mr_dir != DMA_NONE) { - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - mr->mr_dir = DMA_NONE; - } - - rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); -} - -/** * rpcrdma_buffer_get - Get a request buffer * @buffers: Buffer pool from which to obtain a buffer * @@ -1411,6 +1398,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!rep) break; + rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id; trace_xprtrdma_post_recv(rep); rep->rr_recv_wr.next = wr; wr = &rep->rr_recv_wr; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 43974ef39a50..94b28657aeeb 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -53,6 +53,7 @@ #include <rdma/ib_verbs.h> /* RDMA verbs api */ #include <linux/sunrpc/clnt.h> /* rpc_xprt */ +#include <linux/sunrpc/rpc_rdma_cid.h> /* completion IDs */ #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ @@ -93,6 +94,8 @@ struct rpcrdma_ep { unsigned int re_max_requests; /* depends on device */ unsigned int re_inline_send; /* negotiated */ unsigned int re_inline_recv; /* negotiated */ + + atomic_t re_completion_ids; }; /* Pre-allocate extra Work Requests for handling backward receives @@ -180,6 +183,8 @@ enum { struct rpcrdma_rep { struct ib_cqe rr_cqe; + struct rpc_rdma_cid rr_cid; + __be32 rr_xid; __be32 rr_vers; __be32 rr_proc; @@ -211,6 +216,7 @@ enum { struct rpcrdma_req; struct rpcrdma_sendctx { struct ib_cqe sc_cqe; + struct rpc_rdma_cid sc_cid; struct rpcrdma_req *sc_req; unsigned int sc_unmap_count; struct ib_sge sc_sges[]; @@ -225,6 +231,7 @@ struct rpcrdma_sendctx { struct rpcrdma_frwr { struct ib_mr *fr_mr; struct ib_cqe fr_cqe; + struct rpc_rdma_cid fr_cid; struct completion fr_linv_done; union { struct ib_reg_wr fr_regwr; @@ -236,6 +243,7 @@ struct rpcrdma_req; struct rpcrdma_mr { struct list_head mr_list; struct rpcrdma_req *mr_req; + struct ib_device *mr_device; struct scatterlist *mr_sg; int mr_nents; enum dma_data_direction mr_dir; @@ -466,7 +474,6 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); -void rpcrdma_mr_put(struct rpcrdma_mr *mr); void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7090bbee0ec5..c56a66cdf4ac 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -433,7 +433,8 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, if (ret <= 0) goto sock_err; xs_flush_bvec(buf->bvec, ret, seek + buf->page_base); - offset += ret - buf->page_base; + ret -= buf->page_base; + offset += ret; if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) @@ -3059,6 +3060,7 @@ static struct xprt_class xs_local_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_LOCAL, .setup = xs_setup_local, + .netid = { "" }, }; static struct xprt_class xs_udp_transport = { @@ -3067,6 +3069,7 @@ static struct xprt_class xs_udp_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_UDP, .setup = xs_setup_udp, + .netid = { "udp", "udp6", "" }, }; static struct xprt_class xs_tcp_transport = { @@ -3075,6 +3078,7 @@ static struct xprt_class xs_tcp_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_TCP, .setup = xs_setup_tcp, + .netid = { "tcp", "tcp6", "" }, }; static struct xprt_class xs_bc_tcp_transport = { @@ -3083,6 +3087,7 @@ static struct xprt_class xs_bc_tcp_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_BC_TCP, .setup = xs_setup_bc_tcp, + .netid = { "" }, }; /** diff --git a/net/switchdev/Makefile b/net/switchdev/Makefile index bd69a3136e76..c5561d7f3a7c 100644 --- a/net/switchdev/Makefile +++ b/net/switchdev/Makefile @@ -3,4 +3,4 @@ # Makefile for the Switch device API # -obj-$(CONFIG_NET_SWITCHDEV) += switchdev.o +obj-y += switchdev.o diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 23d868545362..89a36db47ab4 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -101,19 +101,18 @@ static int switchdev_deferred_enqueue(struct net_device *dev, static int switchdev_port_attr_notify(enum switchdev_notifier_type nt, struct net_device *dev, const struct switchdev_attr *attr, - struct switchdev_trans *trans) + struct netlink_ext_ack *extack) { int err; int rc; struct switchdev_notifier_port_attr_info attr_info = { .attr = attr, - .trans = trans, .handled = false, }; rc = call_switchdev_blocking_notifiers(nt, dev, - &attr_info.info, NULL); + &attr_info.info, extack); err = notifier_to_errno(rc); if (err) { WARN_ON(!attr_info.handled); @@ -127,36 +126,11 @@ static int switchdev_port_attr_notify(enum switchdev_notifier_type nt, } static int switchdev_port_attr_set_now(struct net_device *dev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { - struct switchdev_trans trans; - int err; - - /* Phase I: prepare for attr set. Driver/device should fail - * here if there are going to be issues in the commit phase, - * such as lack of resources or support. The driver/device - * should reserve resources needed for the commit phase here, - * but should not commit the attr. - */ - - trans.ph_prepare = true; - err = switchdev_port_attr_notify(SWITCHDEV_PORT_ATTR_SET, dev, attr, - &trans); - if (err) - return err; - - /* Phase II: commit attr set. This cannot fail as a fault - * of driver/device. If it does, it's a bug in the driver/device - * because the driver said everythings was OK in phase I. - */ - - trans.ph_prepare = false; - err = switchdev_port_attr_notify(SWITCHDEV_PORT_ATTR_SET, dev, attr, - &trans); - WARN(err, "%s: Commit of attribute (id=%d) failed.\n", - dev->name, attr->id); - - return err; + return switchdev_port_attr_notify(SWITCHDEV_PORT_ATTR_SET, dev, attr, + extack); } static void switchdev_port_attr_set_deferred(struct net_device *dev, @@ -165,7 +139,7 @@ static void switchdev_port_attr_set_deferred(struct net_device *dev, const struct switchdev_attr *attr = data; int err; - err = switchdev_port_attr_set_now(dev, attr); + err = switchdev_port_attr_set_now(dev, attr, NULL); if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to set attribute (id=%d)\n", err, attr->id); @@ -185,21 +159,19 @@ static int switchdev_port_attr_set_defer(struct net_device *dev, * * @dev: port device * @attr: attribute to set - * - * Use a 2-phase prepare-commit transaction model to ensure - * system is not left in a partially updated state due to - * failure from driver/device. + * @extack: netlink extended ack, for error message propagation * * rtnl_lock must be held and must not be in atomic section, * in case SWITCHDEV_F_DEFER flag is not set. */ int switchdev_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { if (attr->flags & SWITCHDEV_F_DEFER) return switchdev_port_attr_set_defer(dev, attr); ASSERT_RTNL(); - return switchdev_port_attr_set_now(dev, attr); + return switchdev_port_attr_set_now(dev, attr, extack); } EXPORT_SYMBOL_GPL(switchdev_port_attr_set); @@ -221,7 +193,6 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj) static int switchdev_port_obj_notify(enum switchdev_notifier_type nt, struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack) { int rc; @@ -229,7 +200,6 @@ static int switchdev_port_obj_notify(enum switchdev_notifier_type nt, struct switchdev_notifier_port_obj_info obj_info = { .obj = obj, - .trans = trans, .handled = false, }; @@ -244,48 +214,15 @@ static int switchdev_port_obj_notify(enum switchdev_notifier_type nt, return 0; } -static int switchdev_port_obj_add_now(struct net_device *dev, - const struct switchdev_obj *obj, - struct netlink_ext_ack *extack) -{ - struct switchdev_trans trans; - int err; - - ASSERT_RTNL(); - - /* Phase I: prepare for obj add. Driver/device should fail - * here if there are going to be issues in the commit phase, - * such as lack of resources or support. The driver/device - * should reserve resources needed for the commit phase here, - * but should not commit the obj. - */ - - trans.ph_prepare = true; - err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, - dev, obj, &trans, extack); - if (err) - return err; - - /* Phase II: commit obj add. This cannot fail as a fault - * of driver/device. If it does, it's a bug in the driver/device - * because the driver said everythings was OK in phase I. - */ - - trans.ph_prepare = false; - err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, - dev, obj, &trans, extack); - WARN(err, "%s: Commit of object (id=%d) failed.\n", dev->name, obj->id); - - return err; -} - static void switchdev_port_obj_add_deferred(struct net_device *dev, const void *data) { const struct switchdev_obj *obj = data; int err; - err = switchdev_port_obj_add_now(dev, obj, NULL); + ASSERT_RTNL(); + err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, + dev, obj, NULL); if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to add object (id=%d)\n", err, obj->id); @@ -307,10 +244,6 @@ static int switchdev_port_obj_add_defer(struct net_device *dev, * @obj: object to add * @extack: netlink extended ack * - * Use a 2-phase prepare-commit transaction model to ensure - * system is not left in a partially updated state due to - * failure from driver/device. - * * rtnl_lock must be held and must not be in atomic section, * in case SWITCHDEV_F_DEFER flag is not set. */ @@ -321,7 +254,8 @@ int switchdev_port_obj_add(struct net_device *dev, if (obj->flags & SWITCHDEV_F_DEFER) return switchdev_port_obj_add_defer(dev, obj); ASSERT_RTNL(); - return switchdev_port_obj_add_now(dev, obj, extack); + return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, + dev, obj, extack); } EXPORT_SYMBOL_GPL(switchdev_port_obj_add); @@ -329,7 +263,7 @@ static int switchdev_port_obj_del_now(struct net_device *dev, const struct switchdev_obj *obj) { return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_DEL, - dev, obj, NULL, NULL); + dev, obj, NULL); } static void switchdev_port_obj_del_deferred(struct net_device *dev, @@ -449,7 +383,6 @@ static int __switchdev_handle_port_obj_add(struct net_device *dev, bool (*check_cb)(const struct net_device *dev), int (*add_cb)(struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack)) { struct netlink_ext_ack *extack; @@ -460,10 +393,10 @@ static int __switchdev_handle_port_obj_add(struct net_device *dev, extack = switchdev_notifier_info_to_extack(&port_obj_info->info); if (check_cb(dev)) { - /* This flag is only checked if the return value is success. */ - port_obj_info->handled = true; - return add_cb(dev, port_obj_info->obj, port_obj_info->trans, - extack); + err = add_cb(dev, port_obj_info->obj, extack); + if (err != -EOPNOTSUPP) + port_obj_info->handled = true; + return err; } /* Switch ports might be stacked under e.g. a LAG. Ignore the @@ -491,7 +424,6 @@ int switchdev_handle_port_obj_add(struct net_device *dev, bool (*check_cb)(const struct net_device *dev), int (*add_cb)(struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack)) { int err; @@ -515,9 +447,10 @@ static int __switchdev_handle_port_obj_del(struct net_device *dev, int err = -EOPNOTSUPP; if (check_cb(dev)) { - /* This flag is only checked if the return value is success. */ - port_obj_info->handled = true; - return del_cb(dev, port_obj_info->obj); + err = del_cb(dev, port_obj_info->obj); + if (err != -EOPNOTSUPP) + port_obj_info->handled = true; + return err; } /* Switch ports might be stacked under e.g. a LAG. Ignore the @@ -561,16 +494,20 @@ static int __switchdev_handle_port_attr_set(struct net_device *dev, bool (*check_cb)(const struct net_device *dev), int (*set_cb)(struct net_device *dev, const struct switchdev_attr *attr, - struct switchdev_trans *trans)) + struct netlink_ext_ack *extack)) { + struct netlink_ext_ack *extack; struct net_device *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; + extack = switchdev_notifier_info_to_extack(&port_attr_info->info); + if (check_cb(dev)) { - port_attr_info->handled = true; - return set_cb(dev, port_attr_info->attr, - port_attr_info->trans); + err = set_cb(dev, port_attr_info->attr, extack); + if (err != -EOPNOTSUPP) + port_attr_info->handled = true; + return err; } /* Switch ports might be stacked under e.g. a LAG. Ignore the @@ -598,7 +535,7 @@ int switchdev_handle_port_attr_set(struct net_device *dev, bool (*check_cb)(const struct net_device *dev), int (*set_cb)(struct net_device *dev, const struct switchdev_attr *attr, - struct switchdev_trans *trans)) + struct netlink_ext_ack *extack)) { int err; diff --git a/net/tipc/addr.c b/net/tipc/addr.c index 0f1eaed1bd1b..abe29d1aa23a 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -55,12 +55,11 @@ bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) void tipc_set_node_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); - u32 *tmp = (u32 *)id; memcpy(tn->node_id, id, NODE_ID_LEN); tipc_nodeid2string(tn->node_id_string, id); - tn->trial_addr = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3]; - pr_info("Own node identity %s, cluster identity %u\n", + tn->trial_addr = hash128to32(id); + pr_info("Node identity %s, cluster identity %u\n", tipc_own_id_string(net), tn->net_id); } @@ -76,7 +75,7 @@ void tipc_set_node_addr(struct net *net, u32 addr) } tn->trial_addr = addr; tn->addr_trial_end = jiffies; - pr_info("32-bit node address hash set to %x\n", addr); + pr_info("Node number set to %u\n", addr); } char *tipc_nodeid2string(char *str, u8 *id) diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 31bee0ea7b3e..1a11831bef62 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -3,6 +3,7 @@ * * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, Wind River Systems + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 650414110452..a4389ef08a98 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -72,6 +72,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, /** * tipc_media_find - locates specified media object by name + * @name: name to locate */ struct tipc_media *tipc_media_find(const char *name) { @@ -86,6 +87,7 @@ struct tipc_media *tipc_media_find(const char *name) /** * media_find_id - locates specified media object by type identifier + * @type: type identifier to locate */ static struct tipc_media *media_find_id(u8 type) { @@ -100,6 +102,9 @@ static struct tipc_media *media_find_id(u8 type) /** * tipc_media_addr_printf - record media address in print buffer + * @buf: output buffer + * @len: output buffer size remaining + * @a: input media address */ int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a) { @@ -127,7 +132,7 @@ int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a) * @name: ptr to bearer name string * @name_parts: ptr to area for bearer name components (or NULL if not needed) * - * Returns 1 if bearer name is valid, otherwise 0. + * Return: 1 if bearer name is valid, otherwise 0. */ static int bearer_name_validate(const char *name, struct tipc_bearer_names *name_parts) @@ -139,10 +144,7 @@ static int bearer_name_validate(const char *name, u32 if_len; /* copy bearer name & ensure length is OK */ - name_copy[TIPC_MAX_BEARER_NAME - 1] = 0; - /* need above in case non-Posix strncpy() doesn't pad with nulls */ - strncpy(name_copy, name, TIPC_MAX_BEARER_NAME); - if (name_copy[TIPC_MAX_BEARER_NAME - 1] != 0) + if (strscpy(name_copy, name, TIPC_MAX_BEARER_NAME) < 0) return 0; /* ensure all component parts of bearer name are present */ @@ -169,6 +171,8 @@ static int bearer_name_validate(const char *name, /** * tipc_bearer_find - locates bearer object with matching bearer name + * @net: the applicable net namespace + * @name: bearer name to locate */ struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name) { @@ -231,6 +235,11 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) /** * tipc_enable_bearer - enable bearer with the given name + * @net: the applicable net namespace + * @name: bearer name to enable + * @disc_domain: bearer domain + * @prio: bearer priority + * @attr: nlattr array */ static int tipc_enable_bearer(struct net *net, const char *name, u32 disc_domain, u32 prio, @@ -345,6 +354,8 @@ rejected: /** * tipc_reset_bearer - Reset all links established over this bearer + * @net: the applicable net namespace + * @b: the target bearer */ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b) { @@ -366,7 +377,9 @@ void tipc_bearer_put(struct tipc_bearer *b) } /** - * bearer_disable + * bearer_disable - disable this bearer + * @net: the applicable net namespace + * @b: the bearer to disable * * Note: This routine assumes caller holds RTNL lock. */ @@ -437,6 +450,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, } /* tipc_disable_l2_media - detach TIPC bearer from an L2 interface + * @b: the target bearer * * Mark L2 bearer as inactive so that incoming buffers are thrown away */ @@ -453,6 +467,7 @@ void tipc_disable_l2_media(struct tipc_bearer *b) /** * tipc_l2_send_msg - send a TIPC packet out over an L2 interface + * @net: the associated network namespace * @skb: the packet to be sent * @b: the bearer through which the packet is to be sent * @dest: peer destination address diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index bc0023119da2..6bf4550aa1ac 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -93,7 +93,8 @@ struct tipc_bearer; * @raw2addr: convert from raw addr format to media addr format * @priority: default link (and bearer) priority * @tolerance: default time (in ms) before declaring link failure - * @window: default window (in packets) before declaring link congestion + * @min_win: minimum window (in packets) before declaring link congestion + * @max_win: maximum window (in packets) before declaring link congestion * @mtu: max packet size bearer can support for media type not dependent on * underlying device MTU * @type_id: TIPC media identifier @@ -138,12 +139,15 @@ struct tipc_media { * @pt: packet type for bearer * @rcu: rcu struct for tipc_bearer * @priority: default link priority for bearer - * @window: default window size for bearer + * @min_win: minimum window (in packets) before declaring link congestion + * @max_win: maximum window (in packets) before declaring link congestion * @tolerance: default link tolerance for bearer * @domain: network domain to which links can be established * @identity: array index of this bearer within TIPC bearer array - * @link_req: ptr to (optional) structure making periodic link setup requests + * @disc: ptr to link setup request * @net_plane: network plane ('A' through 'H') currently associated with bearer + * @up: bearer up flag (bit 0) + * @refcnt: tipc_bearer reference counter * * Note: media-specific code is responsible for initialization of the fields * indicated below when a bearer is enabled; TIPC's generic bearer code takes diff --git a/net/tipc/core.c b/net/tipc/core.c index c2ff42900b53..5cc1f0307215 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -81,8 +81,6 @@ static int __net_init tipc_init_net(struct net *net) if (err) goto out_nametbl; - INIT_LIST_HEAD(&tn->dist_queue); - err = tipc_bcast_init(net); if (err) goto out_bclink; diff --git a/net/tipc/core.h b/net/tipc/core.h index 1d57a4d3b05e..03de7b213f55 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -3,6 +3,7 @@ * * Copyright (c) 2005-2006, 2013-2018 Ericsson AB * Copyright (c) 2005-2007, 2010-2013, Wind River Systems + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -132,9 +133,6 @@ struct tipc_net { spinlock_t nametbl_lock; struct name_table *nametbl; - /* Name dist queue */ - struct list_head dist_queue; - /* Topology subscription server */ struct tipc_topsrv *topsrv; atomic_t subscription_count; @@ -213,6 +211,17 @@ static inline u32 tipc_net_hash_mixes(struct net *net, int tn_rand) return net_hash_mix(&init_net) ^ net_hash_mix(net) ^ tn_rand; } +static inline u32 hash128to32(char *bytes) +{ + __be32 *tmp = (__be32 *)bytes; + u32 res; + + res = ntohl(tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3]); + if (likely(res)) + return res; + return ntohl(tmp[0] | tmp[1] | tmp[2] | tmp[3]); +} + #ifdef CONFIG_SYSCTL int tipc_register_sysctl(void); void tipc_unregister_sysctl(void); diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 740ab9ae41a6..f4fca8f7f63f 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/** +/* * net/tipc/crypto.c: TIPC crypto for key handling & packet en/decryption * * Copyright (c) 2019, Ericsson AB @@ -51,7 +51,7 @@ #define TIPC_REKEYING_INTV_DEF (60 * 24) /* default: 1 day */ -/** +/* * TIPC Key ids */ enum { @@ -63,7 +63,7 @@ enum { KEY_MAX = KEY_3, }; -/** +/* * TIPC Crypto statistics */ enum { @@ -90,7 +90,7 @@ int sysctl_tipc_max_tfms __read_mostly = TIPC_MAX_TFMS_DEF; /* Key exchange switch, default: on */ int sysctl_tipc_key_exchange_enabled __read_mostly = 1; -/** +/* * struct tipc_key - TIPC keys' status indicator * * 7 6 5 4 3 2 1 0 @@ -123,6 +123,8 @@ struct tipc_key { /** * struct tipc_tfm - TIPC TFM structure to form a list of TFMs + * @tfm: cipher handle/key + * @list: linked list of TFMs */ struct tipc_tfm { struct crypto_aead *tfm; @@ -138,7 +140,7 @@ struct tipc_tfm { * @salt: the key's SALT value * @authsize: authentication tag size (max = 16) * @mode: crypto mode is applied to the key - * @hint[]: a hint for user key + * @hint: a hint for user key * @rcu: struct rcu_head * @key: the aead key * @gen: the key's generation @@ -166,6 +168,7 @@ struct tipc_aead { /** * struct tipc_crypto_stats - TIPC Crypto statistics + * @stat: array of crypto statistics */ struct tipc_crypto_stats { unsigned int stat[MAX_STATS]; @@ -194,6 +197,7 @@ struct tipc_crypto_stats { * @key_master: flag indicates if master key exists * @legacy_user: flag indicates if a peer joins w/o master key (for bwd comp.) * @nokey: no key indication + * @flags: combined flags field * @lock: tipc_key lock */ struct tipc_crypto { @@ -324,6 +328,8 @@ do { \ /** * tipc_aead_key_validate - Validate a AEAD user key + * @ukey: pointer to user key data + * @info: netlink info pointer */ int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info) { @@ -477,6 +483,7 @@ static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val) /** * tipc_aead_tfm_next - Move TFM entry to the next one in list and return it + * @aead: the AEAD key pointer */ static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead) { @@ -714,9 +721,9 @@ static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, * @__dnode: TIPC dest node if "known" * * Return: - * 0 : if the encryption has completed - * -EINPROGRESS/-EBUSY : if a callback will be performed - * < 0 : the encryption has failed + * * 0 : if the encryption has completed + * * -EINPROGRESS/-EBUSY : if a callback will be performed + * * < 0 : the encryption has failed */ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b, @@ -870,9 +877,9 @@ static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err) * @b: TIPC bearer where the message has been received * * Return: - * 0 : if the decryption has completed - * -EINPROGRESS/-EBUSY : if a callback will be performed - * < 0 : the decryption has failed + * * 0 : if the decryption has completed + * * -EINPROGRESS/-EBUSY : if a callback will be performed + * * < 0 : the decryption has failed */ static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b) @@ -1001,7 +1008,7 @@ static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr) * tipc_ehdr_validate - Validate an encryption message * @skb: the message buffer * - * Returns "true" if this is a valid encryption message, otherwise "false" + * Return: "true" if this is a valid encryption message, otherwise "false" */ bool tipc_ehdr_validate(struct sk_buff *skb) { @@ -1674,12 +1681,12 @@ static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, * Otherwise, the skb is freed! * * Return: - * 0 : the encryption has succeeded (or no encryption) - * -EINPROGRESS/-EBUSY : the encryption is ongoing, a callback will be made - * -ENOKEK : the encryption has failed due to no key - * -EKEYREVOKED : the encryption has failed due to key revoked - * -ENOMEM : the encryption has failed due to no memory - * < 0 : the encryption has failed due to other reasons + * * 0 : the encryption has succeeded (or no encryption) + * * -EINPROGRESS/-EBUSY : the encryption is ongoing, a callback will be made + * * -ENOKEK : the encryption has failed due to no key + * * -EKEYREVOKED : the encryption has failed due to key revoked + * * -ENOMEM : the encryption has failed due to no memory + * * < 0 : the encryption has failed due to other reasons */ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, struct tipc_bearer *b, struct tipc_media_addr *dst, @@ -1799,12 +1806,12 @@ exit: * cluster key(s) can be taken for decryption (- recursive). * * Return: - * 0 : the decryption has successfully completed - * -EINPROGRESS/-EBUSY : the decryption is ongoing, a callback will be made - * -ENOKEY : the decryption has failed due to no key - * -EBADMSG : the decryption has failed due to bad message - * -ENOMEM : the decryption has failed due to no memory - * < 0 : the decryption has failed due to other reasons + * * 0 : the decryption has successfully completed + * * -EINPROGRESS/-EBUSY : the decryption is ongoing, a callback will be made + * * -ENOKEY : the decryption has failed due to no key + * * -EBADMSG : the decryption has failed due to bad message + * * -ENOMEM : the decryption has failed due to no memory + * * < 0 : the decryption has failed due to other reasons */ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, struct sk_buff **skb, struct tipc_bearer *b) diff --git a/net/tipc/crypto.h b/net/tipc/crypto.h index e71193bd5e36..ce7d4cc8a9e0 100644 --- a/net/tipc/crypto.h +++ b/net/tipc/crypto.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/** +/* * net/tipc/crypto.h: Include file for TIPC crypto * * Copyright (c) 2019, Ericsson AB @@ -53,7 +53,7 @@ #define TIPC_AES_GCM_IV_SIZE 12 #define TIPC_AES_GCM_TAG_SIZE 16 -/** +/* * TIPC crypto modes: * - CLUSTER_KEY: * One single key is used for both TX & RX in all nodes in the cluster. @@ -69,7 +69,7 @@ enum { extern int sysctl_tipc_max_tfms __read_mostly; extern int sysctl_tipc_key_exchange_enabled __read_mostly; -/** +/* * TIPC encryption message format: * * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 diff --git a/net/tipc/discover.c b/net/tipc/discover.c index d4ecacddb40c..5380f605b851 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -74,6 +74,7 @@ struct tipc_discoverer { /** * tipc_disc_init_msg - initialize a link setup message * @net: the applicable net namespace + * @skb: buffer containing message * @mtyp: message type (request or response) * @b: ptr to bearer issuing message */ @@ -341,7 +342,7 @@ exit: * @dest: destination address for request messages * @skb: pointer to created frame * - * Returns 0 if successful, otherwise -errno. + * Return: 0 if successful, otherwise -errno. */ int tipc_disc_create(struct net *net, struct tipc_bearer *b, struct tipc_media_addr *dest, struct sk_buff **skb) @@ -380,7 +381,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b, /** * tipc_disc_delete - destroy object sending periodic link setup requests - * @d: ptr to link duest structure + * @d: ptr to link dest structure */ void tipc_disc_delete(struct tipc_discoverer *d) { diff --git a/net/tipc/group.c b/net/tipc/group.c index b1fcd2ad5ecf..3e137d8c9d2f 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -2,6 +2,7 @@ * net/tipc/group.c: TIPC group messaging code * * Copyright (c) 2017, Ericsson AB + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -359,7 +360,7 @@ struct tipc_nlist *tipc_group_dests(struct tipc_group *grp) return &grp->dests; } -void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq, +void tipc_group_self(struct tipc_group *grp, struct tipc_service_range *seq, int *scope) { seq->type = grp->type; diff --git a/net/tipc/group.h b/net/tipc/group.h index 76b4e5a7b39d..ea4c3be64c78 100644 --- a/net/tipc/group.h +++ b/net/tipc/group.h @@ -2,6 +2,7 @@ * net/tipc/group.h: Include file for TIPC group unicast/multicast functions * * Copyright (c) 2017, Ericsson AB + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -50,7 +51,7 @@ void tipc_group_delete(struct net *net, struct tipc_group *grp); void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port, u32 instance); struct tipc_nlist *tipc_group_dests(struct tipc_group *grp); -void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq, +void tipc_group_self(struct tipc_group *grp, struct tipc_service_range *seq, int *scope); u32 tipc_group_exclude(struct tipc_group *grp); void tipc_group_filter_msg(struct tipc_group *grp, diff --git a/net/tipc/link.c b/net/tipc/link.c index 06b880da2a8e..115109259430 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -120,6 +120,34 @@ struct tipc_stats { * @reasm_buf: head of partially reassembled inbound message fragments * @bc_rcvr: marks that this is a broadcast receiver link * @stats: collects statistics regarding link activity + * @session: session to be used by link + * @snd_nxt_state: next send seq number + * @rcv_nxt_state: next rcv seq number + * @in_session: have received ACTIVATE_MSG from peer + * @active: link is active + * @if_name: associated interface name + * @rst_cnt: link reset counter + * @drop_point: seq number for failover handling (FIXME) + * @failover_reasm_skb: saved failover msg ptr (FIXME) + * @failover_deferdq: deferred message queue for failover processing (FIXME) + * @transmq: the link's transmit queue + * @backlog: link's backlog by priority (importance) + * @snd_nxt: next sequence number to be used + * @rcv_unacked: # messages read by user, but not yet acked back to peer + * @deferdq: deferred receive queue + * @window: sliding window size for congestion handling + * @min_win: minimal send window to be used by link + * @ssthresh: slow start threshold for congestion handling + * @max_win: maximal send window to be used by link + * @cong_acks: congestion acks for congestion avoidance (FIXME) + * @checkpoint: seq number for congestion window size handling + * @reasm_tnlmsg: fragmentation/reassembly area for tunnel protocol message + * @last_gap: last gap ack blocks for bcast (FIXME) + * @last_ga: ptr to gap ack blocks + * @bc_rcvlink: the peer specific link used for broadcast reception + * @bc_sndlink: the namespace global link used for broadcast sending + * @nack_state: bcast nack state + * @bc_peer_is_up: peer has acked the bcast init msg */ struct tipc_link { u32 addr; @@ -450,7 +478,6 @@ u32 tipc_link_state(struct tipc_link *l) * @min_win: minimal send window to be used by link * @max_win: maximal send window to be used by link * @session: session to be used by link - * @ownnode: identity of own node * @peer: node id of peer node * @peer_caps: bitmap describing peer node capabilities * @bc_sndlink: the namespace global link used for broadcast sending @@ -458,8 +485,10 @@ u32 tipc_link_state(struct tipc_link *l) * @inputq: queue to put messages ready for delivery * @namedq: queue to put binding table update messages ready for delivery * @link: return value, pointer to put the created link + * @self: local unicast link id + * @peer_id: 128-bit ID of peer * - * Returns true if link was created, otherwise false + * Return: true if link was created, otherwise false */ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, @@ -532,8 +561,13 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, * @inputq: queue to put messages ready for delivery * @namedq: queue to put binding table update messages ready for delivery * @link: return value, pointer to put the created link + * @ownnode: identity of own node + * @peer: node id of peer node + * @peer_id: 128-bit ID of peer + * @peer_caps: bitmap describing peer node capabilities + * @bc_sndlink: the namespace global link used for broadcast sending * - * Returns true if link was created, otherwise false + * Return: true if link was created, otherwise false */ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, u8 *peer_id, int mtu, u32 min_win, u32 max_win, u16 peer_caps, @@ -788,7 +822,7 @@ static void link_profile_stats(struct tipc_link *l) * tipc_link_too_silent - check if link is "too silent" * @l: tipc link to be checked * - * Returns true if the link 'silent_intv_cnt' is about to reach the + * Return: true if the link 'silent_intv_cnt' is about to reach the * 'abort_limit' value, otherwise false */ bool tipc_link_too_silent(struct tipc_link *l) @@ -990,13 +1024,12 @@ void tipc_link_reset(struct tipc_link *l) * @xmitq: returned list of packets to be sent by caller * * Consumes the buffer chain. - * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted + * Return: 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS */ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff_head *xmitq) { - struct tipc_msg *hdr = buf_msg(skb_peek(list)); struct sk_buff_head *backlogq = &l->backlogq; struct sk_buff_head *transmq = &l->transmq; struct sk_buff *skb, *_skb; @@ -1004,13 +1037,18 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, u16 ack = l->rcv_nxt - 1; u16 seqno = l->snd_nxt; int pkt_cnt = skb_queue_len(list); - int imp = msg_importance(hdr); unsigned int mss = tipc_link_mss(l); unsigned int cwin = l->window; unsigned int mtu = l->mtu; + struct tipc_msg *hdr; bool new_bundle; int rc = 0; + int imp; + if (pkt_cnt <= 0) + return 0; + + hdr = buf_msg(skb_peek(list)); if (unlikely(msg_size(hdr) > mtu)) { pr_warn("Too large msg, purging xmit list %d %d %d %d %d!\n", skb_queue_len(list), msg_user(hdr), @@ -1019,6 +1057,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, return -EMSGSIZE; } + imp = msg_importance(hdr); /* Allow oversubscription of one data msg per source at congestion */ if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) { if (imp == TIPC_SYSTEM_IMPORTANCE) { @@ -1260,7 +1299,7 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, pr_warn("Dropping received illegal msg type\n"); kfree_skb(skb); return true; - }; + } } /* tipc_link_input - process packet that has passed link protocol check @@ -2376,7 +2415,7 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, if (!msg_peer_node_is_up(hdr)) return rc; - /* Open when peer ackowledges our bcast init msg (pkt #1) */ + /* Open when peer acknowledges our bcast init msg (pkt #1) */ if (msg_ack(hdr)) l->bc_peer_is_up = true; @@ -2505,7 +2544,7 @@ void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win) } /** - * link_reset_stats - reset link statistics + * tipc_link_reset_stats - reset link statistics * @l: pointer to link */ void tipc_link_reset_stats(struct tipc_link *l) diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 6dce2abf436e..48fac3b17e40 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -108,7 +108,7 @@ const int tipc_max_domain_size = sizeof(struct tipc_mon_domain); */ static int dom_rec_len(struct tipc_mon_domain *dom, u16 mcnt) { - return ((void *)&dom->members - (void *)dom) + (mcnt * sizeof(u32)); + return (offsetof(struct tipc_mon_domain, members)) + (mcnt * sizeof(u32)); } /* dom_size() : calculate size of own domain based on number of peers diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 32c79c59052b..e9263280a2d4 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -58,11 +58,13 @@ static unsigned int align(unsigned int i) /** * tipc_buf_acquire - creates a TIPC message buffer * @size: message size (including TIPC header) + * @gfp: memory allocation flags * - * Returns a new buffer with data pointers set to the specified size. + * Return: a new buffer with data pointers set to the specified size. * - * NOTE: Headroom is reserved to allow prepending of a data link header. - * There may also be unrequested tailroom present at the buffer's end. + * NOTE: + * Headroom is reserved to allow prepending of a data link header. + * There may also be unrequested tailroom present at the buffer's end. */ struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp) { @@ -115,10 +117,6 @@ struct sk_buff *tipc_msg_create(uint user, uint type, msg_set_origport(msg, oport); msg_set_destport(msg, dport); msg_set_errcode(msg, errcode); - if (hdr_sz > SHORT_H_SIZE) { - msg_set_orignode(msg, onode); - msg_set_destnode(msg, dnode); - } return buf; } @@ -207,8 +205,9 @@ err: * @m: the data to be appended * @mss: max allowable size of buffer * @dlen: size of data to be appended - * @txq: queue to appand to - * Returns the number og 1k blocks appended or errno value + * @txq: queue to append to + * + * Return: the number of 1k blocks appended or errno value */ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, int mss, struct sk_buff_head *txq) @@ -312,7 +311,7 @@ bool tipc_msg_validate(struct sk_buff **_skb) * @pktmax: max size of a fragment incl. the header * @frags: returned fragment skb list * - * Returns 0 if the fragmentation is successful, otherwise: -EINVAL + * Return: 0 if the fragmentation is successful, otherwise: -EINVAL * or -ENOMEM */ int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, @@ -367,6 +366,7 @@ error: * tipc_msg_build - create buffer chain containing specified header and data * @mhdr: Message header, to be prepended to data * @m: User message + * @offset: buffer offset for fragmented messages (FIXME) * @dsz: Total length of user data * @pktmax: Max packet size that can be used * @list: Buffer or chain of buffers to be returned to caller @@ -374,7 +374,7 @@ error: * Note that the recursive call we are making here is safe, since it can * logically go only one further level down. * - * Returns message data size or errno: -ENOMEM, -EFAULT + * Return: message data size or errno: -ENOMEM, -EFAULT */ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int pktmax, struct sk_buff_head *list) @@ -485,7 +485,7 @@ error: * @msg: message to be appended * @max: max allowable size for the bundle buffer * - * Returns "true" if bundling has been performed, otherwise "false" + * Return: "true" if bundling has been performed, otherwise "false" */ static bool tipc_msg_bundle(struct sk_buff *bskb, struct tipc_msg *msg, u32 max) @@ -580,9 +580,9 @@ bundle: * @skb: buffer to be extracted from. * @iskb: extracted inner buffer, to be returned * @pos: position in outer message of msg to be extracted. - * Returns position of next msg + * Returns position of next msg. * Consumes outer buffer when last packet extracted - * Returns true when there is an extracted buffer, otherwise false + * Return: true when there is an extracted buffer, otherwise false */ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) { @@ -626,7 +626,7 @@ none: * @skb: buffer containing message to be reversed; will be consumed * @err: error code to be set in message, if any * Replaces consumed buffer with new one when successful - * Returns true if success, otherwise false + * Return: true if success, otherwise false */ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) { @@ -698,10 +698,11 @@ bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy) /** * tipc_msg_lookup_dest(): try to find new destination for named message + * @net: pointer to associated network namespace * @skb: the buffer containing the message. * @err: error code to be used by caller if lookup fails * Does not consume buffer - * Returns true if a destination is found, false otherwise + * Return: true if a destination is found, false otherwise */ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) { diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index fe4edce459ad..6cf57c3bfa27 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -50,6 +50,8 @@ struct distr_queue_item { /** * publ_to_item - add publication info to a publication message + * @p: publication info + * @i: location of item in the message */ static void publ_to_item(struct distr_item *i, struct publication *p) { @@ -62,6 +64,10 @@ static void publ_to_item(struct distr_item *i, struct publication *p) /** * named_prepare_buf - allocate & initialize a publication message + * @net: the associated network namespace + * @type: message type + * @size: payload size + * @dest: destination node * * The buffer returned is of size INT_H_SIZE + payload size */ @@ -83,6 +89,8 @@ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, /** * tipc_named_publish - tell other nodes about a new publication by this node + * @net: the associated network namespace + * @publ: the new publication */ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) { @@ -111,6 +119,8 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) /** * tipc_named_withdraw - tell other nodes about a withdrawn publication by this node + * @net: the associated network namespace + * @publ: the withdrawn publication */ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) { @@ -138,9 +148,11 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) /** * named_distribute - prepare name info for bulk distribution to another node + * @net: the associated network namespace * @list: list of messages (buffers) to be returned from this function * @dnode: node to be updated * @pls: linked list of publication items to be packed into buffer chain + * @seqno: sequence number for this message */ static void named_distribute(struct net *net, struct sk_buff_head *list, u32 dnode, struct list_head *pls, u16 seqno) @@ -194,6 +206,9 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, /** * tipc_named_node_up - tell specified node about all publications by this node + * @net: the associated network namespace + * @dnode: destination node + * @capabilities: peer node's capabilities */ void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities) { @@ -217,6 +232,9 @@ void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities) /** * tipc_publ_purge - remove publication associated with a failed node + * @net: the associated network namespace + * @publ: the publication to remove + * @addr: failed node's address * * Invoked for each publication issued by a newly failed node. * Removes publication structure from name table & deletes it. @@ -244,24 +262,6 @@ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr) kfree_rcu(p, rcu); } -/** - * tipc_dist_queue_purge - remove deferred updates from a node that went down - */ -static void tipc_dist_queue_purge(struct net *net, u32 addr) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct distr_queue_item *e, *tmp; - - spin_lock_bh(&tn->nametbl_lock); - list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) { - if (e->node != addr) - continue; - list_del(&e->next); - kfree(e); - } - spin_unlock_bh(&tn->nametbl_lock); -} - void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr, u16 capabilities) { @@ -272,7 +272,6 @@ void tipc_publ_notify(struct net *net, struct list_head *nsub_list, list_for_each_entry_safe(publ, tmp, nsub_list, binding_node) tipc_publ_purge(net, publ, addr); - tipc_dist_queue_purge(net, addr); spin_lock_bh(&tn->nametbl_lock); if (!(capabilities & TIPC_NAMED_BCAST)) nt->rc_dests--; @@ -282,9 +281,13 @@ void tipc_publ_notify(struct net *net, struct list_head *nsub_list, /** * tipc_update_nametbl - try to process a nametable update and notify * subscribers + * @net: the associated network namespace + * @i: location of item in the message + * @node: node address + * @dtype: name distributor message type * * tipc_nametbl_lock must be held. - * Returns the publication item if successful, otherwise NULL. + * Return: the publication item if successful, otherwise NULL. */ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, u32 node, u32 dtype) @@ -366,6 +369,10 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, /** * tipc_named_rcv - process name table update messages sent by another node + * @net: the associated network namespace + * @namedq: queue to receive from + * @rcv_nxt: store last received seqno here + * @open: last bulk msg was received (FIXME) */ void tipc_named_rcv(struct net *net, struct sk_buff_head *namedq, u16 *rcv_nxt, bool *open) @@ -393,6 +400,7 @@ void tipc_named_rcv(struct net *net, struct sk_buff_head *namedq, /** * tipc_named_reinit - re-initialize local publications + * @net: the associated network namespace * * This routine is called whenever TIPC networking is enabled. * All name table entries published by this node are updated to reflect diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h index 092323158f06..e231e6964d61 100644 --- a/net/tipc/name_distr.h +++ b/net/tipc/name_distr.h @@ -46,7 +46,7 @@ * @type: name sequence type * @lower: name sequence lower bound * @upper: name sequence upper bound - * @ref: publishing port reference + * @port: publishing port reference * @key: publication key * * ===> All fields are stored in network byte order. <=== diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 2ac33d32edc2..ee5ac40ea2b6 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -3,6 +3,7 @@ * * Copyright (c) 2000-2006, 2014-2018, Ericsson AB * Copyright (c) 2004-2008, 2010-2014, Wind River Systems + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -103,7 +104,8 @@ RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks, * range match * @sr: the service range pointer as a loop cursor * @sc: the pointer to tipc service which holds the service range rbtree - * @start, end: the range (end >= start) for matching + * @start: beginning of the search range (end >= start) for matching + * @end: end of the search range (end >= start) for matching */ #define service_range_foreach_match(sr, sc, start, end) \ for (sr = service_range_match_first((sc)->ranges.rb_node, \ @@ -117,7 +119,8 @@ RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks, /** * service_range_match_first - find first service range matching a range * @n: the root node of service range rbtree for searching - * @start, end: the range (end >= start) for matching + * @start: beginning of the search range (end >= start) for matching + * @end: end of the search range (end >= start) for matching * * Return: the leftmost service range node in the rbtree that overlaps the * specific range if any. Otherwise, returns NULL. @@ -166,7 +169,8 @@ static struct service_range *service_range_match_first(struct rb_node *n, /** * service_range_match_next - find next service range matching a range * @n: a node in service range rbtree from which the searching starts - * @start, end: the range (end >= start) for matching + * @start: beginning of the search range (end >= start) for matching + * @end: end of the search range (end >= start) for matching * * Return: the next service range node to the given node in the rbtree that * overlaps the specific range if any. Otherwise, returns NULL. @@ -218,6 +222,13 @@ static int hash(int x) /** * tipc_publ_create - create a publication structure + * @type: name sequence type + * @lower: name sequence lower bound + * @upper: name sequence upper bound + * @scope: publication scope + * @node: network address of publishing socket + * @port: publishing port + * @key: publication key */ static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper, u32 scope, u32 node, u32 port, @@ -245,6 +256,8 @@ static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper, /** * tipc_service_create - create a service structure for the specified 'type' + * @type: service type + * @hd: name_table services list * * Allocates a single range structure and sets it to all 0's. */ @@ -361,6 +374,9 @@ err: /** * tipc_service_remove_publ - remove a publication from a service + * @sr: service_range to remove publication from + * @node: target node + * @key: target publication key */ static struct publication *tipc_service_remove_publ(struct service_range *sr, u32 node, u32 key) @@ -377,7 +393,7 @@ static struct publication *tipc_service_remove_publ(struct service_range *sr, return NULL; } -/** +/* * Code reused: time_after32() for the same purpose */ #define publication_after(pa, pb) time_after32((pa)->id, (pb)->id) @@ -395,6 +411,8 @@ static int tipc_publ_sort(void *priv, struct list_head *a, * tipc_service_subscribe - attach a subscription, and optionally * issue the prescribed number of events if there is any service * range overlapping with the requested range + * @service: the tipc_service to attach the @sub to + * @sub: the subscription to attach */ static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) @@ -403,12 +421,12 @@ static void tipc_service_subscribe(struct tipc_service *service, struct publication *p, *first, *tmp; struct list_head publ_list; struct service_range *sr; - struct tipc_name_seq ns; + struct tipc_service_range r; u32 filter; - ns.type = tipc_sub_read(sb, seq.type); - ns.lower = tipc_sub_read(sb, seq.lower); - ns.upper = tipc_sub_read(sb, seq.upper); + r.type = tipc_sub_read(sb, seq.type); + r.lower = tipc_sub_read(sb, seq.lower); + r.upper = tipc_sub_read(sb, seq.upper); filter = tipc_sub_read(sb, filter); tipc_sub_get(sub); @@ -418,7 +436,7 @@ static void tipc_service_subscribe(struct tipc_service *service, return; INIT_LIST_HEAD(&publ_list); - service_range_foreach_match(sr, service, ns.lower, ns.upper) { + service_range_foreach_match(sr, service, r.lower, r.upper) { first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { if (filter & TIPC_SUB_PORTS) @@ -528,14 +546,16 @@ exit: /** * tipc_nametbl_translate - perform service instance to socket translation - * - * On entry, 'dnode' is the search domain used during translation. + * @net: network namespace + * @type: message type + * @instance: message instance + * @dnode: the search domain used during translation * * On exit: * - if translation is deferred to another node, leave 'dnode' unchanged and - * return 0 + * return 0 * - if translation is attempted and succeeds, set 'dnode' to the publishing - * node and return the published (non-zero) port number + * node and return the published (non-zero) port number * - if translation is attempted and fails, set 'dnode' to 0 and return 0 * * Note that for legacy users (node configured with Z.C.N address format) the @@ -756,6 +776,11 @@ exit: /** * tipc_nametbl_withdraw - withdraw a service binding + * @net: network namespace + * @type: service type + * @lower: service range lower bound + * @upper: service range upper bound + * @key: target publication key */ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 upper, u32 key) @@ -791,6 +816,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, /** * tipc_nametbl_subscribe - add a subscription object to the name table + * @sub: subscription to add */ bool tipc_nametbl_subscribe(struct tipc_subscription *sub) { @@ -821,6 +847,7 @@ bool tipc_nametbl_subscribe(struct tipc_subscription *sub) /** * tipc_nametbl_unsubscribe - remove a subscription object from name table + * @sub: subscription to remove */ void tipc_nametbl_unsubscribe(struct tipc_subscription *sub) { @@ -870,7 +897,9 @@ int tipc_nametbl_init(struct net *net) } /** - * tipc_service_delete - purge all publications for a service and delete it + * tipc_service_delete - purge all publications for a service and delete it + * @net: the associated network namespace + * @sc: tipc_service to delete */ static void tipc_service_delete(struct net *net, struct tipc_service *sc) { diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 8064e1986e2c..5a82a01369d6 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -60,8 +60,8 @@ struct tipc_group; * @key: publication key, unique across the cluster * @id: publication id * @binding_node: all publications from the same node which bound this one - * - Remote publications: in node->publ_list - * Used by node/name distr to withdraw publications when node is lost + * - Remote publications: in node->publ_list; + * Used by node/name distr to withdraw publications when node is lost * - Local/node scope publications: in name_table->node_scope list * - Local/cluster scope publications: in name_table->cluster_scope list * @binding_sock: all publications from the same socket which bound this one @@ -92,13 +92,16 @@ struct publication { /** * struct name_table - table containing all existing port name publications - * @seq_hlist: name sequence hash lists + * @services: name sequence hash lists * @node_scope: all local publications with node scope * - used by name_distr during re-init of name table * @cluster_scope: all local publications with cluster scope * - used by name_distr to send bulk updates to new nodes * - used by name_distr during re-init of name table + * @cluster_scope_lock: lock for accessing @cluster_scope * @local_publ_count: number of publications issued by this node + * @rc_dests: destination node counter + * @snd_nxt: next sequence number to be used */ struct name_table { struct hlist_head services[TIPC_NAMETBL_SIZE]; diff --git a/net/tipc/net.c b/net/tipc/net.c index 0bb2323201da..a129f661bee3 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -132,7 +132,7 @@ static void tipc_net_finalize(struct net *net, u32 addr) tipc_named_reinit(net); tipc_sk_reinit(net); tipc_mon_reinit_self(net); - tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, + tipc_nametbl_publish(net, TIPC_NODE_STATE, addr, addr, TIPC_CLUSTER_SCOPE, 0, addr); } diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 1c7aa51cc2a3..5a1ce64039f7 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -118,7 +118,8 @@ static void tipc_tlv_init(struct sk_buff *skb, u16 type) skb_put(skb, sizeof(struct tlv_desc)); } -static int tipc_tlv_sprintf(struct sk_buff *skb, const char *fmt, ...) +static __printf(2, 3) int tipc_tlv_sprintf(struct sk_buff *skb, + const char *fmt, ...) { int n; u16 len; @@ -212,12 +213,14 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, } info.attrs = attrbuf; - err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf, - tipc_genl_family.maxattr, - tipc_genl_family.policy, NULL); - if (err) - goto err_out; + if (nlmsg_len(cb.nlh) > 0) { + err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf, + tipc_genl_family.maxattr, + tipc_genl_family.policy, NULL); + if (err) + goto err_out; + } do { int rem; @@ -588,7 +591,7 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg, return 0; tipc_tlv_sprintf(msg->rep, "\nLink <%s>\n", - nla_data(link[TIPC_NLA_LINK_NAME])); + (char *)nla_data(link[TIPC_NLA_LINK_NAME])); if (link[TIPC_NLA_LINK_BROADCAST]) { __fill_bc_link_stat(msg, prop, stats); @@ -695,7 +698,7 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg, link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]); link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP])); - nla_strlcpy(link_info.str, link[TIPC_NLA_LINK_NAME], + nla_strscpy(link_info.str, link[TIPC_NLA_LINK_NAME], TIPC_MAX_LINK_NAME); return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO, diff --git a/net/tipc/node.c b/net/tipc/node.c index d269ebe382e1..008670d1f43e 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -82,7 +82,7 @@ struct tipc_bclink_entry { /** * struct tipc_node - TIPC node structure * @addr: network address of node - * @ref: reference counter to node object + * @kref: reference counter to node object * @lock: rwlock governing access to structure * @net: the applicable net namespace * @hash: links to adjacent nodes in unsorted hash chain @@ -90,9 +90,11 @@ struct tipc_bclink_entry { * @namedq: pointer to name table input queue with name table messages * @active_links: bearer ids of active links, used as index into links[] array * @links: array containing references to all links to node + * @bc_entry: broadcast link entry * @action_flags: bit mask of different types of node actions * @state: connectivity state vs peer node * @preliminary: a preliminary node or not + * @failover_sent: failover sent or not * @sync_point: sequence number where synch/failover is finished * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) @@ -100,9 +102,16 @@ struct tipc_bclink_entry { * @capabilities: bitmap, indicating peer node's functional capabilities * @signature: node instance identifier * @link_id: local and remote bearer ids of changing link, if any + * @peer_id: 128-bit ID of peer + * @peer_id_string: ID string of peer * @publ_list: list of publications + * @conn_sks: list of connections (FIXME) + * @timer: node's keepalive timer + * @keepalive_intv: keepalive interval in milliseconds * @rcu: rcu struct for tipc_node * @delete_at: indicates the time for deleting a down node + * @peer_net: peer's net namespace + * @peer_hash_mix: hash for this peer (FIXME) * @crypto_rx: RX crypto handler */ struct tipc_node { @@ -267,6 +276,7 @@ char *tipc_node_get_id_str(struct tipc_node *node) #ifdef CONFIG_TIPC_CRYPTO /** * tipc_node_crypto_rx - Retrieve crypto RX handle from node + * @__n: target tipc_node * Note: node ref counter must be held first! */ struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n) @@ -814,6 +824,9 @@ static void tipc_node_timeout(struct timer_list *t) /** * __tipc_node_link_up - handle addition of link + * @n: target tipc_node + * @bearer_id: id of the bearer + * @xmitq: queue for messages to be xmited on * Node lock must be held by caller * Link becomes active (alone or shared) or standby, depending on its priority. */ @@ -880,6 +893,9 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, /** * tipc_node_link_up - handle addition of link + * @n: target tipc_node + * @bearer_id: id of the bearer + * @xmitq: queue for messages to be xmited on * * Link becomes active (alone or shared) or standby, depending on its priority. */ @@ -900,10 +916,11 @@ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, * * This function is only called in a very special situation where link * failover can be already started on peer node but not on this node. - * This can happen when e.g. + * This can happen when e.g.:: + * * 1. Both links <1A-2A>, <1B-2B> down * 2. Link endpoint 2A up, but 1A still down (e.g. due to network - * disturbance, wrong session, etc.) + * disturbance, wrong session, etc.) * 3. Link <1B-2B> up * 4. Link endpoint 2A down (e.g. due to link tolerance timeout) * 5. Node 2 starts failover onto link <1B-2B> @@ -940,6 +957,10 @@ static void tipc_node_link_failover(struct tipc_node *n, struct tipc_link *l, /** * __tipc_node_link_down - handle loss of link + * @n: target tipc_node + * @bearer_id: id of the bearer + * @xmitq: queue for messages to be xmited on + * @maddr: output media address of the bearer */ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, @@ -1525,11 +1546,13 @@ static void node_lost_contact(struct tipc_node *n, /** * tipc_node_get_linkname - get the name of a link * + * @net: the applicable net namespace * @bearer_id: id of the bearer * @addr: peer node address * @linkname: link name output buffer + * @len: size of @linkname output buffer * - * Returns 0 on success + * Return: 0 on success */ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, char *linkname, size_t len) @@ -1638,17 +1661,17 @@ static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) return; default: return; - }; + } } /** - * tipc_node_xmit() is the general link level function for message sending + * tipc_node_xmit() - general link level function for message sending * @net: the applicable net namespace * @list: chain of buffers containing message * @dnode: address of destination node * @selector: a number used for deterministic link selection * Consumes the buffer chain. - * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF + * Return: 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF */ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector) @@ -1881,9 +1904,11 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id /** * tipc_node_check_state - check and if necessary update node state + * @n: target tipc_node * @skb: TIPC packet * @bearer_id: identity of bearer delivering the packet - * Returns true if state and msg are ok, otherwise false + * @xmitq: queue for messages to be xmited on + * Return: true if state and msg are ok, otherwise false */ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int bearer_id, struct sk_buff_head *xmitq) @@ -2181,7 +2206,11 @@ void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, &xmitq); else if (prop == TIPC_NLA_PROP_MTU) tipc_link_set_mtu(e->link, b->mtu); + + /* Update MTU for node link entry */ + e->mtu = tipc_link_mss(e->link); } + tipc_node_write_unlock(n); tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL); } @@ -2195,6 +2224,9 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) struct tipc_net *tn = net_generic(net, tipc_net_id); struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; struct tipc_node *peer, *temp_node; + u8 node_id[NODE_ID_LEN]; + u64 *w0 = (u64 *)&node_id[0]; + u64 *w1 = (u64 *)&node_id[8]; u32 addr; int err; @@ -2208,10 +2240,22 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) if (err) return err; - if (!attrs[TIPC_NLA_NET_ADDR]) - return -EINVAL; + /* attrs[TIPC_NLA_NET_NODEID] and attrs[TIPC_NLA_NET_ADDR] are + * mutually exclusive cases + */ + if (attrs[TIPC_NLA_NET_ADDR]) { + addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); + if (!addr) + return -EINVAL; + } - addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); + if (attrs[TIPC_NLA_NET_NODEID]) { + if (!attrs[TIPC_NLA_NET_NODEID_W1]) + return -EINVAL; + *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); + *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); + addr = hash128to32(node_id); + } if (in_own_node(net, addr)) return -ENOTSUPP; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e795a8a2955b..cebcc104dc70 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1,8 +1,9 @@ /* * net/tipc/socket.c: TIPC socket API * - * Copyright (c) 2001-2007, 2012-2017, Ericsson AB + * Copyright (c) 2001-2007, 2012-2019, Ericsson AB * Copyright (c) 2004-2008, 2010-2013, Wind River Systems + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -79,19 +80,32 @@ struct sockaddr_pair { * @maxnagle: maximum size of msg which can be subject to nagle * @portid: unique port identity in TIPC socket hash table * @phdr: preformatted message header used when sending messages - * #cong_links: list of congested links + * @cong_links: list of congested links * @publications: list of publications for port * @blocking_link: address of the congested link we are currently sleeping on * @pub_count: total # of publications port has made during its lifetime * @conn_timeout: the time we can wait for an unresponded setup request + * @probe_unacked: probe has not received ack yet * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue * @cong_link_cnt: number of congested links * @snt_unacked: # messages sent by socket, and not yet acked by peer + * @snd_win: send window size + * @peer_caps: peer capabilities mask * @rcv_unacked: # messages read by user, but not yet acked back to peer + * @rcv_win: receive window size * @peer: 'connected' peer for dgram/rdm * @node: hash table node * @mc_method: cookie for use between socket and broadcast layer * @rcu: rcu struct for tipc_sock + * @group: TIPC communications group + * @oneway: message count in one direction (FIXME) + * @nagle_start: current nagle value + * @snd_backlog: send backlog count + * @msg_acc: messages accepted; used in managing backlog and nagle + * @pkt_cnt: TIPC socket packet count + * @expect_ack: whether this TIPC socket is expecting an ack + * @nodelay: setsockopt() TIPC_NODELAY setting + * @group_is_open: TIPC socket group is fully open (FIXME) */ struct tipc_sock { struct sock sk; @@ -138,9 +152,9 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern); static void tipc_sk_timeout(struct timer_list *t); static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, - struct tipc_name_seq const *seq); + struct tipc_service_range const *seq); static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, - struct tipc_name_seq const *seq); + struct tipc_service_range const *seq); static int tipc_sk_leave(struct tipc_sock *tsk); static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid); static int tipc_sk_insert(struct tipc_sock *tsk); @@ -260,6 +274,7 @@ static void tsk_set_nagle(struct tipc_sock *tsk) /** * tsk_advance_rx_queue - discard first buffer in socket receive queue + * @sk: network socket * * Caller must hold socket lock */ @@ -288,6 +303,8 @@ static void tipc_sk_respond(struct sock *sk, struct sk_buff *skb, int err) /** * tsk_rej_rx_queue - reject all buffers in socket receive queue + * @sk: network socket + * @error: response error code * * Caller must hold socket lock */ @@ -441,7 +458,7 @@ static int tipc_sk_sock_err(struct socket *sock, long *timeout) * This routine creates additional data structures used by the TIPC socket, * initializes them, and links them together. * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_sk_create(struct net *net, struct socket *sock, int protocol, int kern) @@ -606,7 +623,7 @@ static void __tipc_shutdown(struct socket *sock, int error) * are returned or discarded according to the "destination droppable" setting * specified for the message by the sender. * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_release(struct socket *sock) { @@ -644,75 +661,77 @@ static int tipc_release(struct socket *sock) } /** - * tipc_bind - associate or disassocate TIPC name(s) with a socket + * __tipc_bind - associate or disassocate TIPC name(s) with a socket * @sock: socket structure - * @uaddr: socket address describing name(s) and desired operation - * @uaddr_len: size of socket address data structure + * @skaddr: socket address describing name(s) and desired operation + * @alen: size of socket address data structure * * Name and name sequence binding is indicated using a positive scope value; * a negative scope value unbinds the specified name. Specifying no name * (i.e. a socket address length of 0) unbinds all names from the socket. * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise * * NOTE: This routine doesn't need to take the socket lock since it doesn't * access any non-constant socket information. */ -static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, - int uaddr_len) +static int __tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen) { - struct sock *sk = sock->sk; - struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; - struct tipc_sock *tsk = tipc_sk(sk); - int res = -EINVAL; + struct sockaddr_tipc *addr = (struct sockaddr_tipc *)skaddr; + struct tipc_sock *tsk = tipc_sk(sock->sk); - lock_sock(sk); - if (unlikely(!uaddr_len)) { - res = tipc_sk_withdraw(tsk, 0, NULL); - goto exit; - } - if (tsk->group) { - res = -EACCES; - goto exit; - } - if (uaddr_len < sizeof(struct sockaddr_tipc)) { - res = -EINVAL; - goto exit; - } - if (addr->family != AF_TIPC) { - res = -EAFNOSUPPORT; - goto exit; - } + if (unlikely(!alen)) + return tipc_sk_withdraw(tsk, 0, NULL); - if (addr->addrtype == TIPC_ADDR_NAME) + if (addr->addrtype == TIPC_SERVICE_ADDR) addr->addr.nameseq.upper = addr->addr.nameseq.lower; - else if (addr->addrtype != TIPC_ADDR_NAMESEQ) { - res = -EAFNOSUPPORT; - goto exit; - } - if ((addr->addr.nameseq.type < TIPC_RESERVED_TYPES) && - (addr->addr.nameseq.type != TIPC_TOP_SRV) && - (addr->addr.nameseq.type != TIPC_CFG_SRV)) { - res = -EACCES; - goto exit; - } + if (tsk->group) + return -EACCES; - res = (addr->scope >= 0) ? - tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq) : - tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq); -exit: - release_sock(sk); + if (addr->scope >= 0) + return tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq); + else + return tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq); +} + +int tipc_sk_bind(struct socket *sock, struct sockaddr *skaddr, int alen) +{ + int res; + + lock_sock(sock->sk); + res = __tipc_bind(sock, skaddr, alen); + release_sock(sock->sk); return res; } +static int tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen) +{ + struct sockaddr_tipc *addr = (struct sockaddr_tipc *)skaddr; + + if (alen) { + if (alen < sizeof(struct sockaddr_tipc)) + return -EINVAL; + if (addr->family != AF_TIPC) + return -EAFNOSUPPORT; + if (addr->addrtype > TIPC_SERVICE_ADDR) + return -EAFNOSUPPORT; + if (addr->addr.nameseq.type < TIPC_RESERVED_TYPES) { + pr_warn_once("Can't bind to reserved service type %u\n", + addr->addr.nameseq.type); + return -EACCES; + } + } + return tipc_sk_bind(sock, skaddr, alen); +} + /** * tipc_getname - get port ID of socket or peer socket * @sock: socket structure * @uaddr: area for returned socket address * @peer: 0 = own ID, 1 = current peer ID, 2 = current/former peer ID * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise * * NOTE: This routine doesn't need to take the socket lock since it only * accesses socket information that is unchanging (or which changes in @@ -737,7 +756,7 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, addr->addr.id.node = tipc_own_addr(sock_net(sk)); } - addr->addrtype = TIPC_ADDR_ID; + addr->addrtype = TIPC_SOCKET_ADDR; addr->family = AF_TIPC; addr->scope = 0; addr->addr.name.domain = 0; @@ -751,7 +770,7 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, * @sock: socket for which to calculate the poll bits * @wait: ??? * - * Returns pollmask value + * Return: pollmask value * * COMMENTARY: * It appears that the usual socket locking mechanisms are not useful here @@ -813,9 +832,9 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks - * Returns the number of bytes sent on success, or errno + * Return: the number of bytes sent on success, or errno */ -static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, +static int tipc_sendmcast(struct socket *sock, struct tipc_service_range *seq, struct msghdr *msg, size_t dlen, long timeout) { struct sock *sk = sock->sk; @@ -873,6 +892,7 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, /** * tipc_send_group_msg - send a message to a member in the group * @net: network namespace + * @tsk: tipc socket * @m: message to send * @mb: group member * @dnode: destination node @@ -928,7 +948,7 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks - * Returns the number of bytes sent on success, or errno + * Return: the number of bytes sent on success, or errno */ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, int dlen, long timeout) @@ -972,7 +992,7 @@ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks - * Returns the number of bytes sent on success, or errno + * Return: the number of bytes sent on success, or errno */ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, int dlen, long timeout) @@ -1057,7 +1077,7 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks - * Returns the number of bytes sent on success, or errno + * Return: the number of bytes sent on success, or errno */ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, int dlen, long timeout) @@ -1131,7 +1151,7 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks - * Returns the number of bytes sent on success, or errno + * Return: the number of bytes sent on success, or errno */ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, int dlen, long timeout) @@ -1168,6 +1188,7 @@ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, /** * tipc_sk_mcast_rcv - Deliver multicast messages to all destination sockets + * @net: the associated network namespace * @arrvq: queue with arriving messages, to be cloned after destination lookup * @inputq: queue with cloned messages, delivered to socket after dest lookup * @@ -1307,6 +1328,8 @@ static void tipc_sk_push_backlog(struct tipc_sock *tsk, bool nagle_ack) * tipc_sk_conn_proto_rcv - receive a connection mng protocol message * @tsk: receiving socket * @skb: pointer to message buffer. + * @inputq: buffer list containing the buffers + * @xmitq: output message area */ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, struct sk_buff_head *inputq, @@ -1374,7 +1397,7 @@ exit: * and for 'SYN' messages on SOCK_SEQPACKET and SOCK_STREAM connections. * (Note: 'SYN+' is prohibited on SOCK_STREAM.) * - * Returns the number of bytes sent on success, or errno otherwise + * Return: the number of bytes sent on success, or errno otherwise */ static int tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) @@ -1400,7 +1423,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) bool syn = !tipc_sk_type_connectionless(sk); struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; - struct tipc_name_seq *seq; + struct tipc_service_range *seq; struct sk_buff_head pkts; u32 dport = 0, dnode = 0; u32 type = 0, inst = 0; @@ -1419,9 +1442,9 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (grp) { if (!dest) return tipc_send_group_bcast(sock, m, dlen, timeout); - if (dest->addrtype == TIPC_ADDR_NAME) + if (dest->addrtype == TIPC_SERVICE_ADDR) return tipc_send_group_anycast(sock, m, dlen, timeout); - if (dest->addrtype == TIPC_ADDR_ID) + if (dest->addrtype == TIPC_SOCKET_ADDR) return tipc_send_group_unicast(sock, m, dlen, timeout); if (dest->addrtype == TIPC_ADDR_MCAST) return tipc_send_group_mcast(sock, m, dlen, timeout); @@ -1441,7 +1464,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) return -EISCONN; if (tsk->published) return -EOPNOTSUPP; - if (dest->addrtype == TIPC_ADDR_NAME) { + if (dest->addrtype == TIPC_SERVICE_ADDR) { tsk->conn_type = dest->addr.name.name.type; tsk->conn_instance = dest->addr.name.name.instance; } @@ -1452,14 +1475,14 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (dest->addrtype == TIPC_ADDR_MCAST) return tipc_sendmcast(sock, seq, m, dlen, timeout); - if (dest->addrtype == TIPC_ADDR_NAME) { + if (dest->addrtype == TIPC_SERVICE_ADDR) { type = dest->addr.name.name.type; inst = dest->addr.name.name.instance; dnode = dest->addr.name.domain; dport = tipc_nametbl_translate(net, type, inst, &dnode); if (unlikely(!dport && !dnode)) return -EHOSTUNREACH; - } else if (dest->addrtype == TIPC_ADDR_ID) { + } else if (dest->addrtype == TIPC_SOCKET_ADDR) { dnode = dest->addr.id.node; } else { return -EINVAL; @@ -1471,7 +1494,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(rc)) return rc; - if (dest->addrtype == TIPC_ADDR_NAME) { + if (dest->addrtype == TIPC_SERVICE_ADDR) { msg_set_type(hdr, TIPC_NAMED_MSG); msg_set_hdr_sz(hdr, NAMED_H_SIZE); msg_set_nametype(hdr, type); @@ -1479,7 +1502,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); msg_set_destnode(hdr, dnode); msg_set_destport(hdr, dport); - } else { /* TIPC_ADDR_ID */ + } else { /* TIPC_SOCKET_ADDR */ msg_set_type(hdr, TIPC_DIRECT_MSG); msg_set_lookup_scope(hdr, 0); msg_set_destnode(hdr, dnode); @@ -1519,7 +1542,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) * * Used for SOCK_STREAM data. * - * Returns the number of bytes sent on success (or partial success), + * Return: the number of bytes sent on success (or partial success), * or errno if no data sent */ static int tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz) @@ -1627,7 +1650,7 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) * * Used for SOCK_SEQPACKET messages. * - * Returns the number of bytes sent on success, or errno otherwise + * Return: the number of bytes sent on success, or errno otherwise */ static int tipc_send_packet(struct socket *sock, struct msghdr *m, size_t dsz) { @@ -1684,7 +1707,7 @@ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) return; srcaddr->sock.family = AF_TIPC; - srcaddr->sock.addrtype = TIPC_ADDR_ID; + srcaddr->sock.addrtype = TIPC_SOCKET_ADDR; srcaddr->sock.scope = 0; srcaddr->sock.addr.id.ref = msg_origport(hdr); srcaddr->sock.addr.id.node = msg_orignode(hdr); @@ -1696,7 +1719,7 @@ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) /* Group message users may also want to know sending member's id */ srcaddr->member.family = AF_TIPC; - srcaddr->member.addrtype = TIPC_ADDR_NAME; + srcaddr->member.addrtype = TIPC_SERVICE_ADDR; srcaddr->member.scope = 0; srcaddr->member.addr.name.name.type = msg_nametype(hdr); srcaddr->member.addr.name.name.instance = TIPC_SKB_CB(skb)->orig_member; @@ -1712,7 +1735,7 @@ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) * * Note: Ancillary data is not captured if not requested by receiver. * - * Returns 0 if successful, otherwise errno + * Return: 0 if successful, otherwise errno */ static int tipc_sk_anc_data_recv(struct msghdr *m, struct sk_buff *skb, struct tipc_sock *tsk) @@ -1862,6 +1885,7 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) /** * tipc_recvmsg - receive packet-oriented message + * @sock: network socket * @m: descriptor for message info * @buflen: length of user buffer area * @flags: receive flags @@ -1869,7 +1893,7 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) * Used for SOCK_DGRAM, SOCK_RDM, and SOCK_SEQPACKET messages. * If the complete message doesn't fit in user area, truncate it. * - * Returns size of returned message data, errno otherwise + * Return: size of returned message data, errno otherwise */ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buflen, int flags) @@ -1970,6 +1994,7 @@ exit: /** * tipc_recvstream - receive stream-oriented data + * @sock: network socket * @m: descriptor for message info * @buflen: total size of user buffer area * @flags: receive flags @@ -1977,7 +2002,7 @@ exit: * Used for SOCK_STREAM messages only. If not enough data is available * will optionally wait for more; never truncates data. * - * Returns size of returned message data, errno otherwise + * Return: size of returned message data, errno otherwise */ static int tipc_recvstream(struct socket *sock, struct msghdr *m, size_t buflen, int flags) @@ -2155,7 +2180,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, * @tsk: TIPC socket * @skb: pointer to message buffer. * @xmitq: for Nagle ACK if any - * Returns true if message should be added to receive queue, false otherwise + * Return: true if message should be added to receive queue, false otherwise */ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb, struct sk_buff_head *xmitq) @@ -2269,7 +2294,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb, * TIPC_HIGH_IMPORTANCE (8 MB) * TIPC_CRITICAL_IMPORTANCE (16 MB) * - * Returns overload limit according to corresponding message importance + * Return: overload limit according to corresponding message importance */ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) { @@ -2292,12 +2317,12 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) * tipc_sk_filter_rcv - validate incoming message * @sk: socket * @skb: pointer to message. + * @xmitq: output message area (FIXME) * * Enqueues message on receive queue if acceptable; optionally handles * disconnect indication for a connected socket. * * Called with socket lock already taken - * */ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *xmitq) @@ -2387,6 +2412,7 @@ static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) * @inputq: list of incoming buffers with potentially different destinations * @sk: socket where the buffers should be enqueued * @dport: port number for the socket + * @xmitq: output queue * * Caller must hold socket lock */ @@ -2439,6 +2465,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, /** * tipc_sk_rcv - handle a chain of incoming buffers + * @net: the associated network namespace * @inputq: buffer list containing the buffers * Consumes all buffers in list until inputq is empty * Note: may be called in multiple threads referring to the same queue @@ -2531,7 +2558,7 @@ static bool tipc_sockaddr_is_sane(struct sockaddr_tipc *addr) * @destlen: size of socket address data structure * @flags: file-related flags associated with socket * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_connect(struct socket *sock, struct sockaddr *dest, int destlen, int flags) @@ -2624,7 +2651,7 @@ exit: * @sock: socket structure * @len: (unused) * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_listen(struct socket *sock, int len) { @@ -2676,8 +2703,9 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo) * @sock: listening socket * @new_sock: new socket that is to be connected * @flags: file-related flags associated with socket + * @kern: caused by kernel or by userspace? * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern) @@ -2756,7 +2784,7 @@ exit: * * Terminates connection (if necessary), then purges socket's receive queue. * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_shutdown(struct socket *sock, int how) { @@ -2864,7 +2892,7 @@ static void tipc_sk_timeout(struct timer_list *t) } static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, - struct tipc_name_seq const *seq) + struct tipc_service_range const *seq) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); @@ -2892,7 +2920,7 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, } static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, - struct tipc_name_seq const *seq) + struct tipc_service_range const *seq) { struct net *net = sock_net(&tsk->sk); struct publication *publ; @@ -3039,7 +3067,7 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) struct net *net = sock_net(&tsk->sk); struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; - struct tipc_name_seq seq; + struct tipc_service_range seq; int rc; if (mreq->type < TIPC_RESERVED_TYPES) @@ -3076,7 +3104,7 @@ static int tipc_sk_leave(struct tipc_sock *tsk) { struct net *net = sock_net(&tsk->sk); struct tipc_group *grp = tsk->group; - struct tipc_name_seq seq; + struct tipc_service_range seq; int scope; if (!grp) @@ -3099,7 +3127,7 @@ static int tipc_sk_leave(struct tipc_sock *tsk) * For stream sockets only, accepts and ignores all IPPROTO_TCP options * (to ease compatibility). * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, sockptr_t ov, unsigned int ol) @@ -3193,14 +3221,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, * For stream sockets only, returns 0 length result for all IPPROTO_TCP options * (to ease compatibility). * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, char __user *ov, int __user *ol) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_name_seq seq; + struct tipc_service_range seq; int len, scope; u32 value; int res; @@ -3301,12 +3329,12 @@ static int tipc_socketpair(struct socket *sock1, struct socket *sock2) u32 onode = tipc_own_addr(sock_net(sock1->sk)); tsk1->peer.family = AF_TIPC; - tsk1->peer.addrtype = TIPC_ADDR_ID; + tsk1->peer.addrtype = TIPC_SOCKET_ADDR; tsk1->peer.scope = TIPC_NODE_SCOPE; tsk1->peer.addr.id.ref = tsk2->portid; tsk1->peer.addr.id.node = onode; tsk2->peer.family = AF_TIPC; - tsk2->peer.addrtype = TIPC_ADDR_ID; + tsk2->peer.addrtype = TIPC_SOCKET_ADDR; tsk2->peer.scope = TIPC_NODE_SCOPE; tsk2->peer.addr.id.ref = tsk1->portid; tsk2->peer.addr.id.node = onode; @@ -3397,7 +3425,7 @@ static struct proto tipc_proto = { /** * tipc_socket_init - initialize TIPC socket interface * - * Returns 0 on success, errno otherwise + * Return: 0 on success, errno otherwise */ int tipc_socket_init(void) { @@ -3796,10 +3824,11 @@ int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb) /** * tipc_sk_filtering - check if a socket should be traced * @sk: the socket to be examined - * @sysctl_tipc_sk_filter[]: the socket tuple for filtering, - * (portid, sock type, name type, name lower, name upper) * - * Returns true if the socket meets the socket tuple data + * @sysctl_tipc_sk_filter is used as the socket tuple for filtering: + * (portid, sock type, name type, name lower, name upper) + * + * Return: true if the socket meets the socket tuple data * (value 0 = 'any') or when there is no tuple set (all = 0), * otherwise false */ @@ -3864,7 +3893,7 @@ u32 tipc_sock_get_portid(struct sock *sk) * @sk: tipc sk to be checked * @skb: tipc msg to be checked * - * Returns true if the socket rx queue allocation is > 90%, otherwise false + * Return: true if the socket rx queue allocation is > 90%, otherwise false */ bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb) @@ -3882,7 +3911,7 @@ bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb) * @sk: tipc sk to be checked * @skb: tipc msg to be checked * - * Returns true if the socket rx queue allocation is > 90%, otherwise false + * Return: true if the socket rx queue allocation is > 90%, otherwise false */ bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb) diff --git a/net/tipc/socket.h b/net/tipc/socket.h index b11575afc66f..02cdf166807d 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -74,7 +74,7 @@ int tipc_dump_done(struct netlink_callback *cb); u32 tipc_sock_get_portid(struct sock *sk); bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb); bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb); - +int tipc_sk_bind(struct socket *sock, struct sockaddr *skaddr, int alen); int tsk_set_importance(struct sock *sk, int imp); #endif diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index f340e53da625..f6ad0005218c 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -3,6 +3,7 @@ * * Copyright (c) 2000-2017, Ericsson AB * Copyright (c) 2005-2007, 2010-2013, Wind River Systems + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -55,12 +56,14 @@ static void tipc_sub_send_event(struct tipc_subscription *sub, } /** - * tipc_sub_check_overlap - test for subscription overlap with the - * given values + * tipc_sub_check_overlap - test for subscription overlap with the given values + * @seq: tipc_name_seq to check + * @found_lower: lower value to test + * @found_upper: upper value to test * - * Returns 1 if there is overlap, otherwise 0. + * Return: 1 if there is overlap, otherwise 0. */ -int tipc_sub_check_overlap(struct tipc_name_seq *seq, u32 found_lower, +int tipc_sub_check_overlap(struct tipc_service_range *seq, u32 found_lower, u32 found_upper) { if (found_lower < seq->lower) @@ -79,7 +82,7 @@ void tipc_sub_report_overlap(struct tipc_subscription *sub, { struct tipc_subscr *s = &sub->evt.s; u32 filter = tipc_sub_read(s, filter); - struct tipc_name_seq seq; + struct tipc_service_range seq; seq.type = tipc_sub_read(s, seq.type); seq.lower = tipc_sub_read(s, seq.lower); diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index 6ebbec1bedd1..3ded27391d54 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -3,6 +3,7 @@ * * Copyright (c) 2003-2017, Ericsson AB * Copyright (c) 2005-2007, 2012-2013, Wind River Systems + * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -47,12 +48,15 @@ struct tipc_conn; /** * struct tipc_subscription - TIPC network topology subscription object - * @subscriber: pointer to its subscriber - * @seq: name sequence associated with subscription + * @kref: reference count for this subscription + * @net: network namespace associated with subscription * @timer: timer governing subscription duration (optional) - * @nameseq_list: adjacent subscriptions in name sequence's subscription list + * @service_list: adjacent subscriptions in name sequence's subscription list * @sub_list: adjacent subscriptions in subscriber's subscription list * @evt: template for events generated by subscription + * @conid: connection identifier of topology server + * @inactive: true if this subscription is inactive + * @lock: serialize up/down and timer events */ struct tipc_subscription { struct kref kref; @@ -63,7 +67,7 @@ struct tipc_subscription { struct tipc_event evt; int conid; bool inactive; - spinlock_t lock; /* serialize up/down and timer events */ + spinlock_t lock; }; struct tipc_subscription *tipc_sub_subscribe(struct net *net, @@ -71,8 +75,8 @@ struct tipc_subscription *tipc_sub_subscribe(struct net *net, int conid); void tipc_sub_unsubscribe(struct tipc_subscription *sub); -int tipc_sub_check_overlap(struct tipc_name_seq *seq, u32 found_lower, - u32 found_upper); +int tipc_sub_check_overlap(struct tipc_service_range *seq, + u32 found_lower, u32 found_upper); void tipc_sub_report_overlap(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, u32 event, u32 port, u32 node, diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 5f6f86051c83..5522865deae9 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -519,13 +519,13 @@ static int tipc_topsrv_create_listener(struct tipc_topsrv *srv) goto err; saddr.family = AF_TIPC; - saddr.addrtype = TIPC_ADDR_NAMESEQ; - saddr.addr.nameseq.type = TIPC_TOP_SRV; + saddr.addrtype = TIPC_SERVICE_RANGE; + saddr.addr.nameseq.type = TIPC_TOP_SRV; saddr.addr.nameseq.lower = TIPC_TOP_SRV; saddr.addr.nameseq.upper = TIPC_TOP_SRV; saddr.scope = TIPC_NODE_SCOPE; - rc = kernel_bind(lsock, (struct sockaddr *)&saddr, sizeof(saddr)); + rc = tipc_sk_bind(lsock, (struct sockaddr *)&saddr, sizeof(saddr)); if (rc < 0) goto err; rc = kernel_listen(lsock, 0); @@ -664,12 +664,18 @@ static int tipc_topsrv_start(struct net *net) ret = tipc_topsrv_work_start(srv); if (ret < 0) - return ret; + goto err_start; ret = tipc_topsrv_create_listener(srv); if (ret < 0) - tipc_topsrv_work_stop(srv); + goto err_create; + return 0; + +err_create: + tipc_topsrv_work_stop(srv); +err_start: + kfree(srv); return ret; } diff --git a/net/tipc/trace.c b/net/tipc/trace.c index 265f6a26aa3d..7d2931521e0e 100644 --- a/net/tipc/trace.c +++ b/net/tipc/trace.c @@ -36,7 +36,7 @@ #define CREATE_TRACE_POINTS #include "trace.h" -/** +/* * socket tuples for filtering in socket traces: * (portid, sock type, name type, name lower, name upper) */ diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 1d17f4470ee2..21e75e28e86a 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -64,6 +64,11 @@ * * This is the bearer level originating address used in neighbor discovery * messages, and all fields should be in network byte order + * + * @proto: Ethernet protocol in use + * @port: port being used + * @ipv4: IPv4 address of neighbor + * @ipv6: IPv6 address of neighbor */ struct udp_media_addr { __be16 proto; @@ -88,6 +93,7 @@ struct udp_replicast { * @ubsock: bearer associated socket * @ifindex: local address scope * @work: used to schedule deferred work on a bearer + * @rcast: associated udp_replicast container */ struct udp_bearer { struct tipc_bearer __rcu *bearer; @@ -772,7 +778,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, if (err) goto free; - /** + /* * The bcast media address port is used for all peers and the ip * is used if it's a multicast address. */ diff --git a/net/tls/Kconfig b/net/tls/Kconfig index fa0724fd84b4..0cdc1f7b6b08 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -21,6 +21,7 @@ config TLS_DEVICE bool "Transport Layer Security HW offload" depends on TLS select SOCK_VALIDATE_XMIT + select SOCK_RX_QUEUE_MAPPING default n help Enable kernel support for HW offload of the TLS protocol. diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index cec86229a6a0..d9cd229aa111 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -113,7 +113,7 @@ static struct net_device *get_netdev_for_sock(struct sock *sk) struct net_device *netdev = NULL; if (likely(dst)) { - netdev = dst->dev; + netdev = netdev_sk_get_lowest_dev(dst->dev, sk); dev_hold(netdev); } @@ -327,7 +327,7 @@ static int tls_device_record_close(struct sock *sk, /* fill prepend */ tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]), record->len - prot->overhead_size, - record_type, prot->version); + record_type); return ret; } @@ -694,36 +694,51 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx, static bool tls_device_rx_resync_async(struct tls_offload_resync_async *resync_async, - s64 resync_req, u32 *seq) + s64 resync_req, u32 *seq, u16 *rcd_delta) { u32 is_async = resync_req & RESYNC_REQ_ASYNC; u32 req_seq = resync_req >> 32; u32 req_end = req_seq + ((resync_req >> 16) & 0xffff); + u16 i; + + *rcd_delta = 0; if (is_async) { + /* shouldn't get to wraparound: + * too long in async stage, something bad happened + */ + if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX)) + return false; + /* asynchronous stage: log all headers seq such that * req_seq <= seq <= end_seq, and wait for real resync request */ - if (between(*seq, req_seq, req_end) && + if (before(*seq, req_seq)) + return false; + if (!after(*seq, req_end) && resync_async->loglen < TLS_DEVICE_RESYNC_ASYNC_LOGMAX) resync_async->log[resync_async->loglen++] = *seq; + resync_async->rcd_delta++; + return false; } /* synchronous stage: check against the logged entries and * proceed to check the next entries if no match was found */ - while (resync_async->loglen) { - if (req_seq == resync_async->log[resync_async->loglen - 1] && - atomic64_try_cmpxchg(&resync_async->req, - &resync_req, 0)) { - resync_async->loglen = 0; + for (i = 0; i < resync_async->loglen; i++) + if (req_seq == resync_async->log[i] && + atomic64_try_cmpxchg(&resync_async->req, &resync_req, 0)) { + *rcd_delta = resync_async->rcd_delta - i; *seq = req_seq; + resync_async->loglen = 0; + resync_async->rcd_delta = 0; return true; } - resync_async->loglen--; - } + + resync_async->loglen = 0; + resync_async->rcd_delta = 0; if (req_seq == *seq && atomic64_try_cmpxchg(&resync_async->req, @@ -741,6 +756,7 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) u32 sock_data, is_req_pending; struct tls_prot_info *prot; s64 resync_req; + u16 rcd_delta; u32 req_seq; if (tls_ctx->rx_conf != TLS_HW) @@ -786,8 +802,9 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) return; if (!tls_device_rx_resync_async(rx_ctx->resync_async, - resync_req, &seq)) + resync_req, &seq, &rcd_delta)) return; + tls_bigint_subtract(rcd_sn, rcd_delta); break; } @@ -981,7 +998,7 @@ static void tls_device_attach(struct tls_context *ctx, struct sock *sk, int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) { - u16 nonce_size, tag_size, iv_size, rec_seq_size; + u16 nonce_size, tag_size, iv_size, rec_seq_size, salt_size; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_record_info *start_marker_record; @@ -1022,6 +1039,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) iv_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; iv = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->iv; rec_seq_size = TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE; + salt_size = TLS_CIPHER_AES_GCM_128_SALT_SIZE; rec_seq = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq; break; @@ -1042,6 +1060,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) prot->tag_size = tag_size; prot->overhead_size = prot->prepend_size + prot->tag_size; prot->iv_size = iv_size; + prot->salt_size = salt_size; ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, GFP_KERNEL); if (!ctx->tx.iv) { @@ -1245,6 +1264,8 @@ void tls_device_offload_cleanup_rx(struct sock *sk) if (tls_ctx->tx_conf != TLS_HW) { dev_put(netdev); tls_ctx->netdev = NULL; + } else { + set_bit(TLS_RX_DEV_CLOSED, &tls_ctx->flags); } out: up_read(&device_offload_lock); @@ -1274,7 +1295,8 @@ static int tls_device_down(struct net_device *netdev) if (ctx->tx_conf == TLS_HW) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); - if (ctx->rx_conf == TLS_HW) + if (ctx->rx_conf == TLS_HW && + !test_bit(TLS_RX_DEV_CLOSED, &ctx->flags)) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_RX); WRITE_ONCE(ctx->netdev, NULL); @@ -1307,6 +1329,8 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event, switch (event) { case NETDEV_REGISTER: case NETDEV_FEAT_CHANGE: + if (netif_is_bond_master(dev)) + return NOTIFY_DONE; if ((dev->features & NETIF_F_HW_TLS_RX) && !dev->tlsdev_ops->tls_dev_resync) return NOTIFY_BAD; diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 28895333701e..cacf040872c7 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -49,7 +49,8 @@ static int tls_enc_record(struct aead_request *aead_req, struct crypto_aead *aead, char *aad, char *iv, __be64 rcd_sn, struct scatter_walk *in, - struct scatter_walk *out, int *in_len) + struct scatter_walk *out, int *in_len, + struct tls_prot_info *prot) { unsigned char buf[TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE]; struct scatterlist sg_in[3]; @@ -73,8 +74,7 @@ static int tls_enc_record(struct aead_request *aead_req, len -= TLS_CIPHER_AES_GCM_128_IV_SIZE; tls_make_aad(aad, len - TLS_CIPHER_AES_GCM_128_TAG_SIZE, - (char *)&rcd_sn, sizeof(rcd_sn), buf[0], - TLS_1_2_VERSION); + (char *)&rcd_sn, buf[0], prot); memcpy(iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, buf + TLS_HEADER_SIZE, TLS_CIPHER_AES_GCM_128_IV_SIZE); @@ -140,7 +140,7 @@ static struct aead_request *tls_alloc_aead_request(struct crypto_aead *aead, static int tls_enc_records(struct aead_request *aead_req, struct crypto_aead *aead, struct scatterlist *sg_in, struct scatterlist *sg_out, char *aad, char *iv, - u64 rcd_sn, int len) + u64 rcd_sn, int len, struct tls_prot_info *prot) { struct scatter_walk out, in; int rc; @@ -150,7 +150,7 @@ static int tls_enc_records(struct aead_request *aead_req, do { rc = tls_enc_record(aead_req, aead, aad, iv, - cpu_to_be64(rcd_sn), &in, &out, &len); + cpu_to_be64(rcd_sn), &in, &out, &len, prot); rcd_sn++; } while (rc == 0 && len); @@ -348,7 +348,8 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, payload_len, sync_size, dummy_buf); if (tls_enc_records(aead_req, ctx->aead_send, sg_in, sg_out, aad, iv, - rcd_sn, sync_size + payload_len) < 0) + rcd_sn, sync_size + payload_len, + &tls_ctx->prot_info) < 0) goto free_nskb; complete_skb(nskb, skb, tcp_payload_offset); @@ -423,7 +424,7 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, struct sk_buff *skb) { - if (dev == tls_get_ctx(sk)->netdev) + if (dev == tls_get_ctx(sk)->netdev || netif_is_bond_master(dev)) return skb; return tls_sw_fallback(sk, skb); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 8d93cea99f2c..47b7c5334c34 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -521,6 +521,9 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, case TLS_CIPHER_AES_CCM_128: optsize = sizeof(struct tls12_crypto_info_aes_ccm_128); break; + case TLS_CIPHER_CHACHA20_POLY1305: + optsize = sizeof(struct tls12_crypto_info_chacha20_poly1305); + break; default: rc = -EINVAL; goto err_crypto_info; diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c index 3a5dd1e07233..feeceb0e4cb4 100644 --- a/net/tls/tls_proc.c +++ b/net/tls/tls_proc.c @@ -37,9 +37,12 @@ static int tls_statistics_seq_show(struct seq_file *seq, void *v) int __net_init tls_proc_init(struct net *net) { +#ifdef CONFIG_PROC_FS if (!proc_create_net_single("tls_stat", 0444, net->proc_net, tls_statistics_seq_show, NULL)) return -ENOMEM; +#endif /* CONFIG_PROC_FS */ + return 0; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 95ab5545a931..01d933ae5f16 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -505,7 +505,7 @@ static int tls_do_encryption(struct sock *sk, memcpy(&rec->iv_data[iv_offset], tls_ctx->tx.iv, prot->iv_size + prot->salt_size); - xor_iv_with_seq(prot->version, rec->iv_data, tls_ctx->tx.rec_seq); + xor_iv_with_seq(prot, rec->iv_data, tls_ctx->tx.rec_seq); sge->offset += prot->prepend_size; sge->length -= prot->prepend_size; @@ -748,14 +748,13 @@ static int tls_push_record(struct sock *sk, int flags, sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]); tls_make_aad(rec->aad_space, msg_pl->sg.size + prot->tail_size, - tls_ctx->tx.rec_seq, prot->rec_seq_size, - record_type, prot->version); + tls_ctx->tx.rec_seq, record_type, prot); tls_fill_prepend(tls_ctx, page_address(sg_page(&msg_en->sg.data[i])) + msg_en->sg.data[i].offset, msg_pl->sg.size + prot->tail_size, - record_type, prot->version); + record_type); tls_ctx->pending_open_record_frags = false; @@ -1295,6 +1294,12 @@ static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock, return NULL; } + if (!skb_queue_empty(&sk->sk_receive_queue)) { + __strp_unpause(&ctx->strp); + if (ctx->recv_pkt) + return ctx->recv_pkt; + } + if (sk->sk_shutdown & RCV_SHUTDOWN) return NULL; @@ -1465,19 +1470,19 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, kfree(mem); return err; } - if (prot->version == TLS_1_3_VERSION) + if (prot->version == TLS_1_3_VERSION || + prot->cipher_type == TLS_CIPHER_CHACHA20_POLY1305) memcpy(iv + iv_offset, tls_ctx->rx.iv, crypto_aead_ivsize(ctx->aead_recv)); else memcpy(iv + iv_offset, tls_ctx->rx.iv, prot->salt_size); - xor_iv_with_seq(prot->version, iv, tls_ctx->rx.rec_seq); + xor_iv_with_seq(prot, iv, tls_ctx->rx.rec_seq); /* Prepare AAD */ tls_make_aad(aad, rxm->full_len - prot->overhead_size + prot->tail_size, - tls_ctx->rx.rec_seq, prot->rec_seq_size, - ctx->control, prot->version); + tls_ctx->rx.rec_seq, ctx->control, prot); /* Prepare sgin */ sg_init_table(sgin, n_sgin); @@ -1913,7 +1918,7 @@ pick_next_record: * another message type */ msg->msg_flags |= MSG_EOR; - if (ctx->control != TLS_RECORD_TYPE_DATA) + if (control != TLS_RECORD_TYPE_DATA) goto recv_end; } else { break; @@ -2070,7 +2075,8 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) data_len = ((header[4] & 0xFF) | (header[3] << 8)); cipher_overhead = prot->tag_size; - if (prot->version != TLS_1_3_VERSION) + if (prot->version != TLS_1_3_VERSION && + prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) cipher_overhead += prot->iv_size; if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead + @@ -2290,6 +2296,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; struct tls12_crypto_info_aes_gcm_256 *gcm_256_info; struct tls12_crypto_info_aes_ccm_128 *ccm_128_info; + struct tls12_crypto_info_chacha20_poly1305 *chacha20_poly1305_info; struct tls_sw_context_tx *sw_ctx_tx = NULL; struct tls_sw_context_rx *sw_ctx_rx = NULL; struct cipher_context *cctx; @@ -2402,6 +2409,21 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) cipher_name = "ccm(aes)"; break; } + case TLS_CIPHER_CHACHA20_POLY1305: { + chacha20_poly1305_info = (void *)crypto_info; + nonce_size = 0; + tag_size = TLS_CIPHER_CHACHA20_POLY1305_TAG_SIZE; + iv_size = TLS_CIPHER_CHACHA20_POLY1305_IV_SIZE; + iv = chacha20_poly1305_info->iv; + rec_seq_size = TLS_CIPHER_CHACHA20_POLY1305_REC_SEQ_SIZE; + rec_seq = chacha20_poly1305_info->rec_seq; + keysize = TLS_CIPHER_CHACHA20_POLY1305_KEY_SIZE; + key = chacha20_poly1305_info->key; + salt = chacha20_poly1305_info->salt; + salt_size = TLS_CIPHER_CHACHA20_POLY1305_SALT_SIZE; + cipher_name = "rfc7539(chacha20,poly1305)"; + break; + } default: rc = -EINVAL; goto free_priv; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index b4d7b8aba003..5546710d8ac1 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -421,7 +421,8 @@ static void vsock_deassign_transport(struct vsock_sock *vsk) * The vsk->remote_addr is used to decide which transport to use: * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if * g2h is not loaded, will use local transport; - * - remote CID <= VMADDR_CID_HOST will use guest->host transport; + * - remote CID <= VMADDR_CID_HOST or h2g is not loaded or remote flags field + * includes VMADDR_FLAG_TO_HOST flag value, will use guest->host transport; * - remote CID > VMADDR_CID_HOST will use host->guest transport; */ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) @@ -429,8 +430,23 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) const struct vsock_transport *new_transport; struct sock *sk = sk_vsock(vsk); unsigned int remote_cid = vsk->remote_addr.svm_cid; + __u8 remote_flags; int ret; + /* If the packet is coming with the source and destination CIDs higher + * than VMADDR_CID_HOST, then a vsock channel where all the packets are + * forwarded to the host should be established. Then the host will + * need to forward the packets to the guest. + * + * The flag is set on the (listen) receive path (psk is not NULL). On + * the connect path the flag can be set by the user space application. + */ + if (psk && vsk->local_addr.svm_cid > VMADDR_CID_HOST && + vsk->remote_addr.svm_cid > VMADDR_CID_HOST) + vsk->remote_addr.svm_flags |= VMADDR_FLAG_TO_HOST; + + remote_flags = vsk->remote_addr.svm_flags; + switch (sk->sk_type) { case SOCK_DGRAM: new_transport = transport_dgram; @@ -438,7 +454,8 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) case SOCK_STREAM: if (vsock_use_local_transport(remote_cid)) new_transport = transport_local; - else if (remote_cid <= VMADDR_CID_HOST) + else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g || + (remote_flags & VMADDR_FLAG_TO_HOST)) new_transport = transport_g2h; else new_transport = transport_h2g; @@ -926,10 +943,12 @@ static int vsock_shutdown(struct socket *sock, int mode) */ sk = sock->sk; + + lock_sock(sk); if (sock->state == SS_UNCONNECTED) { err = -ENOTCONN; if (sk->sk_type == SOCK_STREAM) - return err; + goto out; } else { sock->state = SS_DISCONNECTING; err = 0; @@ -938,10 +957,8 @@ static int vsock_shutdown(struct socket *sock, int mode) /* Receive and send shutdowns are treated alike. */ mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); if (mode) { - lock_sock(sk); sk->sk_shutdown |= mode; sk->sk_state_change(sk); - release_sock(sk); if (sk->sk_type == SOCK_STREAM) { sock_reset_flag(sk, SOCK_DONE); @@ -949,6 +966,8 @@ static int vsock_shutdown(struct socket *sock, int mode) } } +out: + release_sock(sk); return err; } @@ -997,9 +1016,12 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; } else if (sock->type == SOCK_STREAM) { - const struct vsock_transport *transport = vsk->transport; + const struct vsock_transport *transport; + lock_sock(sk); + transport = vsk->transport; + /* Listening sockets that have connections in their accept * queue can be read. */ @@ -1082,10 +1104,11 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, err = 0; sk = sock->sk; vsk = vsock_sk(sk); - transport = vsk->transport; lock_sock(sk); + transport = vsk->transport; + err = vsock_auto_bind(vsk); if (err) goto out; @@ -1212,7 +1235,7 @@ static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) { const struct vsock_transport *transport = vsk->transport; - if (!transport->cancel_pkt) + if (!transport || !transport->cancel_pkt) return -EOPNOTSUPP; return transport->cancel_pkt(vsk); @@ -1222,7 +1245,6 @@ static void vsock_connect_timeout(struct work_struct *work) { struct sock *sk; struct vsock_sock *vsk; - int cancel = 0; vsk = container_of(work, struct vsock_sock, connect_work.work); sk = sk_vsock(vsk); @@ -1233,11 +1255,9 @@ static void vsock_connect_timeout(struct work_struct *work) sk->sk_state = TCP_CLOSE; sk->sk_err = ETIMEDOUT; sk->sk_error_report(sk); - cancel = 1; + vsock_transport_cancel_pkt(vsk); } release_sock(sk); - if (cancel) - vsock_transport_cancel_pkt(vsk); sock_put(sk); } @@ -1544,10 +1564,11 @@ static int vsock_stream_setsockopt(struct socket *sock, err = 0; sk = sock->sk; vsk = vsock_sk(sk); - transport = vsk->transport; lock_sock(sk); + transport = vsk->transport; + switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: COPY_IN(val); @@ -1680,7 +1701,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, sk = sock->sk; vsk = vsock_sk(sk); - transport = vsk->transport; total_written = 0; err = 0; @@ -1689,6 +1709,8 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, lock_sock(sk); + transport = vsk->transport; + /* Callers should not provide a destination with stream sockets. */ if (msg->msg_namelen) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; @@ -1823,11 +1845,12 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, sk = sock->sk; vsk = vsock_sk(sk); - transport = vsk->transport; err = 0; lock_sock(sk); + transport = vsk->transport; + if (!transport || sk->sk_state != TCP_ESTABLISHED) { /* Recvmsg is supposed to return 0 if a peer performs an * orderly shutdown. Differentiate between that case and when a @@ -2072,8 +2095,7 @@ static long vsock_dev_do_ioctl(struct file *filp, break; default: - pr_err("Unknown ioctl %d\n", cmd); - retval = -EINVAL; + retval = -ENOIOCTLCMD; } return retval; diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 630b851f8150..cc3bae2659e7 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -474,14 +474,10 @@ static void hvs_shutdown_lock_held(struct hvsock *hvs, int mode) static int hvs_shutdown(struct vsock_sock *vsk, int mode) { - struct sock *sk = sk_vsock(vsk); - if (!(mode & SEND_SHUTDOWN)) return 0; - lock_sock(sk); hvs_shutdown_lock_held(vsk->trans, mode); - release_sock(sk); return 0; } diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 0edda1edf988..e4370b1b7494 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -841,8 +841,10 @@ void virtio_transport_release(struct vsock_sock *vsk) virtio_transport_free_pkt(pkt); } - if (remove_sock) + if (remove_sock) { + sock_set_flag(sk, SOCK_DONE); vsock_remove_sock(vsk); + } } EXPORT_SYMBOL_GPL(virtio_transport_release); @@ -1128,18 +1130,18 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, vsk = vsock_sk(sk); - space_available = virtio_transport_space_update(sk, pkt); - lock_sock(sk); - /* Check if sk has been released before lock_sock */ - if (sk->sk_shutdown == SHUTDOWN_MASK) { + /* Check if sk has been closed before lock_sock */ + if (sock_flag(sk, SOCK_DONE)) { (void)virtio_transport_reset_no_sock(t, pkt); release_sock(sk); sock_put(sk); goto free_pkt; } + space_available = virtio_transport_space_update(sk, pkt); + /* Update CID in case it has changed after a transport reset event */ vsk->local_addr.svm_cid = dst.svm_cid; diff --git a/net/vmw_vsock/vsock_addr.c b/net/vmw_vsock/vsock_addr.c index 909de26cb0e7..223b9660a759 100644 --- a/net/vmw_vsock/vsock_addr.c +++ b/net/vmw_vsock/vsock_addr.c @@ -22,13 +22,15 @@ EXPORT_SYMBOL_GPL(vsock_addr_init); int vsock_addr_validate(const struct sockaddr_vm *addr) { + __u8 svm_valid_flags = VMADDR_FLAG_TO_HOST; + if (!addr) return -EFAULT; if (addr->svm_family != AF_VSOCK) return -EAFNOSUPPORT; - if (addr->svm_zero[0] != 0) + if (addr->svm_flags & ~svm_valid_flags) return -EINVAL; return 0; diff --git a/net/wimax/Kconfig b/net/wimax/Kconfig deleted file mode 100644 index d13762bc4abc..000000000000 --- a/net/wimax/Kconfig +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# WiMAX LAN device configuration -# - -menuconfig WIMAX - tristate "WiMAX Wireless Broadband support" - depends on RFKILL || !RFKILL - help - - Select to configure support for devices that provide - wireless broadband connectivity using the WiMAX protocol - (IEEE 802.16). - - Please note that most of these devices require signing up - for a service plan with a provider. - - The different WiMAX drivers can be enabled in the menu entry - - Device Drivers > Network device support > WiMAX Wireless - Broadband devices - - If unsure, it is safe to select M (module). - -config WIMAX_DEBUG_LEVEL - int "WiMAX debug level" - depends on WIMAX - default 8 - help - - Select the maximum debug verbosity level to be compiled into - the WiMAX stack code. - - By default, debug messages are disabled at runtime and can - be selectively enabled for different parts of the code using - the sysfs debug-levels file. - - If set at zero, this will compile out all the debug code. - - It is recommended that it is left at 8. diff --git a/net/wimax/Makefile b/net/wimax/Makefile deleted file mode 100644 index c2a71ae487ac..000000000000 --- a/net/wimax/Makefile +++ /dev/null @@ -1,13 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 - -obj-$(CONFIG_WIMAX) += wimax.o - -wimax-y := \ - id-table.o \ - op-msg.o \ - op-reset.o \ - op-rfkill.o \ - op-state-get.o \ - stack.o - -wimax-$(CONFIG_DEBUG_FS) += debugfs.o diff --git a/net/wimax/debug-levels.h b/net/wimax/debug-levels.h deleted file mode 100644 index ebc287cde336..000000000000 --- a/net/wimax/debug-levels.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Linux WiMAX Stack - * Debug levels control file for the wimax module - * - * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - */ -#ifndef __debug_levels__h__ -#define __debug_levels__h__ - -/* Maximum compile and run time debug level for all submodules */ -#define D_MODULENAME wimax -#define D_MASTER CONFIG_WIMAX_DEBUG_LEVEL - -#include <linux/wimax/debug.h> - -/* List of all the enabled modules */ -enum d_module { - D_SUBMODULE_DECLARE(debugfs), - D_SUBMODULE_DECLARE(id_table), - D_SUBMODULE_DECLARE(op_msg), - D_SUBMODULE_DECLARE(op_reset), - D_SUBMODULE_DECLARE(op_rfkill), - D_SUBMODULE_DECLARE(op_state_get), - D_SUBMODULE_DECLARE(stack), -}; - -#endif /* #ifndef __debug_levels__h__ */ diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c deleted file mode 100644 index 3c54bb6b925a..000000000000 --- a/net/wimax/debugfs.c +++ /dev/null @@ -1,38 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Linux WiMAX - * Debugfs support - * - * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - */ -#include <linux/debugfs.h> -#include <linux/wimax.h> -#include "wimax-internal.h" - -#define D_SUBMODULE debugfs -#include "debug-levels.h" - -void wimax_debugfs_add(struct wimax_dev *wimax_dev) -{ - struct net_device *net_dev = wimax_dev->net_dev; - struct dentry *dentry; - char buf[128]; - - snprintf(buf, sizeof(buf), "wimax:%s", net_dev->name); - dentry = debugfs_create_dir(buf, NULL); - wimax_dev->debugfs_dentry = dentry; - - d_level_register_debugfs("wimax_dl_", debugfs, dentry); - d_level_register_debugfs("wimax_dl_", id_table, dentry); - d_level_register_debugfs("wimax_dl_", op_msg, dentry); - d_level_register_debugfs("wimax_dl_", op_reset, dentry); - d_level_register_debugfs("wimax_dl_", op_rfkill, dentry); - d_level_register_debugfs("wimax_dl_", op_state_get, dentry); - d_level_register_debugfs("wimax_dl_", stack, dentry); -} - -void wimax_debugfs_rm(struct wimax_dev *wimax_dev) -{ - debugfs_remove_recursive(wimax_dev->debugfs_dentry); -} diff --git a/net/wimax/id-table.c b/net/wimax/id-table.c deleted file mode 100644 index 02eee37b7e31..000000000000 --- a/net/wimax/id-table.c +++ /dev/null @@ -1,130 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Linux WiMAX - * Mappping of generic netlink family IDs to net devices - * - * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - * - * We assign a single generic netlink family ID to each device (to - * simplify lookup). - * - * We need a way to map family ID to a wimax_dev pointer. - * - * The idea is to use a very simple lookup. Using a netlink attribute - * with (for example) the interface name implies a heavier search over - * all the network devices; seemed kind of a waste given that we know - * we are looking for a WiMAX device and that most systems will have - * just a single WiMAX adapter. - * - * We put all the WiMAX devices in the system in a linked list and - * match the generic link family ID against the list. - * - * By using a linked list, the case of a single adapter in the system - * becomes (almost) no overhead, while still working for many more. If - * it ever goes beyond two, I'll be surprised. - */ -#include <linux/device.h> -#include <net/genetlink.h> -#include <linux/netdevice.h> -#include <linux/list.h> -#include <linux/wimax.h> -#include "wimax-internal.h" - - -#define D_SUBMODULE id_table -#include "debug-levels.h" - - -static DEFINE_SPINLOCK(wimax_id_table_lock); -static struct list_head wimax_id_table = LIST_HEAD_INIT(wimax_id_table); - - -/* - * wimax_id_table_add - add a gennetlink familiy ID / wimax_dev mapping - * - * @wimax_dev: WiMAX device descriptor to associate to the Generic - * Netlink family ID. - * - * Look for an empty spot in the ID table; if none found, double the - * table's size and get the first spot. - */ -void wimax_id_table_add(struct wimax_dev *wimax_dev) -{ - d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev); - spin_lock(&wimax_id_table_lock); - list_add(&wimax_dev->id_table_node, &wimax_id_table); - spin_unlock(&wimax_id_table_lock); - d_fnend(3, NULL, "(wimax_dev %p)\n", wimax_dev); -} - - -/* - * wimax_get_netdev_by_info - lookup a wimax_dev from the gennetlink info - * - * The generic netlink family ID has been filled out in the - * nlmsghdr->nlmsg_type field, so we pull it from there, look it up in - * the mapping table and reference the wimax_dev. - * - * When done, the reference should be dropped with - * 'dev_put(wimax_dev->net_dev)'. - */ -struct wimax_dev *wimax_dev_get_by_genl_info( - struct genl_info *info, int ifindex) -{ - struct wimax_dev *wimax_dev = NULL; - - d_fnstart(3, NULL, "(info %p ifindex %d)\n", info, ifindex); - spin_lock(&wimax_id_table_lock); - list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) { - if (wimax_dev->net_dev->ifindex == ifindex) { - dev_hold(wimax_dev->net_dev); - goto found; - } - } - wimax_dev = NULL; - d_printf(1, NULL, "wimax: no devices found with ifindex %d\n", - ifindex); -found: - spin_unlock(&wimax_id_table_lock); - d_fnend(3, NULL, "(info %p ifindex %d) = %p\n", - info, ifindex, wimax_dev); - return wimax_dev; -} - - -/* - * wimax_id_table_rm - Remove a gennetlink familiy ID / wimax_dev mapping - * - * @id: family ID to remove from the table - */ -void wimax_id_table_rm(struct wimax_dev *wimax_dev) -{ - spin_lock(&wimax_id_table_lock); - list_del_init(&wimax_dev->id_table_node); - spin_unlock(&wimax_id_table_lock); -} - - -/* - * Release the gennetlink family id / mapping table - * - * On debug, verify that the table is empty upon removal. We want the - * code always compiled, to ensure it doesn't bit rot. It will be - * compiled out if CONFIG_BUG is disabled. - */ -void wimax_id_table_release(void) -{ - struct wimax_dev *wimax_dev; - -#ifndef CONFIG_BUG - return; -#endif - spin_lock(&wimax_id_table_lock); - list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) { - pr_err("BUG: %s wimax_dev %p ifindex %d not cleared\n", - __func__, wimax_dev, wimax_dev->net_dev->ifindex); - WARN_ON(1); - } - spin_unlock(&wimax_id_table_lock); -} diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c deleted file mode 100644 index 6460b5785758..000000000000 --- a/net/wimax/op-msg.c +++ /dev/null @@ -1,391 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Linux WiMAX - * Generic messaging interface between userspace and driver/device - * - * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - * - * This implements a direct communication channel between user space and - * the driver/device, by which free form messages can be sent back and - * forth. - * - * This is intended for device-specific features, vendor quirks, etc. - * - * See include/net/wimax.h - * - * GENERIC NETLINK ENCODING AND CAPACITY - * - * A destination "pipe name" is added to each message; it is up to the - * drivers to assign or use those names (if using them at all). - * - * Messages are encoded as a binary netlink attribute using nla_put() - * using type NLA_UNSPEC (as some versions of libnl still in - * deployment don't yet understand NLA_BINARY). - * - * The maximum capacity of this transport is PAGESIZE per message (so - * the actual payload will be bit smaller depending on the - * netlink/generic netlink attributes and headers). - * - * RECEPTION OF MESSAGES - * - * When a message is received from user space, it is passed verbatim - * to the driver calling wimax_dev->op_msg_from_user(). The return - * value from this function is passed back to user space as an ack - * over the generic netlink protocol. - * - * The stack doesn't do any processing or interpretation of these - * messages. - * - * SENDING MESSAGES - * - * Messages can be sent with wimax_msg(). - * - * If the message delivery needs to happen on a different context to - * that of its creation, wimax_msg_alloc() can be used to get a - * pointer to the message that can be delivered later on with - * wimax_msg_send(). - * - * ROADMAP - * - * wimax_gnl_doit_msg_from_user() Process a message from user space - * wimax_dev_get_by_genl_info() - * wimax_dev->op_msg_from_user() Delivery of message to the driver - * - * wimax_msg() Send a message to user space - * wimax_msg_alloc() - * wimax_msg_send() - */ -#include <linux/device.h> -#include <linux/slab.h> -#include <net/genetlink.h> -#include <linux/netdevice.h> -#include <linux/wimax.h> -#include <linux/security.h> -#include <linux/export.h> -#include "wimax-internal.h" - - -#define D_SUBMODULE op_msg -#include "debug-levels.h" - - -/** - * wimax_msg_alloc - Create a new skb for sending a message to userspace - * - * @wimax_dev: WiMAX device descriptor - * @pipe_name: "named pipe" the message will be sent to - * @msg: pointer to the message data to send - * @size: size of the message to send (in bytes), including the header. - * @gfp_flags: flags for memory allocation. - * - * Returns: %0 if ok, negative errno code on error - * - * Description: - * - * Allocates an skb that will contain the message to send to user - * space over the messaging pipe and initializes it, copying the - * payload. - * - * Once this call is done, you can deliver it with - * wimax_msg_send(). - * - * IMPORTANT: - * - * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as - * wimax_msg_send() depends on skb->data being placed at the - * beginning of the user message. - * - * Unlike other WiMAX stack calls, this call can be used way early, - * even before wimax_dev_add() is called, as long as the - * wimax_dev->net_dev pointer is set to point to a proper - * net_dev. This is so that drivers can use it early in case they need - * to send stuff around or communicate with user space. - */ -struct sk_buff *wimax_msg_alloc(struct wimax_dev *wimax_dev, - const char *pipe_name, - const void *msg, size_t size, - gfp_t gfp_flags) -{ - int result; - struct device *dev = wimax_dev_to_dev(wimax_dev); - size_t msg_size; - void *genl_msg; - struct sk_buff *skb; - - msg_size = nla_total_size(size) - + nla_total_size(sizeof(u32)) - + (pipe_name ? nla_total_size(strlen(pipe_name)) : 0); - result = -ENOMEM; - skb = genlmsg_new(msg_size, gfp_flags); - if (skb == NULL) - goto error_new; - genl_msg = genlmsg_put(skb, 0, 0, &wimax_gnl_family, - 0, WIMAX_GNL_OP_MSG_TO_USER); - if (genl_msg == NULL) { - dev_err(dev, "no memory to create generic netlink message\n"); - goto error_genlmsg_put; - } - result = nla_put_u32(skb, WIMAX_GNL_MSG_IFIDX, - wimax_dev->net_dev->ifindex); - if (result < 0) { - dev_err(dev, "no memory to add ifindex attribute\n"); - goto error_nla_put; - } - if (pipe_name) { - result = nla_put_string(skb, WIMAX_GNL_MSG_PIPE_NAME, - pipe_name); - if (result < 0) { - dev_err(dev, "no memory to add pipe_name attribute\n"); - goto error_nla_put; - } - } - result = nla_put(skb, WIMAX_GNL_MSG_DATA, size, msg); - if (result < 0) { - dev_err(dev, "no memory to add payload (msg %p size %zu) in " - "attribute: %d\n", msg, size, result); - goto error_nla_put; - } - genlmsg_end(skb, genl_msg); - return skb; - -error_nla_put: -error_genlmsg_put: -error_new: - nlmsg_free(skb); - return ERR_PTR(result); -} -EXPORT_SYMBOL_GPL(wimax_msg_alloc); - - -/** - * wimax_msg_data_len - Return a pointer and size of a message's payload - * - * @msg: Pointer to a message created with wimax_msg_alloc() - * @size: Pointer to where to store the message's size - * - * Returns the pointer to the message data. - */ -const void *wimax_msg_data_len(struct sk_buff *msg, size_t *size) -{ - struct nlmsghdr *nlh = (void *) msg->head; - struct nlattr *nla; - - nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr), - WIMAX_GNL_MSG_DATA); - if (nla == NULL) { - pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n"); - return NULL; - } - *size = nla_len(nla); - return nla_data(nla); -} -EXPORT_SYMBOL_GPL(wimax_msg_data_len); - - -/** - * wimax_msg_data - Return a pointer to a message's payload - * - * @msg: Pointer to a message created with wimax_msg_alloc() - */ -const void *wimax_msg_data(struct sk_buff *msg) -{ - struct nlmsghdr *nlh = (void *) msg->head; - struct nlattr *nla; - - nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr), - WIMAX_GNL_MSG_DATA); - if (nla == NULL) { - pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n"); - return NULL; - } - return nla_data(nla); -} -EXPORT_SYMBOL_GPL(wimax_msg_data); - - -/** - * wimax_msg_len - Return a message's payload length - * - * @msg: Pointer to a message created with wimax_msg_alloc() - */ -ssize_t wimax_msg_len(struct sk_buff *msg) -{ - struct nlmsghdr *nlh = (void *) msg->head; - struct nlattr *nla; - - nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr), - WIMAX_GNL_MSG_DATA); - if (nla == NULL) { - pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n"); - return -EINVAL; - } - return nla_len(nla); -} -EXPORT_SYMBOL_GPL(wimax_msg_len); - - -/** - * wimax_msg_send - Send a pre-allocated message to user space - * - * @wimax_dev: WiMAX device descriptor - * - * @skb: &struct sk_buff returned by wimax_msg_alloc(). Note the - * ownership of @skb is transferred to this function. - * - * Returns: 0 if ok, < 0 errno code on error - * - * Description: - * - * Sends a free-form message that was preallocated with - * wimax_msg_alloc() and filled up. - * - * Assumes that once you pass an skb to this function for sending, it - * owns it and will release it when done (on success). - * - * IMPORTANT: - * - * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as - * wimax_msg_send() depends on skb->data being placed at the - * beginning of the user message. - * - * Unlike other WiMAX stack calls, this call can be used way early, - * even before wimax_dev_add() is called, as long as the - * wimax_dev->net_dev pointer is set to point to a proper - * net_dev. This is so that drivers can use it early in case they need - * to send stuff around or communicate with user space. - */ -int wimax_msg_send(struct wimax_dev *wimax_dev, struct sk_buff *skb) -{ - struct device *dev = wimax_dev_to_dev(wimax_dev); - void *msg = skb->data; - size_t size = skb->len; - might_sleep(); - - d_printf(1, dev, "CTX: wimax msg, %zu bytes\n", size); - d_dump(2, dev, msg, size); - genlmsg_multicast(&wimax_gnl_family, skb, 0, 0, GFP_KERNEL); - d_printf(1, dev, "CTX: genl multicast done\n"); - return 0; -} -EXPORT_SYMBOL_GPL(wimax_msg_send); - - -/** - * wimax_msg - Send a message to user space - * - * @wimax_dev: WiMAX device descriptor (properly referenced) - * @pipe_name: "named pipe" the message will be sent to - * @buf: pointer to the message to send. - * @size: size of the buffer pointed to by @buf (in bytes). - * @gfp_flags: flags for memory allocation. - * - * Returns: %0 if ok, negative errno code on error. - * - * Description: - * - * Sends a free-form message to user space on the device @wimax_dev. - * - * NOTES: - * - * Once the @skb is given to this function, who will own it and will - * release it when done (unless it returns error). - */ -int wimax_msg(struct wimax_dev *wimax_dev, const char *pipe_name, - const void *buf, size_t size, gfp_t gfp_flags) -{ - int result = -ENOMEM; - struct sk_buff *skb; - - skb = wimax_msg_alloc(wimax_dev, pipe_name, buf, size, gfp_flags); - if (IS_ERR(skb)) - result = PTR_ERR(skb); - else - result = wimax_msg_send(wimax_dev, skb); - return result; -} -EXPORT_SYMBOL_GPL(wimax_msg); - -/* - * Relays a message from user space to the driver - * - * The skb is passed to the driver-specific function with the netlink - * and generic netlink headers already stripped. - * - * This call will block while handling/relaying the message. - */ -int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info) -{ - int result, ifindex; - struct wimax_dev *wimax_dev; - struct device *dev; - struct nlmsghdr *nlh = info->nlhdr; - char *pipe_name; - void *msg_buf; - size_t msg_len; - - might_sleep(); - d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); - result = -ENODEV; - if (info->attrs[WIMAX_GNL_MSG_IFIDX] == NULL) { - pr_err("WIMAX_GNL_MSG_FROM_USER: can't find IFIDX attribute\n"); - goto error_no_wimax_dev; - } - ifindex = nla_get_u32(info->attrs[WIMAX_GNL_MSG_IFIDX]); - wimax_dev = wimax_dev_get_by_genl_info(info, ifindex); - if (wimax_dev == NULL) - goto error_no_wimax_dev; - dev = wimax_dev_to_dev(wimax_dev); - - /* Unpack arguments */ - result = -EINVAL; - if (info->attrs[WIMAX_GNL_MSG_DATA] == NULL) { - dev_err(dev, "WIMAX_GNL_MSG_FROM_USER: can't find MSG_DATA " - "attribute\n"); - goto error_no_data; - } - msg_buf = nla_data(info->attrs[WIMAX_GNL_MSG_DATA]); - msg_len = nla_len(info->attrs[WIMAX_GNL_MSG_DATA]); - - if (info->attrs[WIMAX_GNL_MSG_PIPE_NAME] == NULL) - pipe_name = NULL; - else { - struct nlattr *attr = info->attrs[WIMAX_GNL_MSG_PIPE_NAME]; - size_t attr_len = nla_len(attr); - /* libnl-1.1 does not yet support NLA_NUL_STRING */ - result = -ENOMEM; - pipe_name = kstrndup(nla_data(attr), attr_len + 1, GFP_KERNEL); - if (pipe_name == NULL) - goto error_alloc; - pipe_name[attr_len] = 0; - } - mutex_lock(&wimax_dev->mutex); - result = wimax_dev_is_ready(wimax_dev); - if (result == -ENOMEDIUM) - result = 0; - if (result < 0) - goto error_not_ready; - result = -ENOSYS; - if (wimax_dev->op_msg_from_user == NULL) - goto error_noop; - - d_printf(1, dev, - "CRX: nlmsghdr len %u type %u flags 0x%04x seq 0x%x pid %u\n", - nlh->nlmsg_len, nlh->nlmsg_type, nlh->nlmsg_flags, - nlh->nlmsg_seq, nlh->nlmsg_pid); - d_printf(1, dev, "CRX: wimax message %zu bytes\n", msg_len); - d_dump(2, dev, msg_buf, msg_len); - - result = wimax_dev->op_msg_from_user(wimax_dev, pipe_name, - msg_buf, msg_len, info); -error_noop: -error_not_ready: - mutex_unlock(&wimax_dev->mutex); -error_alloc: - kfree(pipe_name); -error_no_data: - dev_put(wimax_dev->net_dev); -error_no_wimax_dev: - d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result); - return result; -} diff --git a/net/wimax/op-reset.c b/net/wimax/op-reset.c deleted file mode 100644 index 9899b2e56721..000000000000 --- a/net/wimax/op-reset.c +++ /dev/null @@ -1,108 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Linux WiMAX - * Implement and export a method for resetting a WiMAX device - * - * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - * - * This implements a simple synchronous call to reset a WiMAX device. - * - * Resets aim at being warm, keeping the device handles active; - * however, when that fails, it falls back to a cold reset (that will - * disconnect and reconnect the device). - */ - -#include <net/wimax.h> -#include <net/genetlink.h> -#include <linux/wimax.h> -#include <linux/security.h> -#include <linux/export.h> -#include "wimax-internal.h" - -#define D_SUBMODULE op_reset -#include "debug-levels.h" - - -/** - * wimax_reset - Reset a WiMAX device - * - * @wimax_dev: WiMAX device descriptor - * - * Returns: - * - * %0 if ok and a warm reset was done (the device still exists in - * the system). - * - * -%ENODEV if a cold/bus reset had to be done (device has - * disconnected and reconnected, so current handle is not valid - * any more). - * - * -%EINVAL if the device is not even registered. - * - * Any other negative error code shall be considered as - * non-recoverable. - * - * Description: - * - * Called when wanting to reset the device for any reason. Device is - * taken back to power on status. - * - * This call blocks; on successful return, the device has completed the - * reset process and is ready to operate. - */ -int wimax_reset(struct wimax_dev *wimax_dev) -{ - int result = -EINVAL; - struct device *dev = wimax_dev_to_dev(wimax_dev); - enum wimax_st state; - - might_sleep(); - d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev); - mutex_lock(&wimax_dev->mutex); - dev_hold(wimax_dev->net_dev); - state = wimax_dev->state; - mutex_unlock(&wimax_dev->mutex); - - if (state >= WIMAX_ST_DOWN) { - mutex_lock(&wimax_dev->mutex_reset); - result = wimax_dev->op_reset(wimax_dev); - mutex_unlock(&wimax_dev->mutex_reset); - } - dev_put(wimax_dev->net_dev); - - d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result); - return result; -} -EXPORT_SYMBOL(wimax_reset); - - -/* - * Exporting to user space over generic netlink - * - * Parse the reset command from user space, return error code. - * - * No attributes. - */ -int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info) -{ - int result, ifindex; - struct wimax_dev *wimax_dev; - - d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); - result = -ENODEV; - if (info->attrs[WIMAX_GNL_RESET_IFIDX] == NULL) { - pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n"); - goto error_no_wimax_dev; - } - ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RESET_IFIDX]); - wimax_dev = wimax_dev_get_by_genl_info(info, ifindex); - if (wimax_dev == NULL) - goto error_no_wimax_dev; - /* Execute the operation and send the result back to user space */ - result = wimax_reset(wimax_dev); - dev_put(wimax_dev->net_dev); -error_no_wimax_dev: - d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result); - return result; -} diff --git a/net/wimax/op-rfkill.c b/net/wimax/op-rfkill.c deleted file mode 100644 index 248d10b60b05..000000000000 --- a/net/wimax/op-rfkill.c +++ /dev/null @@ -1,431 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Linux WiMAX - * RF-kill framework integration - * - * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - * - * This integrates into the Linux Kernel rfkill susbystem so that the - * drivers just have to do the bare minimal work, which is providing a - * method to set the software RF-Kill switch and to report changes in - * the software and hardware switch status. - * - * A non-polled generic rfkill device is embedded into the WiMAX - * subsystem's representation of a device. - * - * FIXME: Need polled support? Let drivers provide a poll routine - * and hand it to rfkill ops then? - * - * All device drivers have to do is after wimax_dev_init(), call - * wimax_report_rfkill_hw() and wimax_report_rfkill_sw() to update - * initial state and then every time it changes. See wimax.h:struct - * wimax_dev for more information. - * - * ROADMAP - * - * wimax_gnl_doit_rfkill() User space calling wimax_rfkill() - * wimax_rfkill() Kernel calling wimax_rfkill() - * __wimax_rf_toggle_radio() - * - * wimax_rfkill_set_radio_block() RF-Kill subsystem calling - * __wimax_rf_toggle_radio() - * - * __wimax_rf_toggle_radio() - * wimax_dev->op_rfkill_sw_toggle() Driver backend - * __wimax_state_change() - * - * wimax_report_rfkill_sw() Driver reports state change - * __wimax_state_change() - * - * wimax_report_rfkill_hw() Driver reports state change - * __wimax_state_change() - * - * wimax_rfkill_add() Initialize/shutdown rfkill support - * wimax_rfkill_rm() [called by wimax_dev_add/rm()] - */ - -#include <net/wimax.h> -#include <net/genetlink.h> -#include <linux/wimax.h> -#include <linux/security.h> -#include <linux/rfkill.h> -#include <linux/export.h> -#include "wimax-internal.h" - -#define D_SUBMODULE op_rfkill -#include "debug-levels.h" - -/** - * wimax_report_rfkill_hw - Reports changes in the hardware RF switch - * - * @wimax_dev: WiMAX device descriptor - * - * @state: New state of the RF Kill switch. %WIMAX_RF_ON radio on, - * %WIMAX_RF_OFF radio off. - * - * When the device detects a change in the state of thehardware RF - * switch, it must call this function to let the WiMAX kernel stack - * know that the state has changed so it can be properly propagated. - * - * The WiMAX stack caches the state (the driver doesn't need to). As - * well, as the change is propagated it will come back as a request to - * change the software state to mirror the hardware state. - * - * If the device doesn't have a hardware kill switch, just report - * it on initialization as always on (%WIMAX_RF_ON, radio on). - */ -void wimax_report_rfkill_hw(struct wimax_dev *wimax_dev, - enum wimax_rf_state state) -{ - int result; - struct device *dev = wimax_dev_to_dev(wimax_dev); - enum wimax_st wimax_state; - - d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state); - BUG_ON(state == WIMAX_RF_QUERY); - BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF); - - mutex_lock(&wimax_dev->mutex); - result = wimax_dev_is_ready(wimax_dev); - if (result < 0) - goto error_not_ready; - - if (state != wimax_dev->rf_hw) { - wimax_dev->rf_hw = state; - if (wimax_dev->rf_hw == WIMAX_RF_ON && - wimax_dev->rf_sw == WIMAX_RF_ON) - wimax_state = WIMAX_ST_READY; - else - wimax_state = WIMAX_ST_RADIO_OFF; - - result = rfkill_set_hw_state(wimax_dev->rfkill, - state == WIMAX_RF_OFF); - - __wimax_state_change(wimax_dev, wimax_state); - } -error_not_ready: - mutex_unlock(&wimax_dev->mutex); - d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n", - wimax_dev, state, result); -} -EXPORT_SYMBOL_GPL(wimax_report_rfkill_hw); - - -/** - * wimax_report_rfkill_sw - Reports changes in the software RF switch - * - * @wimax_dev: WiMAX device descriptor - * - * @state: New state of the RF kill switch. %WIMAX_RF_ON radio on, - * %WIMAX_RF_OFF radio off. - * - * Reports changes in the software RF switch state to the WiMAX stack. - * - * The main use is during initialization, so the driver can query the - * device for its current software radio kill switch state and feed it - * to the system. - * - * On the side, the device does not change the software state by - * itself. In practice, this can happen, as the device might decide to - * switch (in software) the radio off for different reasons. - */ -void wimax_report_rfkill_sw(struct wimax_dev *wimax_dev, - enum wimax_rf_state state) -{ - int result; - struct device *dev = wimax_dev_to_dev(wimax_dev); - enum wimax_st wimax_state; - - d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state); - BUG_ON(state == WIMAX_RF_QUERY); - BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF); - - mutex_lock(&wimax_dev->mutex); - result = wimax_dev_is_ready(wimax_dev); - if (result < 0) - goto error_not_ready; - - if (state != wimax_dev->rf_sw) { - wimax_dev->rf_sw = state; - if (wimax_dev->rf_hw == WIMAX_RF_ON && - wimax_dev->rf_sw == WIMAX_RF_ON) - wimax_state = WIMAX_ST_READY; - else - wimax_state = WIMAX_ST_RADIO_OFF; - __wimax_state_change(wimax_dev, wimax_state); - rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF); - } -error_not_ready: - mutex_unlock(&wimax_dev->mutex); - d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n", - wimax_dev, state, result); -} -EXPORT_SYMBOL_GPL(wimax_report_rfkill_sw); - - -/* - * Callback for the RF Kill toggle operation - * - * This function is called by: - * - * - The rfkill subsystem when the RF-Kill key is pressed in the - * hardware and the driver notifies through - * wimax_report_rfkill_hw(). The rfkill subsystem ends up calling back - * here so the software RF Kill switch state is changed to reflect - * the hardware switch state. - * - * - When the user sets the state through sysfs' rfkill/state file - * - * - When the user calls wimax_rfkill(). - * - * This call blocks! - * - * WARNING! When we call rfkill_unregister(), this will be called with - * state 0! - * - * WARNING: wimax_dev must be locked - */ -static -int __wimax_rf_toggle_radio(struct wimax_dev *wimax_dev, - enum wimax_rf_state state) -{ - int result = 0; - struct device *dev = wimax_dev_to_dev(wimax_dev); - enum wimax_st wimax_state; - - might_sleep(); - d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state); - if (wimax_dev->rf_sw == state) - goto out_no_change; - if (wimax_dev->op_rfkill_sw_toggle != NULL) - result = wimax_dev->op_rfkill_sw_toggle(wimax_dev, state); - else if (state == WIMAX_RF_OFF) /* No op? can't turn off */ - result = -ENXIO; - else /* No op? can turn on */ - result = 0; /* should never happen tho */ - if (result >= 0) { - result = 0; - wimax_dev->rf_sw = state; - wimax_state = state == WIMAX_RF_ON ? - WIMAX_ST_READY : WIMAX_ST_RADIO_OFF; - __wimax_state_change(wimax_dev, wimax_state); - } -out_no_change: - d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n", - wimax_dev, state, result); - return result; -} - - -/* - * Translate from rfkill state to wimax state - * - * NOTE: Special state handling rules here - * - * Just pretend the call didn't happen if we are in a state where - * we know for sure it cannot be handled (WIMAX_ST_DOWN or - * __WIMAX_ST_QUIESCING). rfkill() needs it to register and - * unregister, as it will run this path. - * - * NOTE: This call will block until the operation is completed. - */ -static int wimax_rfkill_set_radio_block(void *data, bool blocked) -{ - int result; - struct wimax_dev *wimax_dev = data; - struct device *dev = wimax_dev_to_dev(wimax_dev); - enum wimax_rf_state rf_state; - - d_fnstart(3, dev, "(wimax_dev %p blocked %u)\n", wimax_dev, blocked); - rf_state = WIMAX_RF_ON; - if (blocked) - rf_state = WIMAX_RF_OFF; - mutex_lock(&wimax_dev->mutex); - if (wimax_dev->state <= __WIMAX_ST_QUIESCING) - result = 0; - else - result = __wimax_rf_toggle_radio(wimax_dev, rf_state); - mutex_unlock(&wimax_dev->mutex); - d_fnend(3, dev, "(wimax_dev %p blocked %u) = %d\n", - wimax_dev, blocked, result); - return result; -} - -static const struct rfkill_ops wimax_rfkill_ops = { - .set_block = wimax_rfkill_set_radio_block, -}; - -/** - * wimax_rfkill - Set the software RF switch state for a WiMAX device - * - * @wimax_dev: WiMAX device descriptor - * - * @state: New RF state. - * - * Returns: - * - * >= 0 toggle state if ok, < 0 errno code on error. The toggle state - * is returned as a bitmap, bit 0 being the hardware RF state, bit 1 - * the software RF state. - * - * 0 means disabled (%WIMAX_RF_ON, radio on), 1 means enabled radio - * off (%WIMAX_RF_OFF). - * - * Description: - * - * Called by the user when he wants to request the WiMAX radio to be - * switched on (%WIMAX_RF_ON) or off (%WIMAX_RF_OFF). With - * %WIMAX_RF_QUERY, just the current state is returned. - * - * NOTE: - * - * This call will block until the operation is complete. - */ -int wimax_rfkill(struct wimax_dev *wimax_dev, enum wimax_rf_state state) -{ - int result; - struct device *dev = wimax_dev_to_dev(wimax_dev); - - d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state); - mutex_lock(&wimax_dev->mutex); - result = wimax_dev_is_ready(wimax_dev); - if (result < 0) { - /* While initializing, < 1.4.3 wimax-tools versions use - * this call to check if the device is a valid WiMAX - * device; so we allow it to proceed always, - * considering the radios are all off. */ - if (result == -ENOMEDIUM && state == WIMAX_RF_QUERY) - result = WIMAX_RF_OFF << 1 | WIMAX_RF_OFF; - goto error_not_ready; - } - switch (state) { - case WIMAX_RF_ON: - case WIMAX_RF_OFF: - result = __wimax_rf_toggle_radio(wimax_dev, state); - if (result < 0) - goto error; - rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF); - break; - case WIMAX_RF_QUERY: - break; - default: - result = -EINVAL; - goto error; - } - result = wimax_dev->rf_sw << 1 | wimax_dev->rf_hw; -error: -error_not_ready: - mutex_unlock(&wimax_dev->mutex); - d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n", - wimax_dev, state, result); - return result; -} -EXPORT_SYMBOL(wimax_rfkill); - - -/* - * Register a new WiMAX device's RF Kill support - * - * WARNING: wimax_dev->mutex must be unlocked - */ -int wimax_rfkill_add(struct wimax_dev *wimax_dev) -{ - int result; - struct rfkill *rfkill; - struct device *dev = wimax_dev_to_dev(wimax_dev); - - d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev); - /* Initialize RF Kill */ - result = -ENOMEM; - rfkill = rfkill_alloc(wimax_dev->name, dev, RFKILL_TYPE_WIMAX, - &wimax_rfkill_ops, wimax_dev); - if (rfkill == NULL) - goto error_rfkill_allocate; - - d_printf(1, dev, "rfkill %p\n", rfkill); - - wimax_dev->rfkill = rfkill; - - rfkill_init_sw_state(rfkill, 1); - result = rfkill_register(wimax_dev->rfkill); - if (result < 0) - goto error_rfkill_register; - - /* If there is no SW toggle op, SW RFKill is always on */ - if (wimax_dev->op_rfkill_sw_toggle == NULL) - wimax_dev->rf_sw = WIMAX_RF_ON; - - d_fnend(3, dev, "(wimax_dev %p) = 0\n", wimax_dev); - return 0; - -error_rfkill_register: - rfkill_destroy(wimax_dev->rfkill); -error_rfkill_allocate: - d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result); - return result; -} - - -/* - * Deregister a WiMAX device's RF Kill support - * - * Ick, we can't call rfkill_free() after rfkill_unregister()...oh - * well. - * - * WARNING: wimax_dev->mutex must be unlocked - */ -void wimax_rfkill_rm(struct wimax_dev *wimax_dev) -{ - struct device *dev = wimax_dev_to_dev(wimax_dev); - d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev); - rfkill_unregister(wimax_dev->rfkill); - rfkill_destroy(wimax_dev->rfkill); - d_fnend(3, dev, "(wimax_dev %p)\n", wimax_dev); -} - - -/* - * Exporting to user space over generic netlink - * - * Parse the rfkill command from user space, return a combination - * value that describe the states of the different toggles. - * - * Only one attribute: the new state requested (on, off or no change, - * just query). - */ - -int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info) -{ - int result, ifindex; - struct wimax_dev *wimax_dev; - struct device *dev; - enum wimax_rf_state new_state; - - d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); - result = -ENODEV; - if (info->attrs[WIMAX_GNL_RFKILL_IFIDX] == NULL) { - pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n"); - goto error_no_wimax_dev; - } - ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_IFIDX]); - wimax_dev = wimax_dev_get_by_genl_info(info, ifindex); - if (wimax_dev == NULL) - goto error_no_wimax_dev; - dev = wimax_dev_to_dev(wimax_dev); - result = -EINVAL; - if (info->attrs[WIMAX_GNL_RFKILL_STATE] == NULL) { - dev_err(dev, "WIMAX_GNL_RFKILL: can't find RFKILL_STATE " - "attribute\n"); - goto error_no_pid; - } - new_state = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_STATE]); - - /* Execute the operation and send the result back to user space */ - result = wimax_rfkill(wimax_dev, new_state); -error_no_pid: - dev_put(wimax_dev->net_dev); -error_no_wimax_dev: - d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result); - return result; -} diff --git a/net/wimax/op-state-get.c b/net/wimax/op-state-get.c deleted file mode 100644 index 5bc712de1563..000000000000 --- a/net/wimax/op-state-get.c +++ /dev/null @@ -1,52 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Linux WiMAX - * Implement and export a method for getting a WiMAX device current state - * - * Copyright (C) 2009 Paulius Zaleckas <paulius.zaleckas@teltonika.lt> - * - * Based on previous WiMAX core work by: - * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - */ - -#include <net/wimax.h> -#include <net/genetlink.h> -#include <linux/wimax.h> -#include <linux/security.h> -#include "wimax-internal.h" - -#define D_SUBMODULE op_state_get -#include "debug-levels.h" - - -/* - * Exporting to user space over generic netlink - * - * Parse the state get command from user space, return a combination - * value that describe the current state. - * - * No attributes. - */ -int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info) -{ - int result, ifindex; - struct wimax_dev *wimax_dev; - - d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); - result = -ENODEV; - if (info->attrs[WIMAX_GNL_STGET_IFIDX] == NULL) { - pr_err("WIMAX_GNL_OP_STATE_GET: can't find IFIDX attribute\n"); - goto error_no_wimax_dev; - } - ifindex = nla_get_u32(info->attrs[WIMAX_GNL_STGET_IFIDX]); - wimax_dev = wimax_dev_get_by_genl_info(info, ifindex); - if (wimax_dev == NULL) - goto error_no_wimax_dev; - /* Execute the operation and send the result back to user space */ - result = wimax_state_get(wimax_dev); - dev_put(wimax_dev->net_dev); -error_no_wimax_dev: - d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result); - return result; -} diff --git a/net/wimax/stack.c b/net/wimax/stack.c deleted file mode 100644 index b6dd9d956ed8..000000000000 --- a/net/wimax/stack.c +++ /dev/null @@ -1,609 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Linux WiMAX - * Initialization, addition and removal of wimax devices - * - * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - * - * This implements: - * - * - basic life cycle of 'struct wimax_dev' [wimax_dev_*()]; on - * addition/registration initialize all subfields and allocate - * generic netlink resources for user space communication. On - * removal/unregistration, undo all that. - * - * - device state machine [wimax_state_change()] and support to send - * reports to user space when the state changes - * [wimax_gnl_re_state_change*()]. - * - * See include/net/wimax.h for rationales and design. - * - * ROADMAP - * - * [__]wimax_state_change() Called by drivers to update device's state - * wimax_gnl_re_state_change_alloc() - * wimax_gnl_re_state_change_send() - * - * wimax_dev_init() Init a device - * wimax_dev_add() Register - * wimax_rfkill_add() - * wimax_gnl_add() Register all the generic netlink resources. - * wimax_id_table_add() - * wimax_dev_rm() Unregister - * wimax_id_table_rm() - * wimax_gnl_rm() - * wimax_rfkill_rm() - */ -#include <linux/device.h> -#include <linux/gfp.h> -#include <net/genetlink.h> -#include <linux/netdevice.h> -#include <linux/wimax.h> -#include <linux/module.h> -#include "wimax-internal.h" - - -#define D_SUBMODULE stack -#include "debug-levels.h" - -static char wimax_debug_params[128]; -module_param_string(debug, wimax_debug_params, sizeof(wimax_debug_params), - 0644); -MODULE_PARM_DESC(debug, - "String of space-separated NAME:VALUE pairs, where NAMEs " - "are the different debug submodules and VALUE are the " - "initial debug value to set."); - -/* - * Authoritative source for the RE_STATE_CHANGE attribute policy - * - * We don't really use it here, but /me likes to keep the definition - * close to where the data is generated. - */ -/* -static const struct nla_policy wimax_gnl_re_status_change[WIMAX_GNL_ATTR_MAX + 1] = { - [WIMAX_GNL_STCH_STATE_OLD] = { .type = NLA_U8 }, - [WIMAX_GNL_STCH_STATE_NEW] = { .type = NLA_U8 }, -}; -*/ - - -/* - * Allocate a Report State Change message - * - * @header: save it, you need it for _send() - * - * Creates and fills a basic state change message; different code - * paths can then add more attributes to the message as needed. - * - * Use wimax_gnl_re_state_change_send() to send the returned skb. - * - * Returns: skb with the genl message if ok, IS_ERR() ptr on error - * with an errno code. - */ -static -struct sk_buff *wimax_gnl_re_state_change_alloc( - struct wimax_dev *wimax_dev, - enum wimax_st new_state, enum wimax_st old_state, - void **header) -{ - int result; - struct device *dev = wimax_dev_to_dev(wimax_dev); - void *data; - struct sk_buff *report_skb; - - d_fnstart(3, dev, "(wimax_dev %p new_state %u old_state %u)\n", - wimax_dev, new_state, old_state); - result = -ENOMEM; - report_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (report_skb == NULL) { - dev_err(dev, "RE_STCH: can't create message\n"); - goto error_new; - } - /* FIXME: sending a group ID as the seq is wrong */ - data = genlmsg_put(report_skb, 0, wimax_gnl_family.mcgrp_offset, - &wimax_gnl_family, 0, WIMAX_GNL_RE_STATE_CHANGE); - if (data == NULL) { - dev_err(dev, "RE_STCH: can't put data into message\n"); - goto error_put; - } - *header = data; - - result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_OLD, old_state); - if (result < 0) { - dev_err(dev, "RE_STCH: Error adding OLD attr: %d\n", result); - goto error_put; - } - result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_NEW, new_state); - if (result < 0) { - dev_err(dev, "RE_STCH: Error adding NEW attr: %d\n", result); - goto error_put; - } - result = nla_put_u32(report_skb, WIMAX_GNL_STCH_IFIDX, - wimax_dev->net_dev->ifindex); - if (result < 0) { - dev_err(dev, "RE_STCH: Error adding IFINDEX attribute\n"); - goto error_put; - } - d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %p\n", - wimax_dev, new_state, old_state, report_skb); - return report_skb; - -error_put: - nlmsg_free(report_skb); -error_new: - d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %d\n", - wimax_dev, new_state, old_state, result); - return ERR_PTR(result); -} - - -/* - * Send a Report State Change message (as created with _alloc). - * - * @report_skb: as returned by wimax_gnl_re_state_change_alloc() - * @header: as returned by wimax_gnl_re_state_change_alloc() - * - * Returns: 0 if ok, < 0 errno code on error. - * - * If the message is NULL, pretend it didn't happen. - */ -static -int wimax_gnl_re_state_change_send( - struct wimax_dev *wimax_dev, struct sk_buff *report_skb, - void *header) -{ - int result = 0; - struct device *dev = wimax_dev_to_dev(wimax_dev); - d_fnstart(3, dev, "(wimax_dev %p report_skb %p)\n", - wimax_dev, report_skb); - if (report_skb == NULL) { - result = -ENOMEM; - goto out; - } - genlmsg_end(report_skb, header); - genlmsg_multicast(&wimax_gnl_family, report_skb, 0, 0, GFP_KERNEL); -out: - d_fnend(3, dev, "(wimax_dev %p report_skb %p) = %d\n", - wimax_dev, report_skb, result); - return result; -} - - -static -void __check_new_state(enum wimax_st old_state, enum wimax_st new_state, - unsigned int allowed_states_bm) -{ - if (WARN_ON(((1 << new_state) & allowed_states_bm) == 0)) { - pr_err("SW BUG! Forbidden state change %u -> %u\n", - old_state, new_state); - } -} - - -/* - * Set the current state of a WiMAX device [unlocking version of - * wimax_state_change(). - */ -void __wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state) -{ - struct device *dev = wimax_dev_to_dev(wimax_dev); - enum wimax_st old_state = wimax_dev->state; - struct sk_buff *stch_skb; - void *header; - - d_fnstart(3, dev, "(wimax_dev %p new_state %u [old %u])\n", - wimax_dev, new_state, old_state); - - if (WARN_ON(new_state >= __WIMAX_ST_INVALID)) { - dev_err(dev, "SW BUG: requesting invalid state %u\n", - new_state); - goto out; - } - if (old_state == new_state) - goto out; - header = NULL; /* gcc complains? can't grok why */ - stch_skb = wimax_gnl_re_state_change_alloc( - wimax_dev, new_state, old_state, &header); - - /* Verify the state transition and do exit-from-state actions */ - switch (old_state) { - case __WIMAX_ST_NULL: - __check_new_state(old_state, new_state, - 1 << WIMAX_ST_DOWN); - break; - case WIMAX_ST_DOWN: - __check_new_state(old_state, new_state, - 1 << __WIMAX_ST_QUIESCING - | 1 << WIMAX_ST_UNINITIALIZED - | 1 << WIMAX_ST_RADIO_OFF); - break; - case __WIMAX_ST_QUIESCING: - __check_new_state(old_state, new_state, 1 << WIMAX_ST_DOWN); - break; - case WIMAX_ST_UNINITIALIZED: - __check_new_state(old_state, new_state, - 1 << __WIMAX_ST_QUIESCING - | 1 << WIMAX_ST_RADIO_OFF); - break; - case WIMAX_ST_RADIO_OFF: - __check_new_state(old_state, new_state, - 1 << __WIMAX_ST_QUIESCING - | 1 << WIMAX_ST_READY); - break; - case WIMAX_ST_READY: - __check_new_state(old_state, new_state, - 1 << __WIMAX_ST_QUIESCING - | 1 << WIMAX_ST_RADIO_OFF - | 1 << WIMAX_ST_SCANNING - | 1 << WIMAX_ST_CONNECTING - | 1 << WIMAX_ST_CONNECTED); - break; - case WIMAX_ST_SCANNING: - __check_new_state(old_state, new_state, - 1 << __WIMAX_ST_QUIESCING - | 1 << WIMAX_ST_RADIO_OFF - | 1 << WIMAX_ST_READY - | 1 << WIMAX_ST_CONNECTING - | 1 << WIMAX_ST_CONNECTED); - break; - case WIMAX_ST_CONNECTING: - __check_new_state(old_state, new_state, - 1 << __WIMAX_ST_QUIESCING - | 1 << WIMAX_ST_RADIO_OFF - | 1 << WIMAX_ST_READY - | 1 << WIMAX_ST_SCANNING - | 1 << WIMAX_ST_CONNECTED); - break; - case WIMAX_ST_CONNECTED: - __check_new_state(old_state, new_state, - 1 << __WIMAX_ST_QUIESCING - | 1 << WIMAX_ST_RADIO_OFF - | 1 << WIMAX_ST_READY); - netif_tx_disable(wimax_dev->net_dev); - netif_carrier_off(wimax_dev->net_dev); - break; - case __WIMAX_ST_INVALID: - default: - dev_err(dev, "SW BUG: wimax_dev %p is in unknown state %u\n", - wimax_dev, wimax_dev->state); - WARN_ON(1); - goto out; - } - - /* Execute the actions of entry to the new state */ - switch (new_state) { - case __WIMAX_ST_NULL: - dev_err(dev, "SW BUG: wimax_dev %p entering NULL state " - "from %u\n", wimax_dev, wimax_dev->state); - WARN_ON(1); /* Nobody can enter this state */ - break; - case WIMAX_ST_DOWN: - break; - case __WIMAX_ST_QUIESCING: - break; - case WIMAX_ST_UNINITIALIZED: - break; - case WIMAX_ST_RADIO_OFF: - break; - case WIMAX_ST_READY: - break; - case WIMAX_ST_SCANNING: - break; - case WIMAX_ST_CONNECTING: - break; - case WIMAX_ST_CONNECTED: - netif_carrier_on(wimax_dev->net_dev); - netif_wake_queue(wimax_dev->net_dev); - break; - case __WIMAX_ST_INVALID: - default: - BUG(); - } - __wimax_state_set(wimax_dev, new_state); - if (!IS_ERR(stch_skb)) - wimax_gnl_re_state_change_send(wimax_dev, stch_skb, header); -out: - d_fnend(3, dev, "(wimax_dev %p new_state %u [old %u]) = void\n", - wimax_dev, new_state, old_state); -} - - -/** - * wimax_state_change - Set the current state of a WiMAX device - * - * @wimax_dev: WiMAX device descriptor (properly referenced) - * @new_state: New state to switch to - * - * This implements the state changes for the wimax devices. It will - * - * - verify that the state transition is legal (for now it'll just - * print a warning if not) according to the table in - * linux/wimax.h's documentation for 'enum wimax_st'. - * - * - perform the actions needed for leaving the current state and - * whichever are needed for entering the new state. - * - * - issue a report to user space indicating the new state (and an - * optional payload with information about the new state). - * - * NOTE: @wimax_dev must be locked - */ -void wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state) -{ - /* - * A driver cannot take the wimax_dev out of the - * __WIMAX_ST_NULL state unless by calling wimax_dev_add(). If - * the wimax_dev's state is still NULL, we ignore any request - * to change its state because it means it hasn't been yet - * registered. - * - * There is no need to complain about it, as routines that - * call this might be shared from different code paths that - * are called before or after wimax_dev_add() has done its - * job. - */ - mutex_lock(&wimax_dev->mutex); - if (wimax_dev->state > __WIMAX_ST_NULL) - __wimax_state_change(wimax_dev, new_state); - mutex_unlock(&wimax_dev->mutex); -} -EXPORT_SYMBOL_GPL(wimax_state_change); - - -/** - * wimax_state_get() - Return the current state of a WiMAX device - * - * @wimax_dev: WiMAX device descriptor - * - * Returns: Current state of the device according to its driver. - */ -enum wimax_st wimax_state_get(struct wimax_dev *wimax_dev) -{ - enum wimax_st state; - mutex_lock(&wimax_dev->mutex); - state = wimax_dev->state; - mutex_unlock(&wimax_dev->mutex); - return state; -} -EXPORT_SYMBOL_GPL(wimax_state_get); - - -/** - * wimax_dev_init - initialize a newly allocated instance - * - * @wimax_dev: WiMAX device descriptor to initialize. - * - * Initializes fields of a freshly allocated @wimax_dev instance. This - * function assumes that after allocation, the memory occupied by - * @wimax_dev was zeroed. - */ -void wimax_dev_init(struct wimax_dev *wimax_dev) -{ - INIT_LIST_HEAD(&wimax_dev->id_table_node); - __wimax_state_set(wimax_dev, __WIMAX_ST_NULL); - mutex_init(&wimax_dev->mutex); - mutex_init(&wimax_dev->mutex_reset); -} -EXPORT_SYMBOL_GPL(wimax_dev_init); - -static const struct nla_policy wimax_gnl_policy[WIMAX_GNL_ATTR_MAX + 1] = { - [WIMAX_GNL_RESET_IFIDX] = { .type = NLA_U32, }, - [WIMAX_GNL_RFKILL_IFIDX] = { .type = NLA_U32, }, - [WIMAX_GNL_RFKILL_STATE] = { - .type = NLA_U32 /* enum wimax_rf_state */ - }, - [WIMAX_GNL_STGET_IFIDX] = { .type = NLA_U32, }, - [WIMAX_GNL_MSG_IFIDX] = { .type = NLA_U32, }, - [WIMAX_GNL_MSG_DATA] = { - .type = NLA_UNSPEC, /* libnl doesn't grok BINARY yet */ - }, -}; - -static const struct genl_small_ops wimax_gnl_ops[] = { - { - .cmd = WIMAX_GNL_OP_MSG_FROM_USER, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, - .doit = wimax_gnl_doit_msg_from_user, - }, - { - .cmd = WIMAX_GNL_OP_RESET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, - .doit = wimax_gnl_doit_reset, - }, - { - .cmd = WIMAX_GNL_OP_RFKILL, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, - .doit = wimax_gnl_doit_rfkill, - }, - { - .cmd = WIMAX_GNL_OP_STATE_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, - .doit = wimax_gnl_doit_state_get, - }, -}; - - -static -size_t wimax_addr_scnprint(char *addr_str, size_t addr_str_size, - unsigned char *addr, size_t addr_len) -{ - unsigned int cnt, total; - - for (total = cnt = 0; cnt < addr_len; cnt++) - total += scnprintf(addr_str + total, addr_str_size - total, - "%02x%c", addr[cnt], - cnt == addr_len - 1 ? '\0' : ':'); - return total; -} - - -/** - * wimax_dev_add - Register a new WiMAX device - * - * @wimax_dev: WiMAX device descriptor (as embedded in your @net_dev's - * priv data). You must have called wimax_dev_init() on it before. - * - * @net_dev: net device the @wimax_dev is associated with. The - * function expects SET_NETDEV_DEV() and register_netdev() were - * already called on it. - * - * Registers the new WiMAX device, sets up the user-kernel control - * interface (generic netlink) and common WiMAX infrastructure. - * - * Note that the parts that will allow interaction with user space are - * setup at the very end, when the rest is in place, as once that - * happens, the driver might get user space control requests via - * netlink or from debugfs that might translate into calls into - * wimax_dev->op_*(). - */ -int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev) -{ - int result; - struct device *dev = net_dev->dev.parent; - char addr_str[32]; - - d_fnstart(3, dev, "(wimax_dev %p net_dev %p)\n", wimax_dev, net_dev); - - /* Do the RFKILL setup before locking, as RFKILL will call - * into our functions. - */ - wimax_dev->net_dev = net_dev; - result = wimax_rfkill_add(wimax_dev); - if (result < 0) - goto error_rfkill_add; - - /* Set up user-space interaction */ - mutex_lock(&wimax_dev->mutex); - wimax_id_table_add(wimax_dev); - wimax_debugfs_add(wimax_dev); - - __wimax_state_set(wimax_dev, WIMAX_ST_DOWN); - mutex_unlock(&wimax_dev->mutex); - - wimax_addr_scnprint(addr_str, sizeof(addr_str), - net_dev->dev_addr, net_dev->addr_len); - dev_err(dev, "WiMAX interface %s (%s) ready\n", - net_dev->name, addr_str); - d_fnend(3, dev, "(wimax_dev %p net_dev %p) = 0\n", wimax_dev, net_dev); - return 0; - -error_rfkill_add: - d_fnend(3, dev, "(wimax_dev %p net_dev %p) = %d\n", - wimax_dev, net_dev, result); - return result; -} -EXPORT_SYMBOL_GPL(wimax_dev_add); - - -/** - * wimax_dev_rm - Unregister an existing WiMAX device - * - * @wimax_dev: WiMAX device descriptor - * - * Unregisters a WiMAX device previously registered for use with - * wimax_add_rm(). - * - * IMPORTANT! Must call before calling unregister_netdev(). - * - * After this function returns, you will not get any more user space - * control requests (via netlink or debugfs) and thus to wimax_dev->ops. - * - * Reentrancy control is ensured by setting the state to - * %__WIMAX_ST_QUIESCING. rfkill operations coming through - * wimax_*rfkill*() will be stopped by the quiescing state; ops coming - * from the rfkill subsystem will be stopped by the support being - * removed by wimax_rfkill_rm(). - */ -void wimax_dev_rm(struct wimax_dev *wimax_dev) -{ - d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev); - - mutex_lock(&wimax_dev->mutex); - __wimax_state_change(wimax_dev, __WIMAX_ST_QUIESCING); - wimax_debugfs_rm(wimax_dev); - wimax_id_table_rm(wimax_dev); - __wimax_state_change(wimax_dev, WIMAX_ST_DOWN); - mutex_unlock(&wimax_dev->mutex); - wimax_rfkill_rm(wimax_dev); - d_fnend(3, NULL, "(wimax_dev %p) = void\n", wimax_dev); -} -EXPORT_SYMBOL_GPL(wimax_dev_rm); - - -/* Debug framework control of debug levels */ -struct d_level D_LEVEL[] = { - D_SUBMODULE_DEFINE(debugfs), - D_SUBMODULE_DEFINE(id_table), - D_SUBMODULE_DEFINE(op_msg), - D_SUBMODULE_DEFINE(op_reset), - D_SUBMODULE_DEFINE(op_rfkill), - D_SUBMODULE_DEFINE(op_state_get), - D_SUBMODULE_DEFINE(stack), -}; -size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL); - - -static const struct genl_multicast_group wimax_gnl_mcgrps[] = { - { .name = "msg", }, -}; - -struct genl_family wimax_gnl_family __ro_after_init = { - .name = "WiMAX", - .version = WIMAX_GNL_VERSION, - .hdrsize = 0, - .maxattr = WIMAX_GNL_ATTR_MAX, - .policy = wimax_gnl_policy, - .module = THIS_MODULE, - .small_ops = wimax_gnl_ops, - .n_small_ops = ARRAY_SIZE(wimax_gnl_ops), - .mcgrps = wimax_gnl_mcgrps, - .n_mcgrps = ARRAY_SIZE(wimax_gnl_mcgrps), -}; - - - -/* Shutdown the wimax stack */ -static -int __init wimax_subsys_init(void) -{ - int result; - - d_fnstart(4, NULL, "()\n"); - d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params, - "wimax.debug"); - - result = genl_register_family(&wimax_gnl_family); - if (unlikely(result < 0)) { - pr_err("cannot register generic netlink family: %d\n", result); - goto error_register_family; - } - - d_fnend(4, NULL, "() = 0\n"); - return 0; - -error_register_family: - d_fnend(4, NULL, "() = %d\n", result); - return result; - -} -module_init(wimax_subsys_init); - - -/* Shutdown the wimax stack */ -static -void __exit wimax_subsys_exit(void) -{ - wimax_id_table_release(); - genl_unregister_family(&wimax_gnl_family); -} -module_exit(wimax_subsys_exit); - -MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>"); -MODULE_DESCRIPTION("Linux WiMAX stack"); -MODULE_LICENSE("GPL"); diff --git a/net/wimax/wimax-internal.h b/net/wimax/wimax-internal.h deleted file mode 100644 index 40751207296c..000000000000 --- a/net/wimax/wimax-internal.h +++ /dev/null @@ -1,85 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Linux WiMAX - * Internal API for kernel space WiMAX stack - * - * Copyright (C) 2007 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - * - * This header file is for declarations and definitions internal to - * the WiMAX stack. For public APIs and documentation, see - * include/net/wimax.h and include/linux/wimax.h. - */ - -#ifndef __WIMAX_INTERNAL_H__ -#define __WIMAX_INTERNAL_H__ -#ifdef __KERNEL__ - -#ifdef pr_fmt -#undef pr_fmt -#endif - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/device.h> -#include <net/wimax.h> - - -/* - * Decide if a (locked) device is ready for use - * - * Before using the device structure, it must be locked - * (wimax_dev->mutex). As well, most operations need to call this - * function to check if the state is the right one. - * - * An error value will be returned if the state is not the right - * one. In that case, the caller should not attempt to use the device - * and just unlock it. - */ -static inline __must_check -int wimax_dev_is_ready(struct wimax_dev *wimax_dev) -{ - if (wimax_dev->state == __WIMAX_ST_NULL) - return -EINVAL; /* Device is not even registered! */ - if (wimax_dev->state == WIMAX_ST_DOWN) - return -ENOMEDIUM; - if (wimax_dev->state == __WIMAX_ST_QUIESCING) - return -ESHUTDOWN; - return 0; -} - - -static inline -void __wimax_state_set(struct wimax_dev *wimax_dev, enum wimax_st state) -{ - wimax_dev->state = state; -} -void __wimax_state_change(struct wimax_dev *, enum wimax_st); - -#ifdef CONFIG_DEBUG_FS -void wimax_debugfs_add(struct wimax_dev *); -void wimax_debugfs_rm(struct wimax_dev *); -#else -static inline void wimax_debugfs_add(struct wimax_dev *wimax_dev) {} -static inline void wimax_debugfs_rm(struct wimax_dev *wimax_dev) {} -#endif - -void wimax_id_table_add(struct wimax_dev *); -struct wimax_dev *wimax_dev_get_by_genl_info(struct genl_info *, int); -void wimax_id_table_rm(struct wimax_dev *); -void wimax_id_table_release(void); - -int wimax_rfkill_add(struct wimax_dev *); -void wimax_rfkill_rm(struct wimax_dev *); - -/* generic netlink */ -extern struct genl_family wimax_gnl_family; - -/* ops */ -int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info); -int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info); -int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info); -int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info); - -#endif /* #ifdef __KERNEL__ */ -#endif /* #ifndef __WIMAX_INTERNAL_H__ */ diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 27026f587fa6..f620acd2a0f5 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -21,6 +21,7 @@ config CFG80211 tristate "cfg80211 - wireless configuration API" depends on RFKILL || !RFKILL select FW_LOADER + select CRC32 # may need to update this when certificates are changed and are # using a different algorithm, though right now they shouldn't # (this is here rather than below to allow it to be a module) diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 22d1779ab2b1..285b8076054b 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -530,10 +530,10 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: break; + case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: WARN_ON(1); @@ -677,12 +677,12 @@ bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev) case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_P2P_DEVICE: /* Can NAN type be considered as beaconing interface? */ case NL80211_IFTYPE_NAN: break; case NL80211_IFTYPE_UNSPECIFIED: + case NL80211_IFTYPE_WDS: case NUM_NL80211_IFTYPES: WARN_ON(1); } @@ -1093,7 +1093,7 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy, struct wireless_dev *wdev; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); if (!IS_ENABLED(CONFIG_CFG80211_REG_RELAX_NO_IR) || !(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR)) @@ -1216,9 +1216,10 @@ bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, enum nl80211_iftype iftype) { + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); bool check_no_ir; - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); /* * Under certain conditions suggested by some regulatory bodies a @@ -1324,12 +1325,12 @@ cfg80211_get_chan_state(struct wireless_dev *wdev, break; case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: /* these interface types don't really have a channel */ return; case NL80211_IFTYPE_UNSPECIFIED: + case NL80211_IFTYPE_WDS: case NUM_NL80211_IFTYPES: WARN_ON(1); } diff --git a/net/wireless/core.c b/net/wireless/core.c index 240282c083aa..a2785379df6e 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -222,7 +222,7 @@ static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data) void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_P2P_DEVICE)) return; @@ -247,7 +247,7 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_NAN)) return; @@ -273,7 +273,11 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) dev_close(wdev->netdev); continue; } + /* otherwise, check iftype */ + + wiphy_lock(wiphy); + switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: cfg80211_stop_p2p_device(rdev, wdev); @@ -284,6 +288,8 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) default: break; } + + wiphy_unlock(wiphy); } } EXPORT_SYMBOL_GPL(cfg80211_shutdown_all_interfaces); @@ -318,9 +324,9 @@ static void cfg80211_event_work(struct work_struct *work) rdev = container_of(work, struct cfg80211_registered_device, event_work); - rtnl_lock(); + wiphy_lock(&rdev->wiphy); cfg80211_process_rdev_events(rdev); - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); } void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) @@ -328,6 +334,7 @@ void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) struct wireless_dev *wdev, *tmp; ASSERT_RTNL(); + lockdep_assert_wiphy(&rdev->wiphy); list_for_each_entry_safe(wdev, tmp, &rdev->wiphy.wdev_list, list) { if (wdev->nl_owner_dead) @@ -343,7 +350,9 @@ static void cfg80211_destroy_iface_wk(struct work_struct *work) destroy_work); rtnl_lock(); + wiphy_lock(&rdev->wiphy); cfg80211_destroy_ifaces(rdev); + wiphy_unlock(&rdev->wiphy); rtnl_unlock(); } @@ -475,6 +484,7 @@ use_default_name: } } + mutex_init(&rdev->wiphy.mtx); INIT_LIST_HEAD(&rdev->wiphy.wdev_list); INIT_LIST_HEAD(&rdev->beacon_registrations); spin_lock_init(&rdev->beacon_registrations_lock); @@ -631,10 +641,8 @@ static int wiphy_verify_combinations(struct wiphy *wiphy) return -EINVAL; } -#ifndef CONFIG_WIRELESS_WDS if (WARN_ON(all_iftypes & BIT(NL80211_IFTYPE_WDS))) return -EINVAL; -#endif /* You can't even choose that many! */ if (WARN_ON(cnt < c->max_interfaces)) @@ -675,10 +683,8 @@ int wiphy_register(struct wiphy *wiphy) !(wiphy->nan_supported_bands & BIT(NL80211_BAND_2GHZ))))) return -EINVAL; -#ifndef CONFIG_WIRELESS_WDS if (WARN_ON(wiphy->interface_modes & BIT(NL80211_IFTYPE_WDS))) return -EINVAL; -#endif if (WARN_ON(wiphy->pmsr_capa && !wiphy->pmsr_capa->ftm.supported)) return -EINVAL; @@ -1011,15 +1017,16 @@ void wiphy_unregister(struct wiphy *wiphy) wait_event(rdev->dev_wait, ({ int __count; - rtnl_lock(); + wiphy_lock(&rdev->wiphy); __count = rdev->opencount; - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); __count == 0; })); if (rdev->rfkill) rfkill_unregister(rdev->rfkill); rtnl_lock(); + wiphy_lock(&rdev->wiphy); nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY); rdev->wiphy.registered = false; @@ -1042,6 +1049,7 @@ void wiphy_unregister(struct wiphy *wiphy) cfg80211_rdev_list_generation++; device_del(&rdev->wiphy.dev); + wiphy_unlock(&rdev->wiphy); rtnl_unlock(); flush_work(&rdev->scan_done_wk); @@ -1074,6 +1082,7 @@ void cfg80211_dev_free(struct cfg80211_registered_device *rdev) } list_for_each_entry_safe(scan, tmp, &rdev->bss_list, list) cfg80211_put_bss(&rdev->wiphy, &scan->pub); + mutex_destroy(&rdev->wiphy.mtx); kfree(rdev); } @@ -1098,19 +1107,28 @@ void cfg80211_cqm_config_free(struct wireless_dev *wdev) wdev->cqm_config = NULL; } -static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync) +static void _cfg80211_unregister_wdev(struct wireless_dev *wdev, + bool unregister_netdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); flush_work(&wdev->pmsr_free_wk); nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); + wdev->registered = false; + + if (wdev->netdev) { + sysfs_remove_link(&wdev->netdev->dev.kobj, "phy80211"); + if (unregister_netdev) + unregister_netdevice(wdev->netdev); + } + list_del_rcu(&wdev->list); - if (sync) - synchronize_rcu(); + synchronize_net(); rdev->devlist_generation++; cfg80211_mlme_purge_registrations(wdev); @@ -1135,14 +1153,23 @@ static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync) flush_work(&wdev->disconnect_wk); cfg80211_cqm_config_free(wdev); + + /* + * Ensure that all events have been processed and + * freed. + */ + cfg80211_process_wdev_events(wdev); + + if (WARN_ON(wdev->current_bss)) { + cfg80211_unhold_bss(wdev->current_bss); + cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); + wdev->current_bss = NULL; + } } void cfg80211_unregister_wdev(struct wireless_dev *wdev) { - if (WARN_ON(wdev->netdev)) - return; - - __cfg80211_unregister_wdev(wdev, true); + _cfg80211_unregister_wdev(wdev, true); } EXPORT_SYMBOL(cfg80211_unregister_wdev); @@ -1153,7 +1180,7 @@ static const struct device_type wiphy_type = { void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, int num) { - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); rdev->num_running_ifaces += num; if (iftype == NL80211_IFTYPE_MONITOR) @@ -1166,7 +1193,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, struct net_device *dev = wdev->netdev; struct cfg80211_sched_scan_request *pos, *tmp; - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); ASSERT_WDEV_LOCK(wdev); cfg80211_pmsr_wdev_down(wdev); @@ -1202,9 +1229,6 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, case NL80211_IFTYPE_OCB: __cfg80211_leave_ocb(rdev, dev); break; - case NL80211_IFTYPE_WDS: - /* must be handled by mac80211/driver, has no APIs */ - break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: /* cannot happen, has no netdev */ @@ -1214,6 +1238,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, /* nothing to do */ break; case NL80211_IFTYPE_UNSPECIFIED: + case NL80211_IFTYPE_WDS: case NUM_NL80211_IFTYPES: /* invalid */ break; @@ -1285,6 +1310,9 @@ void cfg80211_init_wdev(struct wireless_dev *wdev) void cfg80211_register_wdev(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { + ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); + /* * We get here also when the interface changes network namespaces, * as it's registered into the new one, but we don't want it to @@ -1296,10 +1324,51 @@ void cfg80211_register_wdev(struct cfg80211_registered_device *rdev, wdev->identifier = ++rdev->wdev_id; list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list); rdev->devlist_generation++; + wdev->registered = true; nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); } +int cfg80211_register_netdevice(struct net_device *dev) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev; + int ret; + + ASSERT_RTNL(); + + if (WARN_ON(!wdev)) + return -EINVAL; + + rdev = wiphy_to_rdev(wdev->wiphy); + + lockdep_assert_held(&rdev->wiphy.mtx); + + /* we'll take care of this */ + wdev->registered = true; + wdev->registering = true; + ret = register_netdevice(dev); + if (ret) + goto out; + + if (sysfs_create_link(&dev->dev.kobj, &rdev->wiphy.dev.kobj, + "phy80211")) { + pr_err("failed to add phy80211 symlink to netdev!\n"); + unregister_netdevice(dev); + ret = -EINVAL; + goto out; + } + + cfg80211_register_wdev(rdev, wdev); + ret = 0; +out: + wdev->registering = false; + if (ret) + wdev->registered = false; + return ret; +} +EXPORT_SYMBOL(cfg80211_register_netdevice); + static int cfg80211_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) { @@ -1325,22 +1394,30 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, cfg80211_init_wdev(wdev); break; case NETDEV_REGISTER: + if (!wdev->registered) { + wiphy_lock(&rdev->wiphy); + cfg80211_register_wdev(rdev, wdev); + wiphy_unlock(&rdev->wiphy); + } + break; + case NETDEV_UNREGISTER: /* - * NB: cannot take rdev->mtx here because this may be - * called within code protected by it when interfaces - * are added with nl80211. + * It is possible to get NETDEV_UNREGISTER multiple times, + * so check wdev->registered. */ - if (sysfs_create_link(&dev->dev.kobj, &rdev->wiphy.dev.kobj, - "phy80211")) { - pr_err("failed to add phy80211 symlink to netdev!\n"); + if (wdev->registered && !wdev->registering) { + wiphy_lock(&rdev->wiphy); + _cfg80211_unregister_wdev(wdev, false); + wiphy_unlock(&rdev->wiphy); } - - cfg80211_register_wdev(rdev, wdev); break; case NETDEV_GOING_DOWN: + wiphy_lock(&rdev->wiphy); cfg80211_leave(rdev, wdev); + wiphy_unlock(&rdev->wiphy); break; case NETDEV_DOWN: + wiphy_lock(&rdev->wiphy); cfg80211_update_iface_num(rdev, wdev->iftype, -1); if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified && @@ -1357,9 +1434,11 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, } rdev->opencount--; + wiphy_unlock(&rdev->wiphy); wake_up(&rdev->dev_wait); break; case NETDEV_UP: + wiphy_lock(&rdev->wiphy); cfg80211_update_iface_num(rdev, wdev->iftype, 1); wdev_lock(wdev); switch (wdev->iftype) { @@ -1406,38 +1485,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, /* assume this means it's off */ wdev->ps = false; } - break; - case NETDEV_UNREGISTER: - /* - * It is possible to get NETDEV_UNREGISTER - * multiple times. To detect that, check - * that the interface is still on the list - * of registered interfaces, and only then - * remove and clean it up. - */ - if (!list_empty(&wdev->list)) { - __cfg80211_unregister_wdev(wdev, false); - sysfs_remove_link(&dev->dev.kobj, "phy80211"); - } - /* - * synchronise (so that we won't find this netdev - * from other code any more) and then clear the list - * head so that the above code can safely check for - * !list_empty() to avoid double-cleanup. - */ - synchronize_rcu(); - INIT_LIST_HEAD(&wdev->list); - /* - * Ensure that all events have been processed and - * freed. - */ - cfg80211_process_wdev_events(wdev); - - if (WARN_ON(wdev->current_bss)) { - cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); - wdev->current_bss = NULL; - } + wiphy_unlock(&rdev->wiphy); break; case NETDEV_PRE_UP: if (!cfg80211_iftype_allowed(wdev->wiphy, wdev->iftype, diff --git a/net/wireless/core.h b/net/wireless/core.h index e3e9686859d4..a7d19b4b40ac 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -231,7 +231,7 @@ static inline void wdev_unlock(struct wireless_dev *wdev) static inline bool cfg80211_has_monitors_only(struct cfg80211_registered_device *rdev) { - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); return rdev->num_running_ifaces == rdev->num_running_monitor_ifaces && rdev->num_running_ifaces > 0; @@ -433,6 +433,8 @@ void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev); /* internal helpers */ bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher); +bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev, + int key_idx, bool pairwise); int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr); diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c index 76b845f68ac8..aab43469a2f0 100644 --- a/net/wireless/debugfs.c +++ b/net/wireless/debugfs.c @@ -73,8 +73,6 @@ static ssize_t ht40allow_map_read(struct file *file, if (!buf) return -ENOMEM; - rtnl_lock(); - for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) @@ -84,8 +82,6 @@ static ssize_t ht40allow_map_read(struct file *file, buf, buf_size, offset); } - rtnl_unlock(); - r = simple_read_from_buffer(user_buf, count, ppos, buf, offset); kfree(buf); diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index a0621bb76d8e..8f98e546becf 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -3,6 +3,7 @@ * Some IBSS support code for cfg80211. * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> + * Copyright (C) 2020-2021 Intel Corporation */ #include <linux/etherdevice.h> @@ -92,7 +93,7 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev = dev->ieee80211_ptr; int err; - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); ASSERT_WDEV_LOCK(wdev); if (wdev->ssid_len) diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 0ac820780437..3aa69b375a10 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -4,7 +4,7 @@ * * Copyright (c) 2009, Jouni Malinen <j@w1.fi> * Copyright (c) 2015 Intel Deutschland GmbH - * Copyright (C) 2019 Intel Corporation + * Copyright (C) 2019-2020 Intel Corporation */ #include <linux/kernel.h> @@ -81,7 +81,8 @@ static void cfg80211_process_auth(struct wireless_dev *wdev, } static void cfg80211_process_deauth(struct wireless_dev *wdev, - const u8 *buf, size_t len) + const u8 *buf, size_t len, + bool reconnect) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; @@ -89,7 +90,7 @@ static void cfg80211_process_deauth(struct wireless_dev *wdev, u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr); - nl80211_send_deauth(rdev, wdev->netdev, buf, len, GFP_KERNEL); + nl80211_send_deauth(rdev, wdev->netdev, buf, len, reconnect, GFP_KERNEL); if (!wdev->current_bss || !ether_addr_equal(wdev->current_bss->pub.bssid, bssid)) @@ -100,7 +101,8 @@ static void cfg80211_process_deauth(struct wireless_dev *wdev, } static void cfg80211_process_disassoc(struct wireless_dev *wdev, - const u8 *buf, size_t len) + const u8 *buf, size_t len, + bool reconnect) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; @@ -108,7 +110,8 @@ static void cfg80211_process_disassoc(struct wireless_dev *wdev, u16 reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code); bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr); - nl80211_send_disassoc(rdev, wdev->netdev, buf, len, GFP_KERNEL); + nl80211_send_disassoc(rdev, wdev->netdev, buf, len, reconnect, + GFP_KERNEL); if (WARN_ON(!wdev->current_bss || !ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) @@ -133,9 +136,9 @@ void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len) if (ieee80211_is_auth(mgmt->frame_control)) cfg80211_process_auth(wdev, buf, len); else if (ieee80211_is_deauth(mgmt->frame_control)) - cfg80211_process_deauth(wdev, buf, len); + cfg80211_process_deauth(wdev, buf, len, false); else if (ieee80211_is_disassoc(mgmt->frame_control)) - cfg80211_process_disassoc(wdev, buf, len); + cfg80211_process_disassoc(wdev, buf, len, false); } EXPORT_SYMBOL(cfg80211_rx_mlme_mgmt); @@ -180,22 +183,23 @@ void cfg80211_abandon_assoc(struct net_device *dev, struct cfg80211_bss *bss) } EXPORT_SYMBOL(cfg80211_abandon_assoc); -void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len) +void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len, + bool reconnect) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_mgmt *mgmt = (void *)buf; ASSERT_WDEV_LOCK(wdev); - trace_cfg80211_tx_mlme_mgmt(dev, buf, len); + trace_cfg80211_tx_mlme_mgmt(dev, buf, len, reconnect); if (WARN_ON(len < 2)) return; if (ieee80211_is_deauth(mgmt->frame_control)) - cfg80211_process_deauth(wdev, buf, len); + cfg80211_process_deauth(wdev, buf, len, reconnect); else - cfg80211_process_disassoc(wdev, buf, len); + cfg80211_process_disassoc(wdev, buf, len, reconnect); } EXPORT_SYMBOL(cfg80211_tx_mlme_mgmt); @@ -446,7 +450,7 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) struct cfg80211_mgmt_registration *reg; struct mgmt_frame_regs upd = {}; - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); spin_lock_bh(&wdev->mgmt_registrations_lock); if (!wdev->mgmt_registrations_need_update) { @@ -488,10 +492,10 @@ void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) rdev = container_of(wk, struct cfg80211_registered_device, mgmt_registrations_update_wk); - rtnl_lock(); + wiphy_lock(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_mgmt_registrations_update(wdev); - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); } int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a77174b99b07..521d36bb0803 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -64,9 +64,9 @@ static const struct genl_multicast_group nl80211_mcgrps[] = { /* returns ERR_PTR values */ static struct wireless_dev * -__cfg80211_wdev_from_attrs(struct net *netns, struct nlattr **attrs) +__cfg80211_wdev_from_attrs(struct cfg80211_registered_device *rdev, + struct net *netns, struct nlattr **attrs) { - struct cfg80211_registered_device *rdev; struct wireless_dev *result = NULL; bool have_ifidx = attrs[NL80211_ATTR_IFINDEX]; bool have_wdev_id = attrs[NL80211_ATTR_WDEV]; @@ -74,8 +74,6 @@ __cfg80211_wdev_from_attrs(struct net *netns, struct nlattr **attrs) int wiphy_idx = -1; int ifidx = -1; - ASSERT_RTNL(); - if (!have_ifidx && !have_wdev_id) return ERR_PTR(-EINVAL); @@ -86,6 +84,28 @@ __cfg80211_wdev_from_attrs(struct net *netns, struct nlattr **attrs) wiphy_idx = wdev_id >> 32; } + if (rdev) { + struct wireless_dev *wdev; + + lockdep_assert_held(&rdev->wiphy.mtx); + + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { + if (have_ifidx && wdev->netdev && + wdev->netdev->ifindex == ifidx) { + result = wdev; + break; + } + if (have_wdev_id && wdev->identifier == (u32)wdev_id) { + result = wdev; + break; + } + } + + return result ?: ERR_PTR(-ENODEV); + } + + ASSERT_RTNL(); + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { struct wireless_dev *wdev; @@ -399,6 +419,18 @@ nl80211_unsol_bcast_probe_resp_policy[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX + .len = IEEE80211_MAX_DATA_LEN } }; +static const struct nla_policy +sar_specs_policy[NL80211_SAR_ATTR_SPECS_MAX + 1] = { + [NL80211_SAR_ATTR_SPECS_POWER] = { .type = NLA_S32 }, + [NL80211_SAR_ATTR_SPECS_RANGE_INDEX] = {.type = NLA_U32 }, +}; + +static const struct nla_policy +sar_policy[NL80211_SAR_ATTR_MAX + 1] = { + [NL80211_SAR_ATTR_TYPE] = NLA_POLICY_MAX(NLA_U32, NUM_NL80211_SAR_TYPE), + [NL80211_SAR_ATTR_SPECS] = NLA_POLICY_NESTED_ARRAY(sar_specs_policy), +}; + static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, @@ -715,6 +747,12 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { NLA_POLICY_EXACT_LEN(IEEE80211_S1G_CAPABILITY_LEN), [NL80211_ATTR_S1G_CAPABILITY_MASK] = NLA_POLICY_EXACT_LEN(IEEE80211_S1G_CAPABILITY_LEN), + [NL80211_ATTR_SAE_PWE] = + NLA_POLICY_RANGE(NLA_U8, NL80211_SAE_PWE_HUNT_AND_PECK, + NL80211_SAE_PWE_BOTH), + [NL80211_ATTR_RECONNECT_REQUESTED] = { .type = NLA_REJECT }, + [NL80211_ATTR_SAR_SPEC] = NLA_POLICY_NESTED(sar_policy), + [NL80211_ATTR_DISABLE_HE] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -897,22 +935,31 @@ int nl80211_prepare_wdev_dump(struct netlink_callback *cb, return err; } - *wdev = __cfg80211_wdev_from_attrs(sock_net(cb->skb->sk), + rtnl_lock(); + *wdev = __cfg80211_wdev_from_attrs(NULL, sock_net(cb->skb->sk), attrbuf); kfree(attrbuf); - if (IS_ERR(*wdev)) + if (IS_ERR(*wdev)) { + rtnl_unlock(); return PTR_ERR(*wdev); + } *rdev = wiphy_to_rdev((*wdev)->wiphy); + mutex_lock(&(*rdev)->wiphy.mtx); + rtnl_unlock(); /* 0 is the first index - add 1 to parse only once */ cb->args[0] = (*rdev)->wiphy_idx + 1; cb->args[1] = (*wdev)->identifier; } else { /* subtract the 1 again here */ - struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1); + struct wiphy *wiphy; struct wireless_dev *tmp; - if (!wiphy) + rtnl_lock(); + wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1); + if (!wiphy) { + rtnl_unlock(); return -ENODEV; + } *rdev = wiphy_to_rdev(wiphy); *wdev = NULL; @@ -923,8 +970,12 @@ int nl80211_prepare_wdev_dump(struct netlink_callback *cb, } } - if (!*wdev) + if (!*wdev) { + rtnl_unlock(); return -ENODEV; + } + mutex_lock(&(*rdev)->wiphy.mtx); + rtnl_unlock(); } return 0; @@ -1882,7 +1933,6 @@ static int nl80211_add_commands_unsplit(struct cfg80211_registered_device *rdev, if (nla_put_u32(msg, i, NL80211_CMD_SET_CHANNEL)) goto nla_put_failure; } - CMD(set_wds_peer, SET_WDS_PEER); if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) { CMD(tdls_mgmt, TDLS_MGMT); CMD(tdls_oper, TDLS_OPER); @@ -2092,6 +2142,56 @@ fail: return -ENOBUFS; } +static int +nl80211_put_sar_specs(struct cfg80211_registered_device *rdev, + struct sk_buff *msg) +{ + struct nlattr *sar_capa, *specs, *sub_freq_range; + u8 num_freq_ranges; + int i; + + if (!rdev->wiphy.sar_capa) + return 0; + + num_freq_ranges = rdev->wiphy.sar_capa->num_freq_ranges; + + sar_capa = nla_nest_start(msg, NL80211_ATTR_SAR_SPEC); + if (!sar_capa) + return -ENOSPC; + + if (nla_put_u32(msg, NL80211_SAR_ATTR_TYPE, rdev->wiphy.sar_capa->type)) + goto fail; + + specs = nla_nest_start(msg, NL80211_SAR_ATTR_SPECS); + if (!specs) + goto fail; + + /* report supported freq_ranges */ + for (i = 0; i < num_freq_ranges; i++) { + sub_freq_range = nla_nest_start(msg, i + 1); + if (!sub_freq_range) + goto fail; + + if (nla_put_u32(msg, NL80211_SAR_ATTR_SPECS_START_FREQ, + rdev->wiphy.sar_capa->freq_ranges[i].start_freq)) + goto fail; + + if (nla_put_u32(msg, NL80211_SAR_ATTR_SPECS_END_FREQ, + rdev->wiphy.sar_capa->freq_ranges[i].end_freq)) + goto fail; + + nla_nest_end(msg, sub_freq_range); + } + + nla_nest_end(msg, specs); + nla_nest_end(msg, sar_capa); + + return 0; +fail: + nla_nest_cancel(msg, sar_capa); + return -ENOBUFS; +} + struct nl80211_dump_wiphy_state { s64 filter_wiphy; long start; @@ -2341,6 +2441,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, CMD(set_multicast_to_unicast, SET_MULTICAST_TO_UNICAST); CMD(update_connect_params, UPDATE_CONNECT_PARAMS); CMD(update_ft_ies, UPDATE_FT_IES); + if (rdev->wiphy.sar_capa) + CMD(set_sar_specs, SET_SAR_SPECS); } #undef CMD @@ -2666,6 +2768,11 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (nl80211_put_tid_config_support(rdev, msg)) goto nla_put_failure; + state->split_start++; + break; + case 16: + if (nl80211_put_sar_specs(rdev, msg)) + goto nla_put_failure; /* done */ state->split_start = 0; @@ -2860,8 +2967,8 @@ static int parse_txq_params(struct nlattr *tb[], static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev) { /* - * You can only set the channel explicitly for WDS interfaces, - * all others have their channel managed via their respective + * You can only set the channel explicitly for some interfaces, + * most have their channel managed via their respective * "establish a connection" command (connect, join, ...) * * For AP/GO and mesh mode, the channel can be set with the @@ -3066,32 +3173,9 @@ static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info) return __nl80211_set_channel(rdev, netdev, info); } -static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info) -{ - struct cfg80211_registered_device *rdev = info->user_ptr[0]; - struct net_device *dev = info->user_ptr[1]; - struct wireless_dev *wdev = dev->ieee80211_ptr; - const u8 *bssid; - - if (!info->attrs[NL80211_ATTR_MAC]) - return -EINVAL; - - if (netif_running(dev)) - return -EBUSY; - - if (!rdev->ops->set_wds_peer) - return -EOPNOTSUPP; - - if (wdev->iftype != NL80211_IFTYPE_WDS) - return -EOPNOTSUPP; - - bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); - return rdev_set_wds_peer(rdev, dev, bssid); -} - static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = NULL; struct net_device *netdev = NULL; struct wireless_dev *wdev; int result = 0, rem_txq_params = 0; @@ -3102,8 +3186,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) u8 coverage_class = 0; u32 txq_limit = 0, txq_memory_limit = 0, txq_quantum = 0; - ASSERT_RTNL(); - + rtnl_lock(); /* * Try to find the wiphy and netdev. Normally this * function shouldn't need the netdev, but this is @@ -3127,14 +3210,18 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (!netdev) { rdev = __cfg80211_rdev_from_attrs(genl_info_net(info), info->attrs); - if (IS_ERR(rdev)) + if (IS_ERR(rdev)) { + rtnl_unlock(); return PTR_ERR(rdev); + } wdev = NULL; netdev = NULL; result = 0; } else wdev = netdev->ieee80211_ptr; + wiphy_lock(&rdev->wiphy); + /* * end workaround code, by now the rdev is available * and locked, and wdev may or may not be NULL. @@ -3143,26 +3230,35 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_WIPHY_NAME]) result = cfg80211_dev_rename( rdev, nla_data(info->attrs[NL80211_ATTR_WIPHY_NAME])); + rtnl_unlock(); if (result) - return result; + goto out; if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) { struct ieee80211_txq_params txq_params; struct nlattr *tb[NL80211_TXQ_ATTR_MAX + 1]; - if (!rdev->ops->set_txq_params) - return -EOPNOTSUPP; + if (!rdev->ops->set_txq_params) { + result = -EOPNOTSUPP; + goto out; + } - if (!netdev) - return -EINVAL; + if (!netdev) { + result = -EINVAL; + goto out; + } if (netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && - netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) - return -EINVAL; + netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { + result = -EINVAL; + goto out; + } - if (!netif_running(netdev)) - return -ENETDOWN; + if (!netif_running(netdev)) { + result = -ENETDOWN; + goto out; + } nla_for_each_nested(nl_txq_params, info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS], @@ -3173,15 +3269,15 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) txq_params_policy, info->extack); if (result) - return result; + goto out; result = parse_txq_params(tb, &txq_params); if (result) - return result; + goto out; result = rdev_set_txq_params(rdev, netdev, &txq_params); if (result) - return result; + goto out; } } @@ -3191,7 +3287,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) nl80211_can_set_dev_channel(wdev) ? netdev : NULL, info); if (result) - return result; + goto out; } if (info->attrs[NL80211_ATTR_WIPHY_TX_POWER_SETTING]) { @@ -3202,15 +3298,19 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (!(rdev->wiphy.features & NL80211_FEATURE_VIF_TXPOWER)) txp_wdev = NULL; - if (!rdev->ops->set_tx_power) - return -EOPNOTSUPP; + if (!rdev->ops->set_tx_power) { + result = -EOPNOTSUPP; + goto out; + } idx = NL80211_ATTR_WIPHY_TX_POWER_SETTING; type = nla_get_u32(info->attrs[idx]); if (!info->attrs[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] && - (type != NL80211_TX_POWER_AUTOMATIC)) - return -EINVAL; + (type != NL80211_TX_POWER_AUTOMATIC)) { + result = -EINVAL; + goto out; + } if (type != NL80211_TX_POWER_AUTOMATIC) { idx = NL80211_ATTR_WIPHY_TX_POWER_LEVEL; @@ -3219,7 +3319,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) result = rdev_set_tx_power(rdev, txp_wdev, type, mbm); if (result) - return result; + goto out; } if (info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX] && @@ -3228,8 +3328,10 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if ((!rdev->wiphy.available_antennas_tx && !rdev->wiphy.available_antennas_rx) || - !rdev->ops->set_antenna) - return -EOPNOTSUPP; + !rdev->ops->set_antenna) { + result = -EOPNOTSUPP; + goto out; + } tx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX]); rx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]); @@ -3237,15 +3339,17 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) /* reject antenna configurations which don't match the * available antenna masks, except for the "all" mask */ if ((~tx_ant && (tx_ant & ~rdev->wiphy.available_antennas_tx)) || - (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx))) - return -EINVAL; + (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx))) { + result = -EINVAL; + goto out; + } tx_ant = tx_ant & rdev->wiphy.available_antennas_tx; rx_ant = rx_ant & rdev->wiphy.available_antennas_rx; result = rdev_set_antenna(rdev, tx_ant, rx_ant); if (result) - return result; + goto out; } changed = 0; @@ -3267,8 +3371,10 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]) { frag_threshold = nla_get_u32( info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]); - if (frag_threshold < 256) - return -EINVAL; + if (frag_threshold < 256) { + result = -EINVAL; + goto out; + } if (frag_threshold != (u32) -1) { /* @@ -3289,8 +3395,10 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]) { - if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) - return -EINVAL; + if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) { + result = -EINVAL; + goto out; + } coverage_class = nla_get_u8( info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]); @@ -3298,16 +3406,20 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) { - if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION)) - return -EOPNOTSUPP; + if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION)) { + result = -EOPNOTSUPP; + goto out; + } changed |= WIPHY_PARAM_DYN_ACK; } if (info->attrs[NL80211_ATTR_TXQ_LIMIT]) { if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_TXQS)) - return -EOPNOTSUPP; + NL80211_EXT_FEATURE_TXQS)) { + result = -EOPNOTSUPP; + goto out; + } txq_limit = nla_get_u32( info->attrs[NL80211_ATTR_TXQ_LIMIT]); changed |= WIPHY_PARAM_TXQ_LIMIT; @@ -3315,8 +3427,10 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]) { if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_TXQS)) - return -EOPNOTSUPP; + NL80211_EXT_FEATURE_TXQS)) { + result = -EOPNOTSUPP; + goto out; + } txq_memory_limit = nla_get_u32( info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]); changed |= WIPHY_PARAM_TXQ_MEMORY_LIMIT; @@ -3324,8 +3438,10 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_TXQ_QUANTUM]) { if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_TXQS)) - return -EOPNOTSUPP; + NL80211_EXT_FEATURE_TXQS)) { + result = -EOPNOTSUPP; + goto out; + } txq_quantum = nla_get_u32( info->attrs[NL80211_ATTR_TXQ_QUANTUM]); changed |= WIPHY_PARAM_TXQ_QUANTUM; @@ -3337,8 +3453,10 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) u8 old_coverage_class; u32 old_txq_limit, old_txq_memory_limit, old_txq_quantum; - if (!rdev->ops->set_wiphy_params) - return -EOPNOTSUPP; + if (!rdev->ops->set_wiphy_params) { + result = -EOPNOTSUPP; + goto out; + } old_retry_short = rdev->wiphy.retry_short; old_retry_long = rdev->wiphy.retry_long; @@ -3376,10 +3494,15 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.txq_limit = old_txq_limit; rdev->wiphy.txq_memory_limit = old_txq_memory_limit; rdev->wiphy.txq_quantum = old_txq_quantum; - return result; + goto out; } } - return 0; + + result = 0; + +out: + wiphy_unlock(&rdev->wiphy); + return result; } static int nl80211_send_chandef(struct sk_buff *msg, @@ -3910,6 +4033,17 @@ static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; /* + * We hold RTNL, so this is safe, without RTNL opencount cannot + * reach 0, and thus the rdev cannot be deleted. + * + * We need to do it for the dev_close(), since that will call + * the netdev notifiers, and we need to acquire the mutex there + * but don't know if we get there from here or from some other + * place (e.g. "ip link set ... down"). + */ + mutex_unlock(&rdev->wiphy.mtx); + + /* * If we remove a wireless device without a netdev then clear * user_ptr[1] so that nl80211_post_doit won't dereference it * to check if it needs to do dev_put(). Otherwise it crashes @@ -3918,6 +4052,10 @@ static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info) */ if (!wdev->netdev) info->user_ptr[1] = NULL; + else + dev_close(wdev->netdev); + + mutex_lock(&rdev->wiphy.mtx); return rdev_del_virtual_intf(rdev, wdev); } @@ -4260,9 +4398,6 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) if (err) return err; - if (key.idx < 0) - return -EINVAL; - if (info->attrs[NL80211_ATTR_MAC]) mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); @@ -4278,6 +4413,10 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) key.type != NL80211_KEYTYPE_GROUP) return -EINVAL; + if (!cfg80211_valid_key_idx(rdev, key.idx, + key.type == NL80211_KEYTYPE_PAIRWISE)) + return -EINVAL; + if (!rdev->ops->del_key) return -EOPNOTSUPP; @@ -4595,7 +4734,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, struct nlattr *attrs[], enum nl80211_attrs attr, struct cfg80211_bitrate_mask *mask, - struct net_device *dev) + struct net_device *dev, + bool default_all_enabled) { struct nlattr *tb[NL80211_TXRATE_MAX + 1]; struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -4610,6 +4750,9 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, for (i = 0; i < NUM_NL80211_BANDS; i++) { const struct ieee80211_sta_he_cap *he_cap; + if (!default_all_enabled) + break; + sband = rdev->wiphy.bands[i]; if (!sband) @@ -4677,6 +4820,7 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[band].ht_mcs)) return -EINVAL; } + if (tb[NL80211_TXRATE_VHT]) { if (!vht_set_mcs_mask( sband, @@ -4684,6 +4828,7 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[band].vht_mcs)) return -EINVAL; } + if (tb[NL80211_TXRATE_GI]) { mask->control[band].gi = nla_get_u8(tb[NL80211_TXRATE_GI]); @@ -4695,6 +4840,7 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, nla_data(tb[NL80211_TXRATE_HE]), mask->control[band].he_mcs)) return -EINVAL; + if (tb[NL80211_TXRATE_HE_GI]) mask->control[band].he_gi = nla_get_u8(tb[NL80211_TXRATE_HE_GI]); @@ -4736,7 +4882,7 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, enum nl80211_band band, struct cfg80211_bitrate_mask *beacon_rate) { - u32 count_ht, count_vht, i; + u32 count_ht, count_vht, count_he, i; u32 rate = beacon_rate->control[band].legacy; /* Allow only one rate */ @@ -4769,7 +4915,21 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, return -EINVAL; } - if ((count_ht && count_vht) || (!rate && !count_ht && !count_vht)) + count_he = 0; + for (i = 0; i < NL80211_HE_NSS_MAX; i++) { + if (hweight16(beacon_rate->control[band].he_mcs[i]) > 1) { + return -EINVAL; + } else if (beacon_rate->control[band].he_mcs[i]) { + count_he++; + if (count_he > 1) + return -EINVAL; + } + if (count_he && rate) + return -EINVAL; + } + + if ((count_ht && count_vht && count_he) || + (!rate && !count_ht && !count_vht && !count_he)) return -EINVAL; if (rate && @@ -4784,6 +4944,10 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, !wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_RATE_VHT)) return -EINVAL; + if (count_he && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_HE)) + return -EINVAL; return 0; } @@ -5013,6 +5177,8 @@ static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, params->vht_required = true; if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HE_PHY) params->he_required = true; + if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_SAE_H2E) + params->sae_h2e_required = true; } } @@ -5244,7 +5410,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, ¶ms.beacon_rate, - dev); + dev, false); if (err) return err; @@ -5806,10 +5972,11 @@ static int nl80211_dump_station(struct sk_buff *skb, int sta_idx = cb->args[2]; int err; - rtnl_lock(); err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); if (err) - goto out_err; + return err; + /* nl80211_prepare_wdev_dump acquired it in the successful case */ + __acquire(&rdev->wiphy.mtx); if (!wdev->netdev) { err = -EINVAL; @@ -5844,7 +6011,7 @@ static int nl80211_dump_station(struct sk_buff *skb, cb->args[2] = sta_idx; err = skb->len; out_err: - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); return err; } @@ -6702,10 +6869,11 @@ static int nl80211_dump_mpath(struct sk_buff *skb, int path_idx = cb->args[2]; int err; - rtnl_lock(); err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); if (err) - goto out_err; + return err; + /* nl80211_prepare_wdev_dump acquired it in the successful case */ + __acquire(&rdev->wiphy.mtx); if (!rdev->ops->dump_mpath) { err = -EOPNOTSUPP; @@ -6738,7 +6906,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb, cb->args[2] = path_idx; err = skb->len; out_err: - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); return err; } @@ -6901,10 +7069,11 @@ static int nl80211_dump_mpp(struct sk_buff *skb, int path_idx = cb->args[2]; int err; - rtnl_lock(); err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); if (err) - goto out_err; + return err; + /* nl80211_prepare_wdev_dump acquired it in the successful case */ + __acquire(&rdev->wiphy.mtx); if (!rdev->ops->dump_mpp) { err = -EOPNOTSUPP; @@ -6937,7 +7106,7 @@ static int nl80211_dump_mpp(struct sk_buff *skb, cb->args[2] = path_idx; err = skb->len; out_err: - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); return err; } @@ -7556,12 +7725,15 @@ static int nl80211_get_reg_do(struct sk_buff *skb, struct genl_info *info) if (!hdr) goto put_failure; + rtnl_lock(); + if (info->attrs[NL80211_ATTR_WIPHY]) { bool self_managed; rdev = cfg80211_get_dev_from_info(genl_info_net(info), info); if (IS_ERR(rdev)) { nlmsg_free(msg); + rtnl_unlock(); return PTR_ERR(rdev); } @@ -7573,6 +7745,7 @@ static int nl80211_get_reg_do(struct sk_buff *skb, struct genl_info *info) /* a self-managed-reg device must have a private regdom */ if (WARN_ON(!regdom && self_managed)) { nlmsg_free(msg); + rtnl_unlock(); return -EINVAL; } @@ -7597,11 +7770,13 @@ static int nl80211_get_reg_do(struct sk_buff *skb, struct genl_info *info) rcu_read_unlock(); genlmsg_end(msg, hdr); + rtnl_unlock(); return genlmsg_reply(msg, info); nla_put_failure_rcu: rcu_read_unlock(); nla_put_failure: + rtnl_unlock(); put_failure: nlmsg_free(msg); return -EMSGSIZE; @@ -7764,12 +7939,17 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - if (!reg_is_valid_request(alpha2)) - return -EINVAL; + rtnl_lock(); + if (!reg_is_valid_request(alpha2)) { + r = -EINVAL; + goto out; + } rd = kzalloc(struct_size(rd, reg_rules, num_rules), GFP_KERNEL); - if (!rd) - return -ENOMEM; + if (!rd) { + r = -ENOMEM; + goto out; + } rd->n_reg_rules = num_rules; rd->alpha2[0] = alpha2[0]; @@ -7801,10 +7981,13 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) } } + r = set_regdom(rd, REGD_SOURCE_CRDA); /* set_regdom takes ownership of rd */ - return set_regdom(rd, REGD_SOURCE_CRDA); + rd = NULL; bad_reg: kfree(rd); + out: + rtnl_unlock(); return r; } #endif /* CONFIG_CFG80211_CRDA_SUPPORT */ @@ -8237,12 +8420,6 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]) { - if (!wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_SET_SCAN_DWELL)) { - err = -EOPNOTSUPP; - goto out_free; - } - request->duration = nla_get_u16(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]); request->duration_mandatory = @@ -8978,10 +9155,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_csa_settings params; - /* csa_attrs is defined static to avoid waste of stack size - this - * function is called under RTNL lock, so this should not be a problem. - */ - static struct nlattr *csa_attrs[NL80211_ATTR_MAX+1]; + struct nlattr **csa_attrs = NULL; int err; bool need_new_beacon = false; bool need_handle_dfs_flag = true; @@ -9046,28 +9220,39 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) if (err) return err; + csa_attrs = kcalloc(NL80211_ATTR_MAX + 1, sizeof(*csa_attrs), + GFP_KERNEL); + if (!csa_attrs) + return -ENOMEM; + err = nla_parse_nested_deprecated(csa_attrs, NL80211_ATTR_MAX, info->attrs[NL80211_ATTR_CSA_IES], nl80211_policy, info->extack); if (err) - return err; + goto free; err = nl80211_parse_beacon(rdev, csa_attrs, ¶ms.beacon_csa); if (err) - return err; + goto free; - if (!csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]) - return -EINVAL; + if (!csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]) { + err = -EINVAL; + goto free; + } len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]); - if (!len || (len % sizeof(u16))) - return -EINVAL; + if (!len || (len % sizeof(u16))) { + err = -EINVAL; + goto free; + } params.n_counter_offsets_beacon = len / sizeof(u16); if (rdev->wiphy.max_num_csa_counters && (params.n_counter_offsets_beacon > - rdev->wiphy.max_num_csa_counters)) - return -EINVAL; + rdev->wiphy.max_num_csa_counters)) { + err = -EINVAL; + goto free; + } params.counter_offsets_beacon = nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]); @@ -9076,23 +9261,31 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) for (i = 0; i < params.n_counter_offsets_beacon; i++) { u16 offset = params.counter_offsets_beacon[i]; - if (offset >= params.beacon_csa.tail_len) - return -EINVAL; + if (offset >= params.beacon_csa.tail_len) { + err = -EINVAL; + goto free; + } - if (params.beacon_csa.tail[offset] != params.count) - return -EINVAL; + if (params.beacon_csa.tail[offset] != params.count) { + err = -EINVAL; + goto free; + } } if (csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]) { len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]); - if (!len || (len % sizeof(u16))) - return -EINVAL; + if (!len || (len % sizeof(u16))) { + err = -EINVAL; + goto free; + } params.n_counter_offsets_presp = len / sizeof(u16); if (rdev->wiphy.max_num_csa_counters && (params.n_counter_offsets_presp > - rdev->wiphy.max_num_csa_counters)) - return -EINVAL; + rdev->wiphy.max_num_csa_counters)) { + err = -EINVAL; + goto free; + } params.counter_offsets_presp = nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]); @@ -9101,35 +9294,42 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) for (i = 0; i < params.n_counter_offsets_presp; i++) { u16 offset = params.counter_offsets_presp[i]; - if (offset >= params.beacon_csa.probe_resp_len) - return -EINVAL; + if (offset >= params.beacon_csa.probe_resp_len) { + err = -EINVAL; + goto free; + } if (params.beacon_csa.probe_resp[offset] != - params.count) - return -EINVAL; + params.count) { + err = -EINVAL; + goto free; + } } } skip_beacons: err = nl80211_parse_chandef(rdev, info, ¶ms.chandef); if (err) - return err; + goto free; if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, ¶ms.chandef, - wdev->iftype)) - return -EINVAL; + wdev->iftype)) { + err = -EINVAL; + goto free; + } err = cfg80211_chandef_dfs_required(wdev->wiphy, ¶ms.chandef, wdev->iftype); if (err < 0) - return err; + goto free; if (err > 0) { params.radar_required = true; if (need_handle_dfs_flag && !nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS])) { - return -EINVAL; + err = -EINVAL; + goto free; } } @@ -9140,6 +9340,8 @@ skip_beacons: err = rdev_channel_switch(rdev, dev, ¶ms); wdev_unlock(wdev); +free: + kfree(csa_attrs); return err; } @@ -9290,12 +9492,11 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb) int start = cb->args[2], idx = 0; int err; - rtnl_lock(); err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); - if (err) { - rtnl_unlock(); + if (err) return err; - } + /* nl80211_prepare_wdev_dump acquired it in the successful case */ + __acquire(&rdev->wiphy.mtx); wdev_lock(wdev); spin_lock_bh(&rdev->bss_lock); @@ -9326,7 +9527,7 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb) wdev_unlock(wdev); cb->args[2] = idx; - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); return skb->len; } @@ -9424,10 +9625,13 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) if (!attrbuf) return -ENOMEM; - rtnl_lock(); res = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); - if (res) - goto out_err; + if (res) { + kfree(attrbuf); + return res; + } + /* nl80211_prepare_wdev_dump acquired it in the successful case */ + __acquire(&rdev->wiphy.mtx); /* prepare_wdev_dump parsed the attributes */ radio_stats = attrbuf[NL80211_ATTR_SURVEY_RADIO_STATS]; @@ -9469,7 +9673,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) res = skb->len; out_err: kfree(attrbuf); - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); return res; } @@ -9732,6 +9936,12 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, nla_len(info->attrs[NL80211_ATTR_SAE_PASSWORD]); } + if (info->attrs[NL80211_ATTR_SAE_PWE]) + settings->sae_pwe = + nla_get_u8(info->attrs[NL80211_ATTR_SAE_PWE]); + else + settings->sae_pwe = NL80211_SAE_PWE_UNSPECIFIED; + return 0; } @@ -9810,6 +10020,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_VHT])) req.flags |= ASSOC_REQ_DISABLE_VHT; + if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HE])) + req.flags |= ASSOC_REQ_DISABLE_HE; + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) memcpy(&req.vht_capa_mask, nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]), @@ -10325,10 +10538,14 @@ EXPORT_SYMBOL(__cfg80211_send_event_skb); static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; - struct wireless_dev *wdev = - __cfg80211_wdev_from_attrs(genl_info_net(info), info->attrs); + struct wireless_dev *wdev; int err; + lockdep_assert_held(&rdev->wiphy.mtx); + + wdev = __cfg80211_wdev_from_attrs(rdev, genl_info_net(info), + info->attrs); + if (!rdev->ops->testmode_cmd) return -EOPNOTSUPP; @@ -10589,6 +10806,9 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_VHT])) connect.flags |= ASSOC_REQ_DISABLE_VHT; + if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HE])) + connect.flags |= ASSOC_REQ_DISABLE_HE; + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) memcpy(&connect.vht_capa_mask, nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]), @@ -11088,7 +11308,7 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, &mask, - dev); + dev, true); if (err) return err; @@ -11165,6 +11385,7 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) case NL80211_IFTYPE_P2P_DEVICE: if (!info->attrs[NL80211_ATTR_WIPHY_FREQ]) return -EINVAL; + break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_P2P_CLIENT: @@ -11697,7 +11918,7 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, &setup.beacon_rate, - dev); + dev, false); if (err) return err; @@ -12634,7 +12855,7 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; struct nlattr *tb[NUM_NL80211_REKEY_DATA]; - struct cfg80211_gtk_rekey_data rekey_data; + struct cfg80211_gtk_rekey_data rekey_data = {}; int err; if (!info->attrs[NL80211_ATTR_REKEY_DATA]) @@ -13512,7 +13733,8 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev = - __cfg80211_wdev_from_attrs(genl_info_net(info), info->attrs); + __cfg80211_wdev_from_attrs(rdev, genl_info_net(info), + info->attrs); int i, err; u32 vid, subcmd; @@ -13636,7 +13858,7 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, goto out; } - *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), attrbuf); + *wdev = __cfg80211_wdev_from_attrs(NULL, sock_net(skb->sk), attrbuf); if (IS_ERR(*wdev)) *wdev = NULL; @@ -14477,7 +14699,8 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, if (tid_conf->txrate_type != NL80211_TX_RATE_AUTOMATIC) { attr = NL80211_TID_CONFIG_ATTR_TX_RATE; err = nl80211_parse_tx_bitrate_mask(info, attrs, attr, - &tid_conf->txrate_mask, dev); + &tid_conf->txrate_mask, dev, + true); if (err) return err; @@ -14570,31 +14793,24 @@ bad_tid_conf: static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = NULL; struct wireless_dev *wdev; struct net_device *dev; - bool rtnl = ops->internal_flags & NL80211_FLAG_NEED_RTNL; - - if (rtnl) - rtnl_lock(); + rtnl_lock(); if (ops->internal_flags & NL80211_FLAG_NEED_WIPHY) { rdev = cfg80211_get_dev_from_info(genl_info_net(info), info); if (IS_ERR(rdev)) { - if (rtnl) - rtnl_unlock(); + rtnl_unlock(); return PTR_ERR(rdev); } info->user_ptr[0] = rdev; } else if (ops->internal_flags & NL80211_FLAG_NEED_NETDEV || ops->internal_flags & NL80211_FLAG_NEED_WDEV) { - ASSERT_RTNL(); - - wdev = __cfg80211_wdev_from_attrs(genl_info_net(info), + wdev = __cfg80211_wdev_from_attrs(NULL, genl_info_net(info), info->attrs); if (IS_ERR(wdev)) { - if (rtnl) - rtnl_unlock(); + rtnl_unlock(); return PTR_ERR(wdev); } @@ -14603,8 +14819,7 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, if (ops->internal_flags & NL80211_FLAG_NEED_NETDEV) { if (!dev) { - if (rtnl) - rtnl_unlock(); + rtnl_unlock(); return -EINVAL; } @@ -14615,8 +14830,7 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP && !wdev_running(wdev)) { - if (rtnl) - rtnl_unlock(); + rtnl_unlock(); return -ENETDOWN; } @@ -14626,6 +14840,14 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, info->user_ptr[0] = rdev; } + if (rdev) { + wiphy_lock(&rdev->wiphy); + /* we keep the mutex locked until post_doit */ + __release(&rdev->wiphy.mtx); + } + if (!(ops->internal_flags & NL80211_FLAG_NEED_RTNL)) + rtnl_unlock(); + return 0; } @@ -14643,6 +14865,14 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, } } + if (info->user_ptr[0]) { + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + + /* we kept the mutex locked since pre_doit */ + __acquire(&rdev->wiphy.mtx); + wiphy_unlock(&rdev->wiphy); + } + if (ops->internal_flags & NL80211_FLAG_NEED_RTNL) rtnl_unlock(); @@ -14658,6 +14888,111 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, } } +static int nl80211_set_sar_sub_specs(struct cfg80211_registered_device *rdev, + struct cfg80211_sar_specs *sar_specs, + struct nlattr *spec[], int index) +{ + u32 range_index, i; + + if (!sar_specs || !spec) + return -EINVAL; + + if (!spec[NL80211_SAR_ATTR_SPECS_POWER] || + !spec[NL80211_SAR_ATTR_SPECS_RANGE_INDEX]) + return -EINVAL; + + range_index = nla_get_u32(spec[NL80211_SAR_ATTR_SPECS_RANGE_INDEX]); + + /* check if range_index exceeds num_freq_ranges */ + if (range_index >= rdev->wiphy.sar_capa->num_freq_ranges) + return -EINVAL; + + /* check if range_index duplicates */ + for (i = 0; i < index; i++) { + if (sar_specs->sub_specs[i].freq_range_index == range_index) + return -EINVAL; + } + + sar_specs->sub_specs[index].power = + nla_get_s32(spec[NL80211_SAR_ATTR_SPECS_POWER]); + + sar_specs->sub_specs[index].freq_range_index = range_index; + + return 0; +} + +static int nl80211_set_sar_specs(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct nlattr *spec[NL80211_SAR_ATTR_SPECS_MAX + 1]; + struct nlattr *tb[NL80211_SAR_ATTR_MAX + 1]; + struct cfg80211_sar_specs *sar_spec; + enum nl80211_sar_type type; + struct nlattr *spec_list; + u32 specs; + int rem, err; + + if (!rdev->wiphy.sar_capa || !rdev->ops->set_sar_specs) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_SAR_SPEC]) + return -EINVAL; + + nla_parse_nested(tb, NL80211_SAR_ATTR_MAX, + info->attrs[NL80211_ATTR_SAR_SPEC], + NULL, NULL); + + if (!tb[NL80211_SAR_ATTR_TYPE] || !tb[NL80211_SAR_ATTR_SPECS]) + return -EINVAL; + + type = nla_get_u32(tb[NL80211_SAR_ATTR_TYPE]); + if (type != rdev->wiphy.sar_capa->type) + return -EINVAL; + + specs = 0; + nla_for_each_nested(spec_list, tb[NL80211_SAR_ATTR_SPECS], rem) + specs++; + + if (specs > rdev->wiphy.sar_capa->num_freq_ranges) + return -EINVAL; + + sar_spec = kzalloc(sizeof(*sar_spec) + + specs * sizeof(struct cfg80211_sar_sub_specs), + GFP_KERNEL); + if (!sar_spec) + return -ENOMEM; + + sar_spec->type = type; + specs = 0; + nla_for_each_nested(spec_list, tb[NL80211_SAR_ATTR_SPECS], rem) { + nla_parse_nested(spec, NL80211_SAR_ATTR_SPECS_MAX, + spec_list, NULL, NULL); + + switch (type) { + case NL80211_SAR_TYPE_POWER: + if (nl80211_set_sar_sub_specs(rdev, sar_spec, + spec, specs)) { + err = -EINVAL; + goto error; + } + break; + default: + err = -EINVAL; + goto error; + } + specs++; + } + + sar_spec->num_sub_specs = specs; + + rdev->cur_cmd_info = info; + err = rdev_set_sar_specs(rdev, sar_spec); + rdev->cur_cmd_info = NULL; +error: + kfree(sar_spec); + return err; +} + static const struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_GET_WIPHY, @@ -14666,8 +15001,7 @@ static const struct genl_ops nl80211_ops[] = { .dumpit = nl80211_dump_wiphy, .done = nl80211_dump_wiphy_done, /* can be retrieved by unprivileged users */ - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, }; @@ -14677,7 +15011,6 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_wiphy, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_INTERFACE, @@ -14685,8 +15018,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_get_interface, .dumpit = nl80211_dump_interface, /* can be retrieved by unprivileged users */ - .internal_flags = NL80211_FLAG_NEED_WDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV, }, { .cmd = NL80211_CMD_SET_INTERFACE, @@ -14717,8 +15049,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_key, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_KEY, @@ -14726,7 +15057,6 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_set_key, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | NL80211_FLAG_CLEAR_SKB, }, { @@ -14735,7 +15065,6 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_new_key, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | NL80211_FLAG_CLEAR_SKB, }, { @@ -14743,64 +15072,56 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_key, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_BEACON, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_set_beacon, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_START_AP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_start_ap, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_STOP_AP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_stop_ap, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_GET_STATION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_station, .dumpit = nl80211_dump_station, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_SET_STATION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_station, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_NEW_STATION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_new_station, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_DEL_STATION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_station, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_GET_MPATH, @@ -14808,8 +15129,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_get_mpath, .dumpit = nl80211_dump_mpath, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_GET_MPP, @@ -14817,47 +15137,42 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_get_mpp, .dumpit = nl80211_dump_mpp, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_MPATH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_mpath, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_NEW_MPATH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_new_mpath, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_DEL_MPATH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_mpath, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_BSS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_bss, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_GET_REG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_reg_do, .dumpit = nl80211_get_reg_dump, - .internal_flags = NL80211_FLAG_NEED_RTNL, + .internal_flags = 0, /* can be retrieved by unprivileged users */ }, #ifdef CONFIG_CFG80211_CRDA_SUPPORT @@ -14866,7 +15181,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_reg, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_RTNL, + .internal_flags = 0, }, #endif { @@ -14886,32 +15201,28 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_mesh_config, /* can be retrieved by unprivileged users */ - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_MESH_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_update_mesh_config, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_TRIGGER_SCAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_trigger_scan, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_ABORT_SCAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_abort_scan, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_GET_SCAN, @@ -14923,16 +15234,14 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_start_sched_scan, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_STOP_SCHED_SCAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_stop_sched_scan, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_AUTHENTICATE, @@ -14940,7 +15249,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_authenticate, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { @@ -14949,7 +15258,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_associate, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { @@ -14957,32 +15266,28 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_deauthenticate, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_DISASSOCIATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_disassociate, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_JOIN_IBSS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_join_ibss, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_LEAVE_IBSS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_leave_ibss, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, #ifdef CONFIG_NL80211_TESTMODE { @@ -14991,8 +15296,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_testmode_do, .dumpit = nl80211_testmode_dump, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, #endif { @@ -15001,7 +15305,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_connect, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { @@ -15010,7 +15314,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_update_connect_params, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { @@ -15018,16 +15322,14 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_disconnect, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_WIPHY_NETNS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_wiphy_netns, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, { .cmd = NL80211_CMD_GET_SURVEY, @@ -15040,7 +15342,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_setdel_pmksa, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { @@ -15048,136 +15350,112 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_setdel_pmksa, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_FLUSH_PMKSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_flush_pmksa, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_REMAIN_ON_CHANNEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_remain_on_channel, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_cancel_remain_on_channel, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_SET_TX_BITRATE_MASK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_tx_bitrate_mask, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_REGISTER_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_register_mgmt, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV, }, { .cmd = NL80211_CMD_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tx_mgmt, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_FRAME_WAIT_CANCEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tx_mgmt_cancel_wait, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_SET_POWER_SAVE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_power_save, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_GET_POWER_SAVE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_power_save, /* can be retrieved by unprivileged users */ - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_SET_CQM, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_cqm, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_SET_CHANNEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_channel, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, - }, - { - .cmd = NL80211_CMD_SET_WDS_PEER, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = nl80211_set_wds_peer, - .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_JOIN_MESH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_join_mesh, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_LEAVE_MESH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_leave_mesh, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_JOIN_OCB, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_join_ocb, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_LEAVE_OCB, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_leave_ocb, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, #ifdef CONFIG_PM { @@ -15185,16 +15463,14 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_wowlan, /* can be retrieved by unprivileged users */ - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, { .cmd = NL80211_CMD_SET_WOWLAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_wowlan, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, #endif { @@ -15203,7 +15479,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_set_rekey_data, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { @@ -15211,48 +15487,42 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_mgmt, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_TDLS_OPER, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_oper, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_UNEXPECTED_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_register_unexpected_frame, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_PROBE_CLIENT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_probe_client, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_REGISTER_BEACONS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_register_beacons, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, { .cmd = NL80211_CMD_SET_NOACK_MAP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_noack_map, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_START_P2P_DEVICE, @@ -15291,48 +15561,42 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_nan_add_func, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_DEL_NAN_FUNCTION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_nan_del_func, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_CHANGE_NAN_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_nan_change_config, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_SET_MCAST_RATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_mcast_rate, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_SET_MAC_ACL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_mac_acl, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_RADAR_DETECT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_start_radar_detection, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_GET_PROTOCOL_FEATURES, @@ -15344,47 +15608,41 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_update_ft_ies, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_CRIT_PROTOCOL_START, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_crit_protocol_start, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_CRIT_PROTOCOL_STOP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_crit_protocol_stop, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_GET_COALESCE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_coalesce, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, { .cmd = NL80211_CMD_SET_COALESCE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_coalesce, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, { .cmd = NL80211_CMD_CHANNEL_SWITCH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_channel_switch, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_VENDOR, @@ -15393,7 +15651,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .dumpit = nl80211_vendor_cmd_dump, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { @@ -15401,122 +15659,115 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_qos_map, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_ADD_TX_TS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_add_tx_ts, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_DEL_TX_TS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_tx_ts, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_TDLS_CHANNEL_SWITCH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_channel_switch, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_cancel_channel_switch, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_MULTICAST_TO_UNICAST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_multicast_to_unicast, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_SET_PMK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_pmk, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_DEL_PMK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_pmk, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_EXTERNAL_AUTH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_external_auth, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_CONTROL_PORT_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tx_control_port, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_GET_FTM_RESPONDER_STATS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_ftm_responder_stats, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_PEER_MEASUREMENT_START, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_pmsr_start, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_NOTIFY_RADAR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_notify_radar_detection, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_UPDATE_OWE_INFO, .doit = nl80211_update_owe_info, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_PROBE_MESH_LINK, .doit = nl80211_probe_mesh_link, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_TID_CONFIG, .doit = nl80211_set_tid_config, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV, + }, + { + .cmd = NL80211_CMD_SET_SAR_SPECS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = nl80211_set_sar_specs, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, }; @@ -15854,7 +16105,7 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, const u8 *buf, size_t len, enum nl80211_commands cmd, gfp_t gfp, int uapsd_queues, const u8 *req_ies, - size_t req_ies_len) + size_t req_ies_len, bool reconnect) { struct sk_buff *msg; void *hdr; @@ -15876,6 +16127,9 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, nla_put(msg, NL80211_ATTR_REQ_IE, req_ies_len, req_ies))) goto nla_put_failure; + if (reconnect && nla_put_flag(msg, NL80211_ATTR_RECONNECT_REQUESTED)) + goto nla_put_failure; + if (uapsd_queues >= 0) { struct nlattr *nla_wmm = nla_nest_start_noflag(msg, NL80211_ATTR_STA_WME); @@ -15904,7 +16158,8 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev, size_t len, gfp_t gfp) { nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_AUTHENTICATE, gfp, -1, NULL, 0); + NL80211_CMD_AUTHENTICATE, gfp, -1, NULL, 0, + false); } void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, @@ -15914,23 +16169,25 @@ void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, { nl80211_send_mlme_event(rdev, netdev, buf, len, NL80211_CMD_ASSOCIATE, gfp, uapsd_queues, - req_ies, req_ies_len); + req_ies, req_ies_len, false); } void nl80211_send_deauth(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, - size_t len, gfp_t gfp) + size_t len, bool reconnect, gfp_t gfp) { nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_DEAUTHENTICATE, gfp, -1, NULL, 0); + NL80211_CMD_DEAUTHENTICATE, gfp, -1, NULL, 0, + reconnect); } void nl80211_send_disassoc(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, - size_t len, gfp_t gfp) + size_t len, bool reconnect, gfp_t gfp) { nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_DISASSOCIATE, gfp, -1, NULL, 0); + NL80211_CMD_DISASSOCIATE, gfp, -1, NULL, 0, + reconnect); } void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, @@ -15961,7 +16218,7 @@ void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, trace_cfg80211_rx_unprot_mlme_mgmt(dev, buf, len); nl80211_send_mlme_event(rdev, dev, buf, len, cmd, GFP_ATOMIC, -1, - NULL, 0); + NULL, 0, false); } EXPORT_SYMBOL(cfg80211_rx_unprot_mlme_mgmt); @@ -17062,7 +17319,7 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef, gfp_t gfp, enum nl80211_commands notif, - u8 count) + u8 count, bool quiet) { struct sk_buff *msg; void *hdr; @@ -17083,9 +17340,13 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, if (nl80211_send_chandef(msg, chandef)) goto nla_put_failure; - if ((notif == NL80211_CMD_CH_SWITCH_STARTED_NOTIFY) && - (nla_put_u32(msg, NL80211_ATTR_CH_SWITCH_COUNT, count))) + if (notif == NL80211_CMD_CH_SWITCH_STARTED_NOTIFY) { + if (nla_put_u32(msg, NL80211_ATTR_CH_SWITCH_COUNT, count)) goto nla_put_failure; + if (quiet && + nla_put_flag(msg, NL80211_ATTR_CH_SWITCH_BLOCK_TX)) + goto nla_put_failure; + } genlmsg_end(msg, hdr); @@ -17118,13 +17379,13 @@ void cfg80211_ch_switch_notify(struct net_device *dev, cfg80211_sched_dfs_chan_update(rdev); nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL, - NL80211_CMD_CH_SWITCH_NOTIFY, 0); + NL80211_CMD_CH_SWITCH_NOTIFY, 0, false); } EXPORT_SYMBOL(cfg80211_ch_switch_notify); void cfg80211_ch_switch_started_notify(struct net_device *dev, struct cfg80211_chan_def *chandef, - u8 count) + u8 count, bool quiet) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; @@ -17133,7 +17394,8 @@ void cfg80211_ch_switch_started_notify(struct net_device *dev, trace_cfg80211_ch_switch_started_notify(dev, chandef); nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL, - NL80211_CMD_CH_SWITCH_STARTED_NOTIFY, count); + NL80211_CMD_CH_SWITCH_STARTED_NOTIFY, + count, quiet); } EXPORT_SYMBOL(cfg80211_ch_switch_started_notify); diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index d3e8e426c486..a3f387770f1b 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018, 2020 Intel Corporation */ #ifndef __NET_WIRELESS_NL80211_H #define __NET_WIRELESS_NL80211_H @@ -69,10 +69,12 @@ void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, const u8 *req_ies, size_t req_ies_len); void nl80211_send_deauth(struct cfg80211_registered_device *rdev, struct net_device *netdev, - const u8 *buf, size_t len, gfp_t gfp); + const u8 *buf, size_t len, + bool reconnect, gfp_t gfp); void nl80211_send_disassoc(struct cfg80211_registered_device *rdev, struct net_device *netdev, - const u8 *buf, size_t len, gfp_t gfp); + const u8 *buf, size_t len, + bool reconnect, gfp_t gfp); void nl80211_send_auth_timeout(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *addr, gfp_t gfp); diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 950d57494168..8b1358d04ca2 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -582,16 +582,6 @@ static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev, return ret; } -static inline int rdev_set_wds_peer(struct cfg80211_registered_device *rdev, - struct net_device *dev, const u8 *addr) -{ - int ret; - trace_rdev_set_wds_peer(&rdev->wiphy, dev, addr); - ret = rdev->ops->set_wds_peer(&rdev->wiphy, dev, addr); - trace_rdev_return_int(&rdev->wiphy, ret); - return ret; -} - static inline int rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev, struct net_device *dev, @@ -1356,4 +1346,16 @@ static inline int rdev_reset_tid_config(struct cfg80211_registered_device *rdev, return ret; } +static inline int rdev_set_sar_specs(struct cfg80211_registered_device *rdev, + struct cfg80211_sar_specs *sar) +{ + int ret; + + trace_rdev_set_sar_specs(&rdev->wiphy, sar); + ret = rdev->ops->set_sar_specs(&rdev->wiphy, sar); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index a04fdfb35f07..21536c48deec 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -5,7 +5,7 @@ * Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2021 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -139,10 +139,18 @@ static const struct ieee80211_regdomain *get_cfg80211_regdom(void) return rcu_dereference_rtnl(cfg80211_regdomain); } +/* + * Returns the regulatory domain associated with the wiphy. + * + * Requires any of RTNL, wiphy mutex or RCU protection. + */ const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy) { - return rcu_dereference_rtnl(wiphy->regd); + return rcu_dereference_check(wiphy->regd, + lockdep_is_held(&wiphy->mtx) || + lockdep_rtnl_is_held()); } +EXPORT_SYMBOL(get_wiphy_regdom); static const char *reg_dfs_region_str(enum nl80211_dfs_regions dfs_region) { @@ -164,7 +172,9 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy) const struct ieee80211_regdomain *regd = NULL; const struct ieee80211_regdomain *wiphy_regd = NULL; + rcu_read_lock(); regd = get_cfg80211_regdom(); + if (!wiphy) goto out; @@ -181,6 +191,8 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy) reg_dfs_region_str(regd->dfs_region)); out: + rcu_read_unlock(); + return regd->dfs_region; } @@ -1616,8 +1628,8 @@ static const struct ieee80211_reg_rule * __freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 min_bw) { const struct ieee80211_regdomain *regd = reg_get_regdomain(wiphy); - const u32 bws[] = {0, 1, 2, 4, 5, 8, 10, 16, 20}; - const struct ieee80211_reg_rule *reg_rule; + static const u32 bws[] = {0, 1, 2, 4, 5, 8, 10, 16, 20}; + const struct ieee80211_reg_rule *reg_rule = ERR_PTR(-ERANGE); int i = ARRAY_SIZE(bws) - 1; u32 bw; @@ -2547,6 +2559,7 @@ static void handle_band_custom(struct wiphy *wiphy, void wiphy_apply_custom_regulatory(struct wiphy *wiphy, const struct ieee80211_regdomain *regd) { + const struct ieee80211_regdomain *new_regd, *tmp; enum nl80211_band band; unsigned int bands_set = 0; @@ -2566,6 +2579,19 @@ void wiphy_apply_custom_regulatory(struct wiphy *wiphy, * on your device's supported bands. */ WARN_ON(!bands_set); + new_regd = reg_copy_regd(regd); + if (IS_ERR(new_regd)) + return; + + rtnl_lock(); + wiphy_lock(wiphy); + + tmp = get_wiphy_regdom(wiphy); + rcu_assign_pointer(wiphy->regd, new_regd); + rcu_free_regdom(tmp); + + wiphy_unlock(wiphy); + rtnl_unlock(); } EXPORT_SYMBOL(wiphy_apply_custom_regulatory); @@ -2727,7 +2753,10 @@ reg_process_hint_driver(struct wiphy *wiphy, return REG_REQ_IGNORE; tmp = get_wiphy_regdom(wiphy); + ASSERT_RTNL(); + wiphy_lock(wiphy); rcu_assign_pointer(wiphy->regd, regd); + wiphy_unlock(wiphy); rcu_free_regdom(tmp); } @@ -3059,41 +3088,52 @@ static void reg_process_pending_beacon_hints(void) spin_unlock_bh(®_pending_beacons_lock); } -static void reg_process_self_managed_hints(void) +static void reg_process_self_managed_hint(struct wiphy *wiphy) { - struct cfg80211_registered_device *rdev; - struct wiphy *wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); const struct ieee80211_regdomain *tmp; const struct ieee80211_regdomain *regd; enum nl80211_band band; struct regulatory_request request = {}; - list_for_each_entry(rdev, &cfg80211_rdev_list, list) { - wiphy = &rdev->wiphy; + ASSERT_RTNL(); + lockdep_assert_wiphy(wiphy); - spin_lock(®_requests_lock); - regd = rdev->requested_regd; - rdev->requested_regd = NULL; - spin_unlock(®_requests_lock); + spin_lock(®_requests_lock); + regd = rdev->requested_regd; + rdev->requested_regd = NULL; + spin_unlock(®_requests_lock); - if (regd == NULL) - continue; + if (!regd) + return; - tmp = get_wiphy_regdom(wiphy); - rcu_assign_pointer(wiphy->regd, regd); - rcu_free_regdom(tmp); + tmp = get_wiphy_regdom(wiphy); + rcu_assign_pointer(wiphy->regd, regd); + rcu_free_regdom(tmp); + + for (band = 0; band < NUM_NL80211_BANDS; band++) + handle_band_custom(wiphy, wiphy->bands[band], regd); + + reg_process_ht_flags(wiphy); - for (band = 0; band < NUM_NL80211_BANDS; band++) - handle_band_custom(wiphy, wiphy->bands[band], regd); + request.wiphy_idx = get_wiphy_idx(wiphy); + request.alpha2[0] = regd->alpha2[0]; + request.alpha2[1] = regd->alpha2[1]; + request.initiator = NL80211_REGDOM_SET_BY_DRIVER; - reg_process_ht_flags(wiphy); + nl80211_send_wiphy_reg_change_event(&request); +} + +static void reg_process_self_managed_hints(void) +{ + struct cfg80211_registered_device *rdev; - request.wiphy_idx = get_wiphy_idx(wiphy); - request.alpha2[0] = regd->alpha2[0]; - request.alpha2[1] = regd->alpha2[1]; - request.initiator = NL80211_REGDOM_SET_BY_DRIVER; + ASSERT_RTNL(); - nl80211_send_wiphy_reg_change_event(&request); + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { + wiphy_lock(&rdev->wiphy); + reg_process_self_managed_hint(&rdev->wiphy); + wiphy_unlock(&rdev->wiphy); } reg_check_channels(); @@ -3772,14 +3812,21 @@ static int reg_set_rd_driver(const struct ieee80211_regdomain *rd, return -ENODEV; if (!driver_request->intersect) { - if (request_wiphy->regd) + ASSERT_RTNL(); + wiphy_lock(request_wiphy); + if (request_wiphy->regd) { + wiphy_unlock(request_wiphy); return -EALREADY; + } regd = reg_copy_regd(rd); - if (IS_ERR(regd)) + if (IS_ERR(regd)) { + wiphy_unlock(request_wiphy); return PTR_ERR(regd); + } rcu_assign_pointer(request_wiphy->regd, regd); + wiphy_unlock(request_wiphy); reset_regdomains(false, rd); return 0; } @@ -3961,8 +4008,8 @@ int regulatory_set_wiphy_regd(struct wiphy *wiphy, } EXPORT_SYMBOL(regulatory_set_wiphy_regd); -int regulatory_set_wiphy_regd_sync_rtnl(struct wiphy *wiphy, - struct ieee80211_regdomain *rd) +int regulatory_set_wiphy_regd_sync(struct wiphy *wiphy, + struct ieee80211_regdomain *rd) { int ret; @@ -3973,10 +4020,11 @@ int regulatory_set_wiphy_regd_sync_rtnl(struct wiphy *wiphy, return ret; /* process the request immediately */ - reg_process_self_managed_hints(); + reg_process_self_managed_hint(wiphy); + reg_check_channels(); return 0; } -EXPORT_SYMBOL(regulatory_set_wiphy_regd_sync_rtnl); +EXPORT_SYMBOL(regulatory_set_wiphy_regd_sync); void wiphy_regulatory_register(struct wiphy *wiphy) { diff --git a/net/wireless/reg.h b/net/wireless/reg.h index f9e83031a40a..f3707f729024 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -63,7 +63,6 @@ unsigned int reg_get_max_bandwidth(const struct ieee80211_regdomain *rd, const struct ieee80211_reg_rule *rule); bool reg_last_request_cell_base(void); -const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy); /** * regulatory_hint_found_beacon - hints a beacon was found on a channel diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 8d0e49c46db3..019952d4fc7d 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -694,7 +694,7 @@ static void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request, static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap, struct cfg80211_scan_request *request) { - u8 i; + int i; u32 s_ssid; for (i = 0; i < request->n_ssids; i++) { @@ -726,7 +726,7 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) int n_channels, count = 0, err; struct cfg80211_scan_request *request, *rdev_req = rdev->scan_req; LIST_HEAD(coloc_ap_list); - bool need_scan_psc; + bool need_scan_psc = true; const struct ieee80211_sband_iftype_data *iftd; rdev_req->scan_6ghz = true; @@ -770,20 +770,18 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) (void *)&request->channels[n_channels]; /* - * PSC channels should not be scanned if all the reported co-located APs - * are indicating that all APs in the same ESS are co-located + * PSC channels should not be scanned in case of direct scan with 1 SSID + * and at least one of the reported co-located APs with same SSID + * indicating that all APs in the same ESS are co-located */ - if (count) { - need_scan_psc = false; - + if (count && request->n_ssids == 1 && request->ssids[0].ssid_len) { list_for_each_entry(ap, &coloc_ap_list, list) { - if (!ap->colocated_ess) { - need_scan_psc = true; + if (ap->colocated_ess && + cfg80211_find_ssid_match(ap, request)) { + need_scan_psc = false; break; } } - } else { - need_scan_psc = true; } /* @@ -920,7 +918,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, union iwreq_data wrqu; #endif - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); if (rdev->scan_msg) { nl80211_send_scan_msg(rdev, rdev->scan_msg); @@ -989,9 +987,9 @@ void __cfg80211_scan_done(struct work_struct *wk) rdev = container_of(wk, struct cfg80211_registered_device, scan_done_wk); - rtnl_lock(); + wiphy_lock(&rdev->wiphy); ___cfg80211_scan_done(rdev, true); - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); } void cfg80211_scan_done(struct cfg80211_scan_request *request, @@ -1024,7 +1022,7 @@ EXPORT_SYMBOL(cfg80211_scan_done); void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req) { - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); list_add_rcu(&req->list, &rdev->sched_scan_req_list); } @@ -1032,7 +1030,7 @@ void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev, static void cfg80211_del_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req) { - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); list_del_rcu(&req->list); kfree_rcu(req, rcu_head); @@ -1044,7 +1042,7 @@ cfg80211_find_sched_scan_req(struct cfg80211_registered_device *rdev, u64 reqid) struct cfg80211_sched_scan_request *pos; list_for_each_entry_rcu(pos, &rdev->sched_scan_req_list, list, - lockdep_rtnl_is_held()) { + lockdep_is_held(&rdev->wiphy.mtx)) { if (pos->reqid == reqid) return pos; } @@ -1092,7 +1090,7 @@ void cfg80211_sched_scan_results_wk(struct work_struct *work) rdev = container_of(work, struct cfg80211_registered_device, sched_scan_res_wk); - rtnl_lock(); + wiphy_lock(&rdev->wiphy); list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) { if (req->report_results) { req->report_results = false; @@ -1107,7 +1105,7 @@ void cfg80211_sched_scan_results_wk(struct work_struct *work) NL80211_CMD_SCHED_SCAN_RESULTS); } } - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); } void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid) @@ -1128,23 +1126,23 @@ void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid) } EXPORT_SYMBOL(cfg80211_sched_scan_results); -void cfg80211_sched_scan_stopped_rtnl(struct wiphy *wiphy, u64 reqid) +void cfg80211_sched_scan_stopped_locked(struct wiphy *wiphy, u64 reqid) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - ASSERT_RTNL(); + lockdep_assert_held(&wiphy->mtx); trace_cfg80211_sched_scan_stopped(wiphy, reqid); __cfg80211_stop_sched_scan(rdev, reqid, true); } -EXPORT_SYMBOL(cfg80211_sched_scan_stopped_rtnl); +EXPORT_SYMBOL(cfg80211_sched_scan_stopped_locked); void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid) { - rtnl_lock(); - cfg80211_sched_scan_stopped_rtnl(wiphy, reqid); - rtnl_unlock(); + wiphy_lock(wiphy); + cfg80211_sched_scan_stopped_locked(wiphy, reqid); + wiphy_unlock(wiphy); } EXPORT_SYMBOL(cfg80211_sched_scan_stopped); @@ -1152,7 +1150,7 @@ int cfg80211_stop_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req, bool driver_initiated) { - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); if (!driver_initiated) { int err = rdev_sched_scan_stop(rdev, req->dev, req->reqid); @@ -1172,7 +1170,7 @@ int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, { struct cfg80211_sched_scan_request *sched_scan_req; - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); sched_scan_req = cfg80211_find_sched_scan_req(rdev, reqid); if (!sched_scan_req) @@ -1901,6 +1899,9 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, tmp.pub.beacon_interval = beacon_interval; tmp.pub.capability = capability; tmp.ts_boottime = data->boottime_ns; + tmp.parent_tsf = data->parent_tsf; + ether_addr_copy(tmp.parent_bssid, data->parent_bssid); + if (non_tx_data) { tmp.pub.transmitted_bss = non_tx_data->tx_bss; ts = bss_from_pub(non_tx_data->tx_bss)->ts; @@ -2773,6 +2774,8 @@ int cfg80211_wext_siwscan(struct net_device *dev, eth_broadcast_addr(creq->bssid); + wiphy_lock(&rdev->wiphy); + rdev->scan_req = creq; err = rdev_scan(rdev, creq); if (err) { @@ -2784,6 +2787,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, creq = NULL; dev_hold(dev); } + wiphy_unlock(&rdev->wiphy); out: kfree(creq); return err; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 38df713f2e2e..07756ca5e3b5 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -67,7 +67,6 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) struct cfg80211_scan_request *request; int n_channels, err; - ASSERT_RTNL(); ASSERT_WDEV_LOCK(wdev); if (rdev->scan_req || rdev->scan_msg) @@ -233,7 +232,7 @@ void cfg80211_conn_work(struct work_struct *work) u8 bssid_buf[ETH_ALEN], *bssid = NULL; enum nl80211_timeout_reason treason; - rtnl_lock(); + wiphy_lock(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) @@ -266,7 +265,7 @@ void cfg80211_conn_work(struct work_struct *work) wdev_unlock(wdev); } - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); } /* Returned bss is reference counted and must be cleaned up appropriately. */ diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 3ac1f48195d2..9b959e3b09c6 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -5,6 +5,7 @@ * * Copyright 2005-2006 Jiri Benc <jbenc@suse.cz> * Copyright 2006 Johannes Berg <johannes@sipsolutions.net> + * Copyright (C) 2020-2021 Intel Corporation */ #include <linux/device.h> @@ -81,12 +82,6 @@ static void wiphy_dev_release(struct device *dev) cfg80211_dev_free(rdev); } -static int wiphy_uevent(struct device *dev, struct kobj_uevent_env *env) -{ - /* TODO, we probably need stuff here */ - return 0; -} - #ifdef CONFIG_PM_SLEEP static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) { @@ -104,6 +99,7 @@ static int wiphy_suspend(struct device *dev) rdev->suspend_at = ktime_get_boottime_seconds(); rtnl_lock(); + wiphy_lock(&rdev->wiphy); if (rdev->wiphy.registered) { if (!rdev->wiphy.wowlan_config) { cfg80211_leave_all(rdev); @@ -118,6 +114,7 @@ static int wiphy_suspend(struct device *dev) ret = rdev_suspend(rdev, NULL); } } + wiphy_unlock(&rdev->wiphy); rtnl_unlock(); return ret; @@ -132,8 +129,10 @@ static int wiphy_resume(struct device *dev) cfg80211_bss_age(rdev, ktime_get_boottime_seconds() - rdev->suspend_at); rtnl_lock(); + wiphy_lock(&rdev->wiphy); if (rdev->wiphy.registered && rdev->ops->resume) ret = rdev_resume(rdev); + wiphy_unlock(&rdev->wiphy); rtnl_unlock(); return ret; @@ -157,7 +156,6 @@ struct class ieee80211_class = { .owner = THIS_MODULE, .dev_release = wiphy_dev_release, .dev_groups = ieee80211_groups, - .dev_uevent = wiphy_uevent, .pm = WIPHY_PM_OPS, .ns_type = &net_ns_type_operations, .namespace = wiphy_namespace, diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 6e218a0acd4e..76b777d5903f 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -838,11 +838,6 @@ DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_del_mpath, TP_ARGS(wiphy, netdev, mac) ); -DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_set_wds_peer, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac), - TP_ARGS(wiphy, netdev, mac) -); - TRACE_EVENT(rdev_dump_station, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx, u8 *mac), @@ -2684,19 +2679,23 @@ DEFINE_EVENT(netdev_frame_event, cfg80211_rx_mlme_mgmt, ); TRACE_EVENT(cfg80211_tx_mlme_mgmt, - TP_PROTO(struct net_device *netdev, const u8 *buf, int len), - TP_ARGS(netdev, buf, len), + TP_PROTO(struct net_device *netdev, const u8 *buf, int len, + bool reconnect), + TP_ARGS(netdev, buf, len, reconnect), TP_STRUCT__entry( NETDEV_ENTRY __dynamic_array(u8, frame, len) + __field(int, reconnect) ), TP_fast_assign( NETDEV_ASSIGN; memcpy(__get_dynamic_array(frame), buf, len); + __entry->reconnect = reconnect; ), - TP_printk(NETDEV_PR_FMT ", ftype:0x%.2x", + TP_printk(NETDEV_PR_FMT ", ftype:0x%.2x reconnect:%d", NETDEV_PR_ARG, - le16_to_cpup((__le16 *)__get_dynamic_array(frame))) + le16_to_cpup((__le16 *)__get_dynamic_array(frame)), + __entry->reconnect) ); DECLARE_EVENT_CLASS(netdev_mac_evt, @@ -3547,6 +3546,25 @@ TRACE_EVENT(rdev_reset_tid_config, TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", tids: 0x%x", WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tids) ); + +TRACE_EVENT(rdev_set_sar_specs, + TP_PROTO(struct wiphy *wiphy, struct cfg80211_sar_specs *sar), + TP_ARGS(wiphy, sar), + TP_STRUCT__entry( + WIPHY_ENTRY + __field(u16, type) + __field(u16, num) + ), + TP_fast_assign( + WIPHY_ASSIGN; + __entry->type = sar->type; + __entry->num = sar->num_sub_specs; + + ), + TP_printk(WIPHY_PR_FMT ", Set type:%d, num_specs:%d", + WIPHY_PR_ARG, __entry->type, __entry->num) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/wireless/util.c b/net/wireless/util.c index f01746894a4e..1bf0200f562a 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -272,18 +272,53 @@ bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher) return false; } -int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, - struct key_params *params, int key_idx, - bool pairwise, const u8 *mac_addr) +static bool +cfg80211_igtk_cipher_supported(struct cfg80211_registered_device *rdev) { - int max_key_idx = 5; + struct wiphy *wiphy = &rdev->wiphy; + int i; - if (wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_BEACON_PROTECTION) || - wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) + for (i = 0; i < wiphy->n_cipher_suites; i++) { + switch (wiphy->cipher_suites[i]) { + case WLAN_CIPHER_SUITE_AES_CMAC: + case WLAN_CIPHER_SUITE_BIP_CMAC_256: + case WLAN_CIPHER_SUITE_BIP_GMAC_128: + case WLAN_CIPHER_SUITE_BIP_GMAC_256: + return true; + } + } + + return false; +} + +bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev, + int key_idx, bool pairwise) +{ + int max_key_idx; + + if (pairwise) + max_key_idx = 3; + else if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION) || + wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; + else if (cfg80211_igtk_cipher_supported(rdev)) + max_key_idx = 5; + else + max_key_idx = 3; + if (key_idx < 0 || key_idx > max_key_idx) + return false; + + return true; +} + +int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, + struct key_params *params, int key_idx, + bool pairwise, const u8 *mac_addr) +{ + if (!cfg80211_valid_key_idx(rdev, key_idx, pairwise)) return -EINVAL; if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) @@ -335,6 +370,7 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, case WLAN_CIPHER_SUITE_WEP104: if (key_idx > 3) return -EINVAL; + break; default: break; } @@ -550,8 +586,7 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, return -1; break; case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS): - if (unlikely(iftype != NL80211_IFTYPE_WDS && - iftype != NL80211_IFTYPE_MESH_POINT && + if (unlikely(iftype != NL80211_IFTYPE_MESH_POINT && iftype != NL80211_IFTYPE_AP_VLAN && iftype != NL80211_IFTYPE_STATION)) return -1; @@ -962,7 +997,7 @@ void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_process_wdev_events(wdev); @@ -975,7 +1010,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, int err; enum nl80211_iftype otype = dev->ieee80211_ptr->iftype; - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); /* don't support changing VLANs, you just re-create them */ if (otype == NL80211_IFTYPE_AP_VLAN) @@ -1051,7 +1086,6 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MESH_POINT: /* bridging OK */ break; @@ -1063,6 +1097,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, /* not happening */ break; case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_NAN: WARN_ON(1); break; @@ -1153,6 +1188,25 @@ static u32 cfg80211_calculate_bitrate_dmg(struct rate_info *rate) return __mcs2bitrate[rate->mcs]; } +static u32 cfg80211_calculate_bitrate_extended_sc_dmg(struct rate_info *rate) +{ + static const u32 __mcs2bitrate[] = { + [6 - 6] = 26950, /* MCS 9.1 : 2695.0 mbps */ + [7 - 6] = 50050, /* MCS 12.1 */ + [8 - 6] = 53900, + [9 - 6] = 57750, + [10 - 6] = 63900, + [11 - 6] = 75075, + [12 - 6] = 80850, + }; + + /* Extended SC MCS not defined for base MCS below 6 or above 12 */ + if (WARN_ON_ONCE(rate->mcs < 6 || rate->mcs > 12)) + return 0; + + return __mcs2bitrate[rate->mcs - 6]; +} + static u32 cfg80211_calculate_bitrate_edmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { @@ -1189,7 +1243,7 @@ static u32 cfg80211_calculate_bitrate_edmg(struct rate_info *rate) static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) { - static const u32 base[4][10] = { + static const u32 base[4][12] = { { 6500000, 13000000, 19500000, @@ -1200,7 +1254,9 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) 65000000, 78000000, /* not in the spec, but some devices use this: */ - 86500000, + 86700000, + 97500000, + 108300000, }, { 13500000, 27000000, @@ -1212,6 +1268,8 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) 135000000, 162000000, 180000000, + 202500000, + 225000000, }, { 29300000, 58500000, @@ -1223,6 +1281,8 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) 292500000, 351000000, 390000000, + 438800000, + 487500000, }, { 58500000, 117000000, @@ -1234,12 +1294,14 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) 585000000, 702000000, 780000000, + 877500000, + 975000000, }, }; u32 bitrate; int idx; - if (rate->mcs > 9) + if (rate->mcs > 11) goto warn; switch (rate->bw) { @@ -1276,20 +1338,22 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) { -#define SCALE 2048 - u16 mcs_divisors[12] = { - 34133, /* 16.666666... */ - 17067, /* 8.333333... */ - 11378, /* 5.555555... */ - 8533, /* 4.166666... */ - 5689, /* 2.777777... */ - 4267, /* 2.083333... */ - 3923, /* 1.851851... */ - 3413, /* 1.666666... */ - 2844, /* 1.388888... */ - 2560, /* 1.250000... */ - 2276, /* 1.111111... */ - 2048, /* 1.000000... */ +#define SCALE 6144 + u32 mcs_divisors[14] = { + 102399, /* 16.666666... */ + 51201, /* 8.333333... */ + 34134, /* 5.555555... */ + 25599, /* 4.166666... */ + 17067, /* 2.777777... */ + 12801, /* 2.083333... */ + 11769, /* 1.851851... */ + 10239, /* 1.666666... */ + 8532, /* 1.388888... */ + 7680, /* 1.250000... */ + 6828, /* 1.111111... */ + 6144, /* 1.000000... */ + 5690, /* 0.926106... */ + 5120, /* 0.833333... */ }; u32 rates_160M[3] = { 960777777, 907400000, 816666666 }; u32 rates_969[3] = { 480388888, 453700000, 408333333 }; @@ -1301,7 +1365,7 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) u64 tmp; u32 result; - if (WARN_ON_ONCE(rate->mcs > 11)) + if (WARN_ON_ONCE(rate->mcs > 13)) return 0; if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2)) @@ -1361,6 +1425,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate) return cfg80211_calculate_bitrate_ht(rate); if (rate->flags & RATE_INFO_FLAGS_DMG) return cfg80211_calculate_bitrate_dmg(rate); + if (rate->flags & RATE_INFO_FLAGS_EXTENDED_SC_DMG) + return cfg80211_calculate_bitrate_extended_sc_dmg(rate); if (rate->flags & RATE_INFO_FLAGS_EDMG) return cfg80211_calculate_bitrate_edmg(rate); if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 78f2927ead7f..a8320dc59af7 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -7,7 +7,7 @@ * we directly assign the wireless handlers of wireless interfaces. * * Copyright 2008-2009 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2019 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation */ #include <linux/export.h> @@ -39,6 +39,7 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, struct cfg80211_registered_device *rdev; struct vif_params vifparams; enum nl80211_iftype type; + int ret; rdev = wiphy_to_rdev(wdev->wiphy); @@ -49,9 +50,6 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, case IW_MODE_ADHOC: type = NL80211_IFTYPE_ADHOC; break; - case IW_MODE_REPEAT: - type = NL80211_IFTYPE_WDS; - break; case IW_MODE_MONITOR: type = NL80211_IFTYPE_MONITOR; break; @@ -64,7 +62,11 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, memset(&vifparams, 0, sizeof(vifparams)); - return cfg80211_change_iface(rdev, dev, type, &vifparams); + wiphy_lock(wdev->wiphy); + ret = cfg80211_change_iface(rdev, dev, type, &vifparams); + wiphy_unlock(wdev->wiphy); + + return ret; } EXPORT_WEXT_HANDLER(cfg80211_wext_siwmode); @@ -256,17 +258,23 @@ int cfg80211_wext_siwrts(struct net_device *dev, u32 orts = wdev->wiphy->rts_threshold; int err; - if (rts->disabled || !rts->fixed) + wiphy_lock(&rdev->wiphy); + if (rts->disabled || !rts->fixed) { wdev->wiphy->rts_threshold = (u32) -1; - else if (rts->value < 0) - return -EINVAL; - else + } else if (rts->value < 0) { + err = -EINVAL; + goto out; + } else { wdev->wiphy->rts_threshold = rts->value; + } err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_RTS_THRESHOLD); + if (err) wdev->wiphy->rts_threshold = orts; +out: + wiphy_unlock(&rdev->wiphy); return err; } EXPORT_WEXT_HANDLER(cfg80211_wext_siwrts); @@ -294,11 +302,13 @@ int cfg80211_wext_siwfrag(struct net_device *dev, u32 ofrag = wdev->wiphy->frag_threshold; int err; - if (frag->disabled || !frag->fixed) + wiphy_lock(&rdev->wiphy); + if (frag->disabled || !frag->fixed) { wdev->wiphy->frag_threshold = (u32) -1; - else if (frag->value < 256) - return -EINVAL; - else { + } else if (frag->value < 256) { + err = -EINVAL; + goto out; + } else { /* Fragment length must be even, so strip LSB. */ wdev->wiphy->frag_threshold = frag->value & ~0x1; } @@ -306,6 +316,8 @@ int cfg80211_wext_siwfrag(struct net_device *dev, err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_FRAG_THRESHOLD); if (err) wdev->wiphy->frag_threshold = ofrag; +out: + wiphy_unlock(&rdev->wiphy); return err; } @@ -340,6 +352,7 @@ static int cfg80211_wext_siwretry(struct net_device *dev, (retry->flags & IW_RETRY_TYPE) != IW_RETRY_LIMIT) return -EINVAL; + wiphy_lock(&rdev->wiphy); if (retry->flags & IW_RETRY_LONG) { wdev->wiphy->retry_long = retry->value; changed |= WIPHY_PARAM_RETRY_LONG; @@ -358,6 +371,7 @@ static int cfg80211_wext_siwretry(struct net_device *dev, wdev->wiphy->retry_short = oshort; wdev->wiphy->retry_long = olong; } + wiphy_unlock(&rdev->wiphy); return err; } @@ -580,15 +594,18 @@ static int cfg80211_wext_siwencode(struct net_device *dev, !rdev->ops->set_default_key) return -EOPNOTSUPP; + wiphy_lock(&rdev->wiphy); idx = erq->flags & IW_ENCODE_INDEX; if (idx == 0) { idx = wdev->wext.default_key; if (idx < 0) idx = 0; - } else if (idx < 1 || idx > 4) - return -EINVAL; - else + } else if (idx < 1 || idx > 4) { + err = -EINVAL; + goto out; + } else { idx--; + } if (erq->flags & IW_ENCODE_DISABLED) remove = true; @@ -602,22 +619,28 @@ static int cfg80211_wext_siwencode(struct net_device *dev, if (!err) wdev->wext.default_key = idx; wdev_unlock(wdev); - return err; + goto out; } memset(¶ms, 0, sizeof(params)); params.key = keybuf; params.key_len = erq->length; - if (erq->length == 5) + if (erq->length == 5) { params.cipher = WLAN_CIPHER_SUITE_WEP40; - else if (erq->length == 13) + } else if (erq->length == 13) { params.cipher = WLAN_CIPHER_SUITE_WEP104; - else if (!remove) - return -EINVAL; + } else if (!remove) { + err = -EINVAL; + goto out; + } + + err = cfg80211_set_encryption(rdev, dev, false, NULL, remove, + wdev->wext.default_key == -1, + idx, ¶ms); +out: + wiphy_unlock(&rdev->wiphy); - return cfg80211_set_encryption(rdev, dev, false, NULL, remove, - wdev->wext.default_key == -1, - idx, ¶ms); + return err; } static int cfg80211_wext_siwencodeext(struct net_device *dev, @@ -632,6 +655,7 @@ static int cfg80211_wext_siwencodeext(struct net_device *dev, bool remove = false; struct key_params params; u32 cipher; + int ret; if (wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_ADHOC) @@ -703,12 +727,16 @@ static int cfg80211_wext_siwencodeext(struct net_device *dev, params.seq_len = 6; } - return cfg80211_set_encryption( + wiphy_lock(wdev->wiphy); + ret = cfg80211_set_encryption( rdev, dev, !(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY), addr, remove, ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY, idx, ¶ms); + wiphy_unlock(wdev->wiphy); + + return ret; } static int cfg80211_wext_giwencode(struct net_device *dev, @@ -757,38 +785,61 @@ static int cfg80211_wext_siwfreq(struct net_device *dev, struct cfg80211_chan_def chandef = { .width = NL80211_CHAN_WIDTH_20_NOHT, }; - int freq; + int freq, ret; + + wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_STATION: - return cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra); + ret = cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra); + break; case NL80211_IFTYPE_ADHOC: - return cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra); + ret = cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra); + break; case NL80211_IFTYPE_MONITOR: freq = cfg80211_wext_freq(wextfreq); - if (freq < 0) - return freq; - if (freq == 0) - return -EINVAL; + if (freq < 0) { + ret = freq; + break; + } + if (freq == 0) { + ret = -EINVAL; + break; + } chandef.center_freq1 = freq; chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); - if (!chandef.chan) - return -EINVAL; - return cfg80211_set_monitor_channel(rdev, &chandef); + if (!chandef.chan) { + ret = -EINVAL; + break; + } + ret = cfg80211_set_monitor_channel(rdev, &chandef); + break; case NL80211_IFTYPE_MESH_POINT: freq = cfg80211_wext_freq(wextfreq); - if (freq < 0) - return freq; - if (freq == 0) - return -EINVAL; + if (freq < 0) { + ret = freq; + break; + } + if (freq == 0) { + ret = -EINVAL; + break; + } chandef.center_freq1 = freq; chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); - if (!chandef.chan) - return -EINVAL; - return cfg80211_set_mesh_channel(rdev, wdev, &chandef); + if (!chandef.chan) { + ret = -EINVAL; + break; + } + ret = cfg80211_set_mesh_channel(rdev, wdev, &chandef); + break; default: - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; + break; } + + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_giwfreq(struct net_device *dev, @@ -800,24 +851,35 @@ static int cfg80211_wext_giwfreq(struct net_device *dev, struct cfg80211_chan_def chandef = {}; int ret; + wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_STATION: - return cfg80211_mgd_wext_giwfreq(dev, info, freq, extra); + ret = cfg80211_mgd_wext_giwfreq(dev, info, freq, extra); + break; case NL80211_IFTYPE_ADHOC: - return cfg80211_ibss_wext_giwfreq(dev, info, freq, extra); + ret = cfg80211_ibss_wext_giwfreq(dev, info, freq, extra); + break; case NL80211_IFTYPE_MONITOR: - if (!rdev->ops->get_channel) - return -EINVAL; + if (!rdev->ops->get_channel) { + ret = -EINVAL; + break; + } ret = rdev_get_channel(rdev, wdev, &chandef); if (ret) - return ret; + break; freq->m = chandef.chan->center_freq; freq->e = 6; - return 0; + ret = 0; + break; default: - return -EINVAL; + ret = -EINVAL; + break; } + + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_siwtxpower(struct net_device *dev, @@ -828,6 +890,7 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); enum nl80211_tx_power_setting type; int dbm = 0; + int ret; if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM) return -EINVAL; @@ -869,7 +932,11 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, return 0; } - return rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm)); + wiphy_lock(&rdev->wiphy); + ret = rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm)); + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_giwtxpower(struct net_device *dev, @@ -888,7 +955,9 @@ static int cfg80211_wext_giwtxpower(struct net_device *dev, if (!rdev->ops->get_tx_power) return -EOPNOTSUPP; + wiphy_lock(&rdev->wiphy); err = rdev_get_tx_power(rdev, wdev, &val); + wiphy_unlock(&rdev->wiphy); if (err) return err; @@ -1128,7 +1197,9 @@ static int cfg80211_wext_siwpower(struct net_device *dev, timeout = wrq->value / 1000; } + wiphy_lock(&rdev->wiphy); err = rdev_set_power_mgmt(rdev, dev, ps, timeout); + wiphy_unlock(&rdev->wiphy); if (err) return err; @@ -1150,50 +1221,6 @@ static int cfg80211_wext_giwpower(struct net_device *dev, return 0; } -static int cfg80211_wds_wext_siwap(struct net_device *dev, - struct iw_request_info *info, - struct sockaddr *addr, char *extra) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - int err; - - if (WARN_ON(wdev->iftype != NL80211_IFTYPE_WDS)) - return -EINVAL; - - if (addr->sa_family != ARPHRD_ETHER) - return -EINVAL; - - if (netif_running(dev)) - return -EBUSY; - - if (!rdev->ops->set_wds_peer) - return -EOPNOTSUPP; - - err = rdev_set_wds_peer(rdev, dev, (u8 *)&addr->sa_data); - if (err) - return err; - - memcpy(&wdev->wext.bssid, (u8 *) &addr->sa_data, ETH_ALEN); - - return 0; -} - -static int cfg80211_wds_wext_giwap(struct net_device *dev, - struct iw_request_info *info, - struct sockaddr *addr, char *extra) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - - if (WARN_ON(wdev->iftype != NL80211_IFTYPE_WDS)) - return -EINVAL; - - addr->sa_family = ARPHRD_ETHER; - memcpy(&addr->sa_data, wdev->wext.bssid, ETH_ALEN); - - return 0; -} - static int cfg80211_wext_siwrate(struct net_device *dev, struct iw_request_info *info, struct iw_param *rate, char *extra) @@ -1203,7 +1230,7 @@ static int cfg80211_wext_siwrate(struct net_device *dev, struct cfg80211_bitrate_mask mask; u32 fixed, maxrate; struct ieee80211_supported_band *sband; - int band, ridx; + int band, ridx, ret; bool match = false; if (!rdev->ops->set_bitrate_mask) @@ -1242,7 +1269,11 @@ static int cfg80211_wext_siwrate(struct net_device *dev, if (!match) return -EINVAL; - return rdev_set_bitrate_mask(rdev, dev, NULL, &mask); + wiphy_lock(&rdev->wiphy); + ret = rdev_set_bitrate_mask(rdev, dev, NULL, &mask); + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_giwrate(struct net_device *dev, @@ -1271,7 +1302,9 @@ static int cfg80211_wext_giwrate(struct net_device *dev, if (err) return err; + wiphy_lock(&rdev->wiphy); err = rdev_get_station(rdev, dev, addr, &sinfo); + wiphy_unlock(&rdev->wiphy); if (err) return err; @@ -1296,6 +1329,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) static struct iw_statistics wstats; static struct station_info sinfo = {}; u8 bssid[ETH_ALEN]; + int ret; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) return NULL; @@ -1314,7 +1348,11 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) memset(&sinfo, 0, sizeof(sinfo)); - if (rdev_get_station(rdev, dev, bssid, &sinfo)) + wiphy_lock(&rdev->wiphy); + ret = rdev_get_station(rdev, dev, bssid, &sinfo); + wiphy_unlock(&rdev->wiphy); + + if (ret) return NULL; memset(&wstats, 0, sizeof(wstats)); @@ -1365,17 +1403,24 @@ static int cfg80211_wext_siwap(struct net_device *dev, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + int ret; + wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - return cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra); + ret = cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra); + break; case NL80211_IFTYPE_STATION: - return cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra); - case NL80211_IFTYPE_WDS: - return cfg80211_wds_wext_siwap(dev, info, ap_addr, extra); + ret = cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra); + break; default: - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; + break; } + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_giwap(struct net_device *dev, @@ -1383,17 +1428,24 @@ static int cfg80211_wext_giwap(struct net_device *dev, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + int ret; + wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - return cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra); + ret = cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra); + break; case NL80211_IFTYPE_STATION: - return cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra); - case NL80211_IFTYPE_WDS: - return cfg80211_wds_wext_giwap(dev, info, ap_addr, extra); + ret = cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra); + break; default: - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; + break; } + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_siwessid(struct net_device *dev, @@ -1401,15 +1453,24 @@ static int cfg80211_wext_siwessid(struct net_device *dev, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + int ret; + wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - return cfg80211_ibss_wext_siwessid(dev, info, data, ssid); + ret = cfg80211_ibss_wext_siwessid(dev, info, data, ssid); + break; case NL80211_IFTYPE_STATION: - return cfg80211_mgd_wext_siwessid(dev, info, data, ssid); + ret = cfg80211_mgd_wext_siwessid(dev, info, data, ssid); + break; default: - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; + break; } + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_giwessid(struct net_device *dev, @@ -1417,18 +1478,27 @@ static int cfg80211_wext_giwessid(struct net_device *dev, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + int ret; data->flags = 0; data->length = 0; + wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - return cfg80211_ibss_wext_giwessid(dev, info, data, ssid); + ret = cfg80211_ibss_wext_giwessid(dev, info, data, ssid); + break; case NL80211_IFTYPE_STATION: - return cfg80211_mgd_wext_giwessid(dev, info, data, ssid); + ret = cfg80211_mgd_wext_giwessid(dev, info, data, ssid); + break; default: - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; + break; } + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_siwpmksa(struct net_device *dev, @@ -1439,6 +1509,7 @@ static int cfg80211_wext_siwpmksa(struct net_device *dev, struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_pmksa cfg_pmksa; struct iw_pmksa *pmksa = (struct iw_pmksa *)extra; + int ret; memset(&cfg_pmksa, 0, sizeof(struct cfg80211_pmksa)); @@ -1448,63 +1519,113 @@ static int cfg80211_wext_siwpmksa(struct net_device *dev, cfg_pmksa.bssid = pmksa->bssid.sa_data; cfg_pmksa.pmkid = pmksa->pmkid; + wiphy_lock(&rdev->wiphy); switch (pmksa->cmd) { case IW_PMKSA_ADD: - if (!rdev->ops->set_pmksa) - return -EOPNOTSUPP; - - return rdev_set_pmksa(rdev, dev, &cfg_pmksa); + if (!rdev->ops->set_pmksa) { + ret = -EOPNOTSUPP; + break; + } + ret = rdev_set_pmksa(rdev, dev, &cfg_pmksa); + break; case IW_PMKSA_REMOVE: - if (!rdev->ops->del_pmksa) - return -EOPNOTSUPP; - - return rdev_del_pmksa(rdev, dev, &cfg_pmksa); + if (!rdev->ops->del_pmksa) { + ret = -EOPNOTSUPP; + break; + } + ret = rdev_del_pmksa(rdev, dev, &cfg_pmksa); + break; case IW_PMKSA_FLUSH: - if (!rdev->ops->flush_pmksa) - return -EOPNOTSUPP; - - return rdev_flush_pmksa(rdev, dev); + if (!rdev->ops->flush_pmksa) { + ret = -EOPNOTSUPP; + break; + } + ret = rdev_flush_pmksa(rdev, dev); + break; default: - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; + break; } + wiphy_unlock(&rdev->wiphy); + + return ret; } +#define DEFINE_WEXT_COMPAT_STUB(func, type) \ + static int __ ## func(struct net_device *dev, \ + struct iw_request_info *info, \ + union iwreq_data *wrqu, \ + char *extra) \ + { \ + return func(dev, info, (type *)wrqu, extra); \ + } + +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwname, char) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwfreq, struct iw_freq) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwfreq, struct iw_freq) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwmode, u32) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwmode, u32) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwrange, struct iw_point) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwap, struct sockaddr) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwap, struct sockaddr) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwmlme, struct iw_point) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwscan, struct iw_point) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwessid, struct iw_point) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwessid, struct iw_point) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwrate, struct iw_param) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwrate, struct iw_param) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwrts, struct iw_param) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwrts, struct iw_param) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwfrag, struct iw_param) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwfrag, struct iw_param) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwretry, struct iw_param) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwretry, struct iw_param) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwencode, struct iw_point) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwencode, struct iw_point) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwpower, struct iw_param) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwpower, struct iw_param) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwgenie, struct iw_point) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwauth, struct iw_param) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwauth, struct iw_param) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwencodeext, struct iw_point) +DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwpmksa, struct iw_point) + static const iw_handler cfg80211_handlers[] = { - [IW_IOCTL_IDX(SIOCGIWNAME)] = (iw_handler) cfg80211_wext_giwname, - [IW_IOCTL_IDX(SIOCSIWFREQ)] = (iw_handler) cfg80211_wext_siwfreq, - [IW_IOCTL_IDX(SIOCGIWFREQ)] = (iw_handler) cfg80211_wext_giwfreq, - [IW_IOCTL_IDX(SIOCSIWMODE)] = (iw_handler) cfg80211_wext_siwmode, - [IW_IOCTL_IDX(SIOCGIWMODE)] = (iw_handler) cfg80211_wext_giwmode, - [IW_IOCTL_IDX(SIOCGIWRANGE)] = (iw_handler) cfg80211_wext_giwrange, - [IW_IOCTL_IDX(SIOCSIWAP)] = (iw_handler) cfg80211_wext_siwap, - [IW_IOCTL_IDX(SIOCGIWAP)] = (iw_handler) cfg80211_wext_giwap, - [IW_IOCTL_IDX(SIOCSIWMLME)] = (iw_handler) cfg80211_wext_siwmlme, - [IW_IOCTL_IDX(SIOCSIWSCAN)] = (iw_handler) cfg80211_wext_siwscan, - [IW_IOCTL_IDX(SIOCGIWSCAN)] = (iw_handler) cfg80211_wext_giwscan, - [IW_IOCTL_IDX(SIOCSIWESSID)] = (iw_handler) cfg80211_wext_siwessid, - [IW_IOCTL_IDX(SIOCGIWESSID)] = (iw_handler) cfg80211_wext_giwessid, - [IW_IOCTL_IDX(SIOCSIWRATE)] = (iw_handler) cfg80211_wext_siwrate, - [IW_IOCTL_IDX(SIOCGIWRATE)] = (iw_handler) cfg80211_wext_giwrate, - [IW_IOCTL_IDX(SIOCSIWRTS)] = (iw_handler) cfg80211_wext_siwrts, - [IW_IOCTL_IDX(SIOCGIWRTS)] = (iw_handler) cfg80211_wext_giwrts, - [IW_IOCTL_IDX(SIOCSIWFRAG)] = (iw_handler) cfg80211_wext_siwfrag, - [IW_IOCTL_IDX(SIOCGIWFRAG)] = (iw_handler) cfg80211_wext_giwfrag, - [IW_IOCTL_IDX(SIOCSIWTXPOW)] = (iw_handler) cfg80211_wext_siwtxpower, - [IW_IOCTL_IDX(SIOCGIWTXPOW)] = (iw_handler) cfg80211_wext_giwtxpower, - [IW_IOCTL_IDX(SIOCSIWRETRY)] = (iw_handler) cfg80211_wext_siwretry, - [IW_IOCTL_IDX(SIOCGIWRETRY)] = (iw_handler) cfg80211_wext_giwretry, - [IW_IOCTL_IDX(SIOCSIWENCODE)] = (iw_handler) cfg80211_wext_siwencode, - [IW_IOCTL_IDX(SIOCGIWENCODE)] = (iw_handler) cfg80211_wext_giwencode, - [IW_IOCTL_IDX(SIOCSIWPOWER)] = (iw_handler) cfg80211_wext_siwpower, - [IW_IOCTL_IDX(SIOCGIWPOWER)] = (iw_handler) cfg80211_wext_giwpower, - [IW_IOCTL_IDX(SIOCSIWGENIE)] = (iw_handler) cfg80211_wext_siwgenie, - [IW_IOCTL_IDX(SIOCSIWAUTH)] = (iw_handler) cfg80211_wext_siwauth, - [IW_IOCTL_IDX(SIOCGIWAUTH)] = (iw_handler) cfg80211_wext_giwauth, - [IW_IOCTL_IDX(SIOCSIWENCODEEXT)]= (iw_handler) cfg80211_wext_siwencodeext, - [IW_IOCTL_IDX(SIOCSIWPMKSA)] = (iw_handler) cfg80211_wext_siwpmksa, + [IW_IOCTL_IDX(SIOCGIWNAME)] = __cfg80211_wext_giwname, + [IW_IOCTL_IDX(SIOCSIWFREQ)] = __cfg80211_wext_siwfreq, + [IW_IOCTL_IDX(SIOCGIWFREQ)] = __cfg80211_wext_giwfreq, + [IW_IOCTL_IDX(SIOCSIWMODE)] = __cfg80211_wext_siwmode, + [IW_IOCTL_IDX(SIOCGIWMODE)] = __cfg80211_wext_giwmode, + [IW_IOCTL_IDX(SIOCGIWRANGE)] = __cfg80211_wext_giwrange, + [IW_IOCTL_IDX(SIOCSIWAP)] = __cfg80211_wext_siwap, + [IW_IOCTL_IDX(SIOCGIWAP)] = __cfg80211_wext_giwap, + [IW_IOCTL_IDX(SIOCSIWMLME)] = __cfg80211_wext_siwmlme, + [IW_IOCTL_IDX(SIOCSIWSCAN)] = cfg80211_wext_siwscan, + [IW_IOCTL_IDX(SIOCGIWSCAN)] = __cfg80211_wext_giwscan, + [IW_IOCTL_IDX(SIOCSIWESSID)] = __cfg80211_wext_siwessid, + [IW_IOCTL_IDX(SIOCGIWESSID)] = __cfg80211_wext_giwessid, + [IW_IOCTL_IDX(SIOCSIWRATE)] = __cfg80211_wext_siwrate, + [IW_IOCTL_IDX(SIOCGIWRATE)] = __cfg80211_wext_giwrate, + [IW_IOCTL_IDX(SIOCSIWRTS)] = __cfg80211_wext_siwrts, + [IW_IOCTL_IDX(SIOCGIWRTS)] = __cfg80211_wext_giwrts, + [IW_IOCTL_IDX(SIOCSIWFRAG)] = __cfg80211_wext_siwfrag, + [IW_IOCTL_IDX(SIOCGIWFRAG)] = __cfg80211_wext_giwfrag, + [IW_IOCTL_IDX(SIOCSIWTXPOW)] = cfg80211_wext_siwtxpower, + [IW_IOCTL_IDX(SIOCGIWTXPOW)] = cfg80211_wext_giwtxpower, + [IW_IOCTL_IDX(SIOCSIWRETRY)] = __cfg80211_wext_siwretry, + [IW_IOCTL_IDX(SIOCGIWRETRY)] = __cfg80211_wext_giwretry, + [IW_IOCTL_IDX(SIOCSIWENCODE)] = __cfg80211_wext_siwencode, + [IW_IOCTL_IDX(SIOCGIWENCODE)] = __cfg80211_wext_giwencode, + [IW_IOCTL_IDX(SIOCSIWPOWER)] = __cfg80211_wext_siwpower, + [IW_IOCTL_IDX(SIOCGIWPOWER)] = __cfg80211_wext_giwpower, + [IW_IOCTL_IDX(SIOCSIWGENIE)] = __cfg80211_wext_siwgenie, + [IW_IOCTL_IDX(SIOCSIWAUTH)] = __cfg80211_wext_siwauth, + [IW_IOCTL_IDX(SIOCGIWAUTH)] = __cfg80211_wext_giwauth, + [IW_IOCTL_IDX(SIOCSIWENCODEEXT)]= __cfg80211_wext_siwencodeext, + [IW_IOCTL_IDX(SIOCSIWPMKSA)] = __cfg80211_wext_siwpmksa, }; const struct iw_handler_def cfg80211_wext_handler = { diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 69102fda9ebd..76a80a41615b 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -896,8 +896,9 @@ out: int call_commit_handler(struct net_device *dev) { #ifdef CONFIG_WIRELESS_EXT - if ((netif_running(dev)) && - (dev->wireless_handlers->standard[0] != NULL)) + if (netif_running(dev) && + dev->wireless_handlers && + dev->wireless_handlers->standard[0]) /* Call the commit handler on the driver */ return dev->wireless_handlers->standard[0](dev, NULL, NULL, NULL); diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 73df23570d43..193a18a53142 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -3,7 +3,7 @@ * cfg80211 wext compat for managed mode. * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2009 Intel Corporation. All rights reserved. + * Copyright (C) 2009, 2020-2021 Intel Corporation. */ #include <linux/export.h> @@ -379,6 +379,7 @@ int cfg80211_wext_siwmlme(struct net_device *dev, if (mlme->addr.sa_family != ARPHRD_ETHER) return -EINVAL; + wiphy_lock(&rdev->wiphy); wdev_lock(wdev); switch (mlme->cmd) { case IW_MLME_DEAUTH: @@ -390,6 +391,7 @@ int cfg80211_wext_siwmlme(struct net_device *dev, break; } wdev_unlock(wdev); + wiphy_unlock(&rdev->wiphy); return err; } diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 0bbb283f23c9..ff687b97b2d9 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -200,22 +200,6 @@ static void x25_remove_socket(struct sock *sk) } /* - * Kill all bound sockets on a dropped device. - */ -static void x25_kill_by_device(struct net_device *dev) -{ - struct sock *s; - - write_lock_bh(&x25_list_lock); - - sk_for_each(s, &x25_list) - if (x25_sk(s)->neighbour && x25_sk(s)->neighbour->dev == dev) - x25_disconnect(s, ENETUNREACH, 0, 0); - - write_unlock_bh(&x25_list_lock); -} - -/* * Handle device status changes. */ static int x25_device_event(struct notifier_block *this, unsigned long event, @@ -227,27 +211,33 @@ static int x25_device_event(struct notifier_block *this, unsigned long event, if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; - if (dev->type == ARPHRD_X25 -#if IS_ENABLED(CONFIG_LLC) - || dev->type == ARPHRD_ETHER -#endif - ) { + if (dev->type == ARPHRD_X25) { switch (event) { - case NETDEV_UP: + case NETDEV_REGISTER: + case NETDEV_POST_TYPE_CHANGE: x25_link_device_up(dev); break; - case NETDEV_GOING_DOWN: + case NETDEV_DOWN: nb = x25_get_neigh(dev); if (nb) { - x25_terminate_link(nb); + x25_link_terminated(nb); x25_neigh_put(nb); } - break; - case NETDEV_DOWN: - x25_kill_by_device(dev); x25_route_device_down(dev); + break; + case NETDEV_PRE_TYPE_CHANGE: + case NETDEV_UNREGISTER: x25_link_device_down(dev); break; + case NETDEV_CHANGE: + if (!netif_carrier_ok(dev)) { + nb = x25_get_neigh(dev); + if (nb) { + x25_link_terminated(nb); + x25_neigh_put(nb); + } + } + break; } } @@ -681,7 +671,8 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) int len, i, rc = 0; if (addr_len != sizeof(struct sockaddr_x25) || - addr->sx25_family != AF_X25) { + addr->sx25_family != AF_X25 || + strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) { rc = -EINVAL; goto out; } @@ -775,7 +766,8 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr, rc = -EINVAL; if (addr_len != sizeof(struct sockaddr_x25) || - addr->sx25_family != AF_X25) + addr->sx25_family != AF_X25 || + strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) goto out; rc = -ENETUNREACH; @@ -825,7 +817,7 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr, sock->state = SS_CONNECTED; rc = 0; out_put_neigh: - if (rc) { + if (rc && x25->neighbour) { read_lock_bh(&x25_list_lock); x25_neigh_put(x25->neighbour); x25->neighbour = NULL; @@ -1050,6 +1042,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, makex25->lci = lci; makex25->dest_addr = dest_addr; makex25->source_addr = source_addr; + x25_neigh_hold(nb); makex25->neighbour = nb; makex25->facilities = facilities; makex25->dte_facilities= dte_facilities; diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index 25bf72ee6cad..5259ef8f5242 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -160,10 +160,6 @@ void x25_establish_link(struct x25_neigh *nb) *ptr = X25_IFACE_CONNECT; break; -#if IS_ENABLED(CONFIG_LLC) - case ARPHRD_ETHER: - return; -#endif default: return; } @@ -179,10 +175,6 @@ void x25_terminate_link(struct x25_neigh *nb) struct sk_buff *skb; unsigned char *ptr; -#if IS_ENABLED(CONFIG_LLC) - if (nb->dev->type == ARPHRD_ETHER) - return; -#endif if (nb->dev->type != ARPHRD_X25) return; @@ -212,11 +204,6 @@ void x25_send_frame(struct sk_buff *skb, struct x25_neigh *nb) *dptr = X25_IFACE_DATA; break; -#if IS_ENABLED(CONFIG_LLC) - case ARPHRD_ETHER: - kfree_skb(skb); - return; -#endif default: kfree_skb(skb); return; diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c index fdae054b7dc1..57a81100c5da 100644 --- a/net/x25/x25_link.c +++ b/net/x25/x25_link.c @@ -58,11 +58,6 @@ static inline void x25_stop_t20timer(struct x25_neigh *nb) del_timer(&nb->t20timer); } -static inline int x25_t20timer_pending(struct x25_neigh *nb) -{ - return timer_pending(&nb->t20timer); -} - /* * This handles all restart and diagnostic frames. */ @@ -70,20 +65,45 @@ void x25_link_control(struct sk_buff *skb, struct x25_neigh *nb, unsigned short frametype) { struct sk_buff *skbn; - int confirm; switch (frametype) { case X25_RESTART_REQUEST: - confirm = !x25_t20timer_pending(nb); - x25_stop_t20timer(nb); - nb->state = X25_LINK_STATE_3; - if (confirm) + switch (nb->state) { + case X25_LINK_STATE_0: + /* This can happen when the x25 module just gets loaded + * and doesn't know layer 2 has already connected + */ + nb->state = X25_LINK_STATE_3; x25_transmit_restart_confirmation(nb); + break; + case X25_LINK_STATE_2: + x25_stop_t20timer(nb); + nb->state = X25_LINK_STATE_3; + break; + case X25_LINK_STATE_3: + /* clear existing virtual calls */ + x25_kill_by_neigh(nb); + + x25_transmit_restart_confirmation(nb); + break; + } break; case X25_RESTART_CONFIRMATION: - x25_stop_t20timer(nb); - nb->state = X25_LINK_STATE_3; + switch (nb->state) { + case X25_LINK_STATE_2: + x25_stop_t20timer(nb); + nb->state = X25_LINK_STATE_3; + break; + case X25_LINK_STATE_3: + /* clear existing virtual calls */ + x25_kill_by_neigh(nb); + + x25_transmit_restart_request(nb); + nb->state = X25_LINK_STATE_2; + x25_start_t20timer(nb); + break; + } break; case X25_DIAGNOSTIC: @@ -214,8 +234,6 @@ void x25_link_established(struct x25_neigh *nb) { switch (nb->state) { case X25_LINK_STATE_0: - nb->state = X25_LINK_STATE_2; - break; case X25_LINK_STATE_1: x25_transmit_restart_request(nb); nb->state = X25_LINK_STATE_2; @@ -232,6 +250,9 @@ void x25_link_established(struct x25_neigh *nb) void x25_link_terminated(struct x25_neigh *nb) { nb->state = X25_LINK_STATE_0; + skb_queue_purge(&nb->queue); + x25_stop_t20timer(nb); + /* Out of order: clear existing virtual calls (X.25 03/93 4.6.3) */ x25_kill_by_neigh(nb); } @@ -277,9 +298,6 @@ void x25_link_device_up(struct net_device *dev) */ static void __x25_remove_neigh(struct x25_neigh *nb) { - skb_queue_purge(&nb->queue); - x25_stop_t20timer(nb); - if (nb->node.next) { list_del(&nb->node); x25_neigh_put(nb); diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c index 00e46c9a5280..9fbe4bb38d94 100644 --- a/net/x25/x25_route.c +++ b/net/x25/x25_route.c @@ -115,9 +115,6 @@ void x25_route_device_down(struct net_device *dev) __x25_remove_route(rt); } write_unlock_bh(&x25_route_list_lock); - - /* Remove any related forwarding */ - x25_clear_forward_by_dev(dev); } /* @@ -127,12 +124,7 @@ struct net_device *x25_dev_get(char *devname) { struct net_device *dev = dev_get_by_name(&init_net, devname); - if (dev && - (!(dev->flags & IFF_UP) || (dev->type != ARPHRD_X25 -#if IS_ENABLED(CONFIG_LLC) - && dev->type != ARPHRD_ETHER -#endif - ))){ + if (dev && (!(dev->flags & IFF_UP) || dev->type != ARPHRD_X25)) { dev_put(dev); dev = NULL; } diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 56d052bc65cb..56a28a686988 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -66,18 +66,31 @@ static void xdp_umem_release(struct xdp_umem *umem) kfree(umem); } +static void xdp_umem_release_deferred(struct work_struct *work) +{ + struct xdp_umem *umem = container_of(work, struct xdp_umem, work); + + xdp_umem_release(umem); +} + void xdp_get_umem(struct xdp_umem *umem) { refcount_inc(&umem->users); } -void xdp_put_umem(struct xdp_umem *umem) +void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup) { if (!umem) return; - if (refcount_dec_and_test(&umem->users)) - xdp_umem_release(umem); + if (refcount_dec_and_test(&umem->users)) { + if (defer_cleanup) { + INIT_WORK(&umem->work, xdp_umem_release_deferred); + schedule_work(&umem->work); + } else { + xdp_umem_release(umem); + } + } } static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 181fdda2f2a8..aa9fe2780410 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -9,7 +9,7 @@ #include <net/xdp_sock_drv.h> void xdp_get_umem(struct xdp_umem *umem); -void xdp_put_umem(struct xdp_umem *umem); +void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup); struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr); #endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b71a32eeae65..4faabd1ecfd1 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -23,6 +23,7 @@ #include <linux/netdevice.h> #include <linux/rculist.h> #include <net/xdp_sock_drv.h> +#include <net/busy_poll.h> #include <net/xdp.h> #include "xsk_queue.h" @@ -107,9 +108,9 @@ EXPORT_SYMBOL(xsk_get_pool_from_qid); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) { - if (queue_id < dev->real_num_rx_queues) + if (queue_id < dev->num_rx_queues) dev->_rx[queue_id].pool = NULL; - if (queue_id < dev->real_num_tx_queues) + if (queue_id < dev->num_tx_queues) dev->_tx[queue_id].pool = NULL; } @@ -183,12 +184,13 @@ static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) memcpy(to_buf, from_buf, len + metalen); } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, - bool explicit_free) +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { struct xdp_buff *xsk_xdp; int err; + u32 len; + len = xdp->data_end - xdp->data; if (len > xsk_pool_get_rx_frame_size(xs->pool)) { xs->rx_dropped++; return -ENOSPC; @@ -206,11 +208,17 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, xsk_buff_free(xsk_xdp); return err; } - if (explicit_free) - xdp_return_buff(xdp); return 0; } +static bool xsk_tx_writeable(struct xdp_sock *xs) +{ + if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) + return false; + + return true; +} + static bool xsk_is_bound(struct xdp_sock *xs) { if (READ_ONCE(xs->state) == XSK_BOUND) { @@ -221,22 +229,16 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, - bool explicit_free) +static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp) { - u32 len; - if (!xsk_is_bound(xs)) return -EINVAL; if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; - len = xdp->data_end - xdp->data; - - return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? - __xsk_rcv_zc(xs, xdp, len) : - __xsk_rcv(xs, xdp, len, explicit_free); + sk_mark_napi_id_once_xdp(&xs->sk, xdp); + return 0; } static void xsk_flush(struct xdp_sock *xs) @@ -251,18 +253,41 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) int err; spin_lock_bh(&xs->rx_lock); - err = xsk_rcv(xs, xdp, false); - xsk_flush(xs); + err = xsk_rcv_check(xs, xdp); + if (!err) { + err = __xsk_rcv(xs, xdp); + xsk_flush(xs); + } spin_unlock_bh(&xs->rx_lock); return err; } +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + int err; + u32 len; + + err = xsk_rcv_check(xs, xdp); + if (err) + return err; + + if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { + len = xdp->data_end - xdp->data; + return __xsk_rcv_zc(xs, xdp, len); + } + + err = __xsk_rcv(xs, xdp); + if (!err) + xdp_return_buff(xdp); + return err; +} + int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); int err; - err = xsk_rcv(xs, xdp, true); + err = xsk_rcv(xs, xdp); if (err) return err; @@ -296,7 +321,8 @@ void xsk_tx_release(struct xsk_buff_pool *pool) rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { __xskq_cons_release(xs->tx); - xs->sk.sk_write_space(&xs->sk); + if (xsk_tx_writeable(xs)) + xs->sk.sk_write_space(&xs->sk); } rcu_read_unlock(); } @@ -332,6 +358,63 @@ out: } EXPORT_SYMBOL(xsk_tx_peek_desc); +static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs, + u32 max_entries) +{ + u32 nb_pkts = 0; + + while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) + nb_pkts++; + + xsk_tx_release(pool); + return nb_pkts; +} + +u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs, + u32 max_entries) +{ + struct xdp_sock *xs; + u32 nb_pkts; + + rcu_read_lock(); + if (!list_is_singular(&pool->xsk_tx_list)) { + /* Fallback to the non-batched version */ + rcu_read_unlock(); + return xsk_tx_peek_release_fallback(pool, descs, max_entries); + } + + xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); + if (!xs) { + nb_pkts = 0; + goto out; + } + + nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries); + if (!nb_pkts) { + xs->tx->queue_empty_descs++; + goto out; + } + + /* This is the backpressure mechanism for the Tx path. Try to + * reserve space in the completion queue for all packets, but + * if there are fewer slots available, just process that many + * packets. This avoids having to implement any buffering in + * the Tx path. + */ + nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts); + if (!nb_pkts) + goto out; + + xskq_cons_release_n(xs->tx, nb_pkts); + __xskq_cons_release(xs->tx); + xs->sk.sk_write_space(&xs->sk); + +out: + rcu_read_unlock(); + return nb_pkts; +} +EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); + static int xsk_wakeup(struct xdp_sock *xs, u8 flags) { struct net_device *dev = xs->dev; @@ -355,9 +438,9 @@ static void xsk_destruct_skb(struct sk_buff *skb) struct xdp_sock *xs = xdp_sk(skb->sk); unsigned long flags; - spin_lock_irqsave(&xs->tx_completion_lock, flags); + spin_lock_irqsave(&xs->pool->cq_lock, flags); xskq_prod_submit_addr(xs->pool->cq, addr); - spin_unlock_irqrestore(&xs->tx_completion_lock, flags); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); sock_wfree(skb); } @@ -369,6 +452,7 @@ static int xsk_generic_xmit(struct sock *sk) bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; + unsigned long flags; int err = 0; mutex_lock(&xs->mutex); @@ -400,10 +484,13 @@ static int xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ + spin_lock_irqsave(&xs->pool->cq_lock, flags); if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) { + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); kfree_skb(skb); goto out; } + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); skb->dev = xs->dev; skb->priority = sk->sk_priority; @@ -411,14 +498,13 @@ static int xsk_generic_xmit(struct sock *sk) skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr; skb->destructor = xsk_destruct_skb; - /* Hinder dev_direct_xmit from freeing the packet and - * therefore completing it in the destructor - */ - refcount_inc(&skb->users); - err = dev_direct_xmit(skb, xs->queue_id); + err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ skb->destructor = sock_wfree; + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_cancel(xs->pool->cq); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); /* Free skb without triggering the perf drop trace */ consume_skb(skb); err = -EAGAIN; @@ -429,12 +515,10 @@ static int xsk_generic_xmit(struct sock *sk) /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP) { /* SKB completed but not sent */ - kfree_skb(skb); err = -EBUSY; goto out; } - consume_skb(skb); sent_frame = true; } @@ -442,7 +526,8 @@ static int xsk_generic_xmit(struct sock *sk) out: if (sent_frame) - sk->sk_write_space(sk); + if (xsk_tx_writeable(xs)) + sk->sk_write_space(sk); mutex_unlock(&xs->mutex); return err; @@ -460,28 +545,77 @@ static int __xsk_sendmsg(struct sock *sk) return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk); } +static bool xsk_no_wakeup(struct sock *sk) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + /* Prefer busy-polling, skip the wakeup. */ + return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && + READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID; +#else + return false; +#endif +} + static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { bool need_wait = !(m->msg_flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); + struct xsk_buff_pool *pool; + + if (unlikely(!xsk_is_bound(xs))) + return -ENXIO; + if (unlikely(need_wait)) + return -EOPNOTSUPP; + + if (sk_can_busy_loop(sk)) + sk_busy_loop(sk, 1); /* only support non-blocking sockets */ + + if (xsk_no_wakeup(sk)) + return 0; + + pool = xs->pool; + if (pool->cached_need_wakeup & XDP_WAKEUP_TX) + return __xsk_sendmsg(sk); + return 0; +} + +static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) +{ + bool need_wait = !(flags & MSG_DONTWAIT); + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); if (unlikely(!xsk_is_bound(xs))) return -ENXIO; + if (unlikely(!(xs->dev->flags & IFF_UP))) + return -ENETDOWN; + if (unlikely(!xs->rx)) + return -ENOBUFS; if (unlikely(need_wait)) return -EOPNOTSUPP; - return __xsk_sendmsg(sk); + if (sk_can_busy_loop(sk)) + sk_busy_loop(sk, 1); /* only support non-blocking sockets */ + + if (xsk_no_wakeup(sk)) + return 0; + + if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) + return xsk_wakeup(xs, XDP_WAKEUP_RX); + return 0; } static __poll_t xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { - __poll_t mask = datagram_poll(file, sock, wait); + __poll_t mask = 0; struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct xsk_buff_pool *pool; + sock_poll_wait(file, sock, wait); + if (unlikely(!xsk_is_bound(xs))) return mask; @@ -497,7 +631,7 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock, if (xs->rx && !xskq_prod_is_empty(xs->rx)) mask |= EPOLLIN | EPOLLRDNORM; - if (xs->tx && !xskq_cons_is_full(xs->tx)) + if (xs->tx && xsk_tx_writeable(xs)) mask |= EPOLLOUT | EPOLLWRNORM; return mask; @@ -548,7 +682,7 @@ static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, node); if (node) { - WARN_ON(xsk_map_inc(node->map)); + bpf_map_inc(&node->map->map); map = node->map; *map_entry = node->map_entry; } @@ -578,7 +712,7 @@ static void xsk_delete_from_maps(struct xdp_sock *xs) while ((map = xsk_get_map_list_entry(xs, &map_entry))) { xsk_map_try_sock_delete(map, xs, map_entry); - xsk_map_put(map); + bpf_map_put(&map->map); } } @@ -766,6 +900,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } } + /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ + xs->fq_tmp = NULL; + xs->cq_tmp = NULL; + xs->dev = dev; xs->zc = xs->umem->zc; xs->queue_id = qid; @@ -1134,7 +1272,7 @@ static const struct proto_ops xsk_proto_ops = { .setsockopt = xsk_setsockopt, .getsockopt = xsk_getsockopt, .sendmsg = xsk_sendmsg, - .recvmsg = sock_no_recvmsg, + .recvmsg = xsk_recvmsg, .mmap = xsk_mmap, .sendpage = sock_no_sendpage, }; @@ -1146,7 +1284,8 @@ static void xsk_destruct(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) return; - xp_put_pool(xs->pool); + if (!xp_put_pool(xs->pool)) + xdp_put_umem(xs->umem, !xs->pool); sk_refcnt_debug_dec(sk); } @@ -1186,7 +1325,6 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs->state = XSK_READY; mutex_init(&xs->mutex); spin_lock_init(&xs->rx_lock); - spin_lock_init(&xs->tx_completion_lock); INIT_LIST_HEAD(&xs->map_list); spin_lock_init(&xs->map_list_lock); diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index b9e896cee5bb..edcf249ad1f1 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -41,8 +41,6 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk) void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry); -int xsk_map_inc(struct xsk_map *map); -void xsk_map_put(struct xsk_map *map); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id); int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 64c9e55d4d4e..8de01aaac4a0 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -71,12 +71,11 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, INIT_LIST_HEAD(&pool->free_list); INIT_LIST_HEAD(&pool->xsk_tx_list); spin_lock_init(&pool->xsk_tx_list_lock); + spin_lock_init(&pool->cq_lock); refcount_set(&pool->users, 1); pool->fq = xs->fq_tmp; pool->cq = xs->cq_tmp; - xs->fq_tmp = NULL; - xs->cq_tmp = NULL; for (i = 0; i < pool->free_heads_cnt; i++) { xskb = &pool->heads[i]; @@ -120,8 +119,8 @@ static void xp_disable_drv_zc(struct xsk_buff_pool *pool) } } -static int __xp_assign_dev(struct xsk_buff_pool *pool, - struct net_device *netdev, u16 queue_id, u16 flags) +int xp_assign_dev(struct xsk_buff_pool *pool, + struct net_device *netdev, u16 queue_id, u16 flags) { bool force_zc, force_copy; struct netdev_bpf bpf; @@ -144,14 +143,13 @@ static int __xp_assign_dev(struct xsk_buff_pool *pool, if (err) return err; - if (flags & XDP_USE_NEED_WAKEUP) { + if (flags & XDP_USE_NEED_WAKEUP) pool->uses_need_wakeup = true; - /* Tx needs to be explicitly woken up the first time. - * Also for supporting drivers that do not implement this - * feature. They will always have to call sendto(). - */ - pool->cached_need_wakeup = XDP_WAKEUP_TX; - } + /* Tx needs to be explicitly woken up the first time. Also + * for supporting drivers that do not implement this + * feature. They will always have to call sendto() or poll(). + */ + pool->cached_need_wakeup = XDP_WAKEUP_TX; dev_hold(netdev); @@ -175,6 +173,7 @@ static int __xp_assign_dev(struct xsk_buff_pool *pool, if (!pool->dma_pages) { WARN(1, "Driver did not DMA map zero-copy buffers"); + err = -EINVAL; goto err_unreg_xsk; } pool->umem->zc = true; @@ -185,17 +184,13 @@ err_unreg_xsk: err_unreg_pool: if (!force_zc) err = 0; /* fallback to copy mode */ - if (err) + if (err) { xsk_clear_pool_at_qid(netdev, queue_id); + dev_put(netdev); + } return err; } -int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, - u16 queue_id, u16 flags) -{ - return __xp_assign_dev(pool, dev, queue_id, flags); -} - int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, struct net_device *dev, u16 queue_id) { @@ -209,7 +204,7 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, if (pool->uses_need_wakeup) flags |= XDP_USE_NEED_WAKEUP; - return __xp_assign_dev(pool, dev, queue_id, flags); + return xp_assign_dev(pool, dev, queue_id, flags); } void xp_clear_dev(struct xsk_buff_pool *pool) @@ -242,7 +237,7 @@ static void xp_release_deferred(struct work_struct *work) pool->cq = NULL; } - xdp_put_umem(pool->umem); + xdp_put_umem(pool->umem, false); xp_destroy(pool); } @@ -251,15 +246,18 @@ void xp_get_pool(struct xsk_buff_pool *pool) refcount_inc(&pool->users); } -void xp_put_pool(struct xsk_buff_pool *pool) +bool xp_put_pool(struct xsk_buff_pool *pool) { if (!pool) - return; + return false; if (refcount_dec_and_test(&pool->users)) { INIT_WORK(&pool->work, xp_release_deferred); schedule_work(&pool->work); + return true; } + + return false; } static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index cdb9cf3cd136..2823b7c3302d 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -18,9 +18,11 @@ struct xdp_ring { /* Hinder the adjacent cache prefetcher to prefetch the consumer * pointer if the producer pointer is touched and vice versa. */ - u32 pad ____cacheline_aligned_in_smp; + u32 pad1 ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; + u32 pad2 ____cacheline_aligned_in_smp; u32 flags; + u32 pad3 ____cacheline_aligned_in_smp; }; /* Used for the RX and TX queues for packets */ @@ -197,6 +199,30 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, return false; } +static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, + struct xdp_desc *descs, + struct xsk_buff_pool *pool, u32 max) +{ + u32 cached_cons = q->cached_cons, nb_entries = 0; + + while (cached_cons != q->cached_prod && nb_entries < max) { + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + u32 idx = cached_cons & q->ring_mask; + + descs[nb_entries] = ring->desc[idx]; + if (unlikely(!xskq_cons_is_valid_desc(q, &descs[nb_entries], pool))) { + /* Skip the entry */ + cached_cons++; + continue; + } + + nb_entries++; + cached_cons++; + } + + return nb_entries; +} + /* Functions for consumers */ static inline void __xskq_cons_release(struct xsk_queue *q) @@ -218,17 +244,22 @@ static inline void xskq_cons_get_entries(struct xsk_queue *q) __xskq_cons_peek(q); } -static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) +static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max) { u32 entries = q->cached_prod - q->cached_cons; - if (entries >= cnt) - return true; + if (entries >= max) + return max; __xskq_cons_peek(q); entries = q->cached_prod - q->cached_cons; - return entries >= cnt; + return entries >= max ? max : entries; +} + +static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) +{ + return xskq_cons_nb_entries(q, cnt) >= cnt ? true : false; } static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) @@ -247,16 +278,28 @@ static inline bool xskq_cons_peek_desc(struct xsk_queue *q, return xskq_cons_read_desc(q, desc, pool); } +static inline u32 xskq_cons_peek_desc_batch(struct xsk_queue *q, struct xdp_desc *descs, + struct xsk_buff_pool *pool, u32 max) +{ + u32 entries = xskq_cons_nb_entries(q, max); + + return xskq_cons_read_desc_batch(q, descs, pool, entries); +} + +/* To improve performance in the xskq_cons_release functions, only update local state here. + * Reflect this to global state when we get new entries from the ring in + * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop. + */ static inline void xskq_cons_release(struct xsk_queue *q) { - /* To improve performance, only update local state here. - * Reflect this to global state when we get new entries - * from the ring in xskq_cons_get_entries() and whenever - * Rx or Tx processing are completed in the NAPI loop. - */ q->cached_cons++; } +static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_cons += cnt; +} + static inline bool xskq_cons_is_full(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ @@ -264,20 +307,36 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q) q->nentries; } +static inline u32 xskq_cons_present_entries(struct xsk_queue *q) +{ + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer); +} + /* Functions for producers */ -static inline bool xskq_prod_is_full(struct xsk_queue *q) +static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); - if (free_entries) - return false; + if (free_entries >= max) + return max; /* Refresh the local tail pointer */ q->cached_cons = READ_ONCE(q->ring->consumer); free_entries = q->nentries - (q->cached_prod - q->cached_cons); - return !free_entries; + return free_entries >= max ? max : free_entries; +} + +static inline bool xskq_prod_is_full(struct xsk_queue *q) +{ + return xskq_prod_nb_free(q, 1) ? false : true; +} + +static inline void xskq_prod_cancel(struct xsk_queue *q) +{ + q->cached_prod--; } static inline int xskq_prod_reserve(struct xsk_queue *q) @@ -302,6 +361,23 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) return 0; } +static inline u32 xskq_prod_reserve_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, + u32 max) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 nb_entries, i, cached_prod; + + nb_entries = xskq_prod_nb_free(q, max); + + /* A, matches D */ + cached_prod = q->cached_prod; + for (i = 0; i < nb_entries; i++) + ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr; + q->cached_prod = cached_prod; + + return nb_entries; +} + static inline int xskq_prod_reserve_desc(struct xsk_queue *q, u64 addr, u32 len) { diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 49da2b8ace8b..113fd9017203 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -11,32 +11,17 @@ #include "xsk.h" -int xsk_map_inc(struct xsk_map *map) -{ - bpf_map_inc(&map->map); - return 0; -} - -void xsk_map_put(struct xsk_map *map) -{ - bpf_map_put(&map->map); -} - static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, struct xdp_sock **map_entry) { struct xsk_map_node *node; - int err; - node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); + node = bpf_map_kzalloc(&map->map, sizeof(*node), + GFP_ATOMIC | __GFP_NOWARN); if (!node) return ERR_PTR(-ENOMEM); - err = xsk_map_inc(map); - if (err) { - kfree(node); - return ERR_PTR(err); - } + bpf_map_inc(&map->map); node->map = map; node->map_entry = map_entry; @@ -45,7 +30,7 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, static void xsk_map_node_free(struct xsk_map_node *node) { - xsk_map_put(node->map); + bpf_map_put(&node->map->map); kfree(node); } @@ -73,9 +58,8 @@ static void xsk_map_sock_delete(struct xdp_sock *xs, static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { - struct bpf_map_memory mem; - int err, numa_node; struct xsk_map *m; + int numa_node; u64 size; if (!capable(CAP_NET_ADMIN)) @@ -89,18 +73,11 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) numa_node = bpf_map_attr_numa_node(attr); size = struct_size(m, xsk_map, attr->max_entries); - err = bpf_map_charge_init(&mem, size); - if (err < 0) - return ERR_PTR(err); - m = bpf_map_area_alloc(size, numa_node); - if (!m) { - bpf_map_charge_finish(&mem); + if (!m) return ERR_PTR(-ENOMEM); - } bpf_map_init_from_attr(&m->map, attr); - bpf_map_charge_move(&m->map.memory, &mem); spin_lock_init(&m->lock); return &m->map; diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index e28f0c9ecd6a..d8e8a11ca845 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -234,6 +234,7 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) case XFRMA_PAD: /* Ignore */ return 0; + case XFRMA_UNSPEC: case XFRMA_ALG_AUTH: case XFRMA_ALG_CRYPT: case XFRMA_ALG_COMP: @@ -387,7 +388,7 @@ static int xfrm_attr_cpy32(void *dst, size_t *pos, const struct nlattr *src, memcpy(nla, src, nla_attr_size(copy_len)); nla->nla_len = nla_attr_size(payload); - *pos += nla_attr_size(payload); + *pos += nla_attr_size(copy_len); nlmsg->nlmsg_len += nla->nla_len; memset(dst + *pos, 0, payload - copy_len); @@ -563,7 +564,7 @@ static struct nlmsghdr *xfrm_user_rcv_msg_compat(const struct nlmsghdr *h32, return NULL; len += NLMSG_HDRLEN; - h64 = kvmalloc(len, GFP_KERNEL | __GFP_ZERO); + h64 = kvmalloc(len, GFP_KERNEL); if (!h64) return ERR_PTR(-ENOMEM); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 37456d022cfa..1158cd0311d7 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -660,7 +660,7 @@ resume: /* only the first xfrm gets the encap type */ encap_type = 0; - if (async && x->repl->recheck(x, skb, seq)) { + if (x->repl->recheck(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } @@ -760,9 +760,9 @@ int xfrm_input_resume(struct sk_buff *skb, int nexthdr) } EXPORT_SYMBOL(xfrm_input_resume); -static void xfrm_trans_reinject(unsigned long data) +static void xfrm_trans_reinject(struct tasklet_struct *t) { - struct xfrm_trans_tasklet *trans = (void *)data; + struct xfrm_trans_tasklet *trans = from_tasklet(trans, t, tasklet); struct sk_buff_head queue; struct sk_buff *skb; @@ -818,7 +818,6 @@ void __init xfrm_input_init(void) trans = &per_cpu(xfrm_trans_tasklet, i); __skb_queue_head_init(&trans->queue); - tasklet_init(&trans->tasklet, xfrm_trans_reinject, - (unsigned long)trans); + tasklet_setup(&trans->tasklet, xfrm_trans_reinject); } } diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 9b8e292a7c6a..495b1f5c979b 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -296,7 +296,8 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) } mtu = dst_mtu(dst); - if (skb->len > mtu) { + if ((!skb_is_gso(skb) && skb->len > mtu) || + (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { @@ -319,12 +320,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) err = dst_output(xi->net, skb->sk, skb); if (net_xmit_eval(err) == 0) { - struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); - - u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += length; - tstats->tx_packets++; - u64_stats_update_end(&tstats->syncp); + dev_sw_netstats_tx_add(dev, 1, length); } else { stats->tx_errors++; stats->tx_aborted_errors++; @@ -538,15 +534,6 @@ static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) return err; } -static void xfrmi_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *s) -{ - dev_fetch_sw_netstats(s, dev->tstats); - - s->rx_dropped = dev->stats.rx_dropped; - s->tx_dropped = dev->stats.tx_dropped; -} - static int xfrmi_get_iflink(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); @@ -554,12 +541,11 @@ static int xfrmi_get_iflink(const struct net_device *dev) return xi->p.link; } - static const struct net_device_ops xfrmi_netdev_ops = { .ndo_init = xfrmi_dev_init, .ndo_uninit = xfrmi_dev_uninit, .ndo_start_xmit = xfrmi_xmit, - .ndo_get_stats64 = xfrmi_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = xfrmi_get_iflink, }; @@ -579,6 +565,11 @@ static void xfrmi_dev_setup(struct net_device *dev) eth_broadcast_addr(dev->broadcast); } +#define XFRMI_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_GSO_SOFTWARE | \ + NETIF_F_HW_CSUM) + static int xfrmi_dev_init(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); @@ -596,6 +587,8 @@ static int xfrmi_dev_init(struct net_device *dev) } dev->features |= NETIF_F_LLTX; + dev->features |= XFRMI_FEATURES; + dev->hw_features |= XFRMI_FEATURES; if (phydev) { dev->needed_headroom = phydev->needed_headroom; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index d622c2548d22..b74f28cabe24 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -793,15 +793,22 @@ static int xfrm_policy_addr_delta(const xfrm_address_t *a, const xfrm_address_t *b, u8 prefixlen, u16 family) { + u32 ma, mb, mask; unsigned int pdw, pbi; int delta = 0; switch (family) { case AF_INET: - if (sizeof(long) == 4 && prefixlen == 0) - return ntohl(a->a4) - ntohl(b->a4); - return (ntohl(a->a4) & ((~0UL << (32 - prefixlen)))) - - (ntohl(b->a4) & ((~0UL << (32 - prefixlen)))); + if (prefixlen == 0) + return 0; + mask = ~0U << (32 - prefixlen); + ma = ntohl(a->a4) & mask; + mb = ntohl(b->a4) & mask; + if (ma < mb) + delta = -1; + else if (ma > mb) + delta = 1; + break; case AF_INET6: pdw = prefixlen >> 5; pbi = prefixlen & 0x1f; @@ -812,10 +819,13 @@ static int xfrm_policy_addr_delta(const xfrm_address_t *a, return delta; } if (pbi) { - u32 mask = ~0u << (32 - pbi); - - delta = (ntohl(a->a6[pdw]) & mask) - - (ntohl(b->a6[pdw]) & mask); + mask = ~0U << (32 - pbi); + ma = ntohl(a->a6[pdw]) & mask; + mb = ntohl(b->a6[pdw]) & mask; + if (ma < mb) + delta = -1; + else if (ma > mb) + delta = 1; } break; default: @@ -3078,8 +3088,8 @@ struct dst_entry *xfrm_lookup_with_ifid(struct net *net, xflo.flags = flags; /* To accelerate a bit... */ - if ((dst_orig->flags & DST_NOXFRM) || - !net->xfrm.policy_count[XFRM_POLICY_OUT]) + if (!if_id && ((dst_orig->flags & DST_NOXFRM) || + !net->xfrm.policy_count[XFRM_POLICY_OUT])) goto nopol; xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index a77da7aae6fe..d01ca1a18418 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1021,7 +1021,8 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, if ((x->sel.family && (x->sel.family != family || !xfrm_selector_match(&x->sel, fl, family))) || - !security_xfrm_state_pol_flow_match(x, pol, fl)) + !security_xfrm_state_pol_flow_match(x, pol, + &fl->u.__fl_common)) return; if (!*best || @@ -1036,7 +1037,8 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, if ((!x->sel.family || (x->sel.family == family && xfrm_selector_match(&x->sel, fl, family))) && - security_xfrm_state_pol_flow_match(x, pol, fl)) + security_xfrm_state_pol_flow_match(x, pol, + &fl->u.__fl_common)) *error = -ESRCH; } } @@ -2382,8 +2384,10 @@ int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen) if (in_compat_syscall()) { struct xfrm_translator *xtr = xfrm_get_translator(); - if (!xtr) + if (!xtr) { + kfree(data); return -EOPNOTSUPP; + } err = xtr->xlate_user_policy_sockptr(&data, optlen); xfrm_put_translator(xtr); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index d0c32a8fcc4a..5a0ef4361e43 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -848,21 +848,84 @@ static int copy_user_offload(struct xfrm_state_offload *xso, struct sk_buff *skb return 0; } +static bool xfrm_redact(void) +{ + return IS_ENABLED(CONFIG_SECURITY) && + security_locked_down(LOCKDOWN_XFRM_SECRET); +} + static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) { struct xfrm_algo *algo; + struct xfrm_algo_auth *ap; struct nlattr *nla; + bool redact_secret = xfrm_redact(); nla = nla_reserve(skb, XFRMA_ALG_AUTH, sizeof(*algo) + (auth->alg_key_len + 7) / 8); if (!nla) return -EMSGSIZE; - algo = nla_data(nla); strncpy(algo->alg_name, auth->alg_name, sizeof(algo->alg_name)); - memcpy(algo->alg_key, auth->alg_key, (auth->alg_key_len + 7) / 8); + + if (redact_secret && auth->alg_key_len) + memset(algo->alg_key, 0, (auth->alg_key_len + 7) / 8); + else + memcpy(algo->alg_key, auth->alg_key, + (auth->alg_key_len + 7) / 8); algo->alg_key_len = auth->alg_key_len; + nla = nla_reserve(skb, XFRMA_ALG_AUTH_TRUNC, xfrm_alg_auth_len(auth)); + if (!nla) + return -EMSGSIZE; + ap = nla_data(nla); + memcpy(ap, auth, sizeof(struct xfrm_algo_auth)); + if (redact_secret && auth->alg_key_len) + memset(ap->alg_key, 0, (auth->alg_key_len + 7) / 8); + else + memcpy(ap->alg_key, auth->alg_key, + (auth->alg_key_len + 7) / 8); + return 0; +} + +static int copy_to_user_aead(struct xfrm_algo_aead *aead, struct sk_buff *skb) +{ + struct nlattr *nla = nla_reserve(skb, XFRMA_ALG_AEAD, aead_len(aead)); + struct xfrm_algo_aead *ap; + bool redact_secret = xfrm_redact(); + + if (!nla) + return -EMSGSIZE; + + ap = nla_data(nla); + memcpy(ap, aead, sizeof(*aead)); + + if (redact_secret && aead->alg_key_len) + memset(ap->alg_key, 0, (aead->alg_key_len + 7) / 8); + else + memcpy(ap->alg_key, aead->alg_key, + (aead->alg_key_len + 7) / 8); + return 0; +} + +static int copy_to_user_ealg(struct xfrm_algo *ealg, struct sk_buff *skb) +{ + struct xfrm_algo *ap; + bool redact_secret = xfrm_redact(); + struct nlattr *nla = nla_reserve(skb, XFRMA_ALG_CRYPT, + xfrm_alg_len(ealg)); + if (!nla) + return -EMSGSIZE; + + ap = nla_data(nla); + memcpy(ap, ealg, sizeof(*ealg)); + + if (redact_secret && ealg->alg_key_len) + memset(ap->alg_key, 0, (ealg->alg_key_len + 7) / 8); + else + memcpy(ap->alg_key, ealg->alg_key, + (ealg->alg_key_len + 7) / 8); + return 0; } @@ -906,20 +969,17 @@ static int copy_to_user_state_extra(struct xfrm_state *x, goto out; } if (x->aead) { - ret = nla_put(skb, XFRMA_ALG_AEAD, aead_len(x->aead), x->aead); + ret = copy_to_user_aead(x->aead, skb); if (ret) goto out; } if (x->aalg) { ret = copy_to_user_auth(x->aalg, skb); - if (!ret) - ret = nla_put(skb, XFRMA_ALG_AUTH_TRUNC, - xfrm_alg_auth_len(x->aalg), x->aalg); if (ret) goto out; } if (x->ealg) { - ret = nla_put(skb, XFRMA_ALG_CRYPT, xfrm_alg_len(x->ealg), x->ealg); + ret = copy_to_user_ealg(x->ealg, skb); if (ret) goto out; } @@ -2444,7 +2504,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]), sizeof(*encap), GFP_KERNEL); if (!encap) - return 0; + return -ENOMEM; } err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap); |