summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/802/garp.c2
-rw-r--r--net/802/mrp.c20
-rw-r--r--net/8021q/vlan_dev.c4
-rw-r--r--net/9p/trans_fd.c28
-rw-r--r--net/9p/trans_xen.c9
-rw-r--r--net/atm/mpoa_proc.c3
-rw-r--r--net/ax25/af_ax25.c4
-rw-r--r--net/batman-adv/netlink.c6
-rw-r--r--net/bluetooth/hci_conn.c18
-rw-r--r--net/bluetooth/iso.c14
-rw-r--r--net/bluetooth/l2cap_core.c86
-rw-r--r--net/bpf/bpf_dummy_struct_ops.c14
-rw-r--r--net/bpf/test_run.c4
-rw-r--r--net/bridge/br.c5
-rw-r--r--net/bridge/br_fdb.c46
-rw-r--r--net/bridge/br_input.c21
-rw-r--r--net/bridge/br_mdb.c11
-rw-r--r--net/bridge/br_multicast.c12
-rw-r--r--net/bridge/br_netlink.c23
-rw-r--r--net/bridge/br_private.h5
-rw-r--r--net/bridge/br_switchdev.c6
-rw-r--r--net/bridge/br_sysfs_br.c2
-rw-r--r--net/bridge/br_vlan.c21
-rw-r--r--net/caif/chnl_net.c3
-rw-r--r--net/can/af_can.c3
-rw-r--r--net/can/isotp.c71
-rw-r--r--net/can/j1939/main.c3
-rw-r--r--net/can/j1939/transport.c6
-rw-r--r--net/ceph/mon_client.c2
-rw-r--r--net/ceph/osd_client.c2
-rw-r--r--net/core/bpf_sk_storage.c39
-rw-r--r--net/core/dev.c173
-rw-r--r--net/core/dev.h7
-rw-r--r--net/core/dev_ioctl.c2
-rw-r--r--net/core/devlink.c351
-rw-r--r--net/core/drop_monitor.c12
-rw-r--r--net/core/failover.c6
-rw-r--r--net/core/filter.c118
-rw-r--r--net/core/flow_dissector.c6
-rw-r--r--net/core/flow_offload.c7
-rw-r--r--net/core/gen_stats.c16
-rw-r--r--net/core/gro.c74
-rw-r--r--net/core/link_watch.c20
-rw-r--r--net/core/lwtunnel.c4
-rw-r--r--net/core/neighbour.c60
-rw-r--r--net/core/net-sysfs.c4
-rw-r--r--net/core/net_namespace.c12
-rw-r--r--net/core/of_net.c5
-rw-r--r--net/core/pktgen.c47
-rw-r--r--net/core/rtnetlink.c90
-rw-r--r--net/core/skbuff.c133
-rw-r--r--net/core/skmsg.c15
-rw-r--r--net/core/sock.c20
-rw-r--r--net/core/sock_diag.c15
-rw-r--r--net/core/sock_map.c7
-rw-r--r--net/core/sock_reuseport.c110
-rw-r--r--net/core/stream.c2
-rw-r--r--net/core/utils.c4
-rw-r--r--net/dcb/dcbnl.c153
-rw-r--r--net/dccp/dccp.h1
-rw-r--r--net/dccp/ipv4.c27
-rw-r--r--net/dccp/ipv6.c39
-rw-r--r--net/dccp/proto.c11
-rw-r--r--net/dsa/Kconfig6
-rw-r--r--net/dsa/Makefile4
-rw-r--r--net/dsa/devlink.c391
-rw-r--r--net/dsa/devlink.h16
-rw-r--r--net/dsa/dsa.c1727
-rw-r--r--net/dsa/dsa.h40
-rw-r--r--net/dsa/dsa2.c1812
-rw-r--r--net/dsa/dsa_priv.h587
-rw-r--r--net/dsa/master.c28
-rw-r--r--net/dsa/master.h19
-rw-r--r--net/dsa/netlink.c3
-rw-r--r--net/dsa/netlink.h8
-rw-r--r--net/dsa/port.c40
-rw-r--r--net/dsa/port.h114
-rw-r--r--net/dsa/slave.c77
-rw-r--r--net/dsa/slave.h69
-rw-r--r--net/dsa/switch.c53
-rw-r--r--net/dsa/switch.h120
-rw-r--r--net/dsa/tag.c243
-rw-r--r--net/dsa/tag.h310
-rw-r--r--net/dsa/tag_8021q.c19
-rw-r--r--net/dsa/tag_8021q.h27
-rw-r--r--net/dsa/tag_ar9331.c8
-rw-r--r--net/dsa/tag_brcm.c18
-rw-r--r--net/dsa/tag_dsa.c13
-rw-r--r--net/dsa/tag_gswip.c8
-rw-r--r--net/dsa/tag_hellcreek.c8
-rw-r--r--net/dsa/tag_ksz.c24
-rw-r--r--net/dsa/tag_lan9303.c8
-rw-r--r--net/dsa/tag_mtk.c10
-rw-r--r--net/dsa/tag_none.c30
-rw-r--r--net/dsa/tag_ocelot.c14
-rw-r--r--net/dsa/tag_ocelot_8021q.c10
-rw-r--r--net/dsa/tag_qca.c8
-rw-r--r--net/dsa/tag_rtl4_a.c8
-rw-r--r--net/dsa/tag_rtl8_4.c9
-rw-r--r--net/dsa/tag_rzn1_a5psw.c8
-rw-r--r--net/dsa/tag_sja1105.c15
-rw-r--r--net/dsa/tag_trailer.c8
-rw-r--r--net/dsa/tag_xrs700x.c8
-rw-r--r--net/ethtool/channels.c19
-rw-r--r--net/ethtool/common.c80
-rw-r--r--net/ethtool/common.h1
-rw-r--r--net/ethtool/eeprom.c2
-rw-r--r--net/ethtool/ioctl.c44
-rw-r--r--net/ethtool/linkstate.c24
-rw-r--r--net/ethtool/pse-pd.c2
-rw-r--r--net/hsr/hsr_forward.c17
-rw-r--r--net/ieee802154/core.c3
-rw-r--r--net/ieee802154/nl802154.c6
-rw-r--r--net/ieee802154/socket.c4
-rw-r--r--net/ipv4/Kconfig10
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/af_inet.c21
-rw-r--r--net/ipv4/bpf_tcp_ca.c17
-rw-r--r--net/ipv4/datagram.c4
-rw-r--r--net/ipv4/esp4_offload.c3
-rw-r--r--net/ipv4/fib_frontend.c4
-rw-r--r--net/ipv4/fib_semantics.c10
-rw-r--r--net/ipv4/fib_trie.c6
-rw-r--r--net/ipv4/igmp.c6
-rw-r--r--net/ipv4/inet_connection_sock.c9
-rw-r--r--net/ipv4/inet_fragment.c14
-rw-r--r--net/ipv4/inet_hashtables.c96
-rw-r--r--net/ipv4/ip_fragment.c19
-rw-r--r--net/ipv4/ip_gre.c12
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_output.c2
-rw-r--r--net/ipv4/ip_sockglue.c3
-rw-r--r--net/ipv4/ip_tunnel.c32
-rw-r--r--net/ipv4/ip_vti.c20
-rw-r--r--net/ipv4/ipip.c2
-rw-r--r--net/ipv4/ipmr.c12
-rw-r--r--net/ipv4/metrics.c3
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c4
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c1
-rw-r--r--net/ipv4/netfilter/nft_dup_ipv4.c3
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c6
-rw-r--r--net/ipv4/nexthop.c2
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/route.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c83
-rw-r--r--net/ipv4/tcp.c11
-rw-r--r--net/ipv4/tcp_bpf.c12
-rw-r--r--net/ipv4/tcp_cdg.c2
-rw-r--r--net/ipv4/tcp_dctcp.c23
-rw-r--r--net/ipv4/tcp_input.c70
-rw-r--r--net/ipv4/tcp_ipv4.c38
-rw-r--r--net/ipv4/tcp_output.c37
-rw-r--r--net/ipv4/tcp_plb.c109
-rw-r--r--net/ipv4/tcp_ulp.c3
-rw-r--r--net/ipv4/udp.c222
-rw-r--r--net/ipv4/udp_bpf.c4
-rw-r--r--net/ipv4/udp_diag.c6
-rw-r--r--net/ipv4/udp_offload.c5
-rw-r--r--net/ipv4/udp_tunnel_nic.c2
-rw-r--r--net/ipv6/addrconf.c10
-rw-r--r--net/ipv6/addrlabel.c1
-rw-r--r--net/ipv6/af_inet6.c13
-rw-r--r--net/ipv6/datagram.c5
-rw-r--r--net/ipv6/esp6_offload.c3
-rw-r--r--net/ipv6/ip6_fib.c7
-rw-r--r--net/ipv6/ip6_flowlabel.c2
-rw-r--r--net/ipv6/ip6_gre.c23
-rw-r--r--net/ipv6/ip6_tunnel.c37
-rw-r--r--net/ipv6/ip6_vti.c16
-rw-r--r--net/ipv6/ip6mr.c10
-rw-r--r--net/ipv6/ipv6_sockglue.c6
-rw-r--r--net/ipv6/mcast.c10
-rw-r--r--net/ipv6/netfilter/ip6t_rpfilter.c1
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c2
-rw-r--r--net/ipv6/netfilter/nft_dup_ipv6.c3
-rw-r--r--net/ipv6/netfilter/nft_fib_ipv6.c2
-rw-r--r--net/ipv6/output_core.c2
-rw-r--r--net/ipv6/ping.c6
-rw-r--r--net/ipv6/raw.c2
-rw-r--r--net/ipv6/reassembly.c13
-rw-r--r--net/ipv6/route.c14
-rw-r--r--net/ipv6/seg6_local.c4
-rw-r--r--net/ipv6/sit.c30
-rw-r--r--net/ipv6/tcp_ipv6.c28
-rw-r--r--net/ipv6/udp.c44
-rw-r--r--net/ipv6/udp_offload.c5
-rw-r--r--net/ipv6/xfrm6_policy.c6
-rw-r--r--net/kcm/kcmsock.c83
-rw-r--r--net/key/af_key.c34
-rw-r--r--net/l2tp/l2tp_core.c22
-rw-r--r--net/l2tp/l2tp_ip6.c2
-rw-r--r--net/mac80211/agg-rx.c25
-rw-r--r--net/mac80211/agg-tx.c2
-rw-r--r--net/mac80211/airtime.c3
-rw-r--r--net/mac80211/cfg.c43
-rw-r--r--net/mac80211/debugfs.c4
-rw-r--r--net/mac80211/debugfs_netdev.c3
-rw-r--r--net/mac80211/debugfs_sta.c148
-rw-r--r--net/mac80211/debugfs_sta.h12
-rw-r--r--net/mac80211/driver-ops.c27
-rw-r--r--net/mac80211/driver-ops.h16
-rw-r--r--net/mac80211/ieee80211_i.h22
-rw-r--r--net/mac80211/iface.c69
-rw-r--r--net/mac80211/link.c17
-rw-r--r--net/mac80211/main.c31
-rw-r--r--net/mac80211/mesh_pathtbl.c2
-rw-r--r--net/mac80211/mlme.c131
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c5
-rw-r--r--net/mac80211/rc80211_minstrel_ht.h1
-rw-r--r--net/mac80211/rx.c3
-rw-r--r--net/mac80211/s1g.c3
-rw-r--r--net/mac80211/scan.c2
-rw-r--r--net/mac80211/sta_info.c118
-rw-r--r--net/mac80211/sta_info.h7
-rw-r--r--net/mac80211/tdls.c1
-rw-r--r--net/mac80211/tx.c35
-rw-r--r--net/mac80211/util.c246
-rw-r--r--net/mac80211/wme.c63
-rw-r--r--net/mac80211/wme.h4
-rw-r--r--net/mac802154/cfg.c6
-rw-r--r--net/mac802154/driver-ops.h253
-rw-r--r--net/mac802154/ieee802154_i.h56
-rw-r--r--net/mac802154/iface.c44
-rw-r--r--net/mac802154/main.c2
-rw-r--r--net/mac802154/rx.c34
-rw-r--r--net/mac802154/tx.c132
-rw-r--r--net/mac802154/util.c71
-rw-r--r--net/mctp/af_mctp.c4
-rw-r--r--net/mctp/route.c2
-rw-r--r--net/mpls/af_mpls.c4
-rw-r--r--net/mptcp/Makefile2
-rw-r--r--net/mptcp/fastopen.c73
-rw-r--r--net/mptcp/options.c25
-rw-r--r--net/mptcp/pm_netlink.c59
-rw-r--r--net/mptcp/pm_userspace.c4
-rw-r--r--net/mptcp/protocol.c279
-rw-r--r--net/mptcp/protocol.h33
-rw-r--r--net/mptcp/sockopt.c43
-rw-r--r--net/mptcp/subflow.c120
-rw-r--r--net/mptcp/token.c4
-rw-r--r--net/netfilter/Kconfig6
-rw-r--r--net/netfilter/Makefile4
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h30
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c10
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c32
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_twos.c4
-rw-r--r--net/netfilter/nf_conntrack_bpf.c17
-rw-r--r--net/netfilter/nf_conntrack_core.c32
-rw-r--r--net/netfilter/nf_conntrack_helper.c100
-rw-r--r--net/netfilter/nf_conntrack_netlink.c24
-rw-r--r--net/netfilter/nf_conntrack_standalone.c2
-rw-r--r--net/netfilter/nf_flow_table_offload.c4
-rw-r--r--net/netfilter/nf_nat_core.c15
-rw-r--r--net/netfilter/nf_tables_api.c112
-rw-r--r--net/netfilter/nf_tables_core.c2
-rw-r--r--net/netfilter/nfnetlink.c1
-rw-r--r--net/netfilter/nft_bitwise.c6
-rw-r--r--net/netfilter/nft_byteorder.c3
-rw-r--r--net/netfilter/nft_cmp.c9
-rw-r--r--net/netfilter/nft_compat.c9
-rw-r--r--net/netfilter/nft_connlimit.c3
-rw-r--r--net/netfilter/nft_counter.c5
-rw-r--r--net/netfilter/nft_ct.c12
-rw-r--r--net/netfilter/nft_dup_netdev.c3
-rw-r--r--net/netfilter/nft_dynset.c7
-rw-r--r--net/netfilter/nft_exthdr.c10
-rw-r--r--net/netfilter/nft_fib.c2
-rw-r--r--net/netfilter/nft_flow_offload.c3
-rw-r--r--net/netfilter/nft_fwd_netdev.c6
-rw-r--r--net/netfilter/nft_hash.c4
-rw-r--r--net/netfilter/nft_immediate.c3
-rw-r--r--net/netfilter/nft_inner.c385
-rw-r--r--net/netfilter/nft_last.c3
-rw-r--r--net/netfilter/nft_limit.c5
-rw-r--r--net/netfilter/nft_log.c3
-rw-r--r--net/netfilter/nft_lookup.c3
-rw-r--r--net/netfilter/nft_masq.c3
-rw-r--r--net/netfilter/nft_meta.c67
-rw-r--r--net/netfilter/nft_nat.c3
-rw-r--r--net/netfilter/nft_numgen.c6
-rw-r--r--net/netfilter/nft_objref.c28
-rw-r--r--net/netfilter/nft_osf.c3
-rw-r--r--net/netfilter/nft_payload.c147
-rw-r--r--net/netfilter/nft_queue.c6
-rw-r--r--net/netfilter/nft_quota.c5
-rw-r--r--net/netfilter/nft_range.c3
-rw-r--r--net/netfilter/nft_redir.c3
-rw-r--r--net/netfilter/nft_reject.c3
-rw-r--r--net/netfilter/nft_rt.c2
-rw-r--r--net/netfilter/nft_socket.c2
-rw-r--r--net/netfilter/nft_synproxy.c3
-rw-r--r--net/netfilter/nft_tproxy.c2
-rw-r--r--net/netfilter/nft_tunnel.c2
-rw-r--r--net/netfilter/nft_xfrm.c2
-rw-r--r--net/netfilter/xt_connmark.c18
-rw-r--r--net/netfilter/xt_sctp.c1
-rw-r--r--net/netfilter/xt_statistic.c2
-rw-r--r--net/netlink/af_netlink.c42
-rw-r--r--net/netlink/genetlink.c516
-rw-r--r--net/nfc/nci/core.c10
-rw-r--r--net/nfc/nci/data.c4
-rw-r--r--net/nfc/nci/hci.c4
-rw-r--r--net/nfc/rawsock.c3
-rw-r--r--net/openvswitch/actions.c2
-rw-r--r--net/openvswitch/conntrack.c113
-rw-r--r--net/openvswitch/datapath.c8
-rw-r--r--net/openvswitch/flow_netlink.c2
-rw-r--r--net/openvswitch/flow_table.c9
-rw-r--r--net/openvswitch/vport-geneve.c2
-rw-r--r--net/openvswitch/vport-gre.c2
-rw-r--r--net/openvswitch/vport-netdev.c2
-rw-r--r--net/openvswitch/vport-vxlan.c2
-rw-r--r--net/packet/af_packet.c19
-rw-r--r--net/rds/bind.c2
-rw-r--r--net/rds/message.c2
-rw-r--r--net/rds/send.c3
-rw-r--r--net/rds/tcp.c3
-rw-r--r--net/rose/rose_link.c3
-rw-r--r--net/rxrpc/Makefile1
-rw-r--r--net/rxrpc/af_rxrpc.c14
-rw-r--r--net/rxrpc/ar-internal.h225
-rw-r--r--net/rxrpc/call_accept.c8
-rw-r--r--net/rxrpc/call_event.c427
-rw-r--r--net/rxrpc/call_object.c63
-rw-r--r--net/rxrpc/conn_client.c41
-rw-r--r--net/rxrpc/conn_object.c4
-rw-r--r--net/rxrpc/input.c770
-rw-r--r--net/rxrpc/insecure.c16
-rw-r--r--net/rxrpc/local_object.c21
-rw-r--r--net/rxrpc/misc.c23
-rw-r--r--net/rxrpc/net_ns.c2
-rw-r--r--net/rxrpc/output.c398
-rw-r--r--net/rxrpc/peer_event.c282
-rw-r--r--net/rxrpc/peer_object.c7
-rw-r--r--net/rxrpc/proc.c110
-rw-r--r--net/rxrpc/protocol.h9
-rw-r--r--net/rxrpc/recvmsg.c268
-rw-r--r--net/rxrpc/rxkad.c251
-rw-r--r--net/rxrpc/sendmsg.c218
-rw-r--r--net/rxrpc/skbuff.c20
-rw-r--r--net/rxrpc/sysctl.c11
-rw-r--r--net/rxrpc/txbuf.c135
-rw-r--r--net/sched/Kconfig2
-rw-r--r--net/sched/act_connmark.c4
-rw-r--r--net/sched/act_ct.c132
-rw-r--r--net/sched/act_ctinfo.c6
-rw-r--r--net/sched/act_gact.c2
-rw-r--r--net/sched/act_sample.c2
-rw-r--r--net/sched/act_skbedit.c14
-rw-r--r--net/sched/cls_api.c7
-rw-r--r--net/sched/sch_api.c5
-rw-r--r--net/sched/sch_cake.c12
-rw-r--r--net/sched/sch_fq_codel.c25
-rw-r--r--net/sched/sch_netem.c22
-rw-r--r--net/sched/sch_pie.c2
-rw-r--r--net/sched/sch_red.c4
-rw-r--r--net/sched/sch_sfb.c5
-rw-r--r--net/sctp/associola.c4
-rw-r--r--net/sctp/diag.c3
-rw-r--r--net/sctp/endpointola.c13
-rw-r--r--net/sctp/input.c108
-rw-r--r--net/sctp/ipv6.c22
-rw-r--r--net/sctp/outqueue.c13
-rw-r--r--net/sctp/protocol.c19
-rw-r--r--net/sctp/sm_statefuns.c2
-rw-r--r--net/sctp/socket.c42
-rw-r--r--net/sctp/stream.c25
-rw-r--r--net/sctp/stream_interleave.c12
-rw-r--r--net/sctp/stream_sched.c5
-rw-r--r--net/sctp/stream_sched_prio.c19
-rw-r--r--net/sctp/stream_sched_rr.c5
-rw-r--r--net/sctp/sysctl.c11
-rw-r--r--net/sctp/ulpqueue.c10
-rw-r--r--net/smc/af_smc.c6
-rw-r--r--net/smc/smc_core.c3
-rw-r--r--net/socket.c8
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c2
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c4
-rw-r--r--net/sunrpc/cache.c2
-rw-r--r--net/sunrpc/sysfs.c12
-rw-r--r--net/sunrpc/xprt.c2
-rw-r--r--net/sunrpc/xprtsock.c2
-rw-r--r--net/tipc/crypto.c3
-rw-r--r--net/tipc/discover.c7
-rw-r--r--net/tipc/netlink_compat.c2
-rw-r--r--net/tipc/socket.c2
-rw-r--r--net/tipc/topsrv.c38
-rw-r--r--net/tls/tls_device_fallback.c5
-rw-r--r--net/tls/tls_strp.c32
-rw-r--r--net/unix/af_unix.c2
-rw-r--r--net/unix/garbage.c20
-rw-r--r--net/unix/unix_bpf.c8
-rw-r--r--net/vmw_vsock/af_vsock.c7
-rw-r--r--net/wireless/core.h5
-rw-r--r--net/wireless/mlme.c4
-rw-r--r--net/wireless/nl80211.c23
-rw-r--r--net/wireless/nl80211.h3
-rw-r--r--net/wireless/reg.c12
-rw-r--r--net/wireless/scan.c17
-rw-r--r--net/wireless/sme.c26
-rw-r--r--net/wireless/util.c10
-rw-r--r--net/wireless/wext-compat.c180
-rw-r--r--net/wireless/wext-compat.h8
-rw-r--r--net/wireless/wext-sme.c5
-rw-r--r--net/x25/x25_dev.c2
-rw-r--r--net/xdp/xskmap.c4
-rw-r--r--net/xfrm/xfrm_device.c15
-rw-r--r--net/xfrm/xfrm_replay.c2
-rw-r--r--net/xfrm/xfrm_state.c2
412 files changed, 10910 insertions, 7286 deletions
diff --git a/net/802/garp.c b/net/802/garp.c
index f6012f8e59f0..fc9eb02a912f 100644
--- a/net/802/garp.c
+++ b/net/802/garp.c
@@ -407,7 +407,7 @@ static void garp_join_timer_arm(struct garp_applicant *app)
{
unsigned long delay;
- delay = (u64)msecs_to_jiffies(garp_join_time) * prandom_u32() >> 32;
+ delay = prandom_u32_max(msecs_to_jiffies(garp_join_time));
mod_timer(&app->join_timer, jiffies + delay);
}
diff --git a/net/802/mrp.c b/net/802/mrp.c
index 35e04cc5390c..6c927d4b35f0 100644
--- a/net/802/mrp.c
+++ b/net/802/mrp.c
@@ -592,7 +592,7 @@ static void mrp_join_timer_arm(struct mrp_applicant *app)
{
unsigned long delay;
- delay = (u64)msecs_to_jiffies(mrp_join_time) * prandom_u32() >> 32;
+ delay = prandom_u32_max(msecs_to_jiffies(mrp_join_time));
mod_timer(&app->join_timer, jiffies + delay);
}
@@ -606,7 +606,10 @@ static void mrp_join_timer(struct timer_list *t)
spin_unlock(&app->lock);
mrp_queue_xmit(app);
- mrp_join_timer_arm(app);
+ spin_lock(&app->lock);
+ if (likely(app->active))
+ mrp_join_timer_arm(app);
+ spin_unlock(&app->lock);
}
static void mrp_periodic_timer_arm(struct mrp_applicant *app)
@@ -620,11 +623,12 @@ static void mrp_periodic_timer(struct timer_list *t)
struct mrp_applicant *app = from_timer(app, t, periodic_timer);
spin_lock(&app->lock);
- mrp_mad_event(app, MRP_EVENT_PERIODIC);
- mrp_pdu_queue(app);
+ if (likely(app->active)) {
+ mrp_mad_event(app, MRP_EVENT_PERIODIC);
+ mrp_pdu_queue(app);
+ mrp_periodic_timer_arm(app);
+ }
spin_unlock(&app->lock);
-
- mrp_periodic_timer_arm(app);
}
static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset)
@@ -872,6 +876,7 @@ int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl)
app->dev = dev;
app->app = appl;
app->mad = RB_ROOT;
+ app->active = true;
spin_lock_init(&app->lock);
skb_queue_head_init(&app->queue);
rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app);
@@ -900,6 +905,9 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl)
RCU_INIT_POINTER(port->applicants[appl->type], NULL);
+ spin_lock_bh(&app->lock);
+ app->active = false;
+ spin_unlock_bh(&app->lock);
/* Delete timer and generate a final TX event to flush out
* all pending messages before the applicant is gone.
*/
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index e1bb41a443c4..296d0145932f 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -712,13 +712,13 @@ static void vlan_dev_get_stats64(struct net_device *dev,
p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i);
do {
- start = u64_stats_fetch_begin_irq(&p->syncp);
+ start = u64_stats_fetch_begin(&p->syncp);
rxpackets = u64_stats_read(&p->rx_packets);
rxbytes = u64_stats_read(&p->rx_bytes);
rxmulticast = u64_stats_read(&p->rx_multicast);
txpackets = u64_stats_read(&p->tx_packets);
txbytes = u64_stats_read(&p->tx_bytes);
- } while (u64_stats_fetch_retry_irq(&p->syncp, start));
+ } while (u64_stats_fetch_retry(&p->syncp, start));
stats->rx_packets += rxpackets;
stats->rx_bytes += rxbytes;
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 56a186768750..07db2f436d44 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -120,7 +120,7 @@ struct p9_conn {
struct list_head unsent_req_list;
struct p9_req_t *rreq;
struct p9_req_t *wreq;
- char tmp_buf[7];
+ char tmp_buf[P9_HDRSZ];
struct p9_fcall rc;
int wpos;
int wsize;
@@ -202,9 +202,11 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
list_move(&req->req_list, &cancel_list);
+ req->status = REQ_STATUS_ERROR;
}
list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
list_move(&req->req_list, &cancel_list);
+ req->status = REQ_STATUS_ERROR;
}
spin_unlock(&m->req_lock);
@@ -291,7 +293,7 @@ static void p9_read_work(struct work_struct *work)
if (!m->rc.sdata) {
m->rc.sdata = m->tmp_buf;
m->rc.offset = 0;
- m->rc.capacity = 7; /* start by reading header */
+ m->rc.capacity = P9_HDRSZ; /* start by reading header */
}
clear_bit(Rpending, &m->wsched);
@@ -314,7 +316,7 @@ static void p9_read_work(struct work_struct *work)
p9_debug(P9_DEBUG_TRANS, "got new header\n");
/* Header size */
- m->rc.size = 7;
+ m->rc.size = P9_HDRSZ;
err = p9_parse_header(&m->rc, &m->rc.size, NULL, NULL, 0);
if (err) {
p9_debug(P9_DEBUG_ERROR,
@@ -322,14 +324,6 @@ static void p9_read_work(struct work_struct *work)
goto error;
}
- if (m->rc.size >= m->client->msize) {
- p9_debug(P9_DEBUG_ERROR,
- "requested packet size too big: %d\n",
- m->rc.size);
- err = -EIO;
- goto error;
- }
-
p9_debug(P9_DEBUG_TRANS,
"mux %p pkt: size: %d bytes tag: %d\n",
m, m->rc.size, m->rc.tag);
@@ -342,6 +336,14 @@ static void p9_read_work(struct work_struct *work)
goto error;
}
+ if (m->rc.size > m->rreq->rc.capacity) {
+ p9_debug(P9_DEBUG_ERROR,
+ "requested packet size too big: %d for tag %d with capacity %zd\n",
+ m->rc.size, m->rc.tag, m->rreq->rc.capacity);
+ err = -EIO;
+ goto error;
+ }
+
if (!m->rreq->rc.sdata) {
p9_debug(P9_DEBUG_ERROR,
"No recv fcall for tag %d (req %p), disconnecting!\n",
@@ -860,8 +862,10 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket)
struct file *file;
p = kzalloc(sizeof(struct p9_trans_fd), GFP_KERNEL);
- if (!p)
+ if (!p) {
+ sock_release(csocket);
return -ENOMEM;
+ }
csocket->sk->sk_allocation = GFP_NOIO;
file = sock_alloc_file(csocket, 0, NULL);
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index b15c64128c3e..aaa5fd364691 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -208,6 +208,14 @@ static void p9_xen_response(struct work_struct *work)
continue;
}
+ if (h.size > req->rc.capacity) {
+ dev_warn(&priv->dev->dev,
+ "requested packet size too big: %d for tag %d with capacity %zd\n",
+ h.size, h.tag, req->rc.capacity);
+ req->status = REQ_STATUS_ERROR;
+ goto recv_error;
+ }
+
memcpy(&req->rc, &h, sizeof(h));
req->rc.offset = 0;
@@ -217,6 +225,7 @@ static void p9_xen_response(struct work_struct *work)
masked_prod, &masked_cons,
XEN_9PFS_RING_SIZE(ring));
+recv_error:
virt_mb();
cons += h.size;
ring->intf->in_cons = cons;
diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c
index 829db9eba0cb..aaf64b953915 100644
--- a/net/atm/mpoa_proc.c
+++ b/net/atm/mpoa_proc.c
@@ -219,11 +219,12 @@ static ssize_t proc_mpc_write(struct file *file, const char __user *buff,
if (!page)
return -ENOMEM;
- for (p = page, len = 0; len < nbytes; p++, len++) {
+ for (p = page, len = 0; len < nbytes; p++) {
if (get_user(*p, buff++)) {
free_page((unsigned long)page);
return -EFAULT;
}
+ len += 1;
if (*p == '\0' || *p == '\n')
break;
}
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 6b4c25a92377..d8da400cb4de 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -723,7 +723,7 @@ static int ax25_getsockopt(struct socket *sock, int level, int optname,
if (maxlen < 1)
return -EFAULT;
- valptr = (void *) &val;
+ valptr = &val;
length = min_t(unsigned int, maxlen, sizeof(int));
lock_sock(sk);
@@ -785,7 +785,7 @@ static int ax25_getsockopt(struct socket *sock, int level, int optname,
length = 1;
}
- valptr = (void *) devname;
+ valptr = devname;
break;
default:
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index a5e4a4e976cf..ad5714f737be 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -1267,7 +1267,8 @@ batadv_get_vlan_from_info(struct batadv_priv *bat_priv, struct net *net,
*
* Return: 0 on success or negative error number in case of failure
*/
-static int batadv_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
+static int batadv_pre_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
struct genl_info *info)
{
struct net *net = genl_info_net(info);
@@ -1332,7 +1333,8 @@ err_put_softif:
* @skb: Netlink message with request data
* @info: receiver information
*/
-static void batadv_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
+static void batadv_post_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
struct genl_info *info)
{
struct batadv_hard_iface *hard_iface;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 7a59c4487050..a6c12863a253 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1067,10 +1067,21 @@ int hci_conn_del(struct hci_conn *conn)
hdev->acl_cnt += conn->sent;
} else {
struct hci_conn *acl = conn->link;
+
if (acl) {
acl->link = NULL;
hci_conn_drop(acl);
}
+
+ /* Unacked ISO frames */
+ if (conn->type == ISO_LINK) {
+ if (hdev->iso_pkts)
+ hdev->iso_cnt += conn->sent;
+ else if (hdev->le_pkts)
+ hdev->le_cnt += conn->sent;
+ else
+ hdev->acl_cnt += conn->sent;
+ }
}
if (conn->amp_mgr)
@@ -1761,6 +1772,7 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
if (!cis)
return ERR_PTR(-ENOMEM);
cis->cleanup = cis_cleanup;
+ cis->dst_type = dst_type;
}
if (cis->state == BT_CONNECTED)
@@ -2140,12 +2152,6 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
struct hci_conn *le;
struct hci_conn *cis;
- /* Convert from ISO socket address type to HCI address type */
- if (dst_type == BDADDR_LE_PUBLIC)
- dst_type = ADDR_LE_DEV_PUBLIC;
- else
- dst_type = ADDR_LE_DEV_RANDOM;
-
if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
le = hci_connect_le(hdev, dst, dst_type, false,
BT_SECURITY_LOW,
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 613039ba5dbf..f825857db6d0 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -235,6 +235,14 @@ static int iso_chan_add(struct iso_conn *conn, struct sock *sk,
return err;
}
+static inline u8 le_addr_type(u8 bdaddr_type)
+{
+ if (bdaddr_type == BDADDR_LE_PUBLIC)
+ return ADDR_LE_DEV_PUBLIC;
+ else
+ return ADDR_LE_DEV_RANDOM;
+}
+
static int iso_connect_bis(struct sock *sk)
{
struct iso_conn *conn;
@@ -328,14 +336,16 @@ static int iso_connect_cis(struct sock *sk)
/* Just bind if DEFER_SETUP has been set */
if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst,
- iso_pi(sk)->dst_type, &iso_pi(sk)->qos);
+ le_addr_type(iso_pi(sk)->dst_type),
+ &iso_pi(sk)->qos);
if (IS_ERR(hcon)) {
err = PTR_ERR(hcon);
goto done;
}
} else {
hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst,
- iso_pi(sk)->dst_type, &iso_pi(sk)->qos);
+ le_addr_type(iso_pi(sk)->dst_type),
+ &iso_pi(sk)->qos);
if (IS_ERR(hcon)) {
err = PTR_ERR(hcon);
goto done;
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 1f34b82ca0ec..9c24947aa41e 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -1990,7 +1990,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm,
if (link_type == LE_LINK && c->src_type == BDADDR_BREDR)
continue;
- if (c->psm == psm) {
+ if (c->chan_type != L2CAP_CHAN_FIXED && c->psm == psm) {
int src_match, dst_match;
int src_any, dst_any;
@@ -3764,7 +3764,8 @@ done:
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
sizeof(rfc), (unsigned long) &rfc, endptr - ptr);
- if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) {
+ if (remote_efs &&
+ test_bit(FLAG_EFS_ENABLE, &chan->flags)) {
chan->remote_id = efs.id;
chan->remote_stype = efs.stype;
chan->remote_msdu = le16_to_cpu(efs.msdu);
@@ -5813,6 +5814,19 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
BT_DBG("psm 0x%2.2x scid 0x%4.4x mtu %u mps %u", __le16_to_cpu(psm),
scid, mtu, mps);
+ /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 3, Part A
+ * page 1059:
+ *
+ * Valid range: 0x0001-0x00ff
+ *
+ * Table 4.15: L2CAP_LE_CREDIT_BASED_CONNECTION_REQ SPSM ranges
+ */
+ if (!psm || __le16_to_cpu(psm) > L2CAP_PSM_LE_DYN_END) {
+ result = L2CAP_CR_LE_BAD_PSM;
+ chan = NULL;
+ goto response;
+ }
+
/* Check if we have socket listening on psm */
pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src,
&conn->hcon->dst, LE_LINK);
@@ -6001,6 +6015,18 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn,
psm = req->psm;
+ /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 3, Part A
+ * page 1059:
+ *
+ * Valid range: 0x0001-0x00ff
+ *
+ * Table 4.15: L2CAP_LE_CREDIT_BASED_CONNECTION_REQ SPSM ranges
+ */
+ if (!psm || __le16_to_cpu(psm) > L2CAP_PSM_LE_DYN_END) {
+ result = L2CAP_CR_LE_BAD_PSM;
+ goto response;
+ }
+
BT_DBG("psm 0x%2.2x mtu %u mps %u", __le16_to_cpu(psm), mtu, mps);
memset(&pdu, 0, sizeof(pdu));
@@ -6885,6 +6911,7 @@ static int l2cap_rx_state_recv(struct l2cap_chan *chan,
struct l2cap_ctrl *control,
struct sk_buff *skb, u8 event)
{
+ struct l2cap_ctrl local_control;
int err = 0;
bool skb_in_use = false;
@@ -6909,15 +6936,32 @@ static int l2cap_rx_state_recv(struct l2cap_chan *chan,
chan->buffer_seq = chan->expected_tx_seq;
skb_in_use = true;
+ /* l2cap_reassemble_sdu may free skb, hence invalidate
+ * control, so make a copy in advance to use it after
+ * l2cap_reassemble_sdu returns and to avoid the race
+ * condition, for example:
+ *
+ * The current thread calls:
+ * l2cap_reassemble_sdu
+ * chan->ops->recv == l2cap_sock_recv_cb
+ * __sock_queue_rcv_skb
+ * Another thread calls:
+ * bt_sock_recvmsg
+ * skb_recv_datagram
+ * skb_free_datagram
+ * Then the current thread tries to access control, but
+ * it was freed by skb_free_datagram.
+ */
+ local_control = *control;
err = l2cap_reassemble_sdu(chan, skb, control);
if (err)
break;
- if (control->final) {
+ if (local_control.final) {
if (!test_and_clear_bit(CONN_REJ_ACT,
&chan->conn_state)) {
- control->final = 0;
- l2cap_retransmit_all(chan, control);
+ local_control.final = 0;
+ l2cap_retransmit_all(chan, &local_control);
l2cap_ertm_send(chan);
}
}
@@ -7297,11 +7341,27 @@ static int l2cap_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
struct sk_buff *skb)
{
+ /* l2cap_reassemble_sdu may free skb, hence invalidate control, so store
+ * the txseq field in advance to use it after l2cap_reassemble_sdu
+ * returns and to avoid the race condition, for example:
+ *
+ * The current thread calls:
+ * l2cap_reassemble_sdu
+ * chan->ops->recv == l2cap_sock_recv_cb
+ * __sock_queue_rcv_skb
+ * Another thread calls:
+ * bt_sock_recvmsg
+ * skb_recv_datagram
+ * skb_free_datagram
+ * Then the current thread tries to access control, but it was freed by
+ * skb_free_datagram.
+ */
+ u16 txseq = control->txseq;
+
BT_DBG("chan %p, control %p, skb %p, state %d", chan, control, skb,
chan->rx_state);
- if (l2cap_classify_txseq(chan, control->txseq) ==
- L2CAP_TXSEQ_EXPECTED) {
+ if (l2cap_classify_txseq(chan, txseq) == L2CAP_TXSEQ_EXPECTED) {
l2cap_pass_to_tx(chan, control);
BT_DBG("buffer_seq %u->%u", chan->buffer_seq,
@@ -7324,8 +7384,8 @@ static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
}
}
- chan->last_acked_seq = control->txseq;
- chan->expected_tx_seq = __next_seq(chan, control->txseq);
+ chan->last_acked_seq = txseq;
+ chan->expected_tx_seq = __next_seq(chan, txseq);
return 0;
}
@@ -7581,6 +7641,7 @@ static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid,
return;
}
+ l2cap_chan_hold(chan);
l2cap_chan_lock(chan);
} else {
BT_DBG("unknown cid 0x%4.4x", cid);
@@ -8426,9 +8487,8 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
* expected length.
*/
if (skb->len < L2CAP_LEN_SIZE) {
- if (l2cap_recv_frag(conn, skb, conn->mtu) < 0)
- goto drop;
- return;
+ l2cap_recv_frag(conn, skb, conn->mtu);
+ break;
}
len = get_unaligned_le16(skb->data) + L2CAP_HDR_SIZE;
@@ -8472,7 +8532,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
/* Header still could not be read just continue */
if (conn->rx_skb->len < L2CAP_LEN_SIZE)
- return;
+ break;
}
if (skb->len > conn->rx_len) {
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index e78dadfc5829..2d434c1f4617 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -156,29 +156,29 @@ static bool bpf_dummy_ops_is_valid_access(int off, int size,
}
static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
- const struct btf *btf,
- const struct btf_type *t, int off,
- int size, enum bpf_access_type atype,
+ const struct bpf_reg_state *reg,
+ int off, int size, enum bpf_access_type atype,
u32 *next_btf_id,
enum bpf_type_flag *flag)
{
const struct btf_type *state;
+ const struct btf_type *t;
s32 type_id;
int err;
- type_id = btf_find_by_name_kind(btf, "bpf_dummy_ops_state",
+ type_id = btf_find_by_name_kind(reg->btf, "bpf_dummy_ops_state",
BTF_KIND_STRUCT);
if (type_id < 0)
return -EINVAL;
- state = btf_type_by_id(btf, type_id);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ state = btf_type_by_id(reg->btf, type_id);
if (t != state) {
bpf_log(log, "only access to bpf_dummy_ops_state is supported\n");
return -EACCES;
}
- err = btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
- flag);
+ err = btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
if (err < 0)
return err;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 13d578ce2a09..6094ef7cffcd 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -774,6 +774,7 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size,
if (user_size > size)
return ERR_PTR(-EMSGSIZE);
+ size = SKB_DATA_ALIGN(size);
data = kzalloc(size + headroom + tailroom, GFP_USER);
if (!data)
return ERR_PTR(-ENOMEM);
@@ -979,9 +980,6 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
{
struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb;
- if (!skb->len)
- return -EINVAL;
-
if (!__skb)
return 0;
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 96e91d69a9a8..4f5098d33a46 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -166,13 +166,14 @@ static int br_switchdev_event(struct notifier_block *unused,
case SWITCHDEV_FDB_ADD_TO_BRIDGE:
fdb_info = ptr;
err = br_fdb_external_learn_add(br, p, fdb_info->addr,
- fdb_info->vid, false);
+ fdb_info->vid,
+ fdb_info->locked, false);
if (err) {
err = notifier_from_errno(err);
break;
}
br_fdb_offloaded_set(br, p, fdb_info->addr,
- fdb_info->vid, true);
+ fdb_info->vid, fdb_info->offloaded);
break;
case SWITCHDEV_FDB_DEL_TO_BRIDGE:
fdb_info = ptr;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e7f4fccb6adb..e69a872bfc1d 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -105,6 +105,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
struct nda_cacheinfo ci;
struct nlmsghdr *nlh;
struct ndmsg *ndm;
+ u32 ext_flags = 0;
nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
if (nlh == NULL)
@@ -125,11 +126,16 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
ndm->ndm_flags |= NTF_EXT_LEARNED;
if (test_bit(BR_FDB_STICKY, &fdb->flags))
ndm->ndm_flags |= NTF_STICKY;
+ if (test_bit(BR_FDB_LOCKED, &fdb->flags))
+ ext_flags |= NTF_EXT_LOCKED;
if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr))
goto nla_put_failure;
if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex))
goto nla_put_failure;
+ if (nla_put_u32(skb, NDA_FLAGS_EXT, ext_flags))
+ goto nla_put_failure;
+
ci.ndm_used = jiffies_to_clock_t(now - fdb->used);
ci.ndm_confirmed = 0;
ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated);
@@ -171,6 +177,7 @@ static inline size_t fdb_nlmsg_size(void)
return NLMSG_ALIGN(sizeof(struct ndmsg))
+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+ nla_total_size(sizeof(u32)) /* NDA_MASTER */
+ + nla_total_size(sizeof(u32)) /* NDA_FLAGS_EXT */
+ nla_total_size(sizeof(u16)) /* NDA_VLAN */
+ nla_total_size(sizeof(struct nda_cacheinfo))
+ nla_total_size(0) /* NDA_FDB_EXT_ATTRS */
@@ -879,6 +886,11 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
&fdb->flags)))
clear_bit(BR_FDB_ADDED_BY_EXT_LEARN,
&fdb->flags);
+ /* Clear locked flag when roaming to an
+ * unlocked port.
+ */
+ if (unlikely(test_bit(BR_FDB_LOCKED, &fdb->flags)))
+ clear_bit(BR_FDB_LOCKED, &fdb->flags);
}
if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags)))
@@ -1082,6 +1094,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
modified = true;
}
+ if (test_and_clear_bit(BR_FDB_LOCKED, &fdb->flags))
+ modified = true;
+
if (fdb_handle_notify(fdb, notify))
modified = true;
@@ -1124,7 +1139,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
"FDB entry towards bridge must be permanent");
return -EINVAL;
}
- err = br_fdb_external_learn_add(br, p, addr, vid, true);
+ err = br_fdb_external_learn_add(br, p, addr, vid, false, true);
} else {
spin_lock_bh(&br->hash_lock);
err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb);
@@ -1150,6 +1165,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
struct net_bridge_port *p = NULL;
struct net_bridge_vlan *v;
struct net_bridge *br = NULL;
+ u32 ext_flags = 0;
int err = 0;
trace_br_fdb_add(ndm, dev, addr, vid, nlh_flags);
@@ -1178,6 +1194,14 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
vg = nbp_vlan_group(p);
}
+ if (tb[NDA_FLAGS_EXT])
+ ext_flags = nla_get_u32(tb[NDA_FLAGS_EXT]);
+
+ if (ext_flags & NTF_EXT_LOCKED) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot add FDB entry with \"locked\" flag set");
+ return -EINVAL;
+ }
+
if (tb[NDA_FDB_EXT_ATTRS]) {
attr = tb[NDA_FDB_EXT_ATTRS];
err = nla_parse_nested(nfea_tb, NFEA_MAX, attr,
@@ -1353,7 +1377,7 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
}
int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid,
+ const unsigned char *addr, u16 vid, bool locked,
bool swdev_notify)
{
struct net_bridge_fdb_entry *fdb;
@@ -1362,6 +1386,9 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
trace_br_fdb_external_learn_add(br, p, addr, vid);
+ if (locked && (!p || !(p->flags & BR_PORT_MAB)))
+ return -EINVAL;
+
spin_lock_bh(&br->hash_lock);
fdb = br_fdb_find(br, addr, vid);
@@ -1374,6 +1401,9 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
if (!p)
flags |= BIT(BR_FDB_LOCAL);
+ if (locked)
+ flags |= BIT(BR_FDB_LOCKED);
+
fdb = fdb_create(br, p, addr, vid, flags);
if (!fdb) {
err = -ENOMEM;
@@ -1381,6 +1411,13 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
}
fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
} else {
+ if (locked &&
+ (!test_bit(BR_FDB_LOCKED, &fdb->flags) ||
+ READ_ONCE(fdb->dst) != p)) {
+ err = -EINVAL;
+ goto err_unlock;
+ }
+
fdb->updated = jiffies;
if (READ_ONCE(fdb->dst) != p) {
@@ -1397,6 +1434,11 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
modified = true;
}
+ if (locked != test_bit(BR_FDB_LOCKED, &fdb->flags)) {
+ change_bit(BR_FDB_LOCKED, &fdb->flags);
+ modified = true;
+ }
+
if (swdev_notify)
set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 68b3e850bcb9..3027e8f6be15 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -109,9 +109,26 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
struct net_bridge_fdb_entry *fdb_src =
br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid);
- if (!fdb_src || READ_ONCE(fdb_src->dst) != p ||
- test_bit(BR_FDB_LOCAL, &fdb_src->flags))
+ if (!fdb_src) {
+ /* FDB miss. Create locked FDB entry if MAB is enabled
+ * and drop the packet.
+ */
+ if (p->flags & BR_PORT_MAB)
+ br_fdb_update(br, p, eth_hdr(skb)->h_source,
+ vid, BIT(BR_FDB_LOCKED));
goto drop;
+ } else if (READ_ONCE(fdb_src->dst) != p ||
+ test_bit(BR_FDB_LOCAL, &fdb_src->flags)) {
+ /* FDB mismatch. Drop the packet without roaming. */
+ goto drop;
+ } else if (test_bit(BR_FDB_LOCKED, &fdb_src->flags)) {
+ /* FDB match, but entry is locked. Refresh it and drop
+ * the packet.
+ */
+ br_fdb_update(br, p, eth_hdr(skb)->h_source, vid,
+ BIT(BR_FDB_LOCKED));
+ goto drop;
+ }
}
nbp_switchdev_frame_mark(p, skb);
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 589ff497d50c..321be94c445a 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -866,7 +866,6 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
unsigned long now = jiffies;
unsigned char flags = 0;
u8 filter_mode;
- int err;
__mdb_entry_to_br_ip(entry, &group, mdb_attrs);
@@ -892,13 +891,9 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
return -EINVAL;
}
- mp = br_mdb_ip_get(br, &group);
- if (!mp) {
- mp = br_multicast_new_group(br, &group);
- err = PTR_ERR_OR_ZERO(mp);
- if (err)
- return err;
- }
+ mp = br_multicast_new_group(br, &group);
+ if (IS_ERR(mp))
+ return PTR_ERR(mp);
/* host join */
if (!port) {
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index db4f2641d1cd..5e988f0ed2c0 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -2669,7 +2669,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge_mcast *brmctx,
if (!pmctx || igmpv2)
continue;
- spin_lock_bh(&brmctx->br->multicast_lock);
+ spin_lock(&brmctx->br->multicast_lock);
if (!br_multicast_ctx_should_use(brmctx, pmctx))
goto unlock_continue;
@@ -2717,7 +2717,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge_mcast *brmctx,
if (changed)
br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB);
unlock_continue:
- spin_unlock_bh(&brmctx->br->multicast_lock);
+ spin_unlock(&brmctx->br->multicast_lock);
}
return err;
@@ -2807,7 +2807,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge_mcast *brmctx,
if (!pmctx || mldv1)
continue;
- spin_lock_bh(&brmctx->br->multicast_lock);
+ spin_lock(&brmctx->br->multicast_lock);
if (!br_multicast_ctx_should_use(brmctx, pmctx))
goto unlock_continue;
@@ -2859,7 +2859,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge_mcast *brmctx,
if (changed)
br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB);
unlock_continue:
- spin_unlock_bh(&brmctx->br->multicast_lock);
+ spin_unlock(&brmctx->br->multicast_lock);
}
return err;
@@ -4899,9 +4899,9 @@ void br_multicast_get_stats(const struct net_bridge *br,
unsigned int start;
do {
- start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
memcpy(&temp, &cpu_stats->mstats, sizeof(temp));
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries);
mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries);
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 5aeb3646e74c..4316cc82ae17 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -188,6 +188,7 @@ static inline size_t br_port_info_size(void)
+ nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */
+ nla_total_size(1) /* IFLA_BRPORT_ISOLATED */
+ nla_total_size(1) /* IFLA_BRPORT_LOCKED */
+ + nla_total_size(1) /* IFLA_BRPORT_MAB */
+ nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */
+ nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */
+ nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */
@@ -274,7 +275,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN,
!!(p->flags & BR_MRP_LOST_IN_CONT)) ||
nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) ||
- nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)))
+ nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)) ||
+ nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB)))
return -EMSGSIZE;
timerval = br_timer_value(&p->message_age_timer);
@@ -876,6 +878,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
[IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
[IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 },
[IFLA_BRPORT_LOCKED] = { .type = NLA_U8 },
+ [IFLA_BRPORT_MAB] = { .type = NLA_U8 },
[IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 },
[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 },
};
@@ -943,6 +946,22 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[],
br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS);
br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED);
+ br_set_port_flag(p, tb, IFLA_BRPORT_MAB, BR_PORT_MAB);
+
+ if ((p->flags & BR_PORT_MAB) &&
+ (!(p->flags & BR_PORT_LOCKED) || !(p->flags & BR_LEARNING))) {
+ NL_SET_ERR_MSG(extack, "Bridge port must be locked and have learning enabled when MAB is enabled");
+ p->flags = old_flags;
+ return -EINVAL;
+ } else if (!(p->flags & BR_PORT_MAB) && (old_flags & BR_PORT_MAB)) {
+ struct net_bridge_fdb_flush_desc desc = {
+ .flags = BIT(BR_FDB_LOCKED),
+ .flags_mask = BIT(BR_FDB_LOCKED),
+ .port_ifindex = p->dev->ifindex,
+ };
+
+ br_fdb_flush(p->br, &desc);
+ }
changed_mask = old_flags ^ p->flags;
@@ -1332,7 +1351,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
if (data[IFLA_BR_FDB_FLUSH]) {
struct net_bridge_fdb_flush_desc desc = {
- .flags_mask = BR_FDB_STATIC
+ .flags_mask = BIT(BR_FDB_STATIC)
};
br_fdb_flush(br, &desc);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 06e5f6faa431..4c4fda930068 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -251,7 +251,8 @@ enum {
BR_FDB_ADDED_BY_EXT_LEARN,
BR_FDB_OFFLOADED,
BR_FDB_NOTIFY,
- BR_FDB_NOTIFY_INACTIVE
+ BR_FDB_NOTIFY_INACTIVE,
+ BR_FDB_LOCKED,
};
struct net_bridge_fdb_key {
@@ -810,7 +811,7 @@ int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
const unsigned char *addr, u16 vid,
- bool swdev_notify);
+ bool locked, bool swdev_notify);
int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
const unsigned char *addr, u16 vid,
bool swdev_notify);
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 8f3d76c751dd..7eb6fd5bb917 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -71,7 +71,7 @@ bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
}
/* Flags that can be offloaded to hardware */
-#define BR_PORT_FLAGS_HW_OFFLOAD (BR_LEARNING | BR_FLOOD | \
+#define BR_PORT_FLAGS_HW_OFFLOAD (BR_LEARNING | BR_FLOOD | BR_PORT_MAB | \
BR_MCAST_FLOOD | BR_BCAST_FLOOD | BR_PORT_LOCKED | \
BR_HAIRPIN_MODE | BR_ISOLATED | BR_MULTICAST_TO_UNICAST)
@@ -136,6 +136,7 @@ static void br_switchdev_fdb_populate(struct net_bridge *br,
item->added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
item->offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags);
item->is_local = test_bit(BR_FDB_LOCAL, &fdb->flags);
+ item->locked = false;
item->info.dev = (!p || item->is_local) ? br->dev : p->dev;
item->info.ctx = ctx;
}
@@ -146,6 +147,9 @@ br_switchdev_fdb_notify(struct net_bridge *br,
{
struct switchdev_notifier_fdb_info item;
+ if (test_bit(BR_FDB_LOCKED, &fdb->flags))
+ return;
+
br_switchdev_fdb_populate(br, &item, fdb, NULL);
switch (type) {
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 612e367fff20..ea733542244c 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -345,7 +345,7 @@ static int set_flush(struct net_bridge *br, unsigned long val,
struct netlink_ext_ack *extack)
{
struct net_bridge_fdb_flush_desc desc = {
- .flags_mask = BR_FDB_STATIC
+ .flags_mask = BIT(BR_FDB_STATIC)
};
br_fdb_flush(br, &desc);
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 6e53dc991409..bc75fa1e4666 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -959,6 +959,8 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto,
list_for_each_entry(p, &br->port_list, list) {
vg = nbp_vlan_group(p);
list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ continue;
err = vlan_vid_add(p->dev, proto, vlan->vid);
if (err)
goto err_filt;
@@ -973,8 +975,11 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto,
/* Delete VLANs for the old proto from the device filter. */
list_for_each_entry(p, &br->port_list, list) {
vg = nbp_vlan_group(p);
- list_for_each_entry(vlan, &vg->vlan_list, vlist)
+ list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ continue;
vlan_vid_del(p->dev, oldproto, vlan->vid);
+ }
}
return 0;
@@ -983,13 +988,19 @@ err_filt:
attr.u.vlan_protocol = ntohs(oldproto);
switchdev_port_attr_set(br->dev, &attr, NULL);
- list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist)
+ list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) {
+ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ continue;
vlan_vid_del(p->dev, proto, vlan->vid);
+ }
list_for_each_entry_continue_reverse(p, &br->port_list, list) {
vg = nbp_vlan_group(p);
- list_for_each_entry(vlan, &vg->vlan_list, vlist)
+ list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ continue;
vlan_vid_del(p->dev, proto, vlan->vid);
+ }
}
return err;
@@ -1378,12 +1389,12 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
cpu_stats = per_cpu_ptr(v->stats, i);
do {
- start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
rxpackets = u64_stats_read(&cpu_stats->rx_packets);
rxbytes = u64_stats_read(&cpu_stats->rx_bytes);
txbytes = u64_stats_read(&cpu_stats->tx_bytes);
txpackets = u64_stats_read(&cpu_stats->tx_packets);
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
u64_stats_add(&stats->rx_packets, rxpackets);
u64_stats_add(&stats->rx_bytes, rxbytes);
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 4d63ef13a1fd..f35fc87c453a 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -310,9 +310,6 @@ static int chnl_net_open(struct net_device *dev)
if (result == 0) {
pr_debug("connect timeout\n");
- caif_disconnect_client(dev_net(dev), &priv->chnl);
- priv->state = CAIF_DISCONNECTED;
- pr_debug("state disconnected\n");
result = -ETIMEDOUT;
goto error;
}
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 9503ab10f9b8..27dcdcc0b808 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -450,7 +450,7 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id,
/* insert new receiver (dev,canid,mask) -> (func,data) */
- if (dev && dev->type != ARPHRD_CAN)
+ if (dev && (dev->type != ARPHRD_CAN || !can_get_ml_priv(dev)))
return -ENODEV;
if (dev && !net_eq(net, dev_net(dev)))
@@ -902,6 +902,7 @@ out_pernet:
static __exit void can_exit(void)
{
/* protocol unregister */
+ dev_remove_pack(&canxl_packet);
dev_remove_pack(&canfd_packet);
dev_remove_pack(&can_packet);
sock_unregister(PF_CAN);
diff --git a/net/can/isotp.c b/net/can/isotp.c
index a9d1357f8489..608f8c24ae46 100644
--- a/net/can/isotp.c
+++ b/net/can/isotp.c
@@ -111,6 +111,9 @@ MODULE_ALIAS("can-proto-6");
#define ISOTP_FC_WT 1 /* wait */
#define ISOTP_FC_OVFLW 2 /* overflow */
+#define ISOTP_FC_TIMEOUT 1 /* 1 sec */
+#define ISOTP_ECHO_TIMEOUT 2 /* 2 secs */
+
enum {
ISOTP_IDLE = 0,
ISOTP_WAIT_FIRST_FC,
@@ -258,7 +261,8 @@ static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus)
so->lastrxcf_tstamp = ktime_set(0, 0);
/* start rx timeout watchdog */
- hrtimer_start(&so->rxtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT);
+ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
return 0;
}
@@ -344,6 +348,8 @@ static int check_pad(struct isotp_sock *so, struct canfd_frame *cf,
return 0;
}
+static void isotp_send_cframe(struct isotp_sock *so);
+
static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae)
{
struct sock *sk = &so->sk;
@@ -398,14 +404,15 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae)
case ISOTP_FC_CTS:
so->tx.bs = 0;
so->tx.state = ISOTP_SENDING;
- /* start cyclic timer for sending CF frame */
- hrtimer_start(&so->txtimer, so->tx_gap,
+ /* send CF frame and enable echo timeout handling */
+ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0),
HRTIMER_MODE_REL_SOFT);
+ isotp_send_cframe(so);
break;
case ISOTP_FC_WT:
/* start timer to wait for next FC frame */
- hrtimer_start(&so->txtimer, ktime_set(1, 0),
+ hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
HRTIMER_MODE_REL_SOFT);
break;
@@ -600,7 +607,7 @@ static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae,
/* perform blocksize handling, if enabled */
if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) {
/* start rx timeout watchdog */
- hrtimer_start(&so->rxtimer, ktime_set(1, 0),
+ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
HRTIMER_MODE_REL_SOFT);
return 0;
}
@@ -829,7 +836,7 @@ static void isotp_rcv_echo(struct sk_buff *skb, void *data)
struct isotp_sock *so = isotp_sk(sk);
struct canfd_frame *cf = (struct canfd_frame *)skb->data;
- /* only handle my own local echo skb's */
+ /* only handle my own local echo CF/SF skb's (no FF!) */
if (skb->sk != sk || so->cfecho != *(u32 *)cf->data)
return;
@@ -849,13 +856,16 @@ static void isotp_rcv_echo(struct sk_buff *skb, void *data)
if (so->txfc.bs && so->tx.bs >= so->txfc.bs) {
/* stop and wait for FC with timeout */
so->tx.state = ISOTP_WAIT_FC;
- hrtimer_start(&so->txtimer, ktime_set(1, 0),
+ hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
HRTIMER_MODE_REL_SOFT);
return;
}
/* no gap between data frames needed => use burst mode */
if (!so->tx_gap) {
+ /* enable echo timeout handling */
+ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
isotp_send_cframe(so);
return;
}
@@ -879,7 +889,7 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer)
/* start timeout for unlikely lost echo skb */
hrtimer_set_expires(&so->txtimer,
ktime_add(ktime_get(),
- ktime_set(2, 0)));
+ ktime_set(ISOTP_ECHO_TIMEOUT, 0)));
restart = HRTIMER_RESTART;
/* push out the next consecutive frame */
@@ -907,7 +917,8 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer)
break;
default:
- WARN_ON_ONCE(1);
+ WARN_ONCE(1, "can-isotp: tx timer state %08X cfecho %08X\n",
+ so->tx.state, so->cfecho);
}
return restart;
@@ -923,7 +934,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
struct canfd_frame *cf;
int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;
int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0;
- s64 hrtimer_sec = 0;
+ s64 hrtimer_sec = ISOTP_ECHO_TIMEOUT;
int off;
int err;
@@ -942,6 +953,8 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE);
if (err)
goto err_out;
+
+ so->tx.state = ISOTP_SENDING;
}
if (!size || size > MAX_MSG_LENGTH) {
@@ -986,6 +999,10 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
cf = (struct canfd_frame *)skb->data;
skb_put_zero(skb, so->ll.mtu);
+ /* cfecho should have been zero'ed by init / former isotp_rcv_echo() */
+ if (so->cfecho)
+ pr_notice_once("can-isotp: uninit cfecho %08X\n", so->cfecho);
+
/* check for single frame transmission depending on TX_DL */
if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) {
/* The message size generally fits into a SingleFrame - good.
@@ -1011,11 +1028,8 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
else
cf->data[ae] |= size;
- so->tx.state = ISOTP_IDLE;
- wake_up_interruptible(&so->wait);
-
- /* don't enable wait queue for a single frame transmission */
- wait_tx_done = 0;
+ /* set CF echo tag for isotp_rcv_echo() (SF-mode) */
+ so->cfecho = *(u32 *)cf->data;
} else {
/* send first frame */
@@ -1031,31 +1045,23 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
/* disable wait for FCs due to activated block size */
so->txfc.bs = 0;
- /* cfecho should have been zero'ed by init */
- if (so->cfecho)
- pr_notice_once("can-isotp: no fc cfecho %08X\n",
- so->cfecho);
-
- /* set consecutive frame echo tag */
+ /* set CF echo tag for isotp_rcv_echo() (CF-mode) */
so->cfecho = *(u32 *)cf->data;
-
- /* switch directly to ISOTP_SENDING state */
- so->tx.state = ISOTP_SENDING;
-
- /* start timeout for unlikely lost echo skb */
- hrtimer_sec = 2;
} else {
/* standard flow control check */
so->tx.state = ISOTP_WAIT_FIRST_FC;
/* start timeout for FC */
- hrtimer_sec = 1;
- }
+ hrtimer_sec = ISOTP_FC_TIMEOUT;
- hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0),
- HRTIMER_MODE_REL_SOFT);
+ /* no CF echo tag for isotp_rcv_echo() (FF-mode) */
+ so->cfecho = 0;
+ }
}
+ hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0),
+ HRTIMER_MODE_REL_SOFT);
+
/* send the first or only CAN frame */
cf->flags = so->ll.tx_flags;
@@ -1068,8 +1074,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
__func__, ERR_PTR(err));
/* no transmission -> no timeout monitoring */
- if (hrtimer_sec)
- hrtimer_cancel(&so->txtimer);
+ hrtimer_cancel(&so->txtimer);
/* reset consecutive frame echo tag */
so->cfecho = 0;
diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c
index 144c86b0e3ff..821d4ff303b3 100644
--- a/net/can/j1939/main.c
+++ b/net/can/j1939/main.c
@@ -336,6 +336,9 @@ int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb)
/* re-claim the CAN_HDR from the SKB */
cf = skb_push(skb, J1939_CAN_HDR);
+ /* initialize header structure */
+ memset(cf, 0, J1939_CAN_HDR);
+
/* make it a full can frame again */
skb_put(skb, J1939_CAN_FTR + (8 - dlc));
diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
index d7d86c944d76..f26f4cfa9e63 100644
--- a/net/can/j1939/transport.c
+++ b/net/can/j1939/transport.c
@@ -342,10 +342,12 @@ static void j1939_session_skb_drop_old(struct j1939_session *session)
__skb_unlink(do_skb, &session->skb_queue);
/* drop ref taken in j1939_session_skb_queue() */
skb_unref(do_skb);
+ spin_unlock_irqrestore(&session->skb_queue.lock, flags);
kfree_skb(do_skb);
+ } else {
+ spin_unlock_irqrestore(&session->skb_queue.lock, flags);
}
- spin_unlock_irqrestore(&session->skb_queue.lock, flags);
}
void j1939_session_skb_queue(struct j1939_session *session,
@@ -985,7 +987,7 @@ static int j1939_session_tx_eoma(struct j1939_session *session)
/* wait for the EOMA packet to come in */
j1939_tp_set_rxtimeout(session, 1250);
- netdev_dbg(session->priv->ndev, "%p: 0x%p\n", __func__, session);
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
return 0;
}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 6a6898ee4049..db60217f911b 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -222,7 +222,7 @@ static void pick_new_mon(struct ceph_mon_client *monc)
max--;
}
- n = prandom_u32() % max;
+ n = prandom_u32_max(max);
if (o >= 0 && n >= o)
n++;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 87b883c7bfd6..4e4f1e4bc265 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1479,7 +1479,7 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
static int pick_random_replica(const struct ceph_osds *acting)
{
- int i = prandom_u32() % acting->size;
+ int i = prandom_u32_max(acting->size);
dout("%s picked osd%d, primary osd%d\n", __func__,
acting->osds[i], acting->primary);
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 94374d529ea4..9d2288c0736e 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -48,10 +48,8 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
/* Called by __sk_destruct() & bpf_sk_storage_clone() */
void bpf_sk_storage_free(struct sock *sk)
{
- struct bpf_local_storage_elem *selem;
struct bpf_local_storage *sk_storage;
bool free_sk_storage = false;
- struct hlist_node *n;
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
@@ -60,24 +58,8 @@ void bpf_sk_storage_free(struct sock *sk)
return;
}
- /* Netiher the bpf_prog nor the bpf-map's syscall
- * could be modifying the sk_storage->list now.
- * Thus, no elem can be added-to or deleted-from the
- * sk_storage->list by the bpf_prog or by the bpf-map's syscall.
- *
- * It is racing with bpf_local_storage_map_free() alone
- * when unlinking elem from the sk_storage->list and
- * the map's bucket->list.
- */
raw_spin_lock_bh(&sk_storage->lock);
- hlist_for_each_entry_safe(selem, n, &sk_storage->list, snode) {
- /* Always unlink from map before unlinking from
- * sk_storage.
- */
- bpf_selem_unlink_map(selem);
- free_sk_storage = bpf_selem_unlink_storage_nolock(
- sk_storage, selem, true, false);
- }
+ free_sk_storage = bpf_local_storage_unlink_nolock(sk_storage);
raw_spin_unlock_bh(&sk_storage->lock);
rcu_read_unlock();
@@ -87,23 +69,12 @@ void bpf_sk_storage_free(struct sock *sk)
static void bpf_sk_storage_map_free(struct bpf_map *map)
{
- struct bpf_local_storage_map *smap;
-
- smap = (struct bpf_local_storage_map *)map;
- bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap, NULL);
+ bpf_local_storage_map_free(map, &sk_cache, NULL);
}
static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
{
- struct bpf_local_storage_map *smap;
-
- smap = bpf_local_storage_map_alloc(attr);
- if (IS_ERR(smap))
- return ERR_CAST(smap);
-
- smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache);
- return &smap->map;
+ return bpf_local_storage_map_alloc(attr, &sk_cache);
}
static int notsupp_get_next_key(struct bpf_map *map, void *key,
@@ -176,7 +147,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk,
if (!copy_selem)
return NULL;
- if (map_value_has_spin_lock(&smap->map))
+ if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))
copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
SDATA(selem)->data, true);
else
@@ -595,7 +566,7 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
if (!nla_value)
goto errout;
- if (map_value_has_spin_lock(&smap->map))
+ if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))
copy_map_value_locked(&smap->map, nla_data(nla_value),
sdata->data, true);
else
diff --git a/net/core/dev.c b/net/core/dev.c
index fa53830d0683..7627c475d991 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1163,22 +1163,6 @@ int dev_change_name(struct net_device *dev, const char *newname)
net = dev_net(dev);
- /* Some auto-enslaved devices e.g. failover slaves are
- * special, as userspace might rename the device after
- * the interface had been brought up and running since
- * the point kernel initiated auto-enslavement. Allow
- * live name change even when these slave devices are
- * up and running.
- *
- * Typically, users of these auto-enslaving devices
- * don't actually care about slave name change, as
- * they are supposed to operate on master interface
- * directly.
- */
- if (dev->flags & IFF_UP &&
- likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
- return -EBUSY;
-
down_write(&devnet_rename_sem);
if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
@@ -1195,7 +1179,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
}
if (oldname[0] && !strchr(oldname, '%'))
- netdev_info(dev, "renamed from %s\n", oldname);
+ netdev_info(dev, "renamed from %s%s\n", oldname,
+ dev->flags & IFF_UP ? " (while UP)" : "");
old_assign_type = dev->name_assign_type;
dev->name_assign_type = NET_NAME_RENAMED;
@@ -1333,7 +1318,7 @@ void netdev_state_change(struct net_device *dev)
call_netdevice_notifiers_info(NETDEV_CHANGE,
&change_info.info);
- rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
}
}
EXPORT_SYMBOL(netdev_state_change);
@@ -1469,7 +1454,7 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
if (ret < 0)
return ret;
- rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
call_netdevice_notifiers(NETDEV_UP, dev);
return ret;
@@ -1541,7 +1526,7 @@ void dev_close_many(struct list_head *head, bool unlink)
__dev_close_many(head);
list_for_each_entry_safe(dev, tmp, head, close_list) {
- rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
call_netdevice_notifiers(NETDEV_DOWN, dev);
if (unlink)
list_del_init(&dev->close_list);
@@ -1621,10 +1606,10 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
- N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
- N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
- N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
- N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
+ N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
+ N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
+ N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
+ N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
@@ -1876,6 +1861,22 @@ int unregister_netdevice_notifier_net(struct net *net,
}
EXPORT_SYMBOL(unregister_netdevice_notifier_net);
+static void __move_netdevice_notifier_net(struct net *src_net,
+ struct net *dst_net,
+ struct notifier_block *nb)
+{
+ __unregister_netdevice_notifier_net(src_net, nb);
+ __register_netdevice_notifier_net(dst_net, nb, true);
+}
+
+void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net,
+ struct notifier_block *nb)
+{
+ rtnl_lock();
+ __move_netdevice_notifier_net(src_net, dst_net, nb);
+ rtnl_unlock();
+}
+
int register_netdevice_notifier_dev_net(struct net_device *dev,
struct notifier_block *nb,
struct netdev_net_notifier *nn)
@@ -1912,10 +1913,8 @@ static void move_netdevice_notifiers_dev_net(struct net_device *dev,
{
struct netdev_net_notifier *nn;
- list_for_each_entry(nn, &dev->net_notifier_list, list) {
- __unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
- __register_netdevice_notifier_net(net, nn->nb, true);
- }
+ list_for_each_entry(nn, &dev->net_notifier_list, list)
+ __move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
}
/**
@@ -2074,13 +2073,10 @@ static DECLARE_WORK(netstamp_work, netstamp_clear);
void net_enable_timestamp(void)
{
#ifdef CONFIG_JUMP_LABEL
- int wanted;
+ int wanted = atomic_read(&netstamp_wanted);
- while (1) {
- wanted = atomic_read(&netstamp_wanted);
- if (wanted <= 0)
- break;
- if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
+ while (wanted > 0) {
+ if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
return;
}
atomic_inc(&netstamp_needed_deferred);
@@ -2094,13 +2090,10 @@ EXPORT_SYMBOL(net_enable_timestamp);
void net_disable_timestamp(void)
{
#ifdef CONFIG_JUMP_LABEL
- int wanted;
+ int wanted = atomic_read(&netstamp_wanted);
- while (1) {
- wanted = atomic_read(&netstamp_wanted);
- if (wanted <= 1)
- break;
- if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
+ while (wanted > 1) {
+ if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
return;
}
atomic_dec(&netstamp_needed_deferred);
@@ -5136,11 +5129,13 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
case TC_ACT_SHOT:
mini_qdisc_qstats_cpu_drop(miniq);
kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
+ *ret = NET_RX_DROP;
return NULL;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
consume_skb(skb);
+ *ret = NET_RX_SUCCESS;
return NULL;
case TC_ACT_REDIRECT:
/* skb_mac_header check was done by cls/act_bpf, so
@@ -5153,8 +5148,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
*another = true;
break;
}
+ *ret = NET_RX_SUCCESS;
return NULL;
case TC_ACT_CONSUMED:
+ *ret = NET_RX_SUCCESS;
return NULL;
default:
break;
@@ -5982,10 +5979,9 @@ EXPORT_SYMBOL(__napi_schedule);
*/
bool napi_schedule_prep(struct napi_struct *n)
{
- unsigned long val, new;
+ unsigned long new, val = READ_ONCE(n->state);
do {
- val = READ_ONCE(n->state);
if (unlikely(val & NAPIF_STATE_DISABLE))
return false;
new = val | NAPIF_STATE_SCHED;
@@ -5998,7 +5994,7 @@ bool napi_schedule_prep(struct napi_struct *n)
*/
new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
NAPIF_STATE_MISSED;
- } while (cmpxchg(&n->state, val, new) != val);
+ } while (!try_cmpxchg(&n->state, &val, new));
return !(val & NAPIF_STATE_SCHED);
}
@@ -6066,9 +6062,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
local_irq_restore(flags);
}
+ val = READ_ONCE(n->state);
do {
- val = READ_ONCE(n->state);
-
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
@@ -6081,7 +6076,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
*/
new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
NAPIF_STATE_SCHED;
- } while (cmpxchg(&n->state, val, new) != val);
+ } while (!try_cmpxchg(&n->state, &val, new));
if (unlikely(val & NAPIF_STATE_MISSED)) {
__napi_schedule(n);
@@ -6402,19 +6397,16 @@ void napi_disable(struct napi_struct *n)
might_sleep();
set_bit(NAPI_STATE_DISABLE, &n->state);
- for ( ; ; ) {
- val = READ_ONCE(n->state);
- if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
+ val = READ_ONCE(n->state);
+ do {
+ while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
usleep_range(20, 200);
- continue;
+ val = READ_ONCE(n->state);
}
new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
-
- if (cmpxchg(&n->state, val, new) == val)
- break;
- }
+ } while (!try_cmpxchg(&n->state, &val, new));
hrtimer_cancel(&n->timer);
@@ -6431,16 +6423,15 @@ EXPORT_SYMBOL(napi_disable);
*/
void napi_enable(struct napi_struct *n)
{
- unsigned long val, new;
+ unsigned long new, val = READ_ONCE(n->state);
do {
- val = READ_ONCE(n->state);
BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
if (n->dev->threaded && n->thread)
new |= NAPIF_STATE_THREADED;
- } while (cmpxchg(&n->state, val, new) != val);
+ } while (!try_cmpxchg(&n->state, &val, new));
}
EXPORT_SYMBOL(napi_enable);
@@ -8347,7 +8338,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
dev_change_rx_flags(dev, IFF_PROMISC);
}
if (notify)
- __dev_notify_flags(dev, old_flags, IFF_PROMISC);
+ __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
return 0;
}
@@ -8402,7 +8393,7 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
dev_set_rx_mode(dev);
if (notify)
__dev_notify_flags(dev, old_flags,
- dev->gflags ^ old_gflags);
+ dev->gflags ^ old_gflags, 0, NULL);
}
return 0;
}
@@ -8565,12 +8556,13 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
}
void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
- unsigned int gchanges)
+ unsigned int gchanges, u32 portid,
+ const struct nlmsghdr *nlh)
{
unsigned int changes = dev->flags ^ old_flags;
if (gchanges)
- rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
if (changes & IFF_UP) {
if (dev->flags & IFF_UP)
@@ -8612,7 +8604,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags,
return ret;
changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
- __dev_notify_flags(dev, old_flags, changes);
+ __dev_notify_flags(dev, old_flags, changes, 0, NULL);
return ret;
}
EXPORT_SYMBOL(dev_change_flags);
@@ -8818,7 +8810,7 @@ EXPORT_SYMBOL(dev_set_mac_address_user);
int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
{
- size_t size = sizeof(sa->sa_data);
+ size_t size = sizeof(sa->sa_data_min);
struct net_device *dev;
int ret = 0;
@@ -10055,7 +10047,7 @@ int register_netdevice(struct net_device *dev)
dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
write_unlock(&dev_base_lock);
if (ret)
- goto err_uninit;
+ goto err_uninit_notify;
__netdev_update_features(dev);
@@ -10097,11 +10089,13 @@ int register_netdevice(struct net_device *dev)
*/
if (!dev->rtnl_link_ops ||
dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
out:
return ret;
+err_uninit_notify:
+ call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
err_uninit:
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
@@ -10375,24 +10369,16 @@ void netdev_run_todo(void)
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
const struct net_device_stats *netdev_stats)
{
-#if BITS_PER_LONG == 64
- BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
- memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
- /* zero out counters that only exist in rtnl_link_stats64 */
- memset((char *)stats64 + sizeof(*netdev_stats), 0,
- sizeof(*stats64) - sizeof(*netdev_stats));
-#else
- size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
- const unsigned long *src = (const unsigned long *)netdev_stats;
+ size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
+ const atomic_long_t *src = (atomic_long_t *)netdev_stats;
u64 *dst = (u64 *)stats64;
BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
for (i = 0; i < n; i++)
- dst[i] = src[i];
+ dst[i] = atomic_long_read(&src[i]);
/* zero out counters that only exist in rtnl_link_stats64 */
memset((char *)stats64 + n * sizeof(u64), 0,
sizeof(*stats64) - n * sizeof(u64));
-#endif
}
EXPORT_SYMBOL(netdev_stats_to_stats64);
@@ -10473,12 +10459,12 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
stats = per_cpu_ptr(netstats, cpu);
do {
- start = u64_stats_fetch_begin_irq(&stats->syncp);
+ start = u64_stats_fetch_begin(&stats->syncp);
rx_packets = u64_stats_read(&stats->rx_packets);
rx_bytes = u64_stats_read(&stats->rx_bytes);
tx_packets = u64_stats_read(&stats->tx_packets);
tx_bytes = u64_stats_read(&stats->tx_bytes);
- } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+ } while (u64_stats_fetch_retry(&stats->syncp, start));
s->rx_packets += rx_packets;
s->rx_bytes += rx_bytes;
@@ -10776,14 +10762,8 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
}
EXPORT_SYMBOL(unregister_netdevice_queue);
-/**
- * unregister_netdevice_many - unregister many devices
- * @head: list of devices
- *
- * Note: As most callers use a stack allocated list_head,
- * we force a list_del() to make sure stack wont be corrupted later.
- */
-void unregister_netdevice_many(struct list_head *head)
+void unregister_netdevice_many_notify(struct list_head *head,
+ u32 portid, const struct nlmsghdr *nlh)
{
struct net_device *dev, *tmp;
LIST_HEAD(close_head);
@@ -10845,7 +10825,8 @@ void unregister_netdevice_many(struct list_head *head)
if (!dev->rtnl_link_ops ||
dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
- GFP_KERNEL, NULL, 0);
+ GFP_KERNEL, NULL, 0,
+ portid, nlmsg_seq(nlh));
/*
* Flush the unicast and multicast chains
@@ -10856,11 +10837,13 @@ void unregister_netdevice_many(struct list_head *head)
netdev_name_node_alt_flush(dev);
netdev_name_node_free(dev->name_node);
+ call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
+
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
if (skb)
- rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
+ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
@@ -10883,6 +10866,18 @@ void unregister_netdevice_many(struct list_head *head)
list_del(head);
}
+
+/**
+ * unregister_netdevice_many - unregister many devices
+ * @head: list of devices
+ *
+ * Note: As most callers use a stack allocated list_head,
+ * we force a list_del() to make sure stack wont be corrupted later.
+ */
+void unregister_netdevice_many(struct list_head *head)
+{
+ unregister_netdevice_many_notify(head, 0, NULL);
+}
EXPORT_SYMBOL(unregister_netdevice_many);
/**
@@ -11038,7 +11033,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
- rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
synchronize_net();
err = 0;
diff --git a/net/core/dev.h b/net/core/dev.h
index cbb8a925175a..814ed5b7b960 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -88,6 +88,13 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier);
void __dev_set_rx_mode(struct net_device *dev);
+void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
+ unsigned int gchanges, u32 portid,
+ const struct nlmsghdr *nlh);
+
+void unregister_netdevice_many_notify(struct list_head *head,
+ u32 portid, const struct nlmsghdr *nlh);
+
static inline void netif_set_gso_max_size(struct net_device *dev,
unsigned int size)
{
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 7674bb9f3076..5cdbfbf9a7dc 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -342,7 +342,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
if (ifr->ifr_hwaddr.sa_family != dev->type)
return -EINVAL;
memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
- min(sizeof(ifr->ifr_hwaddr.sa_data),
+ min(sizeof(ifr->ifr_hwaddr.sa_data_min),
(size_t)dev->addr_len));
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
return 0;
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 89baa7c0938b..0e10a8a68c5e 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -71,6 +71,7 @@ struct devlink {
refcount_t refcount;
struct completion comp;
struct rcu_head rcu;
+ struct notifier_block netdevice_nb;
char priv[] __aligned(NETDEV_ALIGN);
};
@@ -769,7 +770,7 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3)
#define DEVLINK_NL_FLAG_NEED_LINECARD BIT(4)
-static int devlink_nl_pre_doit(const struct genl_ops *ops,
+static int devlink_nl_pre_doit(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info)
{
struct devlink_linecard *linecard;
@@ -827,7 +828,7 @@ unlock:
return err;
}
-static void devlink_nl_post_doit(const struct genl_ops *ops,
+static void devlink_nl_post_doit(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info)
{
struct devlink_linecard *linecard;
@@ -879,6 +880,24 @@ nla_put_failure:
return -EMSGSIZE;
}
+int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port)
+{
+ if (devlink_nl_put_handle(msg, devlink_port->devlink))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ return -EMSGSIZE;
+ return 0;
+}
+
+size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port)
+{
+ struct devlink *devlink = devlink_port->devlink;
+
+ return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */
+ + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */
+ + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */
+}
+
struct devlink_reload_combination {
enum devlink_reload_action action;
enum devlink_reload_limit limit;
@@ -1184,6 +1203,14 @@ static int devlink_nl_rate_fill(struct sk_buff *msg,
devlink_rate->tx_max, DEVLINK_ATTR_PAD))
goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY,
+ devlink_rate->tx_priority))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT,
+ devlink_rate->tx_weight))
+ goto nla_put_failure;
+
if (devlink_rate->parent)
if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME,
devlink_rate->parent->name))
@@ -1292,8 +1319,6 @@ static int devlink_nl_port_fill(struct sk_buff *msg,
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
goto nla_put_failure;
- /* Hold rtnl lock while accessing port's netdev attributes. */
- rtnl_lock();
spin_lock_bh(&devlink_port->type_lock);
if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
goto nla_put_failure_type_locked;
@@ -1302,18 +1327,15 @@ static int devlink_nl_port_fill(struct sk_buff *msg,
devlink_port->desired_type))
goto nla_put_failure_type_locked;
if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
- struct net *net = devlink_net(devlink_port->devlink);
- struct net_device *netdev = devlink_port->type_dev;
-
- if (netdev && net_eq(net, dev_net(netdev)) &&
+ if (devlink_port->type_eth.netdev &&
(nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
- netdev->ifindex) ||
+ devlink_port->type_eth.ifindex) ||
nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
- netdev->name)))
+ devlink_port->type_eth.ifname)))
goto nla_put_failure_type_locked;
}
if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
- struct ib_device *ibdev = devlink_port->type_dev;
+ struct ib_device *ibdev = devlink_port->type_ib.ibdev;
if (ibdev &&
nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME,
@@ -1321,7 +1343,6 @@ static int devlink_nl_port_fill(struct sk_buff *msg,
goto nla_put_failure_type_locked;
}
spin_unlock_bh(&devlink_port->type_lock);
- rtnl_unlock();
if (devlink_nl_port_attrs_put(msg, devlink_port))
goto nla_put_failure;
if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack))
@@ -1336,7 +1357,6 @@ static int devlink_nl_port_fill(struct sk_buff *msg,
nla_put_failure_type_locked:
spin_unlock_bh(&devlink_port->type_lock);
- rtnl_unlock();
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
@@ -1867,10 +1887,8 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
int err = -EOPNOTSUPP;
parent = devlink_rate->parent;
- if (parent && len) {
- NL_SET_ERR_MSG_MOD(info->extack, "Rate object already has parent.");
- return -EBUSY;
- } else if (parent && !len) {
+
+ if (parent && !len) {
if (devlink_rate_is_leaf(devlink_rate))
err = ops->rate_leaf_parent_set(devlink_rate, NULL,
devlink_rate->priv, NULL,
@@ -1884,7 +1902,7 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
refcount_dec(&parent->refcnt);
devlink_rate->parent = NULL;
- } else if (!parent && len) {
+ } else if (len) {
parent = devlink_rate_node_get_by_name(devlink, parent_name);
if (IS_ERR(parent))
return -ENODEV;
@@ -1911,6 +1929,10 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
if (err)
return err;
+ if (devlink_rate->parent)
+ /* we're reassigning to other parent in this case */
+ refcount_dec(&devlink_rate->parent->refcnt);
+
refcount_inc(&parent->refcnt);
devlink_rate->parent = parent;
}
@@ -1924,6 +1946,8 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
{
struct nlattr *nla_parent, **attrs = info->attrs;
int err = -EOPNOTSUPP;
+ u32 priority;
+ u32 weight;
u64 rate;
if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) {
@@ -1952,6 +1976,34 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
devlink_rate->tx_max = rate;
}
+ if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) {
+ priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]);
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv,
+ priority, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv,
+ priority, info->extack);
+
+ if (err)
+ return err;
+ devlink_rate->tx_priority = priority;
+ }
+
+ if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) {
+ weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]);
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv,
+ weight, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv,
+ weight, info->extack);
+
+ if (err)
+ return err;
+ devlink_rate->tx_weight = weight;
+ }
+
nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME];
if (nla_parent) {
err = devlink_nl_rate_parent_node_set(devlink_rate, info,
@@ -1983,6 +2035,18 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the leafs");
return false;
}
+ if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TX_PRIORITY],
+ "TX priority set isn't supported for the leafs");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TX_WEIGHT],
+ "TX weight set isn't supported for the leafs");
+ return false;
+ }
} else if (type == DEVLINK_RATE_TYPE_NODE) {
if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) {
NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes");
@@ -1997,6 +2061,18 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the nodes");
return false;
}
+ if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TX_PRIORITY],
+ "TX priority set isn't supported for the nodes");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TX_WEIGHT],
+ "TX weight set isn't supported for the nodes");
+ return false;
+ }
} else {
WARN(1, "Unknown type of rate object");
return false;
@@ -4490,8 +4566,11 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net,
if (err)
return err;
- if (dest_net && !net_eq(dest_net, curr_net))
+ if (dest_net && !net_eq(dest_net, curr_net)) {
+ move_netdevice_notifier_net(curr_net, dest_net,
+ &devlink->netdevice_nb);
write_pnet(&devlink->_net, dest_net);
+ }
err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack);
devlink_reload_failed_set(devlink, !!err);
@@ -7767,8 +7846,6 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
return -ECANCELED;
}
- reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
-
if (reporter->auto_dump) {
mutex_lock(&reporter->dump_lock);
/* store current dump of current error, for later analysis */
@@ -8304,10 +8381,10 @@ static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats,
cpu_stats = per_cpu_ptr(trap_stats, i);
do {
- start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
rx_packets = u64_stats_read(&cpu_stats->rx_packets);
rx_bytes = u64_stats_read(&cpu_stats->rx_bytes);
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
u64_stats_add(&stats->rx_packets, rx_packets);
u64_stats_add(&stats->rx_bytes, rx_bytes);
@@ -9172,6 +9249,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 },
[DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED },
+ [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32 },
};
static const struct genl_small_ops devlink_nl_ops[] = {
@@ -9602,6 +9681,9 @@ void devlink_set_features(struct devlink *devlink, u64 features)
}
EXPORT_SYMBOL_GPL(devlink_set_features);
+static int devlink_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr);
+
/**
* devlink_alloc_ns - Allocate new devlink instance resources
* in specific namespace
@@ -9632,10 +9714,13 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b,
&last_id, GFP_KERNEL);
- if (ret < 0) {
- kfree(devlink);
- return NULL;
- }
+ if (ret < 0)
+ goto err_xa_alloc;
+
+ devlink->netdevice_nb.notifier_call = devlink_netdevice_event;
+ ret = register_netdevice_notifier_net(net, &devlink->netdevice_nb);
+ if (ret)
+ goto err_register_netdevice_notifier;
devlink->dev = dev;
devlink->ops = ops;
@@ -9662,6 +9747,12 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
init_completion(&devlink->comp);
return devlink;
+
+err_register_netdevice_notifier:
+ xa_erase(&devlinks, devlink->index);
+err_xa_alloc:
+ kfree(devlink);
+ return NULL;
}
EXPORT_SYMBOL_GPL(devlink_alloc_ns);
@@ -9815,6 +9906,10 @@ void devlink_free(struct devlink *devlink)
WARN_ON(!list_empty(&devlink->port_list));
xa_destroy(&devlink->snapshot_ids);
+
+ WARN_ON_ONCE(unregister_netdevice_notifier_net(devlink_net(devlink),
+ &devlink->netdevice_nb));
+
xa_erase(&devlinks, devlink->index);
kfree(devlink);
@@ -9967,6 +10062,7 @@ EXPORT_SYMBOL_GPL(devlink_port_register);
void devl_port_unregister(struct devlink_port *devlink_port)
{
lockdep_assert_held(&devlink_port->devlink->lock);
+ WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET);
devlink_port_type_warn_cancel(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
@@ -9994,20 +10090,6 @@ void devlink_port_unregister(struct devlink_port *devlink_port)
}
EXPORT_SYMBOL_GPL(devlink_port_unregister);
-static void __devlink_port_type_set(struct devlink_port *devlink_port,
- enum devlink_port_type type,
- void *type_dev)
-{
- ASSERT_DEVLINK_PORT_REGISTERED(devlink_port);
-
- devlink_port_type_warn_cancel(devlink_port);
- spin_lock_bh(&devlink_port->type_lock);
- devlink_port->type = type;
- devlink_port->type_dev = type_dev;
- spin_unlock_bh(&devlink_port->type_lock);
- devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
-}
-
static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port,
struct net_device *netdev)
{
@@ -10045,23 +10127,58 @@ static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port,
}
}
+static void __devlink_port_type_set(struct devlink_port *devlink_port,
+ enum devlink_port_type type,
+ void *type_dev)
+{
+ struct net_device *netdev = type_dev;
+
+ ASSERT_DEVLINK_PORT_REGISTERED(devlink_port);
+
+ if (type == DEVLINK_PORT_TYPE_NOTSET) {
+ devlink_port_type_warn_schedule(devlink_port);
+ } else {
+ devlink_port_type_warn_cancel(devlink_port);
+ if (type == DEVLINK_PORT_TYPE_ETH && netdev)
+ devlink_port_type_netdev_checks(devlink_port, netdev);
+ }
+
+ spin_lock_bh(&devlink_port->type_lock);
+ devlink_port->type = type;
+ switch (type) {
+ case DEVLINK_PORT_TYPE_ETH:
+ devlink_port->type_eth.netdev = netdev;
+ if (netdev) {
+ ASSERT_RTNL();
+ devlink_port->type_eth.ifindex = netdev->ifindex;
+ BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) !=
+ sizeof(netdev->name));
+ strcpy(devlink_port->type_eth.ifname, netdev->name);
+ }
+ break;
+ case DEVLINK_PORT_TYPE_IB:
+ devlink_port->type_ib.ibdev = type_dev;
+ break;
+ default:
+ break;
+ }
+ spin_unlock_bh(&devlink_port->type_lock);
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+
/**
* devlink_port_type_eth_set - Set port type to Ethernet
*
* @devlink_port: devlink port
- * @netdev: related netdevice
+ *
+ * If driver is calling this, most likely it is doing something wrong.
*/
-void devlink_port_type_eth_set(struct devlink_port *devlink_port,
- struct net_device *netdev)
+void devlink_port_type_eth_set(struct devlink_port *devlink_port)
{
- if (netdev)
- devlink_port_type_netdev_checks(devlink_port, netdev);
- else
- dev_warn(devlink_port->devlink->dev,
- "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n",
- devlink_port->index);
-
- __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, netdev);
+ dev_warn(devlink_port->devlink->dev,
+ "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n",
+ devlink_port->index);
+ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL);
}
EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
@@ -10082,14 +10199,71 @@ EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
* devlink_port_type_clear - Clear port type
*
* @devlink_port: devlink port
+ *
+ * If driver is calling this for clearing Ethernet type, most likely
+ * it is doing something wrong.
*/
void devlink_port_type_clear(struct devlink_port *devlink_port)
{
+ if (devlink_port->type == DEVLINK_PORT_TYPE_ETH)
+ dev_warn(devlink_port->devlink->dev,
+ "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n",
+ devlink_port->index);
__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL);
- devlink_port_type_warn_schedule(devlink_port);
}
EXPORT_SYMBOL_GPL(devlink_port_type_clear);
+static int devlink_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct devlink_port *devlink_port = netdev->devlink_port;
+ struct devlink *devlink;
+
+ devlink = container_of(nb, struct devlink, netdevice_nb);
+
+ if (!devlink_port || devlink_port->devlink != devlink)
+ return NOTIFY_OK;
+
+ switch (event) {
+ case NETDEV_POST_INIT:
+ /* Set the type but not netdev pointer. It is going to be set
+ * later on by NETDEV_REGISTER event. Happens once during
+ * netdevice register
+ */
+ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH,
+ NULL);
+ break;
+ case NETDEV_REGISTER:
+ case NETDEV_CHANGENAME:
+ /* Set the netdev on top of previously set type. Note this
+ * event happens also during net namespace change so here
+ * we take into account netdev pointer appearing in this
+ * namespace.
+ */
+ __devlink_port_type_set(devlink_port, devlink_port->type,
+ netdev);
+ break;
+ case NETDEV_UNREGISTER:
+ /* Clear netdev pointer, but not the type. This event happens
+ * also during net namespace change so we need to clear
+ * pointer to netdev that is going to another net namespace.
+ */
+ __devlink_port_type_set(devlink_port, devlink_port->type,
+ NULL);
+ break;
+ case NETDEV_PRE_UNINIT:
+ /* Clear the type and the netdev pointer. Happens one during
+ * netdevice unregister.
+ */
+ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET,
+ NULL);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
enum devlink_port_flavour flavour)
{
@@ -10211,13 +10385,60 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set);
/**
+ * devl_rate_node_create - create devlink rate node
+ * @devlink: devlink instance
+ * @priv: driver private data
+ * @node_name: name of the resulting node
+ * @parent: parent devlink_rate struct
+ *
+ * Create devlink rate object of type node
+ */
+struct devlink_rate *
+devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name,
+ struct devlink_rate *parent)
+{
+ struct devlink_rate *rate_node;
+
+ rate_node = devlink_rate_node_get_by_name(devlink, node_name);
+ if (!IS_ERR(rate_node))
+ return ERR_PTR(-EEXIST);
+
+ rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL);
+ if (!rate_node)
+ return ERR_PTR(-ENOMEM);
+
+ if (parent) {
+ rate_node->parent = parent;
+ refcount_inc(&rate_node->parent->refcnt);
+ }
+
+ rate_node->type = DEVLINK_RATE_TYPE_NODE;
+ rate_node->devlink = devlink;
+ rate_node->priv = priv;
+
+ rate_node->name = kstrdup(node_name, GFP_KERNEL);
+ if (!rate_node->name) {
+ kfree(rate_node);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ refcount_set(&rate_node->refcnt, 1);
+ list_add(&rate_node->list, &devlink->rate_list);
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
+ return rate_node;
+}
+EXPORT_SYMBOL_GPL(devl_rate_node_create);
+
+/**
* devl_rate_leaf_create - create devlink rate leaf
* @devlink_port: devlink port object to create rate object on
* @priv: driver private data
+ * @parent: parent devlink_rate struct
*
* Create devlink rate object of type leaf on provided @devlink_port.
*/
-int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv)
+int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv,
+ struct devlink_rate *parent)
{
struct devlink *devlink = devlink_port->devlink;
struct devlink_rate *devlink_rate;
@@ -10231,6 +10452,11 @@ int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv)
if (!devlink_rate)
return -ENOMEM;
+ if (parent) {
+ devlink_rate->parent = parent;
+ refcount_inc(&devlink_rate->parent->refcnt);
+ }
+
devlink_rate->type = DEVLINK_RATE_TYPE_LEAF;
devlink_rate->devlink = devlink;
devlink_rate->devlink_port = devlink_port;
@@ -11624,6 +11850,8 @@ static const struct devlink_trap devlink_trap_generic[] = {
DEVLINK_TRAP(ESP_PARSING, DROP),
DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP),
DEVLINK_TRAP(DMAC_FILTER, DROP),
+ DEVLINK_TRAP(EAPOL, CONTROL),
+ DEVLINK_TRAP(LOCKED_PORT, DROP),
};
#define DEVLINK_TRAP_GROUP(_id) \
@@ -11659,6 +11887,7 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = {
DEVLINK_TRAP_GROUP(ACL_SAMPLE),
DEVLINK_TRAP_GROUP(ACL_TRAP),
DEVLINK_TRAP_GROUP(PARSER_ERROR_DROPS),
+ DEVLINK_TRAP_GROUP(EAPOL),
};
static int devlink_trap_generic_verify(const struct devlink_trap *trap)
@@ -12016,7 +12245,7 @@ devlink_trap_report_metadata_set(struct devlink_trap_metadata *metadata,
spin_lock(&in_devlink_port->type_lock);
if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH)
- metadata->input_dev = in_devlink_port->type_dev;
+ metadata->input_dev = in_devlink_port->type_eth.netdev;
spin_unlock(&in_devlink_port->type_lock);
}
@@ -12416,14 +12645,6 @@ free_msg:
nlmsg_free(msg);
}
-static struct devlink_port *netdev_to_devlink_port(struct net_device *dev)
-{
- if (!dev->netdev_ops->ndo_get_devlink_port)
- return NULL;
-
- return dev->netdev_ops->ndo_get_devlink_port(dev);
-}
-
void devlink_compat_running_version(struct devlink *devlink,
char *buf, size_t len)
{
@@ -12469,7 +12690,7 @@ int devlink_compat_phys_port_name_get(struct net_device *dev,
*/
ASSERT_RTNL();
- devlink_port = netdev_to_devlink_port(dev);
+ devlink_port = dev->devlink_port;
if (!devlink_port)
return -EOPNOTSUPP;
@@ -12485,7 +12706,7 @@ int devlink_compat_switch_id_get(struct net_device *dev,
* devlink_port instance cannot disappear in the middle. No need to take
* any devlink lock as only permanent values are accessed.
*/
- devlink_port = netdev_to_devlink_port(dev);
+ devlink_port = dev->devlink_port;
if (!devlink_port || !devlink_port->switch_port)
return -EOPNOTSUPP;
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index f084a4a6b7ab..5a782d1d8fd3 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -1432,9 +1432,9 @@ static void net_dm_stats_read(struct net_dm_stats *stats)
u64 dropped;
do {
- start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
dropped = u64_stats_read(&cpu_stats->dropped);
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
u64_stats_add(&stats->dropped, dropped);
}
@@ -1476,9 +1476,9 @@ static void net_dm_hw_stats_read(struct net_dm_stats *stats)
u64 dropped;
do {
- start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
dropped = u64_stats_read(&cpu_stats->dropped);
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
u64_stats_add(&stats->dropped, dropped);
}
@@ -1620,7 +1620,7 @@ static const struct genl_small_ops dropmon_ops[] = {
},
};
-static int net_dm_nl_pre_doit(const struct genl_ops *ops,
+static int net_dm_nl_pre_doit(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info)
{
mutex_lock(&net_dm_mutex);
@@ -1628,7 +1628,7 @@ static int net_dm_nl_pre_doit(const struct genl_ops *ops,
return 0;
}
-static void net_dm_nl_post_doit(const struct genl_ops *ops,
+static void net_dm_nl_post_doit(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info)
{
mutex_unlock(&net_dm_mutex);
diff --git a/net/core/failover.c b/net/core/failover.c
index 864d2d83eff4..655411c4ca51 100644
--- a/net/core/failover.c
+++ b/net/core/failover.c
@@ -80,14 +80,14 @@ static int failover_slave_register(struct net_device *slave_dev)
goto err_upper_link;
}
- slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK);
+ slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
if (fops && fops->slave_register &&
!fops->slave_register(slave_dev, failover_dev))
return NOTIFY_OK;
netdev_upper_dev_unlink(slave_dev, failover_dev);
- slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK);
+ slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
err_upper_link:
netdev_rx_handler_unregister(slave_dev);
done:
@@ -121,7 +121,7 @@ int failover_slave_unregister(struct net_device *slave_dev)
netdev_rx_handler_unregister(slave_dev);
netdev_upper_dev_unlink(slave_dev, failover_dev);
- slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK);
+ slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
if (fops && fops->slave_unregister &&
!fops->slave_unregister(slave_dev, failover_dev))
diff --git a/net/core/filter.c b/net/core/filter.c
index bb0136e7a8e4..37baaa6b8fc3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -325,11 +325,11 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
offsetof(struct sk_buff, vlan_tci));
break;
case SKF_AD_VLAN_TAG_PRESENT:
- *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET);
- if (PKT_VLAN_PRESENT_BIT)
- *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT);
- if (PKT_VLAN_PRESENT_BIT < 7)
- *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4);
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, vlan_all));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
+ *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1);
break;
}
@@ -2124,6 +2124,11 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
{
unsigned int mlen = skb_network_offset(skb);
+ if (unlikely(skb->len <= mlen)) {
+ kfree_skb(skb);
+ return -ERANGE;
+ }
+
if (mlen) {
__skb_pull(skb, mlen);
@@ -2145,7 +2150,7 @@ static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
/* Verify that a link layer header is carried */
- if (unlikely(skb->mac_header >= skb->network_header)) {
+ if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) {
kfree_skb(skb);
return -ERANGE;
}
@@ -4104,7 +4109,10 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
.arg2_type = ARG_ANYTHING,
};
-/* XDP_REDIRECT works by a three-step process, implemented in the functions
+/**
+ * DOC: xdp redirect
+ *
+ * XDP_REDIRECT works by a three-step process, implemented in the functions
* below:
*
* 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
@@ -4119,7 +4127,8 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
* 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(),
* which will flush all the different bulk queues, thus completing the
* redirect.
- *
+ */
+/*
* Pointers to the map entries will be kept around for this whole sequence of
* steps, protected by RCU. However, there is no top-level rcu_read_lock() in
* the core code; instead, the RCU protection relies on everything happening
@@ -4410,10 +4419,10 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
.arg2_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
+BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key,
u64, flags)
{
- return map->ops->map_redirect(map, ifindex, flags);
+ return map->ops->map_redirect(map, key, flags);
}
static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
@@ -6428,7 +6437,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
else
sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
dst4, tuple->ipv4.dport,
- dif, sdif, &udp_table, NULL);
+ dif, sdif, net->ipv4.udp_table, NULL);
#if IS_ENABLED(CONFIG_IPV6)
} else {
struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
@@ -6444,7 +6453,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
src6, tuple->ipv6.sport,
dst6, tuple->ipv6.dport,
dif, sdif,
- &udp_table, NULL);
+ net->ipv4.udp_table, NULL);
#endif
}
@@ -8647,28 +8656,25 @@ static bool tc_cls_act_is_valid_access(int off, int size,
DEFINE_MUTEX(nf_conn_btf_access_lock);
EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);
-int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf,
- const struct btf_type *t, int off, int size,
- enum bpf_access_type atype, u32 *next_btf_id,
- enum bpf_type_flag *flag);
+int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size, enum bpf_access_type atype,
+ u32 *next_btf_id, enum bpf_type_flag *flag);
EXPORT_SYMBOL_GPL(nfct_btf_struct_access);
static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
- const struct btf *btf,
- const struct btf_type *t, int off,
- int size, enum bpf_access_type atype,
- u32 *next_btf_id,
- enum bpf_type_flag *flag)
+ const struct bpf_reg_state *reg,
+ int off, int size, enum bpf_access_type atype,
+ u32 *next_btf_id, enum bpf_type_flag *flag)
{
int ret = -EACCES;
if (atype == BPF_READ)
- return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
- flag);
+ return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
mutex_lock(&nf_conn_btf_access_lock);
if (nfct_btf_struct_access)
- ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag);
+ ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
mutex_unlock(&nf_conn_btf_access_lock);
return ret;
@@ -8734,21 +8740,18 @@ void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog,
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
static int xdp_btf_struct_access(struct bpf_verifier_log *log,
- const struct btf *btf,
- const struct btf_type *t, int off,
- int size, enum bpf_access_type atype,
- u32 *next_btf_id,
- enum bpf_type_flag *flag)
+ const struct bpf_reg_state *reg,
+ int off, int size, enum bpf_access_type atype,
+ u32 *next_btf_id, enum bpf_type_flag *flag)
{
int ret = -EACCES;
if (atype == BPF_READ)
- return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
- flag);
+ return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
mutex_lock(&nf_conn_btf_access_lock);
if (nfct_btf_struct_access)
- ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag);
+ ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
mutex_unlock(&nf_conn_btf_access_lock);
return ret;
@@ -8921,6 +8924,10 @@ static bool sock_ops_is_valid_access(int off, int size,
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size,
size_default);
+ case offsetof(struct bpf_sock_ops, skb_hwtstamp):
+ if (size != sizeof(__u64))
+ return false;
+ break;
default:
if (size != size_default)
return false;
@@ -9104,21 +9111,21 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
return insn;
}
-static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si,
+static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg,
struct bpf_insn *insn)
{
/* si->dst_reg = skb_shinfo(SKB); */
#ifdef NET_SKBUFF_DATA_USES_OFFSET
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
- BPF_REG_AX, si->src_reg,
+ BPF_REG_AX, skb_reg,
offsetof(struct sk_buff, end));
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
- si->dst_reg, si->src_reg,
+ dst_reg, skb_reg,
offsetof(struct sk_buff, head));
- *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
+ *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX);
#else
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
- si->dst_reg, si->src_reg,
+ dst_reg, skb_reg,
offsetof(struct sk_buff, end));
#endif
@@ -9290,13 +9297,11 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct __sk_buff, vlan_present):
- *target_size = 1;
- *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
- PKT_VLAN_PRESENT_OFFSET);
- if (PKT_VLAN_PRESENT_BIT)
- *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT);
- if (PKT_VLAN_PRESENT_BIT < 7)
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1);
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff,
+ vlan_all, 4, target_size));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1);
break;
case offsetof(struct __sk_buff, vlan_tci):
@@ -9511,7 +9516,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct __sk_buff, gso_segs):
- insn = bpf_convert_shinfo_access(si, insn);
+ insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
si->dst_reg, si->dst_reg,
bpf_target_off(struct skb_shared_info,
@@ -9519,7 +9524,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
target_size));
break;
case offsetof(struct __sk_buff, gso_size):
- insn = bpf_convert_shinfo_access(si, insn);
+ insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size),
si->dst_reg, si->dst_reg,
bpf_target_off(struct skb_shared_info,
@@ -9546,7 +9551,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8);
BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0);
- insn = bpf_convert_shinfo_access(si, insn);
+ insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
*insn++ = BPF_LDX_MEM(BPF_DW,
si->dst_reg, si->dst_reg,
bpf_target_off(struct skb_shared_info,
@@ -10396,6 +10401,25 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
tcp_flags),
si->dst_reg, si->dst_reg, off);
break;
+ case offsetof(struct bpf_sock_ops, skb_hwtstamp): {
+ struct bpf_insn *jmp_on_null_skb;
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ /* Reserve one insn to test skb == NULL */
+ jmp_on_null_skb = insn++;
+ insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn);
+ *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct skb_shared_info,
+ hwtstamps, 8,
+ target_size));
+ *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0,
+ insn - jmp_on_null_skb - 1);
+ break;
+ }
}
return insn - insn_buf;
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 25cd35f5922e..25fb0bbc310f 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -296,7 +296,7 @@ skb_flow_dissect_ct(const struct sk_buff *skb,
key->ct_zone = ct->zone.id;
#endif
#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
- key->ct_mark = ct->mark;
+ key->ct_mark = READ_ONCE(ct->mark);
#endif
cl = nf_ct_labels_find(ct);
@@ -971,12 +971,14 @@ bool __skb_flow_dissect(const struct net *net,
#if IS_ENABLED(CONFIG_NET_DSA)
if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) &&
proto == htons(ETH_P_XDSA))) {
+ struct metadata_dst *md_dst = skb_metadata_dst(skb);
const struct dsa_device_ops *ops;
int offset = 0;
ops = skb->dev->dsa_ptr->tag_ops;
/* Only DSA header taggers break flow dissection */
- if (ops->needed_headroom) {
+ if (ops->needed_headroom &&
+ (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)) {
if (ops->flow_dissect)
ops->flow_dissect(skb, &proto, &offset);
else
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index abe423fd5736..acfc1f88ea79 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -97,6 +97,13 @@ void flow_rule_match_cvlan(const struct flow_rule *rule,
}
EXPORT_SYMBOL(flow_rule_match_cvlan);
+void flow_rule_match_arp(const struct flow_rule *rule,
+ struct flow_match_arp *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ARP, out);
+}
+EXPORT_SYMBOL(flow_rule_match_arp);
+
void flow_rule_match_ipv4_addrs(const struct flow_rule *rule,
struct flow_match_ipv4_addrs *out)
{
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index c8d137ef5980..b71ccaec0991 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -135,10 +135,10 @@ static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats,
u64 bytes, packets;
do {
- start = u64_stats_fetch_begin_irq(&bcpu->syncp);
+ start = u64_stats_fetch_begin(&bcpu->syncp);
bytes = u64_stats_read(&bcpu->bytes);
packets = u64_stats_read(&bcpu->packets);
- } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
+ } while (u64_stats_fetch_retry(&bcpu->syncp, start));
t_bytes += bytes;
t_packets += packets;
@@ -162,10 +162,10 @@ void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats,
}
do {
if (running)
- start = u64_stats_fetch_begin_irq(&b->syncp);
+ start = u64_stats_fetch_begin(&b->syncp);
bytes = u64_stats_read(&b->bytes);
packets = u64_stats_read(&b->packets);
- } while (running && u64_stats_fetch_retry_irq(&b->syncp, start));
+ } while (running && u64_stats_fetch_retry(&b->syncp, start));
_bstats_update(bstats, bytes, packets);
}
@@ -187,10 +187,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets,
u64 bytes, packets;
do {
- start = u64_stats_fetch_begin_irq(&bcpu->syncp);
+ start = u64_stats_fetch_begin(&bcpu->syncp);
bytes = u64_stats_read(&bcpu->bytes);
packets = u64_stats_read(&bcpu->packets);
- } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
+ } while (u64_stats_fetch_retry(&bcpu->syncp, start));
t_bytes += bytes;
t_packets += packets;
@@ -201,10 +201,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets,
}
do {
if (running)
- start = u64_stats_fetch_begin_irq(&b->syncp);
+ start = u64_stats_fetch_begin(&b->syncp);
*ret_bytes = u64_stats_read(&b->bytes);
*ret_packets = u64_stats_read(&b->packets);
- } while (running && u64_stats_fetch_retry_irq(&b->syncp, start));
+ } while (running && u64_stats_fetch_retry(&b->syncp, start));
}
static int
diff --git a/net/core/gro.c b/net/core/gro.c
index bc9451743307..fd8c6a7e8d3e 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -370,9 +370,7 @@ static void gro_list_prepare(const struct list_head *head,
}
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
- diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
- if (skb_vlan_tag_present(p))
- diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
+ diffs |= p->vlan_all ^ skb->vlan_all;
diffs |= skb_metadata_differs(p, skb);
if (maclen == ETH_HLEN)
diffs |= compare_ether_header(skb_mac_header(p),
@@ -489,45 +487,45 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
- if (ptype->type != type || !ptype->callbacks.gro_receive)
- continue;
-
- skb_set_network_header(skb, skb_gro_offset(skb));
- skb_reset_mac_len(skb);
- BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32));
- BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
- sizeof(u32))); /* Avoid slow unaligned acc */
- *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
- NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
- NAPI_GRO_CB(skb)->is_atomic = 1;
- NAPI_GRO_CB(skb)->count = 1;
- if (unlikely(skb_is_gso(skb))) {
- NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs;
- /* Only support TCP at the moment. */
- if (!skb_is_gso_tcp(skb))
- NAPI_GRO_CB(skb)->flush = 1;
- }
-
- /* Setup for GRO checksum validation */
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- NAPI_GRO_CB(skb)->csum = skb->csum;
- NAPI_GRO_CB(skb)->csum_valid = 1;
- break;
- case CHECKSUM_UNNECESSARY:
- NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
- break;
- }
+ if (ptype->type == type && ptype->callbacks.gro_receive)
+ goto found_ptype;
+ }
+ rcu_read_unlock();
+ goto normal;
+
+found_ptype:
+ skb_set_network_header(skb, skb_gro_offset(skb));
+ skb_reset_mac_len(skb);
+ BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32));
+ BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
+ sizeof(u32))); /* Avoid slow unaligned acc */
+ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
+ NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
+ NAPI_GRO_CB(skb)->is_atomic = 1;
+ NAPI_GRO_CB(skb)->count = 1;
+ if (unlikely(skb_is_gso(skb))) {
+ NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs;
+ /* Only support TCP at the moment. */
+ if (!skb_is_gso_tcp(skb))
+ NAPI_GRO_CB(skb)->flush = 1;
+ }
- pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
- ipv6_gro_receive, inet_gro_receive,
- &gro_list->list, skb);
+ /* Setup for GRO checksum validation */
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ NAPI_GRO_CB(skb)->csum = skb->csum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+ break;
+ case CHECKSUM_UNNECESSARY:
+ NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
break;
}
- rcu_read_unlock();
- if (&ptype->list == head)
- goto normal;
+ pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
+ ipv6_gro_receive, inet_gro_receive,
+ &gro_list->list, skb);
+
+ rcu_read_unlock();
if (PTR_ERR(pp) == -EINPROGRESS) {
ret = GRO_CONSUMED;
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index aa6cb1f90966..c469d1c4db5d 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -38,9 +38,23 @@ static unsigned char default_operstate(const struct net_device *dev)
if (netif_testing(dev))
return IF_OPER_TESTING;
- if (!netif_carrier_ok(dev))
- return (dev->ifindex != dev_get_iflink(dev) ?
- IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN);
+ /* Some uppers (DSA) have additional sources for being down, so
+ * first check whether lower is indeed the source of its down state.
+ */
+ if (!netif_carrier_ok(dev)) {
+ int iflink = dev_get_iflink(dev);
+ struct net_device *peer;
+
+ if (iflink == dev->ifindex)
+ return IF_OPER_DOWN;
+
+ peer = __dev_get_by_index(dev_net(dev), iflink);
+ if (!peer)
+ return IF_OPER_DOWN;
+
+ return netif_carrier_ok(peer) ? IF_OPER_DOWN :
+ IF_OPER_LOWERLAYERDOWN;
+ }
if (netif_dormant(dev))
return IF_OPER_DORMANT;
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 6fac2f0ef074..711cd3b4347a 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -48,9 +48,11 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "RPL";
case LWTUNNEL_ENCAP_IOAM6:
return "IOAM6";
+ case LWTUNNEL_ENCAP_XFRM:
+ /* module autoload not supported for encap type */
+ return NULL;
case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP:
- case LWTUNNEL_ENCAP_XFRM:
case LWTUNNEL_ENCAP_NONE:
case __LWTUNNEL_ENCAP_MAX:
/* should not have got here */
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index e93edb810103..952a54763358 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -111,7 +111,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)
unsigned long neigh_rand_reach_time(unsigned long base)
{
- return base ? (prandom_u32() % base) + (base >> 1) : 0;
+ return base ? prandom_u32_max(base) + (base >> 1) : 0;
}
EXPORT_SYMBOL(neigh_rand_reach_time);
@@ -307,7 +307,31 @@ static int neigh_del_timer(struct neighbour *n)
return 0;
}
-static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net)
+static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
+ int family)
+{
+ switch (family) {
+ case AF_INET:
+ return __in_dev_arp_parms_get_rcu(dev);
+ case AF_INET6:
+ return __in6_dev_nd_parms_get_rcu(dev);
+ }
+ return NULL;
+}
+
+static void neigh_parms_qlen_dec(struct net_device *dev, int family)
+{
+ struct neigh_parms *p;
+
+ rcu_read_lock();
+ p = neigh_get_dev_parms_rcu(dev, family);
+ if (p)
+ p->qlen--;
+ rcu_read_unlock();
+}
+
+static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net,
+ int family)
{
struct sk_buff_head tmp;
unsigned long flags;
@@ -321,13 +345,7 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net)
struct net_device *dev = skb->dev;
if (net == NULL || net_eq(dev_net(dev), net)) {
- struct in_device *in_dev;
-
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(dev);
- if (in_dev)
- in_dev->arp_parms->qlen--;
- rcu_read_unlock();
+ neigh_parms_qlen_dec(dev, family);
__skb_unlink(skb, list);
__skb_queue_tail(&tmp, skb);
}
@@ -409,7 +427,8 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
write_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev, skip_perm);
pneigh_ifdown_and_unlock(tbl, dev);
- pneigh_queue_purge(&tbl->proxy_queue, dev_net(dev));
+ pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL,
+ tbl->family);
if (skb_queue_empty_lockless(&tbl->proxy_queue))
del_timer_sync(&tbl->proxy_timer);
return 0;
@@ -1621,13 +1640,8 @@ static void neigh_proxy_process(struct timer_list *t)
if (tdif <= 0) {
struct net_device *dev = skb->dev;
- struct in_device *in_dev;
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(dev);
- if (in_dev)
- in_dev->arp_parms->qlen--;
- rcu_read_unlock();
+ neigh_parms_qlen_dec(dev, tbl->family);
__skb_unlink(skb, &tbl->proxy_queue);
if (tbl->proxy_redo && netif_running(dev)) {
@@ -1821,7 +1835,7 @@ int neigh_table_clear(int index, struct neigh_table *tbl)
cancel_delayed_work_sync(&tbl->managed_work);
cancel_delayed_work_sync(&tbl->gc_work);
del_timer_sync(&tbl->proxy_timer);
- pneigh_queue_purge(&tbl->proxy_queue, NULL);
+ pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family);
neigh_ifdown(tbl, NULL);
if (atomic_read(&tbl->entries))
pr_crit("neighbour leakage\n");
@@ -3539,18 +3553,6 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write,
return ret;
}
-static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
- int family)
-{
- switch (family) {
- case AF_INET:
- return __in_dev_arp_parms_get_rcu(dev);
- case AF_INET6:
- return __in6_dev_nd_parms_get_rcu(dev);
- }
- return NULL;
-}
-
static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p,
int index)
{
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 8409d41405df..679b84cc8794 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -532,7 +532,7 @@ static ssize_t phys_port_name_show(struct device *dev,
* returning early without hitting the trylock/restart below.
*/
if (!netdev->netdev_ops->ndo_get_phys_port_name &&
- !netdev->netdev_ops->ndo_get_devlink_port)
+ !netdev->devlink_port)
return -EOPNOTSUPP;
if (!rtnl_trylock())
@@ -562,7 +562,7 @@ static ssize_t phys_switch_id_show(struct device *dev,
* because recurse is false when calling dev_get_port_parent_id.
*/
if (!netdev->netdev_ops->ndo_get_port_parent_id &&
- !netdev->netdev_ops->ndo_get_devlink_port)
+ !netdev->devlink_port)
return -EOPNOTSUPP;
if (!rtnl_trylock())
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 0ec2f5906a27..5581d22cc191 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -117,6 +117,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
static int ops_init(const struct pernet_operations *ops, struct net *net)
{
+ struct net_generic *ng;
int err = -ENOMEM;
void *data = NULL;
@@ -135,7 +136,13 @@ static int ops_init(const struct pernet_operations *ops, struct net *net)
if (!err)
return 0;
+ if (ops->id && ops->size) {
cleanup:
+ ng = rcu_dereference_protected(net->gen,
+ lockdep_is_held(&pernet_ops_rwsem));
+ ng->ptr[*ops->id] = NULL;
+ }
+
kfree(data);
out:
@@ -309,6 +316,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
refcount_set(&net->ns.count, 1);
ref_tracker_dir_init(&net->refcnt_tracker, 128);
+ ref_tracker_dir_init(&net->notrefcnt_tracker, 128);
refcount_set(&net->passive, 1);
get_random_bytes(&net->hash_mix, sizeof(u32));
@@ -429,6 +437,10 @@ static void net_free(struct net *net)
{
if (refcount_dec_and_test(&net->passive)) {
kfree(rcu_access_pointer(net->gen));
+
+ /* There should not be any trackers left there. */
+ ref_tracker_dir_exit(&net->notrefcnt_tracker);
+
kmem_cache_free(net_cachep, net);
}
}
diff --git a/net/core/of_net.c b/net/core/of_net.c
index f1a9bf7578e7..55d3fe229269 100644
--- a/net/core/of_net.c
+++ b/net/core/of_net.c
@@ -57,7 +57,7 @@ static int of_get_mac_addr(struct device_node *np, const char *name, u8 *addr)
return -ENODEV;
}
-static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr)
+int of_get_mac_address_nvmem(struct device_node *np, u8 *addr)
{
struct platform_device *pdev = of_find_device_by_node(np);
struct nvmem_cell *cell;
@@ -94,6 +94,7 @@ static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr)
return 0;
}
+EXPORT_SYMBOL(of_get_mac_address_nvmem);
/**
* of_get_mac_address()
@@ -140,7 +141,7 @@ int of_get_mac_address(struct device_node *np, u8 *addr)
if (!ret)
return 0;
- return of_get_mac_addr_nvmem(np, addr);
+ return of_get_mac_address_nvmem(np, addr);
}
EXPORT_SYMBOL(of_get_mac_address);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 88906ba6d9a7..c3763056c554 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2324,7 +2324,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev)
pkt_dev->curfl = 0; /*reset */
}
} else {
- flow = prandom_u32() % pkt_dev->cflows;
+ flow = prandom_u32_max(pkt_dev->cflows);
pkt_dev->curfl = flow;
if (pkt_dev->flows[flow].count > pkt_dev->lflow) {
@@ -2380,10 +2380,9 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev)
else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) {
__u16 t;
if (pkt_dev->flags & F_QUEUE_MAP_RND) {
- t = prandom_u32() %
- (pkt_dev->queue_map_max -
- pkt_dev->queue_map_min + 1)
- + pkt_dev->queue_map_min;
+ t = prandom_u32_max(pkt_dev->queue_map_max -
+ pkt_dev->queue_map_min + 1) +
+ pkt_dev->queue_map_min;
} else {
t = pkt_dev->cur_queue_map + 1;
if (t > pkt_dev->queue_map_max)
@@ -2412,7 +2411,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
__u32 tmp;
if (pkt_dev->flags & F_MACSRC_RND)
- mc = prandom_u32() % pkt_dev->src_mac_count;
+ mc = prandom_u32_max(pkt_dev->src_mac_count);
else {
mc = pkt_dev->cur_src_mac_offset++;
if (pkt_dev->cur_src_mac_offset >=
@@ -2438,7 +2437,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
__u32 tmp;
if (pkt_dev->flags & F_MACDST_RND)
- mc = prandom_u32() % pkt_dev->dst_mac_count;
+ mc = prandom_u32_max(pkt_dev->dst_mac_count);
else {
mc = pkt_dev->cur_dst_mac_offset++;
@@ -2465,23 +2464,23 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
for (i = 0; i < pkt_dev->nr_labels; i++)
if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM)
pkt_dev->labels[i] = MPLS_STACK_BOTTOM |
- ((__force __be32)prandom_u32() &
+ ((__force __be32)get_random_u32() &
htonl(0x000fffff));
}
if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) {
- pkt_dev->vlan_id = prandom_u32() & (4096 - 1);
+ pkt_dev->vlan_id = prandom_u32_max(4096);
}
if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) {
- pkt_dev->svlan_id = prandom_u32() & (4096 - 1);
+ pkt_dev->svlan_id = prandom_u32_max(4096);
}
if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) {
if (pkt_dev->flags & F_UDPSRC_RND)
- pkt_dev->cur_udp_src = prandom_u32() %
- (pkt_dev->udp_src_max - pkt_dev->udp_src_min)
- + pkt_dev->udp_src_min;
+ pkt_dev->cur_udp_src = prandom_u32_max(
+ pkt_dev->udp_src_max - pkt_dev->udp_src_min) +
+ pkt_dev->udp_src_min;
else {
pkt_dev->cur_udp_src++;
@@ -2492,9 +2491,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) {
if (pkt_dev->flags & F_UDPDST_RND) {
- pkt_dev->cur_udp_dst = prandom_u32() %
- (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)
- + pkt_dev->udp_dst_min;
+ pkt_dev->cur_udp_dst = prandom_u32_max(
+ pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) +
+ pkt_dev->udp_dst_min;
} else {
pkt_dev->cur_udp_dst++;
if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max)
@@ -2509,7 +2508,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (imn < imx) {
__u32 t;
if (pkt_dev->flags & F_IPSRC_RND)
- t = prandom_u32() % (imx - imn) + imn;
+ t = prandom_u32_max(imx - imn) + imn;
else {
t = ntohl(pkt_dev->cur_saddr);
t++;
@@ -2531,8 +2530,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->flags & F_IPDST_RND) {
do {
- t = prandom_u32() %
- (imx - imn) + imn;
+ t = prandom_u32_max(imx - imn) +
+ imn;
s = htonl(t);
} while (ipv4_is_loopback(s) ||
ipv4_is_multicast(s) ||
@@ -2569,7 +2568,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
for (i = 0; i < 4; i++) {
pkt_dev->cur_in6_daddr.s6_addr32[i] =
- (((__force __be32)prandom_u32() |
+ (((__force __be32)get_random_u32() |
pkt_dev->min_in6_daddr.s6_addr32[i]) &
pkt_dev->max_in6_daddr.s6_addr32[i]);
}
@@ -2579,9 +2578,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
__u32 t;
if (pkt_dev->flags & F_TXSIZE_RND) {
- t = prandom_u32() %
- (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size)
- + pkt_dev->min_pkt_size;
+ t = prandom_u32_max(pkt_dev->max_pkt_size -
+ pkt_dev->min_pkt_size) +
+ pkt_dev->min_pkt_size;
} else {
t = pkt_dev->cur_pkt_size + 1;
if (t > pkt_dev->max_pkt_size)
@@ -2590,7 +2589,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
pkt_dev->cur_pkt_size = t;
} else if (pkt_dev->n_imix_entries > 0) {
struct imix_pkt *entry;
- __u32 t = prandom_u32() % IMIX_PRECISION;
+ __u32 t = prandom_u32_max(IMIX_PRECISION);
__u8 entry_index = pkt_dev->imix_distribution[t];
entry = &pkt_dev->imix_entries[entry_index];
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 74864dc46a7e..64289bc98887 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -53,6 +53,7 @@
#include <net/fib_rules.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
+#include <net/devlink.h>
#include "dev.h"
@@ -760,7 +761,7 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
EXPORT_SYMBOL(rtnl_unicast);
void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
- struct nlmsghdr *nlh, gfp_t flags)
+ const struct nlmsghdr *nlh, gfp_t flags)
{
struct sock *rtnl = net->rtnl;
@@ -1038,6 +1039,16 @@ static size_t rtnl_proto_down_size(const struct net_device *dev)
return size;
}
+static size_t rtnl_devlink_port_size(const struct net_device *dev)
+{
+ size_t size = nla_total_size(0); /* nest IFLA_DEVLINK_PORT */
+
+ if (dev->devlink_port)
+ size += devlink_nl_port_handle_size(dev->devlink_port);
+
+ return size;
+}
+
static noinline size_t if_nlmsg_size(const struct net_device *dev,
u32 ext_filter_mask)
{
@@ -1091,6 +1102,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_MAX_MTU */
+ rtnl_prop_list_size(dev)
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
+ + rtnl_devlink_port_size(dev)
+ 0;
}
@@ -1728,6 +1740,30 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int rtnl_fill_devlink_port(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct nlattr *devlink_port_nest;
+ int ret;
+
+ devlink_port_nest = nla_nest_start(skb, IFLA_DEVLINK_PORT);
+ if (!devlink_port_nest)
+ return -EMSGSIZE;
+
+ if (dev->devlink_port) {
+ ret = devlink_nl_port_handle_fill(skb, dev->devlink_port);
+ if (ret < 0)
+ goto nest_cancel;
+ }
+
+ nla_nest_end(skb, devlink_port_nest);
+ return 0;
+
+nest_cancel:
+ nla_nest_cancel(skb, devlink_port_nest);
+ return ret;
+}
+
static int rtnl_fill_ifinfo(struct sk_buff *skb,
struct net_device *dev, struct net *src_net,
int type, u32 pid, u32 seq, u32 change,
@@ -1865,6 +1901,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
dev->dev.parent->bus->name))
goto nla_put_failure;
+ if (rtnl_fill_devlink_port(skb, dev))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -3110,7 +3149,7 @@ static int rtnl_group_dellink(const struct net *net, int group)
return 0;
}
-int rtnl_delete_link(struct net_device *dev)
+int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh)
{
const struct rtnl_link_ops *ops;
LIST_HEAD(list_kill);
@@ -3120,7 +3159,7 @@ int rtnl_delete_link(struct net_device *dev)
return -EOPNOTSUPP;
ops->dellink(dev, &list_kill);
- unregister_netdevice_many(&list_kill);
+ unregister_netdevice_many_notify(&list_kill, portid, nlh);
return 0;
}
@@ -3130,6 +3169,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
+ u32 portid = NETLINK_CB(skb).portid;
struct net *tgt_net = net;
struct net_device *dev = NULL;
struct ifinfomsg *ifm;
@@ -3171,7 +3211,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
goto out;
}
- err = rtnl_delete_link(dev);
+ err = rtnl_delete_link(dev, portid, nlh);
out:
if (netnsid >= 0)
@@ -3180,7 +3220,8 @@ out:
return err;
}
-int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
+int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
+ u32 portid, const struct nlmsghdr *nlh)
{
unsigned int old_flags;
int err;
@@ -3194,10 +3235,10 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
}
if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
- __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags));
+ __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh);
} else {
dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
- __dev_notify_flags(dev, old_flags, ~0U);
+ __dev_notify_flags(dev, old_flags, ~0U, portid, nlh);
}
return 0;
}
@@ -3311,11 +3352,13 @@ static int rtnl_group_changelink(const struct sk_buff *skb,
static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
const struct rtnl_link_ops *ops,
+ const struct nlmsghdr *nlh,
struct nlattr **tb, struct nlattr **data,
struct netlink_ext_ack *extack)
{
unsigned char name_assign_type = NET_NAME_USER;
struct net *net = sock_net(skb->sk);
+ u32 portid = NETLINK_CB(skb).portid;
struct net *dest_net, *link_net;
struct net_device *dev;
char ifname[IFNAMSIZ];
@@ -3369,7 +3412,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
goto out;
}
- err = rtnl_configure_link(dev, ifm);
+ err = rtnl_configure_link(dev, ifm, portid, nlh);
if (err < 0)
goto out_unregister;
if (link_net) {
@@ -3578,7 +3621,7 @@ replay:
return -EOPNOTSUPP;
}
- return rtnl_newlink_create(skb, ifm, ops, tb, data, extack);
+ return rtnl_newlink_create(skb, ifm, ops, nlh, tb, data, extack);
}
static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -3896,7 +3939,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
unsigned int change,
u32 event, gfp_t flags, int *new_nsid,
- int new_ifindex)
+ int new_ifindex, u32 portid, u32 seq)
{
struct net *net = dev_net(dev);
struct sk_buff *skb;
@@ -3907,7 +3950,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
goto errout;
err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
- type, 0, 0, change, 0, 0, event,
+ type, portid, seq, change, 0, 0, event,
new_nsid, new_ifindex, -1, flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_size() */
@@ -3922,16 +3965,18 @@ errout:
return NULL;
}
-void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
+void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags,
+ u32 portid, const struct nlmsghdr *nlh)
{
struct net *net = dev_net(dev);
- rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
+ rtnl_notify(skb, net, portid, RTNLGRP_LINK, nlh, flags);
}
static void rtmsg_ifinfo_event(int type, struct net_device *dev,
unsigned int change, u32 event,
- gfp_t flags, int *new_nsid, int new_ifindex)
+ gfp_t flags, int *new_nsid, int new_ifindex,
+ u32 portid, const struct nlmsghdr *nlh)
{
struct sk_buff *skb;
@@ -3939,23 +3984,23 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev,
return;
skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid,
- new_ifindex);
+ new_ifindex, portid, nlmsg_seq(nlh));
if (skb)
- rtmsg_ifinfo_send(skb, dev, flags);
+ rtmsg_ifinfo_send(skb, dev, flags, portid, nlh);
}
void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
- gfp_t flags)
+ gfp_t flags, u32 portid, const struct nlmsghdr *nlh)
{
rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
- NULL, 0);
+ NULL, 0, portid, nlh);
}
void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
gfp_t flags, int *new_nsid, int new_ifindex)
{
rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
- new_nsid, new_ifindex);
+ new_nsid, new_ifindex, 0, NULL);
}
static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
@@ -4045,6 +4090,11 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
return err;
}
+ if (tb[NDA_FLAGS_EXT]) {
+ netdev_info(dev, "invalid flags given to default FDB implementation\n");
+ return err;
+ }
+
if (vid) {
netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n");
return err;
@@ -6140,7 +6190,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
case NETDEV_CHANGELOWERSTATE:
case NETDEV_CHANGE_TX_QUEUE_LEN:
rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
- GFP_KERNEL, NULL, 0);
+ GFP_KERNEL, NULL, 0, 0, NULL);
break;
default:
break;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1d9719e72f9d..4bf95e36ed16 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -94,6 +94,7 @@ EXPORT_SYMBOL(sysctl_max_skb_frags);
#undef FN
#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
const char * const drop_reasons[] = {
+ [SKB_CONSUMED] = "CONSUMED",
DEFINE_DROP_REASON(FN, FN)
};
EXPORT_SYMBOL(drop_reasons);
@@ -506,14 +507,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
*/
size = SKB_DATA_ALIGN(size);
size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
+ osize = kmalloc_size_roundup(size);
+ data = kmalloc_reserve(osize, gfp_mask, node, &pfmemalloc);
if (unlikely(!data))
goto nodata;
- /* kmalloc(size) might give us more room than requested.
+ /* kmalloc_size_roundup() might give us more room than requested.
* Put skb_shared_info exactly at the end of allocated zone,
* to allow max possible filling before reallocation.
*/
- osize = ksize(data);
size = SKB_WITH_OVERHEAD(osize);
prefetchw(data + size);
@@ -748,6 +749,13 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
+static bool skb_pp_recycle(struct sk_buff *skb, void *data)
+{
+ if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
+ return false;
+ return page_pool_return_skb_page(virt_to_page(data));
+}
+
static void skb_free_head(struct sk_buff *skb)
{
unsigned char *head = skb->head;
@@ -761,7 +769,7 @@ static void skb_free_head(struct sk_buff *skb)
}
}
-static void skb_release_data(struct sk_buff *skb)
+static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
int i;
@@ -784,7 +792,7 @@ static void skb_release_data(struct sk_buff *skb)
free_head:
if (shinfo->frag_list)
- kfree_skb_list(shinfo->frag_list);
+ kfree_skb_list_reason(shinfo->frag_list, reason);
skb_free_head(skb);
exit:
@@ -847,11 +855,11 @@ void skb_release_head_state(struct sk_buff *skb)
}
/* Free everything but the sk_buff shell. */
-static void skb_release_all(struct sk_buff *skb)
+static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
{
skb_release_head_state(skb);
if (likely(skb->head))
- skb_release_data(skb);
+ skb_release_data(skb, reason);
}
/**
@@ -865,7 +873,7 @@ static void skb_release_all(struct sk_buff *skb)
void __kfree_skb(struct sk_buff *skb)
{
- skb_release_all(skb);
+ skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);
@@ -887,7 +895,10 @@ kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
- trace_kfree_skb(skb, __builtin_return_address(0), reason);
+ if (reason == SKB_CONSUMED)
+ trace_consume_skb(skb);
+ else
+ trace_kfree_skb(skb, __builtin_return_address(0), reason);
__kfree_skb(skb);
}
EXPORT_SYMBOL(kfree_skb_reason);
@@ -1045,7 +1056,7 @@ EXPORT_SYMBOL(consume_skb);
void __consume_stateless_skb(struct sk_buff *skb)
{
trace_consume_skb(skb);
- skb_release_data(skb);
+ skb_release_data(skb, SKB_CONSUMED);
kfree_skbmem(skb);
}
@@ -1070,7 +1081,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
void __kfree_skb_defer(struct sk_buff *skb)
{
- skb_release_all(skb);
+ skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
napi_skb_cache_put(skb);
}
@@ -1108,7 +1119,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
- skb_release_all(skb);
+ skb_release_all(skb, SKB_CONSUMED);
napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
@@ -1239,7 +1250,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
*/
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
- skb_release_all(dst);
+ skb_release_all(dst, SKB_CONSUMED);
return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);
@@ -1256,13 +1267,12 @@ int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
user = mmp->user ? : current_user();
+ old_pg = atomic_long_read(&user->locked_vm);
do {
- old_pg = atomic_long_read(&user->locked_vm);
new_pg = old_pg + num_pg;
if (new_pg > max_pg)
return -ENOBUFS;
- } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) !=
- old_pg);
+ } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg));
if (!mmp->user) {
mmp->user = get_uid(user);
@@ -1814,10 +1824,11 @@ EXPORT_SYMBOL(__pskb_copy_fclone);
int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
gfp_t gfp_mask)
{
- int i, osize = skb_end_offset(skb);
- int size = osize + nhead + ntail;
+ unsigned int osize = skb_end_offset(skb);
+ unsigned int size = osize + nhead + ntail;
long off;
u8 *data;
+ int i;
BUG_ON(nhead < 0);
@@ -1825,15 +1836,16 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
skb_zcopy_downgrade_managed(skb);
- size = SKB_DATA_ALIGN(size);
-
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
- gfp_mask, NUMA_NO_NODE, NULL);
+
+ size = SKB_DATA_ALIGN(size);
+ size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ size = kmalloc_size_roundup(size);
+ data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
goto nodata;
- size = SKB_WITH_OVERHEAD(ksize(data));
+ size = SKB_WITH_OVERHEAD(size);
/* Copy only real data... and, alas, header. This should be
* optimized for the cases when header is void.
@@ -1860,7 +1872,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb);
+ skb_release_data(skb, SKB_CONSUMED);
} else {
skb_free_head(skb);
}
@@ -3971,7 +3983,7 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
} else if (i < MAX_SKB_FRAGS) {
skb_zcopy_downgrade_managed(skb);
get_page(page);
- skb_fill_page_desc(skb, i, page, offset, size);
+ skb_fill_page_desc_noacc(skb, i, page, offset, size);
} else {
return -EMSGSIZE;
}
@@ -4134,23 +4146,25 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
int i = 0;
int pos;
- if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) &&
- (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) {
- /* gso_size is untrusted, and we have a frag_list with a linear
- * non head_frag head.
- *
- * (we assume checking the first list_skb member suffices;
- * i.e if either of the list_skb members have non head_frag
- * head, then the first one has too).
- *
- * If head_skb's headlen does not fit requested gso_size, it
- * means that the frag_list members do NOT terminate on exact
- * gso_size boundaries. Hence we cannot perform skb_frag_t page
- * sharing. Therefore we must fallback to copying the frag_list
- * skbs; we do so by disabling SG.
- */
- if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb))
- features &= ~NETIF_F_SG;
+ if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
+ mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
+ struct sk_buff *check_skb;
+
+ for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
+ if (skb_headlen(check_skb) && !check_skb->head_frag) {
+ /* gso_size is untrusted, and we have a frag_list with
+ * a linear non head_frag item.
+ *
+ * If head_skb's headlen does not fit requested gso_size,
+ * it means that the frag_list members do NOT terminate
+ * on exact gso_size boundaries. Hence we cannot perform
+ * skb_frag_t page sharing. Therefore we must fallback to
+ * copying the frag_list skbs; we do so by disabling SG.
+ */
+ features &= ~NETIF_F_SG;
+ break;
+ }
+ }
}
__skb_push(head_skb, doffset);
@@ -6167,21 +6181,20 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
const int headlen, gfp_t gfp_mask)
{
int i;
- int size = skb_end_offset(skb);
+ unsigned int size = skb_end_offset(skb);
int new_hlen = headlen - off;
u8 *data;
- size = SKB_DATA_ALIGN(size);
-
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- data = kmalloc_reserve(size +
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
- gfp_mask, NUMA_NO_NODE, NULL);
+
+ size = SKB_DATA_ALIGN(size);
+ size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ size = kmalloc_size_roundup(size);
+ data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
-
- size = SKB_WITH_OVERHEAD(ksize(data));
+ size = SKB_WITH_OVERHEAD(size);
/* Copy real data, and all frags */
skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
@@ -6201,7 +6214,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb);
+ skb_release_data(skb, SKB_CONSUMED);
} else {
/* we can reuse existing recount- all we did was
* relocate values
@@ -6286,22 +6299,21 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
int pos, gfp_t gfp_mask)
{
int i, k = 0;
- int size = skb_end_offset(skb);
+ unsigned int size = skb_end_offset(skb);
u8 *data;
const int nfrags = skb_shinfo(skb)->nr_frags;
struct skb_shared_info *shinfo;
- size = SKB_DATA_ALIGN(size);
-
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- data = kmalloc_reserve(size +
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
- gfp_mask, NUMA_NO_NODE, NULL);
+
+ size = SKB_DATA_ALIGN(size);
+ size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ size = kmalloc_size_roundup(size);
+ data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
-
- size = SKB_WITH_OVERHEAD(ksize(data));
+ size = SKB_WITH_OVERHEAD(size);
memcpy((struct skb_shared_info *)(data + size),
skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
@@ -6345,7 +6357,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
kfree(data);
return -ENOMEM;
}
- skb_release_data(skb);
+ skb_release_data(skb, SKB_CONSUMED);
skb->head = data;
skb->head_frag = 0;
@@ -6424,6 +6436,7 @@ void skb_condense(struct sk_buff *skb)
*/
skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
}
+EXPORT_SYMBOL(skb_condense);
#ifdef CONFIG_SKB_EXTENSIONS
static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index ca70525621c7..e6b9ced3eda8 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -500,11 +500,11 @@ bool sk_msg_is_readable(struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_msg_is_readable);
-static struct sk_msg *alloc_sk_msg(void)
+static struct sk_msg *alloc_sk_msg(gfp_t gfp)
{
struct sk_msg *msg;
- msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL);
+ msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN);
if (unlikely(!msg))
return NULL;
sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS);
@@ -520,7 +520,7 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
if (!sk_rmem_schedule(sk, skb, skb->truesize))
return NULL;
- return alloc_sk_msg();
+ return alloc_sk_msg(GFP_KERNEL);
}
static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
@@ -597,7 +597,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len)
{
- struct sk_msg *msg = alloc_sk_msg();
+ struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC);
struct sock *sk = psock->sk;
int err;
@@ -803,16 +803,13 @@ static void sk_psock_link_destroy(struct sk_psock *psock)
}
}
-void sk_psock_stop(struct sk_psock *psock, bool wait)
+void sk_psock_stop(struct sk_psock *psock)
{
spin_lock_bh(&psock->ingress_lock);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
sk_psock_cork_free(psock);
__sk_psock_zap_ingress(psock);
spin_unlock_bh(&psock->ingress_lock);
-
- if (wait)
- cancel_work_sync(&psock->work);
}
static void sk_psock_done_strp(struct sk_psock *psock);
@@ -850,7 +847,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
- sk_psock_stop(psock, false);
+ sk_psock_stop(psock);
INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
queue_rcu_work(system_wq, &psock->rwork);
diff --git a/net/core/sock.c b/net/core/sock.c
index a3ba0358c77c..4571914a4aa8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1436,7 +1436,7 @@ set_sndbuf:
break;
}
case SO_INCOMING_CPU:
- WRITE_ONCE(sk->sk_incoming_cpu, val);
+ reuseport_update_incoming_cpu(sk, val);
break;
case SO_CNX_ADVICE:
@@ -2094,6 +2094,9 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
if (likely(sk->sk_net_refcnt)) {
get_net_track(net, &sk->ns_tracker, priority);
sock_inuse_add(net, 1);
+ } else {
+ __netns_tracker_alloc(net, &sk->ns_tracker,
+ false, priority);
}
sock_net_set(sk, net);
@@ -2149,6 +2152,9 @@ static void __sk_destruct(struct rcu_head *head)
if (likely(sk->sk_net_refcnt))
put_net_track(sock_net(sk), &sk->ns_tracker);
+ else
+ __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
+
sk_prot_free(sk->sk_prot_creator, sk);
}
@@ -2237,6 +2243,14 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
if (likely(newsk->sk_net_refcnt)) {
get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
sock_inuse_add(sock_net(newsk), 1);
+ } else {
+ /* Kernel sockets are not elevating the struct net refcount.
+ * Instead, use a tracker to more easily detect if a layer
+ * is not properly dismantling its kernel sockets at netns
+ * destroy time.
+ */
+ __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
+ false, priority);
}
sk_node_init(&newsk->sk_node);
sock_lock_init(newsk);
@@ -2730,7 +2744,7 @@ failure:
}
EXPORT_SYMBOL(sock_alloc_send_pskb);
-int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
+int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
struct sockcm_cookie *sockc)
{
u32 tsflags;
@@ -2784,7 +2798,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
return -EINVAL;
if (cmsg->cmsg_level != SOL_SOCKET)
continue;
- ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
+ ret = __sock_cmsg_send(sk, cmsg, sockc);
if (ret)
return ret;
}
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index f7cf74cdd3db..b1e29e18d1d6 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -25,14 +25,17 @@ DEFINE_COOKIE(sock_cookie);
u64 __sock_gen_cookie(struct sock *sk)
{
- while (1) {
- u64 res = atomic64_read(&sk->sk_cookie);
+ u64 res = atomic64_read(&sk->sk_cookie);
- if (res)
- return res;
- res = gen_cookie_next(&sock_cookie);
- atomic64_cmpxchg(&sk->sk_cookie, 0, res);
+ if (!res) {
+ u64 new = gen_cookie_next(&sock_cookie);
+
+ atomic64_cmpxchg(&sk->sk_cookie, res, new);
+
+ /* Another thread might have changed sk_cookie before us. */
+ res = atomic64_read(&sk->sk_cookie);
}
+ return res;
}
int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie)
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index a660baedd9e7..81beb16ab1eb 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1596,7 +1596,7 @@ void sock_map_destroy(struct sock *sk)
saved_destroy = psock->saved_destroy;
sock_map_remove_links(sk, psock);
rcu_read_unlock();
- sk_psock_stop(psock, false);
+ sk_psock_stop(psock);
sk_psock_put(sk, psock);
saved_destroy(sk);
}
@@ -1619,9 +1619,10 @@ void sock_map_close(struct sock *sk, long timeout)
saved_close = psock->saved_close;
sock_map_remove_links(sk, psock);
rcu_read_unlock();
- sk_psock_stop(psock, true);
- sk_psock_put(sk, psock);
+ sk_psock_stop(psock);
release_sock(sk);
+ cancel_work_sync(&psock->work);
+ sk_psock_put(sk, psock);
saved_close(sk, timeout);
}
EXPORT_SYMBOL_GPL(sock_map_close);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 5daa1fa54249..5a165286e4d8 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -21,6 +21,86 @@ static DEFINE_IDA(reuseport_ida);
static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
struct sock_reuseport *reuse, bool bind_inany);
+void reuseport_has_conns_set(struct sock *sk)
+{
+ struct sock_reuseport *reuse;
+
+ if (!rcu_access_pointer(sk->sk_reuseport_cb))
+ return;
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ if (likely(reuse))
+ reuse->has_conns = 1;
+ spin_unlock_bh(&reuseport_lock);
+}
+EXPORT_SYMBOL(reuseport_has_conns_set);
+
+static void __reuseport_get_incoming_cpu(struct sock_reuseport *reuse)
+{
+ /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */
+ WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu + 1);
+}
+
+static void __reuseport_put_incoming_cpu(struct sock_reuseport *reuse)
+{
+ /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */
+ WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu - 1);
+}
+
+static void reuseport_get_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse)
+{
+ if (sk->sk_incoming_cpu >= 0)
+ __reuseport_get_incoming_cpu(reuse);
+}
+
+static void reuseport_put_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse)
+{
+ if (sk->sk_incoming_cpu >= 0)
+ __reuseport_put_incoming_cpu(reuse);
+}
+
+void reuseport_update_incoming_cpu(struct sock *sk, int val)
+{
+ struct sock_reuseport *reuse;
+ int old_sk_incoming_cpu;
+
+ if (unlikely(!rcu_access_pointer(sk->sk_reuseport_cb))) {
+ /* Paired with REAE_ONCE() in sk_incoming_cpu_update()
+ * and compute_score().
+ */
+ WRITE_ONCE(sk->sk_incoming_cpu, val);
+ return;
+ }
+
+ spin_lock_bh(&reuseport_lock);
+
+ /* This must be done under reuseport_lock to avoid a race with
+ * reuseport_grow(), which accesses sk->sk_incoming_cpu without
+ * lock_sock() when detaching a shutdown()ed sk.
+ *
+ * Paired with READ_ONCE() in reuseport_select_sock_by_hash().
+ */
+ old_sk_incoming_cpu = sk->sk_incoming_cpu;
+ WRITE_ONCE(sk->sk_incoming_cpu, val);
+
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+
+ /* reuseport_grow() has detached a closed sk. */
+ if (!reuse)
+ goto out;
+
+ if (old_sk_incoming_cpu < 0 && val >= 0)
+ __reuseport_get_incoming_cpu(reuse);
+ else if (old_sk_incoming_cpu >= 0 && val < 0)
+ __reuseport_put_incoming_cpu(reuse);
+
+out:
+ spin_unlock_bh(&reuseport_lock);
+}
+
static int reuseport_sock_index(struct sock *sk,
const struct sock_reuseport *reuse,
bool closed)
@@ -48,6 +128,7 @@ static void __reuseport_add_sock(struct sock *sk,
/* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
smp_wmb();
reuse->num_socks++;
+ reuseport_get_incoming_cpu(sk, reuse);
}
static bool __reuseport_detach_sock(struct sock *sk,
@@ -60,6 +141,7 @@ static bool __reuseport_detach_sock(struct sock *sk,
reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
reuse->num_socks--;
+ reuseport_put_incoming_cpu(sk, reuse);
return true;
}
@@ -70,6 +152,7 @@ static void __reuseport_add_closed_sock(struct sock *sk,
reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
/* paired with READ_ONCE() in inet_csk_bind_conflict() */
WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
+ reuseport_get_incoming_cpu(sk, reuse);
}
static bool __reuseport_detach_closed_sock(struct sock *sk,
@@ -83,6 +166,7 @@ static bool __reuseport_detach_closed_sock(struct sock *sk,
reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
/* paired with READ_ONCE() in inet_csk_bind_conflict() */
WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
+ reuseport_put_incoming_cpu(sk, reuse);
return true;
}
@@ -150,6 +234,7 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
reuse->bind_inany = bind_inany;
reuse->socks[0] = sk;
reuse->num_socks = 1;
+ reuseport_get_incoming_cpu(sk, reuse);
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
out:
@@ -193,6 +278,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
more_reuse->reuseport_id = reuse->reuseport_id;
more_reuse->bind_inany = reuse->bind_inany;
more_reuse->has_conns = reuse->has_conns;
+ more_reuse->incoming_cpu = reuse->incoming_cpu;
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
@@ -442,18 +528,32 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
u32 hash, u16 num_socks)
{
+ struct sock *first_valid_sk = NULL;
int i, j;
i = j = reciprocal_scale(hash, num_socks);
- while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
+ do {
+ struct sock *sk = reuse->socks[i];
+
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ /* Paired with WRITE_ONCE() in __reuseport_(get|put)_incoming_cpu(). */
+ if (!READ_ONCE(reuse->incoming_cpu))
+ return sk;
+
+ /* Paired with WRITE_ONCE() in reuseport_update_incoming_cpu(). */
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
+ return sk;
+
+ if (!first_valid_sk)
+ first_valid_sk = sk;
+ }
+
i++;
if (i >= num_socks)
i = 0;
- if (i == j)
- return NULL;
- }
+ } while (i != j);
- return reuse->socks[i];
+ return first_valid_sk;
}
/**
diff --git a/net/core/stream.c b/net/core/stream.c
index 1105057ce00a..75fded8495f5 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -123,7 +123,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (sk_stream_memory_free(sk))
- current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
+ current_timeo = vm_wait = prandom_u32_max(HZ / 5) + 2;
add_wait_queue(sk_sleep(sk), &wait);
diff --git a/net/core/utils.c b/net/core/utils.c
index 938495bc1d34..c994e95172ac 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -302,7 +302,7 @@ static int inet4_pton(const char *src, u16 port_num,
struct sockaddr_storage *addr)
{
struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
- int srclen = strlen(src);
+ size_t srclen = strlen(src);
if (srclen > INET_ADDRSTRLEN)
return -EINVAL;
@@ -322,7 +322,7 @@ static int inet6_pton(struct net *net, const char *src, u16 port_num,
{
struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
const char *scope_delim;
- int srclen = strlen(src);
+ size_t srclen = strlen(src);
if (srclen > INET6_ADDRSTRLEN)
return -EINVAL;
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index dc4fb699b56c..f9949e051f49 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -166,6 +166,7 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = {
[DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)},
[DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)},
[DCB_ATTR_DCB_BUFFER] = {.len = sizeof(struct dcbnl_buffer)},
+ [DCB_ATTR_DCB_APP_TRUST_TABLE] = {.type = NLA_NESTED},
};
/* DCB number of traffic classes nested attributes. */
@@ -179,6 +180,38 @@ static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = {
static LIST_HEAD(dcb_app_list);
static DEFINE_SPINLOCK(dcb_lock);
+static enum ieee_attrs_app dcbnl_app_attr_type_get(u8 selector)
+{
+ switch (selector) {
+ case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
+ case IEEE_8021QAZ_APP_SEL_STREAM:
+ case IEEE_8021QAZ_APP_SEL_DGRAM:
+ case IEEE_8021QAZ_APP_SEL_ANY:
+ case IEEE_8021QAZ_APP_SEL_DSCP:
+ return DCB_ATTR_IEEE_APP;
+ case DCB_APP_SEL_PCP:
+ return DCB_ATTR_DCB_APP;
+ default:
+ return DCB_ATTR_IEEE_APP_UNSPEC;
+ }
+}
+
+static bool dcbnl_app_attr_type_validate(enum ieee_attrs_app type)
+{
+ switch (type) {
+ case DCB_ATTR_IEEE_APP:
+ case DCB_ATTR_DCB_APP:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool dcbnl_app_selector_validate(enum ieee_attrs_app type, u8 selector)
+{
+ return dcbnl_app_attr_type_get(selector) == type;
+}
+
static struct sk_buff *dcbnl_newmsg(int type, u8 cmd, u32 port, u32 seq,
u32 flags, struct nlmsghdr **nlhp)
{
@@ -1027,12 +1060,51 @@ nla_put_failure:
return err;
}
+static int dcbnl_getapptrust(struct net_device *netdev, struct sk_buff *skb)
+{
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ enum ieee_attrs_app type;
+ struct nlattr *apptrust;
+ int nselectors, err, i;
+ u8 *selectors;
+
+ selectors = kzalloc(IEEE_8021QAZ_APP_SEL_MAX + 1, GFP_KERNEL);
+ if (!selectors)
+ return -ENOMEM;
+
+ err = ops->dcbnl_getapptrust(netdev, selectors, &nselectors);
+ if (err) {
+ err = 0;
+ goto out;
+ }
+
+ apptrust = nla_nest_start(skb, DCB_ATTR_DCB_APP_TRUST_TABLE);
+ if (!apptrust) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ for (i = 0; i < nselectors; i++) {
+ type = dcbnl_app_attr_type_get(selectors[i]);
+ err = nla_put_u8(skb, type, selectors[i]);
+ if (err) {
+ nla_nest_cancel(skb, apptrust);
+ goto out;
+ }
+ }
+ nla_nest_end(skb, apptrust);
+
+out:
+ kfree(selectors);
+ return err;
+}
+
/* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */
static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
{
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
struct nlattr *ieee, *app;
struct dcb_app_type *itr;
- const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
int dcbx;
int err;
@@ -1116,8 +1188,9 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
spin_lock_bh(&dcb_lock);
list_for_each_entry(itr, &dcb_app_list, list) {
if (itr->ifindex == netdev->ifindex) {
- err = nla_put(skb, DCB_ATTR_IEEE_APP, sizeof(itr->app),
- &itr->app);
+ enum ieee_attrs_app type =
+ dcbnl_app_attr_type_get(itr->app.selector);
+ err = nla_put(skb, type, sizeof(itr->app), &itr->app);
if (err) {
spin_unlock_bh(&dcb_lock);
return -EMSGSIZE;
@@ -1133,6 +1206,12 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
spin_unlock_bh(&dcb_lock);
nla_nest_end(skb, app);
+ if (ops->dcbnl_getapptrust) {
+ err = dcbnl_getapptrust(netdev, skb);
+ if (err)
+ return err;
+ }
+
/* get peer info if available */
if (ops->ieee_peer_getets) {
struct ieee_ets ets;
@@ -1493,9 +1572,10 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
int rem;
nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) {
+ enum ieee_attrs_app type = nla_type(attr);
struct dcb_app *app_data;
- if (nla_type(attr) != DCB_ATTR_IEEE_APP)
+ if (!dcbnl_app_attr_type_validate(type))
continue;
if (nla_len(attr) < sizeof(struct dcb_app)) {
@@ -1504,6 +1584,13 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
}
app_data = nla_data(attr);
+
+ if (!dcbnl_app_selector_validate(type,
+ app_data->selector)) {
+ err = -EINVAL;
+ goto err;
+ }
+
if (ops->ieee_setapp)
err = ops->ieee_setapp(netdev, app_data);
else
@@ -1513,6 +1600,53 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
}
}
+ if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) {
+ u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0};
+ struct nlattr *attr;
+ int nselectors = 0;
+ int rem;
+
+ if (!ops->dcbnl_setapptrust) {
+ err = -EOPNOTSUPP;
+ goto err;
+ }
+
+ nla_for_each_nested(attr, ieee[DCB_ATTR_DCB_APP_TRUST_TABLE],
+ rem) {
+ enum ieee_attrs_app type = nla_type(attr);
+ u8 selector;
+ int i;
+
+ if (!dcbnl_app_attr_type_validate(type) ||
+ nla_len(attr) != 1 ||
+ nselectors >= sizeof(selectors)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ selector = nla_get_u8(attr);
+
+ if (!dcbnl_app_selector_validate(type, selector)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ /* Duplicate selector ? */
+ for (i = 0; i < nselectors; i++) {
+ if (selectors[i] == selector) {
+ err = -EINVAL;
+ goto err;
+ }
+ }
+
+ selectors[nselectors++] = selector;
+ }
+
+ err = ops->dcbnl_setapptrust(netdev, selectors, nselectors);
+ if (err)
+ goto err;
+ }
+
err:
err = nla_put_u8(skb, DCB_ATTR_IEEE, err);
dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, seq, 0);
@@ -1554,11 +1688,20 @@ static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh,
int rem;
nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) {
+ enum ieee_attrs_app type = nla_type(attr);
struct dcb_app *app_data;
- if (nla_type(attr) != DCB_ATTR_IEEE_APP)
+ if (!dcbnl_app_attr_type_validate(type))
continue;
+
app_data = nla_data(attr);
+
+ if (!dcbnl_app_selector_validate(type,
+ app_data->selector)) {
+ err = -EINVAL;
+ goto err;
+ }
+
if (ops->ieee_delapp)
err = ops->ieee_delapp(netdev, app_data);
else
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 7dfc00c9fb32..9ddc3a9e89e4 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -278,6 +278,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
const struct dccp_hdr *dh, const unsigned int len);
+void dccp_destruct_common(struct sock *sk);
int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized);
void dccp_destroy_sock(struct sock *sk);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 6a6e121dc00c..b780827f5e0a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -45,11 +45,10 @@ static unsigned int dccp_v4_pernet_id __read_mostly;
int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
- struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
- __be32 daddr, nexthop, prev_sk_rcv_saddr;
struct inet_sock *inet = inet_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
__be16 orig_sport, orig_dport;
+ __be32 daddr, nexthop;
struct flowi4 *fl4;
struct rtable *rt;
int err;
@@ -91,26 +90,13 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
daddr = fl4->daddr;
if (inet->inet_saddr == 0) {
- if (inet_csk(sk)->icsk_bind2_hash) {
- prev_addr_hashbucket =
- inet_bhashfn_portaddr(&dccp_hashinfo, sk,
- sock_net(sk),
- inet->inet_num);
- prev_sk_rcv_saddr = sk->sk_rcv_saddr;
- }
- inet->inet_saddr = fl4->saddr;
- }
-
- sk_rcv_saddr_set(sk, inet->inet_saddr);
-
- if (prev_addr_hashbucket) {
- err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
+ err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
if (err) {
- inet->inet_saddr = 0;
- sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
ip_rt_put(rt);
return err;
}
+ } else {
+ sk_rcv_saddr_set(sk, inet->inet_saddr);
}
inet->inet_dport = usin->sin_port;
@@ -144,7 +130,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_daddr,
inet->inet_sport,
inet->inet_dport);
- inet->inet_id = prandom_u32();
+ inet->inet_id = get_random_u16();
err = dccp_connect(sk);
rt = NULL;
@@ -157,6 +143,7 @@ failure:
* This unhashes the socket and releases the local port, if necessary.
*/
dccp_set_state(sk, DCCP_CLOSED);
+ inet_bhash2_reset_saddr(sk);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->inet_dport = 0;
@@ -443,7 +430,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt));
newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = ip_hdr(skb)->ttl;
- newinet->inet_id = prandom_u32();
+ newinet->inet_id = get_random_u16();
if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
goto put_and_exit;
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index e57b43006074..4260fe466993 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -934,26 +934,11 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
}
if (saddr == NULL) {
- struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
- struct in6_addr prev_v6_rcv_saddr;
-
- if (icsk->icsk_bind2_hash) {
- prev_addr_hashbucket = inet_bhashfn_portaddr(&dccp_hashinfo,
- sk, sock_net(sk),
- inet->inet_num);
- prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
- }
-
saddr = &fl6.saddr;
- sk->sk_v6_rcv_saddr = *saddr;
-
- if (prev_addr_hashbucket) {
- err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
- if (err) {
- sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr;
- goto failure;
- }
- }
+
+ err = inet_bhash2_update_saddr(sk, saddr, AF_INET6);
+ if (err)
+ goto failure;
}
/* set the source address */
@@ -985,6 +970,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
late_failure:
dccp_set_state(sk, DCCP_CLOSED);
+ inet_bhash2_reset_saddr(sk);
__sk_dst_reset(sk);
failure:
inet->inet_dport = 0;
@@ -1021,6 +1007,12 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
.sockaddr_len = sizeof(struct sockaddr_in6),
};
+static void dccp_v6_sk_destruct(struct sock *sk)
+{
+ dccp_destruct_common(sk);
+ inet6_sock_destruct(sk);
+}
+
/* NOTE: A lot of things set to zero explicitly by call to
* sk_alloc() so need not be done here.
*/
@@ -1033,17 +1025,12 @@ static int dccp_v6_init_sock(struct sock *sk)
if (unlikely(!dccp_v6_ctl_sock_initialized))
dccp_v6_ctl_sock_initialized = 1;
inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops;
+ sk->sk_destruct = dccp_v6_sk_destruct;
}
return err;
}
-static void dccp_v6_destroy_sock(struct sock *sk)
-{
- dccp_destroy_sock(sk);
- inet6_destroy_sock(sk);
-}
-
static struct timewait_sock_ops dccp6_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct dccp6_timewait_sock),
};
@@ -1066,7 +1053,7 @@ static struct proto dccp_v6_prot = {
.accept = inet_csk_accept,
.get_port = inet_csk_get_port,
.shutdown = dccp_shutdown,
- .destroy = dccp_v6_destroy_sock,
+ .destroy = dccp_destroy_sock,
.orphan_count = &dccp_orphan_count,
.max_header = MAX_DCCP_HEADER,
.obj_size = sizeof(struct dccp6_sock),
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index c548ca3e9b0e..a06b5641287a 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -171,12 +171,18 @@ const char *dccp_packet_name(const int type)
EXPORT_SYMBOL_GPL(dccp_packet_name);
-static void dccp_sk_destruct(struct sock *sk)
+void dccp_destruct_common(struct sock *sk)
{
struct dccp_sock *dp = dccp_sk(sk);
ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
dp->dccps_hc_tx_ccid = NULL;
+}
+EXPORT_SYMBOL_GPL(dccp_destruct_common);
+
+static void dccp_sk_destruct(struct sock *sk)
+{
+ dccp_destruct_common(sk);
inet_sock_destruct(sk);
}
@@ -279,8 +285,7 @@ int dccp_disconnect(struct sock *sk, int flags)
inet->inet_dport = 0;
- if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
- inet_reset_saddr(sk);
+ inet_bhash2_reset_saddr(sk);
sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 3eef72ce99a4..8e698bea99a3 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -18,6 +18,12 @@ if NET_DSA
# Drivers must select the appropriate tagging format(s)
+config NET_DSA_TAG_NONE
+ tristate "No-op tag driver"
+ help
+ Say Y or M if you want to enable support for switches which don't tag
+ frames over the CPU port.
+
config NET_DSA_TAG_AR9331
tristate "Tag driver for Atheros AR9331 SoC with built-in switch"
help
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index bf57ef3bce2a..cc7e93a562fe 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -2,13 +2,14 @@
# the core
obj-$(CONFIG_NET_DSA) += dsa_core.o
dsa_core-y += \
+ devlink.o \
dsa.o \
- dsa2.o \
master.o \
netlink.o \
port.o \
slave.o \
switch.o \
+ tag.o \
tag_8021q.o
# tagging formats
@@ -20,6 +21,7 @@ obj-$(CONFIG_NET_DSA_TAG_HELLCREEK) += tag_hellcreek.o
obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o
obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o
obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o
+obj-$(CONFIG_NET_DSA_TAG_NONE) += tag_none.o
obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o
obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o
obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
diff --git a/net/dsa/devlink.c b/net/dsa/devlink.c
new file mode 100644
index 000000000000..431bf52290a1
--- /dev/null
+++ b/net/dsa/devlink.c
@@ -0,0 +1,391 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * DSA devlink handling
+ */
+
+#include <net/dsa.h>
+#include <net/devlink.h>
+
+#include "devlink.h"
+
+static int dsa_devlink_info_get(struct devlink *dl,
+ struct devlink_info_req *req,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (ds->ops->devlink_info_get)
+ return ds->ops->devlink_info_get(ds, req, extack);
+
+ return -EOPNOTSUPP;
+}
+
+static int dsa_devlink_sb_pool_get(struct devlink *dl,
+ unsigned int sb_index, u16 pool_index,
+ struct devlink_sb_pool_info *pool_info)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_sb_pool_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_pool_get(ds, sb_index, pool_index,
+ pool_info);
+}
+
+static int dsa_devlink_sb_pool_set(struct devlink *dl, unsigned int sb_index,
+ u16 pool_index, u32 size,
+ enum devlink_sb_threshold_type threshold_type,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_sb_pool_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_pool_set(ds, sb_index, pool_index, size,
+ threshold_type, extack);
+}
+
+static int dsa_devlink_sb_port_pool_get(struct devlink_port *dlp,
+ unsigned int sb_index, u16 pool_index,
+ u32 *p_threshold)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_port_pool_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_port_pool_get(ds, port, sb_index,
+ pool_index, p_threshold);
+}
+
+static int dsa_devlink_sb_port_pool_set(struct devlink_port *dlp,
+ unsigned int sb_index, u16 pool_index,
+ u32 threshold,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_port_pool_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_port_pool_set(ds, port, sb_index,
+ pool_index, threshold, extack);
+}
+
+static int
+dsa_devlink_sb_tc_pool_bind_get(struct devlink_port *dlp,
+ unsigned int sb_index, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ u16 *p_pool_index, u32 *p_threshold)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_tc_pool_bind_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_tc_pool_bind_get(ds, port, sb_index,
+ tc_index, pool_type,
+ p_pool_index, p_threshold);
+}
+
+static int
+dsa_devlink_sb_tc_pool_bind_set(struct devlink_port *dlp,
+ unsigned int sb_index, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ u16 pool_index, u32 threshold,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_tc_pool_bind_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_tc_pool_bind_set(ds, port, sb_index,
+ tc_index, pool_type,
+ pool_index, threshold,
+ extack);
+}
+
+static int dsa_devlink_sb_occ_snapshot(struct devlink *dl,
+ unsigned int sb_index)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_sb_occ_snapshot)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_occ_snapshot(ds, sb_index);
+}
+
+static int dsa_devlink_sb_occ_max_clear(struct devlink *dl,
+ unsigned int sb_index)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_sb_occ_max_clear)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_occ_max_clear(ds, sb_index);
+}
+
+static int dsa_devlink_sb_occ_port_pool_get(struct devlink_port *dlp,
+ unsigned int sb_index,
+ u16 pool_index, u32 *p_cur,
+ u32 *p_max)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_occ_port_pool_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_occ_port_pool_get(ds, port, sb_index,
+ pool_index, p_cur, p_max);
+}
+
+static int
+dsa_devlink_sb_occ_tc_port_bind_get(struct devlink_port *dlp,
+ unsigned int sb_index, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ u32 *p_cur, u32 *p_max)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_occ_tc_port_bind_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_occ_tc_port_bind_get(ds, port,
+ sb_index, tc_index,
+ pool_type, p_cur,
+ p_max);
+}
+
+static const struct devlink_ops dsa_devlink_ops = {
+ .info_get = dsa_devlink_info_get,
+ .sb_pool_get = dsa_devlink_sb_pool_get,
+ .sb_pool_set = dsa_devlink_sb_pool_set,
+ .sb_port_pool_get = dsa_devlink_sb_port_pool_get,
+ .sb_port_pool_set = dsa_devlink_sb_port_pool_set,
+ .sb_tc_pool_bind_get = dsa_devlink_sb_tc_pool_bind_get,
+ .sb_tc_pool_bind_set = dsa_devlink_sb_tc_pool_bind_set,
+ .sb_occ_snapshot = dsa_devlink_sb_occ_snapshot,
+ .sb_occ_max_clear = dsa_devlink_sb_occ_max_clear,
+ .sb_occ_port_pool_get = dsa_devlink_sb_occ_port_pool_get,
+ .sb_occ_tc_port_bind_get = dsa_devlink_sb_occ_tc_port_bind_get,
+};
+
+int dsa_devlink_param_get(struct devlink *dl, u32 id,
+ struct devlink_param_gset_ctx *ctx)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_param_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_param_get(ds, id, ctx);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_param_get);
+
+int dsa_devlink_param_set(struct devlink *dl, u32 id,
+ struct devlink_param_gset_ctx *ctx)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_param_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_param_set(ds, id, ctx);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_param_set);
+
+int dsa_devlink_params_register(struct dsa_switch *ds,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ return devlink_params_register(ds->devlink, params, params_count);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_params_register);
+
+void dsa_devlink_params_unregister(struct dsa_switch *ds,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ devlink_params_unregister(ds->devlink, params, params_count);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_params_unregister);
+
+int dsa_devlink_resource_register(struct dsa_switch *ds,
+ const char *resource_name,
+ u64 resource_size,
+ u64 resource_id,
+ u64 parent_resource_id,
+ const struct devlink_resource_size_params *size_params)
+{
+ return devlink_resource_register(ds->devlink, resource_name,
+ resource_size, resource_id,
+ parent_resource_id,
+ size_params);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resource_register);
+
+void dsa_devlink_resources_unregister(struct dsa_switch *ds)
+{
+ devlink_resources_unregister(ds->devlink);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resources_unregister);
+
+void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds,
+ u64 resource_id,
+ devlink_resource_occ_get_t *occ_get,
+ void *occ_get_priv)
+{
+ return devlink_resource_occ_get_register(ds->devlink, resource_id,
+ occ_get, occ_get_priv);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_register);
+
+void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds,
+ u64 resource_id)
+{
+ devlink_resource_occ_get_unregister(ds->devlink, resource_id);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister);
+
+struct devlink_region *
+dsa_devlink_region_create(struct dsa_switch *ds,
+ const struct devlink_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
+{
+ return devlink_region_create(ds->devlink, ops, region_max_snapshots,
+ region_size);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_region_create);
+
+struct devlink_region *
+dsa_devlink_port_region_create(struct dsa_switch *ds,
+ int port,
+ const struct devlink_port_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port);
+
+ return devlink_port_region_create(&dp->devlink_port, ops,
+ region_max_snapshots,
+ region_size);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_port_region_create);
+
+void dsa_devlink_region_destroy(struct devlink_region *region)
+{
+ devlink_region_destroy(region);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_region_destroy);
+
+int dsa_port_devlink_setup(struct dsa_port *dp)
+{
+ struct devlink_port *dlp = &dp->devlink_port;
+ struct dsa_switch_tree *dst = dp->ds->dst;
+ struct devlink_port_attrs attrs = {};
+ struct devlink *dl = dp->ds->devlink;
+ struct dsa_switch *ds = dp->ds;
+ const unsigned char *id;
+ unsigned char len;
+ int err;
+
+ memset(dlp, 0, sizeof(*dlp));
+ devlink_port_init(dl, dlp);
+
+ if (ds->ops->port_setup) {
+ err = ds->ops->port_setup(ds, dp->index);
+ if (err)
+ return err;
+ }
+
+ id = (const unsigned char *)&dst->index;
+ len = sizeof(dst->index);
+
+ attrs.phys.port_number = dp->index;
+ memcpy(attrs.switch_id.id, id, len);
+ attrs.switch_id.id_len = len;
+
+ switch (dp->type) {
+ case DSA_PORT_TYPE_UNUSED:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_UNUSED;
+ break;
+ case DSA_PORT_TYPE_CPU:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_CPU;
+ break;
+ case DSA_PORT_TYPE_DSA:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_DSA;
+ break;
+ case DSA_PORT_TYPE_USER:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
+ break;
+ }
+
+ devlink_port_attrs_set(dlp, &attrs);
+ err = devlink_port_register(dl, dlp, dp->index);
+ if (err) {
+ if (ds->ops->port_teardown)
+ ds->ops->port_teardown(ds, dp->index);
+ return err;
+ }
+
+ return 0;
+}
+
+void dsa_port_devlink_teardown(struct dsa_port *dp)
+{
+ struct devlink_port *dlp = &dp->devlink_port;
+ struct dsa_switch *ds = dp->ds;
+
+ devlink_port_unregister(dlp);
+
+ if (ds->ops->port_teardown)
+ ds->ops->port_teardown(ds, dp->index);
+
+ devlink_port_fini(dlp);
+}
+
+void dsa_switch_devlink_register(struct dsa_switch *ds)
+{
+ devlink_register(ds->devlink);
+}
+
+void dsa_switch_devlink_unregister(struct dsa_switch *ds)
+{
+ devlink_unregister(ds->devlink);
+}
+
+int dsa_switch_devlink_alloc(struct dsa_switch *ds)
+{
+ struct dsa_devlink_priv *dl_priv;
+ struct devlink *dl;
+
+ /* Add the switch to devlink before calling setup, so that setup can
+ * add dpipe tables
+ */
+ dl = devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv), ds->dev);
+ if (!dl)
+ return -ENOMEM;
+
+ ds->devlink = dl;
+
+ dl_priv = devlink_priv(ds->devlink);
+ dl_priv->ds = ds;
+
+ return 0;
+}
+
+void dsa_switch_devlink_free(struct dsa_switch *ds)
+{
+ devlink_free(ds->devlink);
+ ds->devlink = NULL;
+}
diff --git a/net/dsa/devlink.h b/net/dsa/devlink.h
new file mode 100644
index 000000000000..4d9f4f23705b
--- /dev/null
+++ b/net/dsa/devlink.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_DEVLINK_H
+#define __DSA_DEVLINK_H
+
+struct dsa_port;
+struct dsa_switch;
+
+int dsa_port_devlink_setup(struct dsa_port *dp);
+void dsa_port_devlink_teardown(struct dsa_port *dp);
+void dsa_switch_devlink_register(struct dsa_switch *ds);
+void dsa_switch_devlink_unregister(struct dsa_switch *ds);
+int dsa_switch_devlink_alloc(struct dsa_switch *ds);
+void dsa_switch_devlink_free(struct dsa_switch *ds);
+
+#endif
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 64b14f655b23..e5f156940c67 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -1,453 +1,1637 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * net/dsa/dsa.c - Hardware switch handling
+ * DSA topology and switch handling
+ *
* Copyright (c) 2008-2009 Marvell Semiconductor
* Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
+ * Copyright (c) 2016 Andrew Lunn <andrew@lunn.ch>
*/
#include <linux/device.h>
+#include <linux/err.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/netdevice.h>
-#include <linux/sysfs.h>
-#include <linux/ptp_classify.h>
+#include <linux/slab.h>
+#include <linux/rtnetlink.h>
+#include <linux/of.h>
+#include <linux/of_mdio.h>
+#include <linux/of_net.h>
+#include <net/sch_generic.h>
+
+#include "devlink.h"
+#include "dsa.h"
+#include "master.h"
+#include "netlink.h"
+#include "port.h"
+#include "slave.h"
+#include "switch.h"
+#include "tag.h"
+
+#define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG
+
+static DEFINE_MUTEX(dsa2_mutex);
+LIST_HEAD(dsa_tree_list);
-#include "dsa_priv.h"
+static struct workqueue_struct *dsa_owq;
-static LIST_HEAD(dsa_tag_drivers_list);
-static DEFINE_MUTEX(dsa_tag_drivers_lock);
+/* Track the bridges with forwarding offload enabled */
+static unsigned long dsa_fwd_offloading_bridges;
-static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
- struct net_device *dev)
+bool dsa_schedule_work(struct work_struct *work)
{
- /* Just return the original SKB */
- return skb;
+ return queue_work(dsa_owq, work);
}
-static const struct dsa_device_ops none_ops = {
- .name = "none",
- .proto = DSA_TAG_PROTO_NONE,
- .xmit = dsa_slave_notag_xmit,
- .rcv = NULL,
-};
+void dsa_flush_workqueue(void)
+{
+ flush_workqueue(dsa_owq);
+}
+EXPORT_SYMBOL_GPL(dsa_flush_workqueue);
-DSA_TAG_DRIVER(none_ops);
+/**
+ * dsa_lag_map() - Map LAG structure to a linear LAG array
+ * @dst: Tree in which to record the mapping.
+ * @lag: LAG structure that is to be mapped to the tree's array.
+ *
+ * dsa_lag_id/dsa_lag_by_id can then be used to translate between the
+ * two spaces. The size of the mapping space is determined by the
+ * driver by setting ds->num_lag_ids. It is perfectly legal to leave
+ * it unset if it is not needed, in which case these functions become
+ * no-ops.
+ */
+void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag)
+{
+ unsigned int id;
+
+ for (id = 1; id <= dst->lags_len; id++) {
+ if (!dsa_lag_by_id(dst, id)) {
+ dst->lags[id - 1] = lag;
+ lag->id = id;
+ return;
+ }
+ }
-static void dsa_tag_driver_register(struct dsa_tag_driver *dsa_tag_driver,
- struct module *owner)
+ /* No IDs left, which is OK. Some drivers do not need it. The
+ * ones that do, e.g. mv88e6xxx, will discover that dsa_lag_id
+ * returns an error for this device when joining the LAG. The
+ * driver can then return -EOPNOTSUPP back to DSA, which will
+ * fall back to a software LAG.
+ */
+}
+
+/**
+ * dsa_lag_unmap() - Remove a LAG ID mapping
+ * @dst: Tree in which the mapping is recorded.
+ * @lag: LAG structure that was mapped.
+ *
+ * As there may be multiple users of the mapping, it is only removed
+ * if there are no other references to it.
+ */
+void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag)
{
- dsa_tag_driver->owner = owner;
+ unsigned int id;
- mutex_lock(&dsa_tag_drivers_lock);
- list_add_tail(&dsa_tag_driver->list, &dsa_tag_drivers_list);
- mutex_unlock(&dsa_tag_drivers_lock);
+ dsa_lags_foreach_id(id, dst) {
+ if (dsa_lag_by_id(dst, id) == lag) {
+ dst->lags[id - 1] = NULL;
+ lag->id = 0;
+ break;
+ }
+ }
}
-void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[],
- unsigned int count, struct module *owner)
+struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst,
+ const struct net_device *lag_dev)
{
- unsigned int i;
+ struct dsa_port *dp;
- for (i = 0; i < count; i++)
- dsa_tag_driver_register(dsa_tag_driver_array[i], owner);
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_lag_dev_get(dp) == lag_dev)
+ return dp->lag;
+
+ return NULL;
}
-static void dsa_tag_driver_unregister(struct dsa_tag_driver *dsa_tag_driver)
+struct dsa_bridge *dsa_tree_bridge_find(struct dsa_switch_tree *dst,
+ const struct net_device *br)
{
- mutex_lock(&dsa_tag_drivers_lock);
- list_del(&dsa_tag_driver->list);
- mutex_unlock(&dsa_tag_drivers_lock);
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_bridge_dev_get(dp) == br)
+ return dp->bridge;
+
+ return NULL;
}
-EXPORT_SYMBOL_GPL(dsa_tag_drivers_register);
-void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[],
- unsigned int count)
+static int dsa_bridge_num_find(const struct net_device *bridge_dev)
{
- unsigned int i;
+ struct dsa_switch_tree *dst;
- for (i = 0; i < count; i++)
- dsa_tag_driver_unregister(dsa_tag_driver_array[i]);
+ list_for_each_entry(dst, &dsa_tree_list, list) {
+ struct dsa_bridge *bridge;
+
+ bridge = dsa_tree_bridge_find(dst, bridge_dev);
+ if (bridge)
+ return bridge->num;
+ }
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(dsa_tag_drivers_unregister);
-const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops)
+unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max)
{
- return ops->name;
-};
+ unsigned int bridge_num = dsa_bridge_num_find(bridge_dev);
+
+ /* Switches without FDB isolation support don't get unique
+ * bridge numbering
+ */
+ if (!max)
+ return 0;
+
+ if (!bridge_num) {
+ /* First port that requests FDB isolation or TX forwarding
+ * offload for this bridge
+ */
+ bridge_num = find_next_zero_bit(&dsa_fwd_offloading_bridges,
+ DSA_MAX_NUM_OFFLOADING_BRIDGES,
+ 1);
+ if (bridge_num >= max)
+ return 0;
+
+ set_bit(bridge_num, &dsa_fwd_offloading_bridges);
+ }
+
+ return bridge_num;
+}
+
+void dsa_bridge_num_put(const struct net_device *bridge_dev,
+ unsigned int bridge_num)
+{
+ /* Since we refcount bridges, we know that when we call this function
+ * it is no longer in use, so we can just go ahead and remove it from
+ * the bit mask.
+ */
+ clear_bit(bridge_num, &dsa_fwd_offloading_bridges);
+}
+
+struct dsa_switch *dsa_switch_find(int tree_index, int sw_index)
+{
+ struct dsa_switch_tree *dst;
+ struct dsa_port *dp;
+
+ list_for_each_entry(dst, &dsa_tree_list, list) {
+ if (dst->index != tree_index)
+ continue;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dp->ds->index != sw_index)
+ continue;
+
+ return dp->ds;
+ }
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(dsa_switch_find);
+
+static struct dsa_switch_tree *dsa_tree_find(int index)
+{
+ struct dsa_switch_tree *dst;
+
+ list_for_each_entry(dst, &dsa_tree_list, list)
+ if (dst->index == index)
+ return dst;
+
+ return NULL;
+}
+
+static struct dsa_switch_tree *dsa_tree_alloc(int index)
+{
+ struct dsa_switch_tree *dst;
+
+ dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+ if (!dst)
+ return NULL;
+
+ dst->index = index;
+
+ INIT_LIST_HEAD(&dst->rtable);
+
+ INIT_LIST_HEAD(&dst->ports);
+
+ INIT_LIST_HEAD(&dst->list);
+ list_add_tail(&dst->list, &dsa_tree_list);
+
+ kref_init(&dst->refcount);
-/* Function takes a reference on the module owning the tagger,
- * so dsa_tag_driver_put must be called afterwards.
+ return dst;
+}
+
+static void dsa_tree_free(struct dsa_switch_tree *dst)
+{
+ if (dst->tag_ops)
+ dsa_tag_driver_put(dst->tag_ops);
+ list_del(&dst->list);
+ kfree(dst);
+}
+
+static struct dsa_switch_tree *dsa_tree_get(struct dsa_switch_tree *dst)
+{
+ if (dst)
+ kref_get(&dst->refcount);
+
+ return dst;
+}
+
+static struct dsa_switch_tree *dsa_tree_touch(int index)
+{
+ struct dsa_switch_tree *dst;
+
+ dst = dsa_tree_find(index);
+ if (dst)
+ return dsa_tree_get(dst);
+ else
+ return dsa_tree_alloc(index);
+}
+
+static void dsa_tree_release(struct kref *ref)
+{
+ struct dsa_switch_tree *dst;
+
+ dst = container_of(ref, struct dsa_switch_tree, refcount);
+
+ dsa_tree_free(dst);
+}
+
+static void dsa_tree_put(struct dsa_switch_tree *dst)
+{
+ if (dst)
+ kref_put(&dst->refcount, dsa_tree_release);
+}
+
+static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst,
+ struct device_node *dn)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dp->dn == dn)
+ return dp;
+
+ return NULL;
+}
+
+static struct dsa_link *dsa_link_touch(struct dsa_port *dp,
+ struct dsa_port *link_dp)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_switch_tree *dst;
+ struct dsa_link *dl;
+
+ dst = ds->dst;
+
+ list_for_each_entry(dl, &dst->rtable, list)
+ if (dl->dp == dp && dl->link_dp == link_dp)
+ return dl;
+
+ dl = kzalloc(sizeof(*dl), GFP_KERNEL);
+ if (!dl)
+ return NULL;
+
+ dl->dp = dp;
+ dl->link_dp = link_dp;
+
+ INIT_LIST_HEAD(&dl->list);
+ list_add_tail(&dl->list, &dst->rtable);
+
+ return dl;
+}
+
+static bool dsa_port_setup_routing_table(struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_switch_tree *dst = ds->dst;
+ struct device_node *dn = dp->dn;
+ struct of_phandle_iterator it;
+ struct dsa_port *link_dp;
+ struct dsa_link *dl;
+ int err;
+
+ of_for_each_phandle(&it, err, dn, "link", NULL, 0) {
+ link_dp = dsa_tree_find_port_by_node(dst, it.node);
+ if (!link_dp) {
+ of_node_put(it.node);
+ return false;
+ }
+
+ dl = dsa_link_touch(dp, link_dp);
+ if (!dl) {
+ of_node_put(it.node);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst)
+{
+ bool complete = true;
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dsa_port_is_dsa(dp)) {
+ complete = dsa_port_setup_routing_table(dp);
+ if (!complete)
+ break;
+ }
+ }
+
+ return complete;
+}
+
+static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_cpu(dp))
+ return dp;
+
+ return NULL;
+}
+
+struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst)
+{
+ struct device_node *ethernet;
+ struct net_device *master;
+ struct dsa_port *cpu_dp;
+
+ cpu_dp = dsa_tree_find_first_cpu(dst);
+ ethernet = of_parse_phandle(cpu_dp->dn, "ethernet", 0);
+ master = of_find_net_device_by_node(ethernet);
+ of_node_put(ethernet);
+
+ return master;
+}
+
+/* Assign the default CPU port (the first one in the tree) to all ports of the
+ * fabric which don't already have one as part of their own switch.
*/
-const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf)
+static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst)
{
- const struct dsa_device_ops *ops = ERR_PTR(-ENOPROTOOPT);
- struct dsa_tag_driver *dsa_tag_driver;
+ struct dsa_port *cpu_dp, *dp;
- mutex_lock(&dsa_tag_drivers_lock);
- list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) {
- const struct dsa_device_ops *tmp = dsa_tag_driver->ops;
+ cpu_dp = dsa_tree_find_first_cpu(dst);
+ if (!cpu_dp) {
+ pr_err("DSA: tree %d has no CPU port\n", dst->index);
+ return -EINVAL;
+ }
- if (!sysfs_streq(buf, tmp->name))
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dp->cpu_dp)
continue;
- if (!try_module_get(dsa_tag_driver->owner))
- break;
+ if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
+ dp->cpu_dp = cpu_dp;
+ }
- ops = tmp;
- break;
+ return 0;
+}
+
+/* Perform initial assignment of CPU ports to user ports and DSA links in the
+ * fabric, giving preference to CPU ports local to each switch. Default to
+ * using the first CPU port in the switch tree if the port does not have a CPU
+ * port local to this switch.
+ */
+static int dsa_tree_setup_cpu_ports(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *cpu_dp, *dp;
+
+ list_for_each_entry(cpu_dp, &dst->ports, list) {
+ if (!dsa_port_is_cpu(cpu_dp))
+ continue;
+
+ /* Prefer a local CPU port */
+ dsa_switch_for_each_port(dp, cpu_dp->ds) {
+ /* Prefer the first local CPU port found */
+ if (dp->cpu_dp)
+ continue;
+
+ if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
+ dp->cpu_dp = cpu_dp;
+ }
}
- mutex_unlock(&dsa_tag_drivers_lock);
- return ops;
+ return dsa_tree_setup_default_cpu(dst);
}
-const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol)
+static void dsa_tree_teardown_cpu_ports(struct dsa_switch_tree *dst)
{
- struct dsa_tag_driver *dsa_tag_driver;
- const struct dsa_device_ops *ops;
- bool found = false;
+ struct dsa_port *dp;
- request_module("%s%d", DSA_TAG_DRIVER_ALIAS, tag_protocol);
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
+ dp->cpu_dp = NULL;
+}
- mutex_lock(&dsa_tag_drivers_lock);
- list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) {
- ops = dsa_tag_driver->ops;
- if (ops->proto == tag_protocol) {
- found = true;
+static int dsa_port_setup(struct dsa_port *dp)
+{
+ bool dsa_port_link_registered = false;
+ struct dsa_switch *ds = dp->ds;
+ bool dsa_port_enabled = false;
+ int err = 0;
+
+ if (dp->setup)
+ return 0;
+
+ err = dsa_port_devlink_setup(dp);
+ if (err)
+ return err;
+
+ switch (dp->type) {
+ case DSA_PORT_TYPE_UNUSED:
+ dsa_port_disable(dp);
+ break;
+ case DSA_PORT_TYPE_CPU:
+ if (dp->dn) {
+ err = dsa_shared_port_link_register_of(dp);
+ if (err)
+ break;
+ dsa_port_link_registered = true;
+ } else {
+ dev_warn(ds->dev,
+ "skipping link registration for CPU port %d\n",
+ dp->index);
+ }
+
+ err = dsa_port_enable(dp, NULL);
+ if (err)
break;
+ dsa_port_enabled = true;
+
+ break;
+ case DSA_PORT_TYPE_DSA:
+ if (dp->dn) {
+ err = dsa_shared_port_link_register_of(dp);
+ if (err)
+ break;
+ dsa_port_link_registered = true;
+ } else {
+ dev_warn(ds->dev,
+ "skipping link registration for DSA port %d\n",
+ dp->index);
}
+
+ err = dsa_port_enable(dp, NULL);
+ if (err)
+ break;
+ dsa_port_enabled = true;
+
+ break;
+ case DSA_PORT_TYPE_USER:
+ of_get_mac_address(dp->dn, dp->mac);
+ err = dsa_slave_create(dp);
+ break;
}
- if (found) {
- if (!try_module_get(dsa_tag_driver->owner))
- ops = ERR_PTR(-ENOPROTOOPT);
- } else {
- ops = ERR_PTR(-ENOPROTOOPT);
+ if (err && dsa_port_enabled)
+ dsa_port_disable(dp);
+ if (err && dsa_port_link_registered)
+ dsa_shared_port_link_unregister_of(dp);
+ if (err) {
+ dsa_port_devlink_teardown(dp);
+ return err;
}
- mutex_unlock(&dsa_tag_drivers_lock);
+ dp->setup = true;
- return ops;
+ return 0;
}
-void dsa_tag_driver_put(const struct dsa_device_ops *ops)
+static void dsa_port_teardown(struct dsa_port *dp)
{
- struct dsa_tag_driver *dsa_tag_driver;
+ if (!dp->setup)
+ return;
- mutex_lock(&dsa_tag_drivers_lock);
- list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) {
- if (dsa_tag_driver->ops == ops) {
- module_put(dsa_tag_driver->owner);
- break;
+ switch (dp->type) {
+ case DSA_PORT_TYPE_UNUSED:
+ break;
+ case DSA_PORT_TYPE_CPU:
+ dsa_port_disable(dp);
+ if (dp->dn)
+ dsa_shared_port_link_unregister_of(dp);
+ break;
+ case DSA_PORT_TYPE_DSA:
+ dsa_port_disable(dp);
+ if (dp->dn)
+ dsa_shared_port_link_unregister_of(dp);
+ break;
+ case DSA_PORT_TYPE_USER:
+ if (dp->slave) {
+ dsa_slave_destroy(dp->slave);
+ dp->slave = NULL;
}
+ break;
}
- mutex_unlock(&dsa_tag_drivers_lock);
+
+ dsa_port_devlink_teardown(dp);
+
+ dp->setup = false;
}
-static int dev_is_class(struct device *dev, void *class)
+static int dsa_port_setup_as_unused(struct dsa_port *dp)
{
- if (dev->class != NULL && !strcmp(dev->class->name, class))
- return 1;
+ dp->type = DSA_PORT_TYPE_UNUSED;
+ return dsa_port_setup(dp);
+}
+
+static int dsa_switch_setup_tag_protocol(struct dsa_switch *ds)
+{
+ const struct dsa_device_ops *tag_ops = ds->dst->tag_ops;
+ struct dsa_switch_tree *dst = ds->dst;
+ int err;
+
+ if (tag_ops->proto == dst->default_proto)
+ goto connect;
+
+ rtnl_lock();
+ err = ds->ops->change_tag_protocol(ds, tag_ops->proto);
+ rtnl_unlock();
+ if (err) {
+ dev_err(ds->dev, "Unable to use tag protocol \"%s\": %pe\n",
+ tag_ops->name, ERR_PTR(err));
+ return err;
+ }
+
+connect:
+ if (tag_ops->connect) {
+ err = tag_ops->connect(ds);
+ if (err)
+ return err;
+ }
+
+ if (ds->ops->connect_tag_protocol) {
+ err = ds->ops->connect_tag_protocol(ds, tag_ops->proto);
+ if (err) {
+ dev_err(ds->dev,
+ "Unable to connect to tag protocol \"%s\": %pe\n",
+ tag_ops->name, ERR_PTR(err));
+ goto disconnect;
+ }
+ }
return 0;
+
+disconnect:
+ if (tag_ops->disconnect)
+ tag_ops->disconnect(ds);
+
+ return err;
}
-static struct device *dev_find_class(struct device *parent, char *class)
+static void dsa_switch_teardown_tag_protocol(struct dsa_switch *ds)
{
- if (dev_is_class(parent, class)) {
- get_device(parent);
- return parent;
- }
+ const struct dsa_device_ops *tag_ops = ds->dst->tag_ops;
- return device_find_child(parent, class, dev_is_class);
+ if (tag_ops->disconnect)
+ tag_ops->disconnect(ds);
}
-struct net_device *dsa_dev_to_net_device(struct device *dev)
+static int dsa_switch_setup(struct dsa_switch *ds)
{
- struct device *d;
+ struct device_node *dn;
+ int err;
- d = dev_find_class(dev, "net");
- if (d != NULL) {
- struct net_device *nd;
+ if (ds->setup)
+ return 0;
- nd = to_net_dev(d);
- dev_hold(nd);
- put_device(d);
+ /* Initialize ds->phys_mii_mask before registering the slave MDIO bus
+ * driver and before ops->setup() has run, since the switch drivers and
+ * the slave MDIO bus driver rely on these values for probing PHY
+ * devices or not
+ */
+ ds->phys_mii_mask |= dsa_user_ports(ds);
- return nd;
+ err = dsa_switch_devlink_alloc(ds);
+ if (err)
+ return err;
+
+ err = dsa_switch_register_notifier(ds);
+ if (err)
+ goto devlink_free;
+
+ ds->configure_vlan_while_not_filtering = true;
+
+ err = ds->ops->setup(ds);
+ if (err < 0)
+ goto unregister_notifier;
+
+ err = dsa_switch_setup_tag_protocol(ds);
+ if (err)
+ goto teardown;
+
+ if (!ds->slave_mii_bus && ds->ops->phy_read) {
+ ds->slave_mii_bus = mdiobus_alloc();
+ if (!ds->slave_mii_bus) {
+ err = -ENOMEM;
+ goto teardown;
+ }
+
+ dsa_slave_mii_bus_init(ds);
+
+ dn = of_get_child_by_name(ds->dev->of_node, "mdio");
+
+ err = of_mdiobus_register(ds->slave_mii_bus, dn);
+ of_node_put(dn);
+ if (err < 0)
+ goto free_slave_mii_bus;
}
- return NULL;
+ dsa_switch_devlink_register(ds);
+
+ ds->setup = true;
+ return 0;
+
+free_slave_mii_bus:
+ if (ds->slave_mii_bus && ds->ops->phy_read)
+ mdiobus_free(ds->slave_mii_bus);
+teardown:
+ if (ds->ops->teardown)
+ ds->ops->teardown(ds);
+unregister_notifier:
+ dsa_switch_unregister_notifier(ds);
+devlink_free:
+ dsa_switch_devlink_free(ds);
+ return err;
}
-EXPORT_SYMBOL_GPL(dsa_dev_to_net_device);
-/* Determine if we should defer delivery of skb until we have a rx timestamp.
- *
- * Called from dsa_switch_rcv. For now, this will only work if tagging is
- * enabled on the switch. Normally the MAC driver would retrieve the hardware
- * timestamp when it reads the packet out of the hardware. However in a DSA
- * switch, the DSA driver owning the interface to which the packet is
- * delivered is never notified unless we do so here.
+static void dsa_switch_teardown(struct dsa_switch *ds)
+{
+ if (!ds->setup)
+ return;
+
+ dsa_switch_devlink_unregister(ds);
+
+ if (ds->slave_mii_bus && ds->ops->phy_read) {
+ mdiobus_unregister(ds->slave_mii_bus);
+ mdiobus_free(ds->slave_mii_bus);
+ ds->slave_mii_bus = NULL;
+ }
+
+ dsa_switch_teardown_tag_protocol(ds);
+
+ if (ds->ops->teardown)
+ ds->ops->teardown(ds);
+
+ dsa_switch_unregister_notifier(ds);
+
+ dsa_switch_devlink_free(ds);
+
+ ds->setup = false;
+}
+
+/* First tear down the non-shared, then the shared ports. This ensures that
+ * all work items scheduled by our switchdev handlers for user ports have
+ * completed before we destroy the refcounting kept on the shared ports.
*/
-static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p,
- struct sk_buff *skb)
+static void dsa_tree_teardown_ports(struct dsa_switch_tree *dst)
{
- struct dsa_switch *ds = p->dp->ds;
- unsigned int type;
+ struct dsa_port *dp;
- if (skb_headroom(skb) < ETH_HLEN)
- return false;
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_user(dp) || dsa_port_is_unused(dp))
+ dsa_port_teardown(dp);
+
+ dsa_flush_workqueue();
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp))
+ dsa_port_teardown(dp);
+}
- __skb_push(skb, ETH_HLEN);
+static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *dp;
- type = ptp_classify_raw(skb);
+ list_for_each_entry(dp, &dst->ports, list)
+ dsa_switch_teardown(dp->ds);
+}
- __skb_pull(skb, ETH_HLEN);
+/* Bring shared ports up first, then non-shared ports */
+static int dsa_tree_setup_ports(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *dp;
+ int err = 0;
- if (type == PTP_CLASS_NONE)
- return false;
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp)) {
+ err = dsa_port_setup(dp);
+ if (err)
+ goto teardown;
+ }
+ }
- if (likely(ds->ops->port_rxtstamp))
- return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type);
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dsa_port_is_user(dp) || dsa_port_is_unused(dp)) {
+ err = dsa_port_setup(dp);
+ if (err) {
+ err = dsa_port_setup_as_unused(dp);
+ if (err)
+ goto teardown;
+ }
+ }
+ }
- return false;
+ return 0;
+
+teardown:
+ dsa_tree_teardown_ports(dst);
+
+ return err;
}
-static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt, struct net_device *unused)
+static int dsa_tree_setup_switches(struct dsa_switch_tree *dst)
{
- struct dsa_port *cpu_dp = dev->dsa_ptr;
- struct sk_buff *nskb = NULL;
- struct dsa_slave_priv *p;
+ struct dsa_port *dp;
+ int err = 0;
- if (unlikely(!cpu_dp)) {
- kfree_skb(skb);
- return 0;
+ list_for_each_entry(dp, &dst->ports, list) {
+ err = dsa_switch_setup(dp->ds);
+ if (err) {
+ dsa_tree_teardown_switches(dst);
+ break;
+ }
}
- skb = skb_unshare(skb, GFP_ATOMIC);
- if (!skb)
- return 0;
+ return err;
+}
- nskb = cpu_dp->rcv(skb, dev);
- if (!nskb) {
- kfree_skb(skb);
- return 0;
+static int dsa_tree_setup_master(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *cpu_dp;
+ int err = 0;
+
+ rtnl_lock();
+
+ dsa_tree_for_each_cpu_port(cpu_dp, dst) {
+ struct net_device *master = cpu_dp->master;
+ bool admin_up = (master->flags & IFF_UP) &&
+ !qdisc_tx_is_noop(master);
+
+ err = dsa_master_setup(master, cpu_dp);
+ if (err)
+ break;
+
+ /* Replay master state event */
+ dsa_tree_master_admin_state_change(dst, master, admin_up);
+ dsa_tree_master_oper_state_change(dst, master,
+ netif_oper_up(master));
}
- skb = nskb;
- skb_push(skb, ETH_HLEN);
- skb->pkt_type = PACKET_HOST;
- skb->protocol = eth_type_trans(skb, skb->dev);
+ rtnl_unlock();
+
+ return err;
+}
+
+static void dsa_tree_teardown_master(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *cpu_dp;
- if (unlikely(!dsa_slave_dev_check(skb->dev))) {
- /* Packet is to be injected directly on an upper
- * device, e.g. a team/bond, so skip all DSA-port
- * specific actions.
+ rtnl_lock();
+
+ dsa_tree_for_each_cpu_port(cpu_dp, dst) {
+ struct net_device *master = cpu_dp->master;
+
+ /* Synthesizing an "admin down" state is sufficient for
+ * the switches to get a notification if the master is
+ * currently up and running.
*/
- netif_rx(skb);
- return 0;
+ dsa_tree_master_admin_state_change(dst, master, false);
+
+ dsa_master_teardown(master);
}
- p = netdev_priv(skb->dev);
+ rtnl_unlock();
+}
- if (unlikely(cpu_dp->ds->untag_bridge_pvid)) {
- nskb = dsa_untag_bridge_pvid(skb);
- if (!nskb) {
- kfree_skb(skb);
- return 0;
- }
- skb = nskb;
+static int dsa_tree_setup_lags(struct dsa_switch_tree *dst)
+{
+ unsigned int len = 0;
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dp->ds->num_lag_ids > len)
+ len = dp->ds->num_lag_ids;
}
- dev_sw_netstats_rx_add(skb->dev, skb->len);
+ if (!len)
+ return 0;
+
+ dst->lags = kcalloc(len, sizeof(*dst->lags), GFP_KERNEL);
+ if (!dst->lags)
+ return -ENOMEM;
+
+ dst->lags_len = len;
+ return 0;
+}
+
+static void dsa_tree_teardown_lags(struct dsa_switch_tree *dst)
+{
+ kfree(dst->lags);
+}
+
+static int dsa_tree_setup(struct dsa_switch_tree *dst)
+{
+ bool complete;
+ int err;
+
+ if (dst->setup) {
+ pr_err("DSA: tree %d already setup! Disjoint trees?\n",
+ dst->index);
+ return -EEXIST;
+ }
- if (dsa_skb_defer_rx_timestamp(p, skb))
+ complete = dsa_tree_setup_routing_table(dst);
+ if (!complete)
return 0;
- gro_cells_receive(&p->gcells, skb);
+ err = dsa_tree_setup_cpu_ports(dst);
+ if (err)
+ return err;
+
+ err = dsa_tree_setup_switches(dst);
+ if (err)
+ goto teardown_cpu_ports;
+
+ err = dsa_tree_setup_ports(dst);
+ if (err)
+ goto teardown_switches;
+
+ err = dsa_tree_setup_master(dst);
+ if (err)
+ goto teardown_ports;
+
+ err = dsa_tree_setup_lags(dst);
+ if (err)
+ goto teardown_master;
+
+ dst->setup = true;
+
+ pr_info("DSA: tree %d setup\n", dst->index);
return 0;
+
+teardown_master:
+ dsa_tree_teardown_master(dst);
+teardown_ports:
+ dsa_tree_teardown_ports(dst);
+teardown_switches:
+ dsa_tree_teardown_switches(dst);
+teardown_cpu_ports:
+ dsa_tree_teardown_cpu_ports(dst);
+
+ return err;
}
-#ifdef CONFIG_PM_SLEEP
-static bool dsa_port_is_initialized(const struct dsa_port *dp)
+static void dsa_tree_teardown(struct dsa_switch_tree *dst)
{
- return dp->type == DSA_PORT_TYPE_USER && dp->slave;
+ struct dsa_link *dl, *next;
+
+ if (!dst->setup)
+ return;
+
+ dsa_tree_teardown_lags(dst);
+
+ dsa_tree_teardown_master(dst);
+
+ dsa_tree_teardown_ports(dst);
+
+ dsa_tree_teardown_switches(dst);
+
+ dsa_tree_teardown_cpu_ports(dst);
+
+ list_for_each_entry_safe(dl, next, &dst->rtable, list) {
+ list_del(&dl->list);
+ kfree(dl);
+ }
+
+ pr_info("DSA: tree %d torn down\n", dst->index);
+
+ dst->setup = false;
}
-int dsa_switch_suspend(struct dsa_switch *ds)
+static int dsa_tree_bind_tag_proto(struct dsa_switch_tree *dst,
+ const struct dsa_device_ops *tag_ops)
+{
+ const struct dsa_device_ops *old_tag_ops = dst->tag_ops;
+ struct dsa_notifier_tag_proto_info info;
+ int err;
+
+ dst->tag_ops = tag_ops;
+
+ /* Notify the switches from this tree about the connection
+ * to the new tagger
+ */
+ info.tag_ops = tag_ops;
+ err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_CONNECT, &info);
+ if (err && err != -EOPNOTSUPP)
+ goto out_disconnect;
+
+ /* Notify the old tagger about the disconnection from this tree */
+ info.tag_ops = old_tag_ops;
+ dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info);
+
+ return 0;
+
+out_disconnect:
+ info.tag_ops = tag_ops;
+ dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info);
+ dst->tag_ops = old_tag_ops;
+
+ return err;
+}
+
+/* Since the dsa/tagging sysfs device attribute is per master, the assumption
+ * is that all DSA switches within a tree share the same tagger, otherwise
+ * they would have formed disjoint trees (different "dsa,member" values).
+ */
+int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
+ const struct dsa_device_ops *tag_ops,
+ const struct dsa_device_ops *old_tag_ops)
{
+ struct dsa_notifier_tag_proto_info info;
struct dsa_port *dp;
- int ret = 0;
+ int err = -EBUSY;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ /* At the moment we don't allow changing the tag protocol under
+ * traffic. The rtnl_mutex also happens to serialize concurrent
+ * attempts to change the tagging protocol. If we ever lift the IFF_UP
+ * restriction, there needs to be another mutex which serializes this.
+ */
+ dsa_tree_for_each_user_port(dp, dst) {
+ if (dsa_port_to_master(dp)->flags & IFF_UP)
+ goto out_unlock;
+
+ if (dp->slave->flags & IFF_UP)
+ goto out_unlock;
+ }
- /* Suspend slave network devices */
- dsa_switch_for_each_port(dp, ds) {
- if (!dsa_port_is_initialized(dp))
- continue;
+ /* Notify the tag protocol change */
+ info.tag_ops = tag_ops;
+ err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info);
+ if (err)
+ goto out_unwind_tagger;
- ret = dsa_slave_suspend(dp->slave);
- if (ret)
- return ret;
+ err = dsa_tree_bind_tag_proto(dst, tag_ops);
+ if (err)
+ goto out_unwind_tagger;
+
+ rtnl_unlock();
+
+ return 0;
+
+out_unwind_tagger:
+ info.tag_ops = old_tag_ops;
+ dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info);
+out_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static void dsa_tree_master_state_change(struct dsa_switch_tree *dst,
+ struct net_device *master)
+{
+ struct dsa_notifier_master_state_info info;
+ struct dsa_port *cpu_dp = master->dsa_ptr;
+
+ info.master = master;
+ info.operational = dsa_port_master_is_operational(cpu_dp);
+
+ dsa_tree_notify(dst, DSA_NOTIFIER_MASTER_STATE_CHANGE, &info);
+}
+
+void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst,
+ struct net_device *master,
+ bool up)
+{
+ struct dsa_port *cpu_dp = master->dsa_ptr;
+ bool notify = false;
+
+ /* Don't keep track of admin state on LAG DSA masters,
+ * but rather just of physical DSA masters
+ */
+ if (netif_is_lag_master(master))
+ return;
+
+ if ((dsa_port_master_is_operational(cpu_dp)) !=
+ (up && cpu_dp->master_oper_up))
+ notify = true;
+
+ cpu_dp->master_admin_up = up;
+
+ if (notify)
+ dsa_tree_master_state_change(dst, master);
+}
+
+void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst,
+ struct net_device *master,
+ bool up)
+{
+ struct dsa_port *cpu_dp = master->dsa_ptr;
+ bool notify = false;
+
+ /* Don't keep track of oper state on LAG DSA masters,
+ * but rather just of physical DSA masters
+ */
+ if (netif_is_lag_master(master))
+ return;
+
+ if ((dsa_port_master_is_operational(cpu_dp)) !=
+ (cpu_dp->master_admin_up && up))
+ notify = true;
+
+ cpu_dp->master_oper_up = up;
+
+ if (notify)
+ dsa_tree_master_state_change(dst, master);
+}
+
+static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index)
+{
+ struct dsa_switch_tree *dst = ds->dst;
+ struct dsa_port *dp;
+
+ dsa_switch_for_each_port(dp, ds)
+ if (dp->index == index)
+ return dp;
+
+ dp = kzalloc(sizeof(*dp), GFP_KERNEL);
+ if (!dp)
+ return NULL;
+
+ dp->ds = ds;
+ dp->index = index;
+
+ mutex_init(&dp->addr_lists_lock);
+ mutex_init(&dp->vlans_lock);
+ INIT_LIST_HEAD(&dp->fdbs);
+ INIT_LIST_HEAD(&dp->mdbs);
+ INIT_LIST_HEAD(&dp->vlans);
+ INIT_LIST_HEAD(&dp->list);
+ list_add_tail(&dp->list, &dst->ports);
+
+ return dp;
+}
+
+static int dsa_port_parse_user(struct dsa_port *dp, const char *name)
+{
+ dp->type = DSA_PORT_TYPE_USER;
+ dp->name = name;
+
+ return 0;
+}
+
+static int dsa_port_parse_dsa(struct dsa_port *dp)
+{
+ dp->type = DSA_PORT_TYPE_DSA;
+
+ return 0;
+}
+
+static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp,
+ struct net_device *master)
+{
+ enum dsa_tag_protocol tag_protocol = DSA_TAG_PROTO_NONE;
+ struct dsa_switch *mds, *ds = dp->ds;
+ unsigned int mdp_upstream;
+ struct dsa_port *mdp;
+
+ /* It is possible to stack DSA switches onto one another when that
+ * happens the switch driver may want to know if its tagging protocol
+ * is going to work in such a configuration.
+ */
+ if (dsa_slave_dev_check(master)) {
+ mdp = dsa_slave_to_port(master);
+ mds = mdp->ds;
+ mdp_upstream = dsa_upstream_port(mds, mdp->index);
+ tag_protocol = mds->ops->get_tag_protocol(mds, mdp_upstream,
+ DSA_TAG_PROTO_NONE);
}
- if (ds->ops->suspend)
- ret = ds->ops->suspend(ds);
+ /* If the master device is not itself a DSA slave in a disjoint DSA
+ * tree, then return immediately.
+ */
+ return ds->ops->get_tag_protocol(ds, dp->index, tag_protocol);
+}
- return ret;
+static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master,
+ const char *user_protocol)
+{
+ const struct dsa_device_ops *tag_ops = NULL;
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_switch_tree *dst = ds->dst;
+ enum dsa_tag_protocol default_proto;
+
+ /* Find out which protocol the switch would prefer. */
+ default_proto = dsa_get_tag_protocol(dp, master);
+ if (dst->default_proto) {
+ if (dst->default_proto != default_proto) {
+ dev_err(ds->dev,
+ "A DSA switch tree can have only one tagging protocol\n");
+ return -EINVAL;
+ }
+ } else {
+ dst->default_proto = default_proto;
+ }
+
+ /* See if the user wants to override that preference. */
+ if (user_protocol) {
+ if (!ds->ops->change_tag_protocol) {
+ dev_err(ds->dev, "Tag protocol cannot be modified\n");
+ return -EINVAL;
+ }
+
+ tag_ops = dsa_tag_driver_get_by_name(user_protocol);
+ if (IS_ERR(tag_ops)) {
+ dev_warn(ds->dev,
+ "Failed to find a tagging driver for protocol %s, using default\n",
+ user_protocol);
+ tag_ops = NULL;
+ }
+ }
+
+ if (!tag_ops)
+ tag_ops = dsa_tag_driver_get_by_id(default_proto);
+
+ if (IS_ERR(tag_ops)) {
+ if (PTR_ERR(tag_ops) == -ENOPROTOOPT)
+ return -EPROBE_DEFER;
+
+ dev_warn(ds->dev, "No tagger for this switch\n");
+ return PTR_ERR(tag_ops);
+ }
+
+ if (dst->tag_ops) {
+ if (dst->tag_ops != tag_ops) {
+ dev_err(ds->dev,
+ "A DSA switch tree can have only one tagging protocol\n");
+
+ dsa_tag_driver_put(tag_ops);
+ return -EINVAL;
+ }
+
+ /* In the case of multiple CPU ports per switch, the tagging
+ * protocol is still reference-counted only per switch tree.
+ */
+ dsa_tag_driver_put(tag_ops);
+ } else {
+ dst->tag_ops = tag_ops;
+ }
+
+ dp->master = master;
+ dp->type = DSA_PORT_TYPE_CPU;
+ dsa_port_set_tag_protocol(dp, dst->tag_ops);
+ dp->dst = dst;
+
+ /* At this point, the tree may be configured to use a different
+ * tagger than the one chosen by the switch driver during
+ * .setup, in the case when a user selects a custom protocol
+ * through the DT.
+ *
+ * This is resolved by syncing the driver with the tree in
+ * dsa_switch_setup_tag_protocol once .setup has run and the
+ * driver is ready to accept calls to .change_tag_protocol. If
+ * the driver does not support the custom protocol at that
+ * point, the tree is wholly rejected, thereby ensuring that the
+ * tree and driver are always in agreement on the protocol to
+ * use.
+ */
+ return 0;
}
-EXPORT_SYMBOL_GPL(dsa_switch_suspend);
-int dsa_switch_resume(struct dsa_switch *ds)
+static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn)
+{
+ struct device_node *ethernet = of_parse_phandle(dn, "ethernet", 0);
+ const char *name = of_get_property(dn, "label", NULL);
+ bool link = of_property_read_bool(dn, "link");
+
+ dp->dn = dn;
+
+ if (ethernet) {
+ struct net_device *master;
+ const char *user_protocol;
+
+ master = of_find_net_device_by_node(ethernet);
+ of_node_put(ethernet);
+ if (!master)
+ return -EPROBE_DEFER;
+
+ user_protocol = of_get_property(dn, "dsa-tag-protocol", NULL);
+ return dsa_port_parse_cpu(dp, master, user_protocol);
+ }
+
+ if (link)
+ return dsa_port_parse_dsa(dp);
+
+ return dsa_port_parse_user(dp, name);
+}
+
+static int dsa_switch_parse_ports_of(struct dsa_switch *ds,
+ struct device_node *dn)
{
+ struct device_node *ports, *port;
struct dsa_port *dp;
- int ret = 0;
+ int err = 0;
+ u32 reg;
+
+ ports = of_get_child_by_name(dn, "ports");
+ if (!ports) {
+ /* The second possibility is "ethernet-ports" */
+ ports = of_get_child_by_name(dn, "ethernet-ports");
+ if (!ports) {
+ dev_err(ds->dev, "no ports child node found\n");
+ return -EINVAL;
+ }
+ }
- if (ds->ops->resume)
- ret = ds->ops->resume(ds);
+ for_each_available_child_of_node(ports, port) {
+ err = of_property_read_u32(port, "reg", &reg);
+ if (err) {
+ of_node_put(port);
+ goto out_put_node;
+ }
- if (ret)
- return ret;
+ if (reg >= ds->num_ports) {
+ dev_err(ds->dev, "port %pOF index %u exceeds num_ports (%u)\n",
+ port, reg, ds->num_ports);
+ of_node_put(port);
+ err = -EINVAL;
+ goto out_put_node;
+ }
- /* Resume slave network devices */
- dsa_switch_for_each_port(dp, ds) {
- if (!dsa_port_is_initialized(dp))
- continue;
+ dp = dsa_to_port(ds, reg);
- ret = dsa_slave_resume(dp->slave);
- if (ret)
- return ret;
+ err = dsa_port_parse_of(dp, port);
+ if (err) {
+ of_node_put(port);
+ goto out_put_node;
+ }
+ }
+
+out_put_node:
+ of_node_put(ports);
+ return err;
+}
+
+static int dsa_switch_parse_member_of(struct dsa_switch *ds,
+ struct device_node *dn)
+{
+ u32 m[2] = { 0, 0 };
+ int sz;
+
+ /* Don't error out if this optional property isn't found */
+ sz = of_property_read_variable_u32_array(dn, "dsa,member", m, 2, 2);
+ if (sz < 0 && sz != -EINVAL)
+ return sz;
+
+ ds->index = m[1];
+
+ ds->dst = dsa_tree_touch(m[0]);
+ if (!ds->dst)
+ return -ENOMEM;
+
+ if (dsa_switch_find(ds->dst->index, ds->index)) {
+ dev_err(ds->dev,
+ "A DSA switch with index %d already exists in tree %d\n",
+ ds->index, ds->dst->index);
+ return -EEXIST;
}
+ if (ds->dst->last_switch < ds->index)
+ ds->dst->last_switch = ds->index;
+
return 0;
}
-EXPORT_SYMBOL_GPL(dsa_switch_resume);
-#endif
-static struct packet_type dsa_pack_type __read_mostly = {
- .type = cpu_to_be16(ETH_P_XDSA),
- .func = dsa_switch_rcv,
-};
+static int dsa_switch_touch_ports(struct dsa_switch *ds)
+{
+ struct dsa_port *dp;
+ int port;
-static struct workqueue_struct *dsa_owq;
+ for (port = 0; port < ds->num_ports; port++) {
+ dp = dsa_port_touch(ds, port);
+ if (!dp)
+ return -ENOMEM;
+ }
-bool dsa_schedule_work(struct work_struct *work)
+ return 0;
+}
+
+static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn)
{
- return queue_work(dsa_owq, work);
+ int err;
+
+ err = dsa_switch_parse_member_of(ds, dn);
+ if (err)
+ return err;
+
+ err = dsa_switch_touch_ports(ds);
+ if (err)
+ return err;
+
+ return dsa_switch_parse_ports_of(ds, dn);
}
-void dsa_flush_workqueue(void)
+static int dev_is_class(struct device *dev, void *class)
{
- flush_workqueue(dsa_owq);
+ if (dev->class != NULL && !strcmp(dev->class->name, class))
+ return 1;
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(dsa_flush_workqueue);
-int dsa_devlink_param_get(struct devlink *dl, u32 id,
- struct devlink_param_gset_ctx *ctx)
+static struct device *dev_find_class(struct device *parent, char *class)
{
- struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+ if (dev_is_class(parent, class)) {
+ get_device(parent);
+ return parent;
+ }
- if (!ds->ops->devlink_param_get)
- return -EOPNOTSUPP;
+ return device_find_child(parent, class, dev_is_class);
+}
+
+static struct net_device *dsa_dev_to_net_device(struct device *dev)
+{
+ struct device *d;
+
+ d = dev_find_class(dev, "net");
+ if (d != NULL) {
+ struct net_device *nd;
+
+ nd = to_net_dev(d);
+ dev_hold(nd);
+ put_device(d);
- return ds->ops->devlink_param_get(ds, id, ctx);
+ return nd;
+ }
+
+ return NULL;
}
-EXPORT_SYMBOL_GPL(dsa_devlink_param_get);
-int dsa_devlink_param_set(struct devlink *dl, u32 id,
- struct devlink_param_gset_ctx *ctx)
+static int dsa_port_parse(struct dsa_port *dp, const char *name,
+ struct device *dev)
{
- struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+ if (!strcmp(name, "cpu")) {
+ struct net_device *master;
+
+ master = dsa_dev_to_net_device(dev);
+ if (!master)
+ return -EPROBE_DEFER;
+
+ dev_put(master);
- if (!ds->ops->devlink_param_set)
- return -EOPNOTSUPP;
+ return dsa_port_parse_cpu(dp, master, NULL);
+ }
+
+ if (!strcmp(name, "dsa"))
+ return dsa_port_parse_dsa(dp);
- return ds->ops->devlink_param_set(ds, id, ctx);
+ return dsa_port_parse_user(dp, name);
}
-EXPORT_SYMBOL_GPL(dsa_devlink_param_set);
-int dsa_devlink_params_register(struct dsa_switch *ds,
- const struct devlink_param *params,
- size_t params_count)
+static int dsa_switch_parse_ports(struct dsa_switch *ds,
+ struct dsa_chip_data *cd)
{
- return devlink_params_register(ds->devlink, params, params_count);
+ bool valid_name_found = false;
+ struct dsa_port *dp;
+ struct device *dev;
+ const char *name;
+ unsigned int i;
+ int err;
+
+ for (i = 0; i < DSA_MAX_PORTS; i++) {
+ name = cd->port_names[i];
+ dev = cd->netdev[i];
+ dp = dsa_to_port(ds, i);
+
+ if (!name)
+ continue;
+
+ err = dsa_port_parse(dp, name, dev);
+ if (err)
+ return err;
+
+ valid_name_found = true;
+ }
+
+ if (!valid_name_found && i == DSA_MAX_PORTS)
+ return -EINVAL;
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(dsa_devlink_params_register);
-void dsa_devlink_params_unregister(struct dsa_switch *ds,
- const struct devlink_param *params,
- size_t params_count)
+static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd)
{
- devlink_params_unregister(ds->devlink, params, params_count);
+ int err;
+
+ ds->cd = cd;
+
+ /* We don't support interconnected switches nor multiple trees via
+ * platform data, so this is the unique switch of the tree.
+ */
+ ds->index = 0;
+ ds->dst = dsa_tree_touch(0);
+ if (!ds->dst)
+ return -ENOMEM;
+
+ err = dsa_switch_touch_ports(ds);
+ if (err)
+ return err;
+
+ return dsa_switch_parse_ports(ds, cd);
}
-EXPORT_SYMBOL_GPL(dsa_devlink_params_unregister);
-int dsa_devlink_resource_register(struct dsa_switch *ds,
- const char *resource_name,
- u64 resource_size,
- u64 resource_id,
- u64 parent_resource_id,
- const struct devlink_resource_size_params *size_params)
+static void dsa_switch_release_ports(struct dsa_switch *ds)
{
- return devlink_resource_register(ds->devlink, resource_name,
- resource_size, resource_id,
- parent_resource_id,
- size_params);
+ struct dsa_port *dp, *next;
+
+ dsa_switch_for_each_port_safe(dp, next, ds) {
+ WARN_ON(!list_empty(&dp->fdbs));
+ WARN_ON(!list_empty(&dp->mdbs));
+ WARN_ON(!list_empty(&dp->vlans));
+ list_del(&dp->list);
+ kfree(dp);
+ }
}
-EXPORT_SYMBOL_GPL(dsa_devlink_resource_register);
-void dsa_devlink_resources_unregister(struct dsa_switch *ds)
+static int dsa_switch_probe(struct dsa_switch *ds)
{
- devlink_resources_unregister(ds->devlink);
+ struct dsa_switch_tree *dst;
+ struct dsa_chip_data *pdata;
+ struct device_node *np;
+ int err;
+
+ if (!ds->dev)
+ return -ENODEV;
+
+ pdata = ds->dev->platform_data;
+ np = ds->dev->of_node;
+
+ if (!ds->num_ports)
+ return -EINVAL;
+
+ if (np) {
+ err = dsa_switch_parse_of(ds, np);
+ if (err)
+ dsa_switch_release_ports(ds);
+ } else if (pdata) {
+ err = dsa_switch_parse(ds, pdata);
+ if (err)
+ dsa_switch_release_ports(ds);
+ } else {
+ err = -ENODEV;
+ }
+
+ if (err)
+ return err;
+
+ dst = ds->dst;
+ dsa_tree_get(dst);
+ err = dsa_tree_setup(dst);
+ if (err) {
+ dsa_switch_release_ports(ds);
+ dsa_tree_put(dst);
+ }
+
+ return err;
}
-EXPORT_SYMBOL_GPL(dsa_devlink_resources_unregister);
-void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds,
- u64 resource_id,
- devlink_resource_occ_get_t *occ_get,
- void *occ_get_priv)
+int dsa_register_switch(struct dsa_switch *ds)
{
- return devlink_resource_occ_get_register(ds->devlink, resource_id,
- occ_get, occ_get_priv);
+ int err;
+
+ mutex_lock(&dsa2_mutex);
+ err = dsa_switch_probe(ds);
+ dsa_tree_put(ds->dst);
+ mutex_unlock(&dsa2_mutex);
+
+ return err;
}
-EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_register);
+EXPORT_SYMBOL_GPL(dsa_register_switch);
-void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds,
- u64 resource_id)
+static void dsa_switch_remove(struct dsa_switch *ds)
{
- devlink_resource_occ_get_unregister(ds->devlink, resource_id);
+ struct dsa_switch_tree *dst = ds->dst;
+
+ dsa_tree_teardown(dst);
+ dsa_switch_release_ports(ds);
+ dsa_tree_put(dst);
}
-EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister);
-struct devlink_region *
-dsa_devlink_region_create(struct dsa_switch *ds,
- const struct devlink_region_ops *ops,
- u32 region_max_snapshots, u64 region_size)
+void dsa_unregister_switch(struct dsa_switch *ds)
{
- return devlink_region_create(ds->devlink, ops, region_max_snapshots,
- region_size);
+ mutex_lock(&dsa2_mutex);
+ dsa_switch_remove(ds);
+ mutex_unlock(&dsa2_mutex);
}
-EXPORT_SYMBOL_GPL(dsa_devlink_region_create);
+EXPORT_SYMBOL_GPL(dsa_unregister_switch);
-struct devlink_region *
-dsa_devlink_port_region_create(struct dsa_switch *ds,
- int port,
- const struct devlink_port_region_ops *ops,
- u32 region_max_snapshots, u64 region_size)
+/* If the DSA master chooses to unregister its net_device on .shutdown, DSA is
+ * blocking that operation from completion, due to the dev_hold taken inside
+ * netdev_upper_dev_link. Unlink the DSA slave interfaces from being uppers of
+ * the DSA master, so that the system can reboot successfully.
+ */
+void dsa_switch_shutdown(struct dsa_switch *ds)
{
- struct dsa_port *dp = dsa_to_port(ds, port);
+ struct net_device *master, *slave_dev;
+ struct dsa_port *dp;
+
+ mutex_lock(&dsa2_mutex);
+
+ if (!ds->setup)
+ goto out;
+
+ rtnl_lock();
+
+ dsa_switch_for_each_user_port(dp, ds) {
+ master = dsa_port_to_master(dp);
+ slave_dev = dp->slave;
+
+ netdev_upper_dev_unlink(master, slave_dev);
+ }
+
+ /* Disconnect from further netdevice notifiers on the master,
+ * since netdev_uses_dsa() will now return false.
+ */
+ dsa_switch_for_each_cpu_port(dp, ds)
+ dp->master->dsa_ptr = NULL;
+
+ rtnl_unlock();
+out:
+ mutex_unlock(&dsa2_mutex);
+}
+EXPORT_SYMBOL_GPL(dsa_switch_shutdown);
+
+#ifdef CONFIG_PM_SLEEP
+static bool dsa_port_is_initialized(const struct dsa_port *dp)
+{
+ return dp->type == DSA_PORT_TYPE_USER && dp->slave;
+}
+
+int dsa_switch_suspend(struct dsa_switch *ds)
+{
+ struct dsa_port *dp;
+ int ret = 0;
+
+ /* Suspend slave network devices */
+ dsa_switch_for_each_port(dp, ds) {
+ if (!dsa_port_is_initialized(dp))
+ continue;
+
+ ret = dsa_slave_suspend(dp->slave);
+ if (ret)
+ return ret;
+ }
+
+ if (ds->ops->suspend)
+ ret = ds->ops->suspend(ds);
- return devlink_port_region_create(&dp->devlink_port, ops,
- region_max_snapshots,
- region_size);
+ return ret;
}
-EXPORT_SYMBOL_GPL(dsa_devlink_port_region_create);
+EXPORT_SYMBOL_GPL(dsa_switch_suspend);
-void dsa_devlink_region_destroy(struct devlink_region *region)
+int dsa_switch_resume(struct dsa_switch *ds)
{
- devlink_region_destroy(region);
+ struct dsa_port *dp;
+ int ret = 0;
+
+ if (ds->ops->resume)
+ ret = ds->ops->resume(ds);
+
+ if (ret)
+ return ret;
+
+ /* Resume slave network devices */
+ dsa_switch_for_each_port(dp, ds) {
+ if (!dsa_port_is_initialized(dp))
+ continue;
+
+ ret = dsa_slave_resume(dp->slave);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(dsa_devlink_region_destroy);
+EXPORT_SYMBOL_GPL(dsa_switch_resume);
+#endif
struct dsa_port *dsa_port_from_netdev(struct net_device *netdev)
{
@@ -533,9 +1717,6 @@ static int __init dsa_init_module(void)
dev_add_pack(&dsa_pack_type);
- dsa_tag_driver_register(&DSA_TAG_DRIVER_NAME(none_ops),
- THIS_MODULE);
-
rc = rtnl_link_register(&dsa_link_ops);
if (rc)
goto netlink_register_fail;
@@ -543,7 +1724,6 @@ static int __init dsa_init_module(void)
return 0;
netlink_register_fail:
- dsa_tag_driver_unregister(&DSA_TAG_DRIVER_NAME(none_ops));
dsa_slave_unregister_notifier();
dev_remove_pack(&dsa_pack_type);
register_notifier_fail:
@@ -556,7 +1736,6 @@ module_init(dsa_init_module);
static void __exit dsa_cleanup_module(void)
{
rtnl_link_unregister(&dsa_link_ops);
- dsa_tag_driver_unregister(&DSA_TAG_DRIVER_NAME(none_ops));
dsa_slave_unregister_notifier();
dev_remove_pack(&dsa_pack_type);
diff --git a/net/dsa/dsa.h b/net/dsa/dsa.h
new file mode 100644
index 000000000000..b7e17ae1094d
--- /dev/null
+++ b/net/dsa/dsa.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_H
+#define __DSA_H
+
+#include <linux/list.h>
+#include <linux/types.h>
+
+struct dsa_db;
+struct dsa_device_ops;
+struct dsa_lag;
+struct dsa_switch_tree;
+struct net_device;
+struct work_struct;
+
+extern struct list_head dsa_tree_list;
+
+bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b);
+bool dsa_schedule_work(struct work_struct *work);
+void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag);
+void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag);
+struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst,
+ const struct net_device *lag_dev);
+struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst);
+int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
+ const struct dsa_device_ops *tag_ops,
+ const struct dsa_device_ops *old_tag_ops);
+void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst,
+ struct net_device *master,
+ bool up);
+void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst,
+ struct net_device *master,
+ bool up);
+unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max);
+void dsa_bridge_num_put(const struct net_device *bridge_dev,
+ unsigned int bridge_num);
+struct dsa_bridge *dsa_tree_bridge_find(struct dsa_switch_tree *dst,
+ const struct net_device *br);
+
+#endif
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
deleted file mode 100644
index af0e2c0394ac..000000000000
--- a/net/dsa/dsa2.c
+++ /dev/null
@@ -1,1812 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/dsa/dsa2.c - Hardware switch handling, binding version 2
- * Copyright (c) 2008-2009 Marvell Semiconductor
- * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
- * Copyright (c) 2016 Andrew Lunn <andrew@lunn.ch>
- */
-
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/list.h>
-#include <linux/netdevice.h>
-#include <linux/slab.h>
-#include <linux/rtnetlink.h>
-#include <linux/of.h>
-#include <linux/of_mdio.h>
-#include <linux/of_net.h>
-#include <net/devlink.h>
-#include <net/sch_generic.h>
-
-#include "dsa_priv.h"
-
-static DEFINE_MUTEX(dsa2_mutex);
-LIST_HEAD(dsa_tree_list);
-
-/* Track the bridges with forwarding offload enabled */
-static unsigned long dsa_fwd_offloading_bridges;
-
-/**
- * dsa_tree_notify - Execute code for all switches in a DSA switch tree.
- * @dst: collection of struct dsa_switch devices to notify.
- * @e: event, must be of type DSA_NOTIFIER_*
- * @v: event-specific value.
- *
- * Given a struct dsa_switch_tree, this can be used to run a function once for
- * each member DSA switch. The other alternative of traversing the tree is only
- * through its ports list, which does not uniquely list the switches.
- */
-int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v)
-{
- struct raw_notifier_head *nh = &dst->nh;
- int err;
-
- err = raw_notifier_call_chain(nh, e, v);
-
- return notifier_to_errno(err);
-}
-
-/**
- * dsa_broadcast - Notify all DSA trees in the system.
- * @e: event, must be of type DSA_NOTIFIER_*
- * @v: event-specific value.
- *
- * Can be used to notify the switching fabric of events such as cross-chip
- * bridging between disjoint trees (such as islands of tagger-compatible
- * switches bridged by an incompatible middle switch).
- *
- * WARNING: this function is not reliable during probe time, because probing
- * between trees is asynchronous and not all DSA trees might have probed.
- */
-int dsa_broadcast(unsigned long e, void *v)
-{
- struct dsa_switch_tree *dst;
- int err = 0;
-
- list_for_each_entry(dst, &dsa_tree_list, list) {
- err = dsa_tree_notify(dst, e, v);
- if (err)
- break;
- }
-
- return err;
-}
-
-/**
- * dsa_lag_map() - Map LAG structure to a linear LAG array
- * @dst: Tree in which to record the mapping.
- * @lag: LAG structure that is to be mapped to the tree's array.
- *
- * dsa_lag_id/dsa_lag_by_id can then be used to translate between the
- * two spaces. The size of the mapping space is determined by the
- * driver by setting ds->num_lag_ids. It is perfectly legal to leave
- * it unset if it is not needed, in which case these functions become
- * no-ops.
- */
-void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag)
-{
- unsigned int id;
-
- for (id = 1; id <= dst->lags_len; id++) {
- if (!dsa_lag_by_id(dst, id)) {
- dst->lags[id - 1] = lag;
- lag->id = id;
- return;
- }
- }
-
- /* No IDs left, which is OK. Some drivers do not need it. The
- * ones that do, e.g. mv88e6xxx, will discover that dsa_lag_id
- * returns an error for this device when joining the LAG. The
- * driver can then return -EOPNOTSUPP back to DSA, which will
- * fall back to a software LAG.
- */
-}
-
-/**
- * dsa_lag_unmap() - Remove a LAG ID mapping
- * @dst: Tree in which the mapping is recorded.
- * @lag: LAG structure that was mapped.
- *
- * As there may be multiple users of the mapping, it is only removed
- * if there are no other references to it.
- */
-void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag)
-{
- unsigned int id;
-
- dsa_lags_foreach_id(id, dst) {
- if (dsa_lag_by_id(dst, id) == lag) {
- dst->lags[id - 1] = NULL;
- lag->id = 0;
- break;
- }
- }
-}
-
-struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst,
- const struct net_device *lag_dev)
-{
- struct dsa_port *dp;
-
- list_for_each_entry(dp, &dst->ports, list)
- if (dsa_port_lag_dev_get(dp) == lag_dev)
- return dp->lag;
-
- return NULL;
-}
-
-struct dsa_bridge *dsa_tree_bridge_find(struct dsa_switch_tree *dst,
- const struct net_device *br)
-{
- struct dsa_port *dp;
-
- list_for_each_entry(dp, &dst->ports, list)
- if (dsa_port_bridge_dev_get(dp) == br)
- return dp->bridge;
-
- return NULL;
-}
-
-static int dsa_bridge_num_find(const struct net_device *bridge_dev)
-{
- struct dsa_switch_tree *dst;
-
- list_for_each_entry(dst, &dsa_tree_list, list) {
- struct dsa_bridge *bridge;
-
- bridge = dsa_tree_bridge_find(dst, bridge_dev);
- if (bridge)
- return bridge->num;
- }
-
- return 0;
-}
-
-unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max)
-{
- unsigned int bridge_num = dsa_bridge_num_find(bridge_dev);
-
- /* Switches without FDB isolation support don't get unique
- * bridge numbering
- */
- if (!max)
- return 0;
-
- if (!bridge_num) {
- /* First port that requests FDB isolation or TX forwarding
- * offload for this bridge
- */
- bridge_num = find_next_zero_bit(&dsa_fwd_offloading_bridges,
- DSA_MAX_NUM_OFFLOADING_BRIDGES,
- 1);
- if (bridge_num >= max)
- return 0;
-
- set_bit(bridge_num, &dsa_fwd_offloading_bridges);
- }
-
- return bridge_num;
-}
-
-void dsa_bridge_num_put(const struct net_device *bridge_dev,
- unsigned int bridge_num)
-{
- /* Since we refcount bridges, we know that when we call this function
- * it is no longer in use, so we can just go ahead and remove it from
- * the bit mask.
- */
- clear_bit(bridge_num, &dsa_fwd_offloading_bridges);
-}
-
-struct dsa_switch *dsa_switch_find(int tree_index, int sw_index)
-{
- struct dsa_switch_tree *dst;
- struct dsa_port *dp;
-
- list_for_each_entry(dst, &dsa_tree_list, list) {
- if (dst->index != tree_index)
- continue;
-
- list_for_each_entry(dp, &dst->ports, list) {
- if (dp->ds->index != sw_index)
- continue;
-
- return dp->ds;
- }
- }
-
- return NULL;
-}
-EXPORT_SYMBOL_GPL(dsa_switch_find);
-
-static struct dsa_switch_tree *dsa_tree_find(int index)
-{
- struct dsa_switch_tree *dst;
-
- list_for_each_entry(dst, &dsa_tree_list, list)
- if (dst->index == index)
- return dst;
-
- return NULL;
-}
-
-static struct dsa_switch_tree *dsa_tree_alloc(int index)
-{
- struct dsa_switch_tree *dst;
-
- dst = kzalloc(sizeof(*dst), GFP_KERNEL);
- if (!dst)
- return NULL;
-
- dst->index = index;
-
- INIT_LIST_HEAD(&dst->rtable);
-
- INIT_LIST_HEAD(&dst->ports);
-
- INIT_LIST_HEAD(&dst->list);
- list_add_tail(&dst->list, &dsa_tree_list);
-
- kref_init(&dst->refcount);
-
- return dst;
-}
-
-static void dsa_tree_free(struct dsa_switch_tree *dst)
-{
- if (dst->tag_ops)
- dsa_tag_driver_put(dst->tag_ops);
- list_del(&dst->list);
- kfree(dst);
-}
-
-static struct dsa_switch_tree *dsa_tree_get(struct dsa_switch_tree *dst)
-{
- if (dst)
- kref_get(&dst->refcount);
-
- return dst;
-}
-
-static struct dsa_switch_tree *dsa_tree_touch(int index)
-{
- struct dsa_switch_tree *dst;
-
- dst = dsa_tree_find(index);
- if (dst)
- return dsa_tree_get(dst);
- else
- return dsa_tree_alloc(index);
-}
-
-static void dsa_tree_release(struct kref *ref)
-{
- struct dsa_switch_tree *dst;
-
- dst = container_of(ref, struct dsa_switch_tree, refcount);
-
- dsa_tree_free(dst);
-}
-
-static void dsa_tree_put(struct dsa_switch_tree *dst)
-{
- if (dst)
- kref_put(&dst->refcount, dsa_tree_release);
-}
-
-static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst,
- struct device_node *dn)
-{
- struct dsa_port *dp;
-
- list_for_each_entry(dp, &dst->ports, list)
- if (dp->dn == dn)
- return dp;
-
- return NULL;
-}
-
-static struct dsa_link *dsa_link_touch(struct dsa_port *dp,
- struct dsa_port *link_dp)
-{
- struct dsa_switch *ds = dp->ds;
- struct dsa_switch_tree *dst;
- struct dsa_link *dl;
-
- dst = ds->dst;
-
- list_for_each_entry(dl, &dst->rtable, list)
- if (dl->dp == dp && dl->link_dp == link_dp)
- return dl;
-
- dl = kzalloc(sizeof(*dl), GFP_KERNEL);
- if (!dl)
- return NULL;
-
- dl->dp = dp;
- dl->link_dp = link_dp;
-
- INIT_LIST_HEAD(&dl->list);
- list_add_tail(&dl->list, &dst->rtable);
-
- return dl;
-}
-
-static bool dsa_port_setup_routing_table(struct dsa_port *dp)
-{
- struct dsa_switch *ds = dp->ds;
- struct dsa_switch_tree *dst = ds->dst;
- struct device_node *dn = dp->dn;
- struct of_phandle_iterator it;
- struct dsa_port *link_dp;
- struct dsa_link *dl;
- int err;
-
- of_for_each_phandle(&it, err, dn, "link", NULL, 0) {
- link_dp = dsa_tree_find_port_by_node(dst, it.node);
- if (!link_dp) {
- of_node_put(it.node);
- return false;
- }
-
- dl = dsa_link_touch(dp, link_dp);
- if (!dl) {
- of_node_put(it.node);
- return false;
- }
- }
-
- return true;
-}
-
-static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst)
-{
- bool complete = true;
- struct dsa_port *dp;
-
- list_for_each_entry(dp, &dst->ports, list) {
- if (dsa_port_is_dsa(dp)) {
- complete = dsa_port_setup_routing_table(dp);
- if (!complete)
- break;
- }
- }
-
- return complete;
-}
-
-static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst)
-{
- struct dsa_port *dp;
-
- list_for_each_entry(dp, &dst->ports, list)
- if (dsa_port_is_cpu(dp))
- return dp;
-
- return NULL;
-}
-
-struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst)
-{
- struct device_node *ethernet;
- struct net_device *master;
- struct dsa_port *cpu_dp;
-
- cpu_dp = dsa_tree_find_first_cpu(dst);
- ethernet = of_parse_phandle(cpu_dp->dn, "ethernet", 0);
- master = of_find_net_device_by_node(ethernet);
- of_node_put(ethernet);
-
- return master;
-}
-
-/* Assign the default CPU port (the first one in the tree) to all ports of the
- * fabric which don't already have one as part of their own switch.
- */
-static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst)
-{
- struct dsa_port *cpu_dp, *dp;
-
- cpu_dp = dsa_tree_find_first_cpu(dst);
- if (!cpu_dp) {
- pr_err("DSA: tree %d has no CPU port\n", dst->index);
- return -EINVAL;
- }
-
- list_for_each_entry(dp, &dst->ports, list) {
- if (dp->cpu_dp)
- continue;
-
- if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
- dp->cpu_dp = cpu_dp;
- }
-
- return 0;
-}
-
-/* Perform initial assignment of CPU ports to user ports and DSA links in the
- * fabric, giving preference to CPU ports local to each switch. Default to
- * using the first CPU port in the switch tree if the port does not have a CPU
- * port local to this switch.
- */
-static int dsa_tree_setup_cpu_ports(struct dsa_switch_tree *dst)
-{
- struct dsa_port *cpu_dp, *dp;
-
- list_for_each_entry(cpu_dp, &dst->ports, list) {
- if (!dsa_port_is_cpu(cpu_dp))
- continue;
-
- /* Prefer a local CPU port */
- dsa_switch_for_each_port(dp, cpu_dp->ds) {
- /* Prefer the first local CPU port found */
- if (dp->cpu_dp)
- continue;
-
- if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
- dp->cpu_dp = cpu_dp;
- }
- }
-
- return dsa_tree_setup_default_cpu(dst);
-}
-
-static void dsa_tree_teardown_cpu_ports(struct dsa_switch_tree *dst)
-{
- struct dsa_port *dp;
-
- list_for_each_entry(dp, &dst->ports, list)
- if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
- dp->cpu_dp = NULL;
-}
-
-static int dsa_port_devlink_setup(struct dsa_port *dp)
-{
- struct devlink_port *dlp = &dp->devlink_port;
- struct dsa_switch_tree *dst = dp->ds->dst;
- struct devlink_port_attrs attrs = {};
- struct devlink *dl = dp->ds->devlink;
- struct dsa_switch *ds = dp->ds;
- const unsigned char *id;
- unsigned char len;
- int err;
-
- memset(dlp, 0, sizeof(*dlp));
- devlink_port_init(dl, dlp);
-
- if (ds->ops->port_setup) {
- err = ds->ops->port_setup(ds, dp->index);
- if (err)
- return err;
- }
-
- id = (const unsigned char *)&dst->index;
- len = sizeof(dst->index);
-
- attrs.phys.port_number = dp->index;
- memcpy(attrs.switch_id.id, id, len);
- attrs.switch_id.id_len = len;
-
- switch (dp->type) {
- case DSA_PORT_TYPE_UNUSED:
- attrs.flavour = DEVLINK_PORT_FLAVOUR_UNUSED;
- break;
- case DSA_PORT_TYPE_CPU:
- attrs.flavour = DEVLINK_PORT_FLAVOUR_CPU;
- break;
- case DSA_PORT_TYPE_DSA:
- attrs.flavour = DEVLINK_PORT_FLAVOUR_DSA;
- break;
- case DSA_PORT_TYPE_USER:
- attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
- break;
- }
-
- devlink_port_attrs_set(dlp, &attrs);
- err = devlink_port_register(dl, dlp, dp->index);
- if (err) {
- if (ds->ops->port_teardown)
- ds->ops->port_teardown(ds, dp->index);
- return err;
- }
-
- return 0;
-}
-
-static void dsa_port_devlink_teardown(struct dsa_port *dp)
-{
- struct devlink_port *dlp = &dp->devlink_port;
- struct dsa_switch *ds = dp->ds;
-
- devlink_port_unregister(dlp);
-
- if (ds->ops->port_teardown)
- ds->ops->port_teardown(ds, dp->index);
-
- devlink_port_fini(dlp);
-}
-
-static int dsa_port_setup(struct dsa_port *dp)
-{
- struct devlink_port *dlp = &dp->devlink_port;
- bool dsa_port_link_registered = false;
- struct dsa_switch *ds = dp->ds;
- bool dsa_port_enabled = false;
- int err = 0;
-
- if (dp->setup)
- return 0;
-
- err = dsa_port_devlink_setup(dp);
- if (err)
- return err;
-
- switch (dp->type) {
- case DSA_PORT_TYPE_UNUSED:
- dsa_port_disable(dp);
- break;
- case DSA_PORT_TYPE_CPU:
- if (dp->dn) {
- err = dsa_shared_port_link_register_of(dp);
- if (err)
- break;
- dsa_port_link_registered = true;
- } else {
- dev_warn(ds->dev,
- "skipping link registration for CPU port %d\n",
- dp->index);
- }
-
- err = dsa_port_enable(dp, NULL);
- if (err)
- break;
- dsa_port_enabled = true;
-
- break;
- case DSA_PORT_TYPE_DSA:
- if (dp->dn) {
- err = dsa_shared_port_link_register_of(dp);
- if (err)
- break;
- dsa_port_link_registered = true;
- } else {
- dev_warn(ds->dev,
- "skipping link registration for DSA port %d\n",
- dp->index);
- }
-
- err = dsa_port_enable(dp, NULL);
- if (err)
- break;
- dsa_port_enabled = true;
-
- break;
- case DSA_PORT_TYPE_USER:
- of_get_mac_address(dp->dn, dp->mac);
- err = dsa_slave_create(dp);
- if (err)
- break;
-
- devlink_port_type_eth_set(dlp, dp->slave);
- break;
- }
-
- if (err && dsa_port_enabled)
- dsa_port_disable(dp);
- if (err && dsa_port_link_registered)
- dsa_shared_port_link_unregister_of(dp);
- if (err) {
- dsa_port_devlink_teardown(dp);
- return err;
- }
-
- dp->setup = true;
-
- return 0;
-}
-
-static void dsa_port_teardown(struct dsa_port *dp)
-{
- struct devlink_port *dlp = &dp->devlink_port;
-
- if (!dp->setup)
- return;
-
- devlink_port_type_clear(dlp);
-
- switch (dp->type) {
- case DSA_PORT_TYPE_UNUSED:
- break;
- case DSA_PORT_TYPE_CPU:
- dsa_port_disable(dp);
- if (dp->dn)
- dsa_shared_port_link_unregister_of(dp);
- break;
- case DSA_PORT_TYPE_DSA:
- dsa_port_disable(dp);
- if (dp->dn)
- dsa_shared_port_link_unregister_of(dp);
- break;
- case DSA_PORT_TYPE_USER:
- if (dp->slave) {
- dsa_slave_destroy(dp->slave);
- dp->slave = NULL;
- }
- break;
- }
-
- dsa_port_devlink_teardown(dp);
-
- dp->setup = false;
-}
-
-static int dsa_port_setup_as_unused(struct dsa_port *dp)
-{
- dp->type = DSA_PORT_TYPE_UNUSED;
- return dsa_port_setup(dp);
-}
-
-static int dsa_devlink_info_get(struct devlink *dl,
- struct devlink_info_req *req,
- struct netlink_ext_ack *extack)
-{
- struct dsa_switch *ds = dsa_devlink_to_ds(dl);
-
- if (ds->ops->devlink_info_get)
- return ds->ops->devlink_info_get(ds, req, extack);
-
- return -EOPNOTSUPP;
-}
-
-static int dsa_devlink_sb_pool_get(struct devlink *dl,
- unsigned int sb_index, u16 pool_index,
- struct devlink_sb_pool_info *pool_info)
-{
- struct dsa_switch *ds = dsa_devlink_to_ds(dl);
-
- if (!ds->ops->devlink_sb_pool_get)
- return -EOPNOTSUPP;
-
- return ds->ops->devlink_sb_pool_get(ds, sb_index, pool_index,
- pool_info);
-}
-
-static int dsa_devlink_sb_pool_set(struct devlink *dl, unsigned int sb_index,
- u16 pool_index, u32 size,
- enum devlink_sb_threshold_type threshold_type,
- struct netlink_ext_ack *extack)
-{
- struct dsa_switch *ds = dsa_devlink_to_ds(dl);
-
- if (!ds->ops->devlink_sb_pool_set)
- return -EOPNOTSUPP;
-
- return ds->ops->devlink_sb_pool_set(ds, sb_index, pool_index, size,
- threshold_type, extack);
-}
-
-static int dsa_devlink_sb_port_pool_get(struct devlink_port *dlp,
- unsigned int sb_index, u16 pool_index,
- u32 *p_threshold)
-{
- struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
- int port = dsa_devlink_port_to_port(dlp);
-
- if (!ds->ops->devlink_sb_port_pool_get)
- return -EOPNOTSUPP;
-
- return ds->ops->devlink_sb_port_pool_get(ds, port, sb_index,
- pool_index, p_threshold);
-}
-
-static int dsa_devlink_sb_port_pool_set(struct devlink_port *dlp,
- unsigned int sb_index, u16 pool_index,
- u32 threshold,
- struct netlink_ext_ack *extack)
-{
- struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
- int port = dsa_devlink_port_to_port(dlp);
-
- if (!ds->ops->devlink_sb_port_pool_set)
- return -EOPNOTSUPP;
-
- return ds->ops->devlink_sb_port_pool_set(ds, port, sb_index,
- pool_index, threshold, extack);
-}
-
-static int
-dsa_devlink_sb_tc_pool_bind_get(struct devlink_port *dlp,
- unsigned int sb_index, u16 tc_index,
- enum devlink_sb_pool_type pool_type,
- u16 *p_pool_index, u32 *p_threshold)
-{
- struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
- int port = dsa_devlink_port_to_port(dlp);
-
- if (!ds->ops->devlink_sb_tc_pool_bind_get)
- return -EOPNOTSUPP;
-
- return ds->ops->devlink_sb_tc_pool_bind_get(ds, port, sb_index,
- tc_index, pool_type,
- p_pool_index, p_threshold);
-}
-
-static int
-dsa_devlink_sb_tc_pool_bind_set(struct devlink_port *dlp,
- unsigned int sb_index, u16 tc_index,
- enum devlink_sb_pool_type pool_type,
- u16 pool_index, u32 threshold,
- struct netlink_ext_ack *extack)
-{
- struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
- int port = dsa_devlink_port_to_port(dlp);
-
- if (!ds->ops->devlink_sb_tc_pool_bind_set)
- return -EOPNOTSUPP;
-
- return ds->ops->devlink_sb_tc_pool_bind_set(ds, port, sb_index,
- tc_index, pool_type,
- pool_index, threshold,
- extack);
-}
-
-static int dsa_devlink_sb_occ_snapshot(struct devlink *dl,
- unsigned int sb_index)
-{
- struct dsa_switch *ds = dsa_devlink_to_ds(dl);
-
- if (!ds->ops->devlink_sb_occ_snapshot)
- return -EOPNOTSUPP;
-
- return ds->ops->devlink_sb_occ_snapshot(ds, sb_index);
-}
-
-static int dsa_devlink_sb_occ_max_clear(struct devlink *dl,
- unsigned int sb_index)
-{
- struct dsa_switch *ds = dsa_devlink_to_ds(dl);
-
- if (!ds->ops->devlink_sb_occ_max_clear)
- return -EOPNOTSUPP;
-
- return ds->ops->devlink_sb_occ_max_clear(ds, sb_index);
-}
-
-static int dsa_devlink_sb_occ_port_pool_get(struct devlink_port *dlp,
- unsigned int sb_index,
- u16 pool_index, u32 *p_cur,
- u32 *p_max)
-{
- struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
- int port = dsa_devlink_port_to_port(dlp);
-
- if (!ds->ops->devlink_sb_occ_port_pool_get)
- return -EOPNOTSUPP;
-
- return ds->ops->devlink_sb_occ_port_pool_get(ds, port, sb_index,
- pool_index, p_cur, p_max);
-}
-
-static int
-dsa_devlink_sb_occ_tc_port_bind_get(struct devlink_port *dlp,
- unsigned int sb_index, u16 tc_index,
- enum devlink_sb_pool_type pool_type,
- u32 *p_cur, u32 *p_max)
-{
- struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
- int port = dsa_devlink_port_to_port(dlp);
-
- if (!ds->ops->devlink_sb_occ_tc_port_bind_get)
- return -EOPNOTSUPP;
-
- return ds->ops->devlink_sb_occ_tc_port_bind_get(ds, port,
- sb_index, tc_index,
- pool_type, p_cur,
- p_max);
-}
-
-static const struct devlink_ops dsa_devlink_ops = {
- .info_get = dsa_devlink_info_get,
- .sb_pool_get = dsa_devlink_sb_pool_get,
- .sb_pool_set = dsa_devlink_sb_pool_set,
- .sb_port_pool_get = dsa_devlink_sb_port_pool_get,
- .sb_port_pool_set = dsa_devlink_sb_port_pool_set,
- .sb_tc_pool_bind_get = dsa_devlink_sb_tc_pool_bind_get,
- .sb_tc_pool_bind_set = dsa_devlink_sb_tc_pool_bind_set,
- .sb_occ_snapshot = dsa_devlink_sb_occ_snapshot,
- .sb_occ_max_clear = dsa_devlink_sb_occ_max_clear,
- .sb_occ_port_pool_get = dsa_devlink_sb_occ_port_pool_get,
- .sb_occ_tc_port_bind_get = dsa_devlink_sb_occ_tc_port_bind_get,
-};
-
-static int dsa_switch_setup_tag_protocol(struct dsa_switch *ds)
-{
- const struct dsa_device_ops *tag_ops = ds->dst->tag_ops;
- struct dsa_switch_tree *dst = ds->dst;
- int err;
-
- if (tag_ops->proto == dst->default_proto)
- goto connect;
-
- rtnl_lock();
- err = ds->ops->change_tag_protocol(ds, tag_ops->proto);
- rtnl_unlock();
- if (err) {
- dev_err(ds->dev, "Unable to use tag protocol \"%s\": %pe\n",
- tag_ops->name, ERR_PTR(err));
- return err;
- }
-
-connect:
- if (tag_ops->connect) {
- err = tag_ops->connect(ds);
- if (err)
- return err;
- }
-
- if (ds->ops->connect_tag_protocol) {
- err = ds->ops->connect_tag_protocol(ds, tag_ops->proto);
- if (err) {
- dev_err(ds->dev,
- "Unable to connect to tag protocol \"%s\": %pe\n",
- tag_ops->name, ERR_PTR(err));
- goto disconnect;
- }
- }
-
- return 0;
-
-disconnect:
- if (tag_ops->disconnect)
- tag_ops->disconnect(ds);
-
- return err;
-}
-
-static int dsa_switch_setup(struct dsa_switch *ds)
-{
- struct dsa_devlink_priv *dl_priv;
- struct device_node *dn;
- int err;
-
- if (ds->setup)
- return 0;
-
- /* Initialize ds->phys_mii_mask before registering the slave MDIO bus
- * driver and before ops->setup() has run, since the switch drivers and
- * the slave MDIO bus driver rely on these values for probing PHY
- * devices or not
- */
- ds->phys_mii_mask |= dsa_user_ports(ds);
-
- /* Add the switch to devlink before calling setup, so that setup can
- * add dpipe tables
- */
- ds->devlink =
- devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv), ds->dev);
- if (!ds->devlink)
- return -ENOMEM;
- dl_priv = devlink_priv(ds->devlink);
- dl_priv->ds = ds;
-
- err = dsa_switch_register_notifier(ds);
- if (err)
- goto devlink_free;
-
- ds->configure_vlan_while_not_filtering = true;
-
- err = ds->ops->setup(ds);
- if (err < 0)
- goto unregister_notifier;
-
- err = dsa_switch_setup_tag_protocol(ds);
- if (err)
- goto teardown;
-
- if (!ds->slave_mii_bus && ds->ops->phy_read) {
- ds->slave_mii_bus = mdiobus_alloc();
- if (!ds->slave_mii_bus) {
- err = -ENOMEM;
- goto teardown;
- }
-
- dsa_slave_mii_bus_init(ds);
-
- dn = of_get_child_by_name(ds->dev->of_node, "mdio");
-
- err = of_mdiobus_register(ds->slave_mii_bus, dn);
- of_node_put(dn);
- if (err < 0)
- goto free_slave_mii_bus;
- }
-
- ds->setup = true;
- devlink_register(ds->devlink);
- return 0;
-
-free_slave_mii_bus:
- if (ds->slave_mii_bus && ds->ops->phy_read)
- mdiobus_free(ds->slave_mii_bus);
-teardown:
- if (ds->ops->teardown)
- ds->ops->teardown(ds);
-unregister_notifier:
- dsa_switch_unregister_notifier(ds);
-devlink_free:
- devlink_free(ds->devlink);
- ds->devlink = NULL;
- return err;
-}
-
-static void dsa_switch_teardown(struct dsa_switch *ds)
-{
- if (!ds->setup)
- return;
-
- if (ds->devlink)
- devlink_unregister(ds->devlink);
-
- if (ds->slave_mii_bus && ds->ops->phy_read) {
- mdiobus_unregister(ds->slave_mii_bus);
- mdiobus_free(ds->slave_mii_bus);
- ds->slave_mii_bus = NULL;
- }
-
- if (ds->ops->teardown)
- ds->ops->teardown(ds);
-
- dsa_switch_unregister_notifier(ds);
-
- if (ds->devlink) {
- devlink_free(ds->devlink);
- ds->devlink = NULL;
- }
-
- ds->setup = false;
-}
-
-/* First tear down the non-shared, then the shared ports. This ensures that
- * all work items scheduled by our switchdev handlers for user ports have
- * completed before we destroy the refcounting kept on the shared ports.
- */
-static void dsa_tree_teardown_ports(struct dsa_switch_tree *dst)
-{
- struct dsa_port *dp;
-
- list_for_each_entry(dp, &dst->ports, list)
- if (dsa_port_is_user(dp) || dsa_port_is_unused(dp))
- dsa_port_teardown(dp);
-
- dsa_flush_workqueue();
-
- list_for_each_entry(dp, &dst->ports, list)
- if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp))
- dsa_port_teardown(dp);
-}
-
-static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst)
-{
- struct dsa_port *dp;
-
- list_for_each_entry(dp, &dst->ports, list)
- dsa_switch_teardown(dp->ds);
-}
-
-/* Bring shared ports up first, then non-shared ports */
-static int dsa_tree_setup_ports(struct dsa_switch_tree *dst)
-{
- struct dsa_port *dp;
- int err = 0;
-
- list_for_each_entry(dp, &dst->ports, list) {
- if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp)) {
- err = dsa_port_setup(dp);
- if (err)
- goto teardown;
- }
- }
-
- list_for_each_entry(dp, &dst->ports, list) {
- if (dsa_port_is_user(dp) || dsa_port_is_unused(dp)) {
- err = dsa_port_setup(dp);
- if (err) {
- err = dsa_port_setup_as_unused(dp);
- if (err)
- goto teardown;
- }
- }
- }
-
- return 0;
-
-teardown:
- dsa_tree_teardown_ports(dst);
-
- return err;
-}
-
-static int dsa_tree_setup_switches(struct dsa_switch_tree *dst)
-{
- struct dsa_port *dp;
- int err = 0;
-
- list_for_each_entry(dp, &dst->ports, list) {
- err = dsa_switch_setup(dp->ds);
- if (err) {
- dsa_tree_teardown_switches(dst);
- break;
- }
- }
-
- return err;
-}
-
-static int dsa_tree_setup_master(struct dsa_switch_tree *dst)
-{
- struct dsa_port *cpu_dp;
- int err = 0;
-
- rtnl_lock();
-
- dsa_tree_for_each_cpu_port(cpu_dp, dst) {
- struct net_device *master = cpu_dp->master;
- bool admin_up = (master->flags & IFF_UP) &&
- !qdisc_tx_is_noop(master);
-
- err = dsa_master_setup(master, cpu_dp);
- if (err)
- break;
-
- /* Replay master state event */
- dsa_tree_master_admin_state_change(dst, master, admin_up);
- dsa_tree_master_oper_state_change(dst, master,
- netif_oper_up(master));
- }
-
- rtnl_unlock();
-
- return err;
-}
-
-static void dsa_tree_teardown_master(struct dsa_switch_tree *dst)
-{
- struct dsa_port *cpu_dp;
-
- rtnl_lock();
-
- dsa_tree_for_each_cpu_port(cpu_dp, dst) {
- struct net_device *master = cpu_dp->master;
-
- /* Synthesizing an "admin down" state is sufficient for
- * the switches to get a notification if the master is
- * currently up and running.
- */
- dsa_tree_master_admin_state_change(dst, master, false);
-
- dsa_master_teardown(master);
- }
-
- rtnl_unlock();
-}
-
-static int dsa_tree_setup_lags(struct dsa_switch_tree *dst)
-{
- unsigned int len = 0;
- struct dsa_port *dp;
-
- list_for_each_entry(dp, &dst->ports, list) {
- if (dp->ds->num_lag_ids > len)
- len = dp->ds->num_lag_ids;
- }
-
- if (!len)
- return 0;
-
- dst->lags = kcalloc(len, sizeof(*dst->lags), GFP_KERNEL);
- if (!dst->lags)
- return -ENOMEM;
-
- dst->lags_len = len;
- return 0;
-}
-
-static void dsa_tree_teardown_lags(struct dsa_switch_tree *dst)
-{
- kfree(dst->lags);
-}
-
-static int dsa_tree_setup(struct dsa_switch_tree *dst)
-{
- bool complete;
- int err;
-
- if (dst->setup) {
- pr_err("DSA: tree %d already setup! Disjoint trees?\n",
- dst->index);
- return -EEXIST;
- }
-
- complete = dsa_tree_setup_routing_table(dst);
- if (!complete)
- return 0;
-
- err = dsa_tree_setup_cpu_ports(dst);
- if (err)
- return err;
-
- err = dsa_tree_setup_switches(dst);
- if (err)
- goto teardown_cpu_ports;
-
- err = dsa_tree_setup_ports(dst);
- if (err)
- goto teardown_switches;
-
- err = dsa_tree_setup_master(dst);
- if (err)
- goto teardown_ports;
-
- err = dsa_tree_setup_lags(dst);
- if (err)
- goto teardown_master;
-
- dst->setup = true;
-
- pr_info("DSA: tree %d setup\n", dst->index);
-
- return 0;
-
-teardown_master:
- dsa_tree_teardown_master(dst);
-teardown_ports:
- dsa_tree_teardown_ports(dst);
-teardown_switches:
- dsa_tree_teardown_switches(dst);
-teardown_cpu_ports:
- dsa_tree_teardown_cpu_ports(dst);
-
- return err;
-}
-
-static void dsa_tree_teardown(struct dsa_switch_tree *dst)
-{
- struct dsa_link *dl, *next;
-
- if (!dst->setup)
- return;
-
- dsa_tree_teardown_lags(dst);
-
- dsa_tree_teardown_master(dst);
-
- dsa_tree_teardown_ports(dst);
-
- dsa_tree_teardown_switches(dst);
-
- dsa_tree_teardown_cpu_ports(dst);
-
- list_for_each_entry_safe(dl, next, &dst->rtable, list) {
- list_del(&dl->list);
- kfree(dl);
- }
-
- pr_info("DSA: tree %d torn down\n", dst->index);
-
- dst->setup = false;
-}
-
-static int dsa_tree_bind_tag_proto(struct dsa_switch_tree *dst,
- const struct dsa_device_ops *tag_ops)
-{
- const struct dsa_device_ops *old_tag_ops = dst->tag_ops;
- struct dsa_notifier_tag_proto_info info;
- int err;
-
- dst->tag_ops = tag_ops;
-
- /* Notify the switches from this tree about the connection
- * to the new tagger
- */
- info.tag_ops = tag_ops;
- err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_CONNECT, &info);
- if (err && err != -EOPNOTSUPP)
- goto out_disconnect;
-
- /* Notify the old tagger about the disconnection from this tree */
- info.tag_ops = old_tag_ops;
- dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info);
-
- return 0;
-
-out_disconnect:
- info.tag_ops = tag_ops;
- dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info);
- dst->tag_ops = old_tag_ops;
-
- return err;
-}
-
-/* Since the dsa/tagging sysfs device attribute is per master, the assumption
- * is that all DSA switches within a tree share the same tagger, otherwise
- * they would have formed disjoint trees (different "dsa,member" values).
- */
-int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
- const struct dsa_device_ops *tag_ops,
- const struct dsa_device_ops *old_tag_ops)
-{
- struct dsa_notifier_tag_proto_info info;
- struct dsa_port *dp;
- int err = -EBUSY;
-
- if (!rtnl_trylock())
- return restart_syscall();
-
- /* At the moment we don't allow changing the tag protocol under
- * traffic. The rtnl_mutex also happens to serialize concurrent
- * attempts to change the tagging protocol. If we ever lift the IFF_UP
- * restriction, there needs to be another mutex which serializes this.
- */
- dsa_tree_for_each_user_port(dp, dst) {
- if (dsa_port_to_master(dp)->flags & IFF_UP)
- goto out_unlock;
-
- if (dp->slave->flags & IFF_UP)
- goto out_unlock;
- }
-
- /* Notify the tag protocol change */
- info.tag_ops = tag_ops;
- err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info);
- if (err)
- goto out_unwind_tagger;
-
- err = dsa_tree_bind_tag_proto(dst, tag_ops);
- if (err)
- goto out_unwind_tagger;
-
- rtnl_unlock();
-
- return 0;
-
-out_unwind_tagger:
- info.tag_ops = old_tag_ops;
- dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info);
-out_unlock:
- rtnl_unlock();
- return err;
-}
-
-static void dsa_tree_master_state_change(struct dsa_switch_tree *dst,
- struct net_device *master)
-{
- struct dsa_notifier_master_state_info info;
- struct dsa_port *cpu_dp = master->dsa_ptr;
-
- info.master = master;
- info.operational = dsa_port_master_is_operational(cpu_dp);
-
- dsa_tree_notify(dst, DSA_NOTIFIER_MASTER_STATE_CHANGE, &info);
-}
-
-void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst,
- struct net_device *master,
- bool up)
-{
- struct dsa_port *cpu_dp = master->dsa_ptr;
- bool notify = false;
-
- /* Don't keep track of admin state on LAG DSA masters,
- * but rather just of physical DSA masters
- */
- if (netif_is_lag_master(master))
- return;
-
- if ((dsa_port_master_is_operational(cpu_dp)) !=
- (up && cpu_dp->master_oper_up))
- notify = true;
-
- cpu_dp->master_admin_up = up;
-
- if (notify)
- dsa_tree_master_state_change(dst, master);
-}
-
-void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst,
- struct net_device *master,
- bool up)
-{
- struct dsa_port *cpu_dp = master->dsa_ptr;
- bool notify = false;
-
- /* Don't keep track of oper state on LAG DSA masters,
- * but rather just of physical DSA masters
- */
- if (netif_is_lag_master(master))
- return;
-
- if ((dsa_port_master_is_operational(cpu_dp)) !=
- (cpu_dp->master_admin_up && up))
- notify = true;
-
- cpu_dp->master_oper_up = up;
-
- if (notify)
- dsa_tree_master_state_change(dst, master);
-}
-
-static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index)
-{
- struct dsa_switch_tree *dst = ds->dst;
- struct dsa_port *dp;
-
- dsa_switch_for_each_port(dp, ds)
- if (dp->index == index)
- return dp;
-
- dp = kzalloc(sizeof(*dp), GFP_KERNEL);
- if (!dp)
- return NULL;
-
- dp->ds = ds;
- dp->index = index;
-
- mutex_init(&dp->addr_lists_lock);
- mutex_init(&dp->vlans_lock);
- INIT_LIST_HEAD(&dp->fdbs);
- INIT_LIST_HEAD(&dp->mdbs);
- INIT_LIST_HEAD(&dp->vlans);
- INIT_LIST_HEAD(&dp->list);
- list_add_tail(&dp->list, &dst->ports);
-
- return dp;
-}
-
-static int dsa_port_parse_user(struct dsa_port *dp, const char *name)
-{
- if (!name)
- name = "eth%d";
-
- dp->type = DSA_PORT_TYPE_USER;
- dp->name = name;
-
- return 0;
-}
-
-static int dsa_port_parse_dsa(struct dsa_port *dp)
-{
- dp->type = DSA_PORT_TYPE_DSA;
-
- return 0;
-}
-
-static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp,
- struct net_device *master)
-{
- enum dsa_tag_protocol tag_protocol = DSA_TAG_PROTO_NONE;
- struct dsa_switch *mds, *ds = dp->ds;
- unsigned int mdp_upstream;
- struct dsa_port *mdp;
-
- /* It is possible to stack DSA switches onto one another when that
- * happens the switch driver may want to know if its tagging protocol
- * is going to work in such a configuration.
- */
- if (dsa_slave_dev_check(master)) {
- mdp = dsa_slave_to_port(master);
- mds = mdp->ds;
- mdp_upstream = dsa_upstream_port(mds, mdp->index);
- tag_protocol = mds->ops->get_tag_protocol(mds, mdp_upstream,
- DSA_TAG_PROTO_NONE);
- }
-
- /* If the master device is not itself a DSA slave in a disjoint DSA
- * tree, then return immediately.
- */
- return ds->ops->get_tag_protocol(ds, dp->index, tag_protocol);
-}
-
-static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master,
- const char *user_protocol)
-{
- struct dsa_switch *ds = dp->ds;
- struct dsa_switch_tree *dst = ds->dst;
- const struct dsa_device_ops *tag_ops;
- enum dsa_tag_protocol default_proto;
-
- /* Find out which protocol the switch would prefer. */
- default_proto = dsa_get_tag_protocol(dp, master);
- if (dst->default_proto) {
- if (dst->default_proto != default_proto) {
- dev_err(ds->dev,
- "A DSA switch tree can have only one tagging protocol\n");
- return -EINVAL;
- }
- } else {
- dst->default_proto = default_proto;
- }
-
- /* See if the user wants to override that preference. */
- if (user_protocol) {
- if (!ds->ops->change_tag_protocol) {
- dev_err(ds->dev, "Tag protocol cannot be modified\n");
- return -EINVAL;
- }
-
- tag_ops = dsa_find_tagger_by_name(user_protocol);
- } else {
- tag_ops = dsa_tag_driver_get(default_proto);
- }
-
- if (IS_ERR(tag_ops)) {
- if (PTR_ERR(tag_ops) == -ENOPROTOOPT)
- return -EPROBE_DEFER;
-
- dev_warn(ds->dev, "No tagger for this switch\n");
- return PTR_ERR(tag_ops);
- }
-
- if (dst->tag_ops) {
- if (dst->tag_ops != tag_ops) {
- dev_err(ds->dev,
- "A DSA switch tree can have only one tagging protocol\n");
-
- dsa_tag_driver_put(tag_ops);
- return -EINVAL;
- }
-
- /* In the case of multiple CPU ports per switch, the tagging
- * protocol is still reference-counted only per switch tree.
- */
- dsa_tag_driver_put(tag_ops);
- } else {
- dst->tag_ops = tag_ops;
- }
-
- dp->master = master;
- dp->type = DSA_PORT_TYPE_CPU;
- dsa_port_set_tag_protocol(dp, dst->tag_ops);
- dp->dst = dst;
-
- /* At this point, the tree may be configured to use a different
- * tagger than the one chosen by the switch driver during
- * .setup, in the case when a user selects a custom protocol
- * through the DT.
- *
- * This is resolved by syncing the driver with the tree in
- * dsa_switch_setup_tag_protocol once .setup has run and the
- * driver is ready to accept calls to .change_tag_protocol. If
- * the driver does not support the custom protocol at that
- * point, the tree is wholly rejected, thereby ensuring that the
- * tree and driver are always in agreement on the protocol to
- * use.
- */
- return 0;
-}
-
-static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn)
-{
- struct device_node *ethernet = of_parse_phandle(dn, "ethernet", 0);
- const char *name = of_get_property(dn, "label", NULL);
- bool link = of_property_read_bool(dn, "link");
-
- dp->dn = dn;
-
- if (ethernet) {
- struct net_device *master;
- const char *user_protocol;
-
- master = of_find_net_device_by_node(ethernet);
- of_node_put(ethernet);
- if (!master)
- return -EPROBE_DEFER;
-
- user_protocol = of_get_property(dn, "dsa-tag-protocol", NULL);
- return dsa_port_parse_cpu(dp, master, user_protocol);
- }
-
- if (link)
- return dsa_port_parse_dsa(dp);
-
- return dsa_port_parse_user(dp, name);
-}
-
-static int dsa_switch_parse_ports_of(struct dsa_switch *ds,
- struct device_node *dn)
-{
- struct device_node *ports, *port;
- struct dsa_port *dp;
- int err = 0;
- u32 reg;
-
- ports = of_get_child_by_name(dn, "ports");
- if (!ports) {
- /* The second possibility is "ethernet-ports" */
- ports = of_get_child_by_name(dn, "ethernet-ports");
- if (!ports) {
- dev_err(ds->dev, "no ports child node found\n");
- return -EINVAL;
- }
- }
-
- for_each_available_child_of_node(ports, port) {
- err = of_property_read_u32(port, "reg", &reg);
- if (err) {
- of_node_put(port);
- goto out_put_node;
- }
-
- if (reg >= ds->num_ports) {
- dev_err(ds->dev, "port %pOF index %u exceeds num_ports (%u)\n",
- port, reg, ds->num_ports);
- of_node_put(port);
- err = -EINVAL;
- goto out_put_node;
- }
-
- dp = dsa_to_port(ds, reg);
-
- err = dsa_port_parse_of(dp, port);
- if (err) {
- of_node_put(port);
- goto out_put_node;
- }
- }
-
-out_put_node:
- of_node_put(ports);
- return err;
-}
-
-static int dsa_switch_parse_member_of(struct dsa_switch *ds,
- struct device_node *dn)
-{
- u32 m[2] = { 0, 0 };
- int sz;
-
- /* Don't error out if this optional property isn't found */
- sz = of_property_read_variable_u32_array(dn, "dsa,member", m, 2, 2);
- if (sz < 0 && sz != -EINVAL)
- return sz;
-
- ds->index = m[1];
-
- ds->dst = dsa_tree_touch(m[0]);
- if (!ds->dst)
- return -ENOMEM;
-
- if (dsa_switch_find(ds->dst->index, ds->index)) {
- dev_err(ds->dev,
- "A DSA switch with index %d already exists in tree %d\n",
- ds->index, ds->dst->index);
- return -EEXIST;
- }
-
- if (ds->dst->last_switch < ds->index)
- ds->dst->last_switch = ds->index;
-
- return 0;
-}
-
-static int dsa_switch_touch_ports(struct dsa_switch *ds)
-{
- struct dsa_port *dp;
- int port;
-
- for (port = 0; port < ds->num_ports; port++) {
- dp = dsa_port_touch(ds, port);
- if (!dp)
- return -ENOMEM;
- }
-
- return 0;
-}
-
-static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn)
-{
- int err;
-
- err = dsa_switch_parse_member_of(ds, dn);
- if (err)
- return err;
-
- err = dsa_switch_touch_ports(ds);
- if (err)
- return err;
-
- return dsa_switch_parse_ports_of(ds, dn);
-}
-
-static int dsa_port_parse(struct dsa_port *dp, const char *name,
- struct device *dev)
-{
- if (!strcmp(name, "cpu")) {
- struct net_device *master;
-
- master = dsa_dev_to_net_device(dev);
- if (!master)
- return -EPROBE_DEFER;
-
- dev_put(master);
-
- return dsa_port_parse_cpu(dp, master, NULL);
- }
-
- if (!strcmp(name, "dsa"))
- return dsa_port_parse_dsa(dp);
-
- return dsa_port_parse_user(dp, name);
-}
-
-static int dsa_switch_parse_ports(struct dsa_switch *ds,
- struct dsa_chip_data *cd)
-{
- bool valid_name_found = false;
- struct dsa_port *dp;
- struct device *dev;
- const char *name;
- unsigned int i;
- int err;
-
- for (i = 0; i < DSA_MAX_PORTS; i++) {
- name = cd->port_names[i];
- dev = cd->netdev[i];
- dp = dsa_to_port(ds, i);
-
- if (!name)
- continue;
-
- err = dsa_port_parse(dp, name, dev);
- if (err)
- return err;
-
- valid_name_found = true;
- }
-
- if (!valid_name_found && i == DSA_MAX_PORTS)
- return -EINVAL;
-
- return 0;
-}
-
-static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd)
-{
- int err;
-
- ds->cd = cd;
-
- /* We don't support interconnected switches nor multiple trees via
- * platform data, so this is the unique switch of the tree.
- */
- ds->index = 0;
- ds->dst = dsa_tree_touch(0);
- if (!ds->dst)
- return -ENOMEM;
-
- err = dsa_switch_touch_ports(ds);
- if (err)
- return err;
-
- return dsa_switch_parse_ports(ds, cd);
-}
-
-static void dsa_switch_release_ports(struct dsa_switch *ds)
-{
- struct dsa_port *dp, *next;
-
- dsa_switch_for_each_port_safe(dp, next, ds) {
- WARN_ON(!list_empty(&dp->fdbs));
- WARN_ON(!list_empty(&dp->mdbs));
- WARN_ON(!list_empty(&dp->vlans));
- list_del(&dp->list);
- kfree(dp);
- }
-}
-
-static int dsa_switch_probe(struct dsa_switch *ds)
-{
- struct dsa_switch_tree *dst;
- struct dsa_chip_data *pdata;
- struct device_node *np;
- int err;
-
- if (!ds->dev)
- return -ENODEV;
-
- pdata = ds->dev->platform_data;
- np = ds->dev->of_node;
-
- if (!ds->num_ports)
- return -EINVAL;
-
- if (np) {
- err = dsa_switch_parse_of(ds, np);
- if (err)
- dsa_switch_release_ports(ds);
- } else if (pdata) {
- err = dsa_switch_parse(ds, pdata);
- if (err)
- dsa_switch_release_ports(ds);
- } else {
- err = -ENODEV;
- }
-
- if (err)
- return err;
-
- dst = ds->dst;
- dsa_tree_get(dst);
- err = dsa_tree_setup(dst);
- if (err) {
- dsa_switch_release_ports(ds);
- dsa_tree_put(dst);
- }
-
- return err;
-}
-
-int dsa_register_switch(struct dsa_switch *ds)
-{
- int err;
-
- mutex_lock(&dsa2_mutex);
- err = dsa_switch_probe(ds);
- dsa_tree_put(ds->dst);
- mutex_unlock(&dsa2_mutex);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(dsa_register_switch);
-
-static void dsa_switch_remove(struct dsa_switch *ds)
-{
- struct dsa_switch_tree *dst = ds->dst;
-
- dsa_tree_teardown(dst);
- dsa_switch_release_ports(ds);
- dsa_tree_put(dst);
-}
-
-void dsa_unregister_switch(struct dsa_switch *ds)
-{
- mutex_lock(&dsa2_mutex);
- dsa_switch_remove(ds);
- mutex_unlock(&dsa2_mutex);
-}
-EXPORT_SYMBOL_GPL(dsa_unregister_switch);
-
-/* If the DSA master chooses to unregister its net_device on .shutdown, DSA is
- * blocking that operation from completion, due to the dev_hold taken inside
- * netdev_upper_dev_link. Unlink the DSA slave interfaces from being uppers of
- * the DSA master, so that the system can reboot successfully.
- */
-void dsa_switch_shutdown(struct dsa_switch *ds)
-{
- struct net_device *master, *slave_dev;
- struct dsa_port *dp;
-
- mutex_lock(&dsa2_mutex);
-
- if (!ds->setup)
- goto out;
-
- rtnl_lock();
-
- dsa_switch_for_each_user_port(dp, ds) {
- master = dsa_port_to_master(dp);
- slave_dev = dp->slave;
-
- netdev_upper_dev_unlink(master, slave_dev);
- }
-
- /* Disconnect from further netdevice notifiers on the master,
- * since netdev_uses_dsa() will now return false.
- */
- dsa_switch_for_each_cpu_port(dp, ds)
- dp->master->dsa_ptr = NULL;
-
- rtnl_unlock();
-out:
- mutex_unlock(&dsa2_mutex);
-}
-EXPORT_SYMBOL_GPL(dsa_switch_shutdown);
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
deleted file mode 100644
index 6e65c7ffd6f3..000000000000
--- a/net/dsa/dsa_priv.h
+++ /dev/null
@@ -1,587 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * net/dsa/dsa_priv.h - Hardware switch handling
- * Copyright (c) 2008-2009 Marvell Semiconductor
- */
-
-#ifndef __DSA_PRIV_H
-#define __DSA_PRIV_H
-
-#include <linux/if_bridge.h>
-#include <linux/if_vlan.h>
-#include <linux/phy.h>
-#include <linux/netdevice.h>
-#include <linux/netpoll.h>
-#include <net/dsa.h>
-#include <net/gro_cells.h>
-
-#define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG
-
-enum {
- DSA_NOTIFIER_AGEING_TIME,
- DSA_NOTIFIER_BRIDGE_JOIN,
- DSA_NOTIFIER_BRIDGE_LEAVE,
- DSA_NOTIFIER_FDB_ADD,
- DSA_NOTIFIER_FDB_DEL,
- DSA_NOTIFIER_HOST_FDB_ADD,
- DSA_NOTIFIER_HOST_FDB_DEL,
- DSA_NOTIFIER_LAG_FDB_ADD,
- DSA_NOTIFIER_LAG_FDB_DEL,
- DSA_NOTIFIER_LAG_CHANGE,
- DSA_NOTIFIER_LAG_JOIN,
- DSA_NOTIFIER_LAG_LEAVE,
- DSA_NOTIFIER_MDB_ADD,
- DSA_NOTIFIER_MDB_DEL,
- DSA_NOTIFIER_HOST_MDB_ADD,
- DSA_NOTIFIER_HOST_MDB_DEL,
- DSA_NOTIFIER_VLAN_ADD,
- DSA_NOTIFIER_VLAN_DEL,
- DSA_NOTIFIER_HOST_VLAN_ADD,
- DSA_NOTIFIER_HOST_VLAN_DEL,
- DSA_NOTIFIER_MTU,
- DSA_NOTIFIER_TAG_PROTO,
- DSA_NOTIFIER_TAG_PROTO_CONNECT,
- DSA_NOTIFIER_TAG_PROTO_DISCONNECT,
- DSA_NOTIFIER_TAG_8021Q_VLAN_ADD,
- DSA_NOTIFIER_TAG_8021Q_VLAN_DEL,
- DSA_NOTIFIER_MASTER_STATE_CHANGE,
-};
-
-/* DSA_NOTIFIER_AGEING_TIME */
-struct dsa_notifier_ageing_time_info {
- unsigned int ageing_time;
-};
-
-/* DSA_NOTIFIER_BRIDGE_* */
-struct dsa_notifier_bridge_info {
- const struct dsa_port *dp;
- struct dsa_bridge bridge;
- bool tx_fwd_offload;
- struct netlink_ext_ack *extack;
-};
-
-/* DSA_NOTIFIER_FDB_* */
-struct dsa_notifier_fdb_info {
- const struct dsa_port *dp;
- const unsigned char *addr;
- u16 vid;
- struct dsa_db db;
-};
-
-/* DSA_NOTIFIER_LAG_FDB_* */
-struct dsa_notifier_lag_fdb_info {
- struct dsa_lag *lag;
- const unsigned char *addr;
- u16 vid;
- struct dsa_db db;
-};
-
-/* DSA_NOTIFIER_MDB_* */
-struct dsa_notifier_mdb_info {
- const struct dsa_port *dp;
- const struct switchdev_obj_port_mdb *mdb;
- struct dsa_db db;
-};
-
-/* DSA_NOTIFIER_LAG_* */
-struct dsa_notifier_lag_info {
- const struct dsa_port *dp;
- struct dsa_lag lag;
- struct netdev_lag_upper_info *info;
- struct netlink_ext_ack *extack;
-};
-
-/* DSA_NOTIFIER_VLAN_* */
-struct dsa_notifier_vlan_info {
- const struct dsa_port *dp;
- const struct switchdev_obj_port_vlan *vlan;
- struct netlink_ext_ack *extack;
-};
-
-/* DSA_NOTIFIER_MTU */
-struct dsa_notifier_mtu_info {
- const struct dsa_port *dp;
- int mtu;
-};
-
-/* DSA_NOTIFIER_TAG_PROTO_* */
-struct dsa_notifier_tag_proto_info {
- const struct dsa_device_ops *tag_ops;
-};
-
-/* DSA_NOTIFIER_TAG_8021Q_VLAN_* */
-struct dsa_notifier_tag_8021q_vlan_info {
- const struct dsa_port *dp;
- u16 vid;
-};
-
-/* DSA_NOTIFIER_MASTER_STATE_CHANGE */
-struct dsa_notifier_master_state_info {
- const struct net_device *master;
- bool operational;
-};
-
-struct dsa_switchdev_event_work {
- struct net_device *dev;
- struct net_device *orig_dev;
- struct work_struct work;
- unsigned long event;
- /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and
- * SWITCHDEV_FDB_DEL_TO_DEVICE
- */
- unsigned char addr[ETH_ALEN];
- u16 vid;
- bool host_addr;
-};
-
-enum dsa_standalone_event {
- DSA_UC_ADD,
- DSA_UC_DEL,
- DSA_MC_ADD,
- DSA_MC_DEL,
-};
-
-struct dsa_standalone_event_work {
- struct work_struct work;
- struct net_device *dev;
- enum dsa_standalone_event event;
- unsigned char addr[ETH_ALEN];
- u16 vid;
-};
-
-struct dsa_slave_priv {
- /* Copy of CPU port xmit for faster access in slave transmit hot path */
- struct sk_buff * (*xmit)(struct sk_buff *skb,
- struct net_device *dev);
-
- struct gro_cells gcells;
-
- /* DSA port data, such as switch, port index, etc. */
- struct dsa_port *dp;
-
-#ifdef CONFIG_NET_POLL_CONTROLLER
- struct netpoll *netpoll;
-#endif
-
- /* TC context */
- struct list_head mall_tc_list;
-};
-
-/* dsa.c */
-const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol);
-void dsa_tag_driver_put(const struct dsa_device_ops *ops);
-const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf);
-
-bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b);
-
-bool dsa_schedule_work(struct work_struct *work);
-const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops);
-
-static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops)
-{
- return ops->needed_headroom + ops->needed_tailroom;
-}
-
-/* master.c */
-int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp);
-void dsa_master_teardown(struct net_device *dev);
-int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
- struct netdev_lag_upper_info *uinfo,
- struct netlink_ext_ack *extack);
-void dsa_master_lag_teardown(struct net_device *lag_dev,
- struct dsa_port *cpu_dp);
-
-static inline struct net_device *dsa_master_find_slave(struct net_device *dev,
- int device, int port)
-{
- struct dsa_port *cpu_dp = dev->dsa_ptr;
- struct dsa_switch_tree *dst = cpu_dp->dst;
- struct dsa_port *dp;
-
- list_for_each_entry(dp, &dst->ports, list)
- if (dp->ds->index == device && dp->index == port &&
- dp->type == DSA_PORT_TYPE_USER)
- return dp->slave;
-
- return NULL;
-}
-
-/* netlink.c */
-extern struct rtnl_link_ops dsa_link_ops __read_mostly;
-
-/* port.c */
-void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp,
- const struct dsa_device_ops *tag_ops);
-int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age);
-int dsa_port_set_mst_state(struct dsa_port *dp,
- const struct switchdev_mst_state *state,
- struct netlink_ext_ack *extack);
-int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy);
-int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy);
-void dsa_port_disable_rt(struct dsa_port *dp);
-void dsa_port_disable(struct dsa_port *dp);
-int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
- struct netlink_ext_ack *extack);
-void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br);
-void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br);
-int dsa_port_lag_change(struct dsa_port *dp,
- struct netdev_lag_lower_state_info *linfo);
-int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev,
- struct netdev_lag_upper_info *uinfo,
- struct netlink_ext_ack *extack);
-void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev);
-void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev);
-int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
- struct netlink_ext_ack *extack);
-bool dsa_port_skip_vlan_configuration(struct dsa_port *dp);
-int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock);
-int dsa_port_mst_enable(struct dsa_port *dp, bool on,
- struct netlink_ext_ack *extack);
-int dsa_port_vlan_msti(struct dsa_port *dp,
- const struct switchdev_vlan_msti *msti);
-int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu);
-int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
- u16 vid);
-int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
- u16 vid);
-int dsa_port_standalone_host_fdb_add(struct dsa_port *dp,
- const unsigned char *addr, u16 vid);
-int dsa_port_standalone_host_fdb_del(struct dsa_port *dp,
- const unsigned char *addr, u16 vid);
-int dsa_port_bridge_host_fdb_add(struct dsa_port *dp, const unsigned char *addr,
- u16 vid);
-int dsa_port_bridge_host_fdb_del(struct dsa_port *dp, const unsigned char *addr,
- u16 vid);
-int dsa_port_lag_fdb_add(struct dsa_port *dp, const unsigned char *addr,
- u16 vid);
-int dsa_port_lag_fdb_del(struct dsa_port *dp, const unsigned char *addr,
- u16 vid);
-int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data);
-int dsa_port_mdb_add(const struct dsa_port *dp,
- const struct switchdev_obj_port_mdb *mdb);
-int dsa_port_mdb_del(const struct dsa_port *dp,
- const struct switchdev_obj_port_mdb *mdb);
-int dsa_port_standalone_host_mdb_add(const struct dsa_port *dp,
- const struct switchdev_obj_port_mdb *mdb);
-int dsa_port_standalone_host_mdb_del(const struct dsa_port *dp,
- const struct switchdev_obj_port_mdb *mdb);
-int dsa_port_bridge_host_mdb_add(const struct dsa_port *dp,
- const struct switchdev_obj_port_mdb *mdb);
-int dsa_port_bridge_host_mdb_del(const struct dsa_port *dp,
- const struct switchdev_obj_port_mdb *mdb);
-int dsa_port_pre_bridge_flags(const struct dsa_port *dp,
- struct switchdev_brport_flags flags,
- struct netlink_ext_ack *extack);
-int dsa_port_bridge_flags(struct dsa_port *dp,
- struct switchdev_brport_flags flags,
- struct netlink_ext_ack *extack);
-int dsa_port_vlan_add(struct dsa_port *dp,
- const struct switchdev_obj_port_vlan *vlan,
- struct netlink_ext_ack *extack);
-int dsa_port_vlan_del(struct dsa_port *dp,
- const struct switchdev_obj_port_vlan *vlan);
-int dsa_port_host_vlan_add(struct dsa_port *dp,
- const struct switchdev_obj_port_vlan *vlan,
- struct netlink_ext_ack *extack);
-int dsa_port_host_vlan_del(struct dsa_port *dp,
- const struct switchdev_obj_port_vlan *vlan);
-int dsa_port_mrp_add(const struct dsa_port *dp,
- const struct switchdev_obj_mrp *mrp);
-int dsa_port_mrp_del(const struct dsa_port *dp,
- const struct switchdev_obj_mrp *mrp);
-int dsa_port_mrp_add_ring_role(const struct dsa_port *dp,
- const struct switchdev_obj_ring_role_mrp *mrp);
-int dsa_port_mrp_del_ring_role(const struct dsa_port *dp,
- const struct switchdev_obj_ring_role_mrp *mrp);
-int dsa_port_phylink_create(struct dsa_port *dp);
-void dsa_port_phylink_destroy(struct dsa_port *dp);
-int dsa_shared_port_link_register_of(struct dsa_port *dp);
-void dsa_shared_port_link_unregister_of(struct dsa_port *dp);
-int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr);
-void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr);
-int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast);
-void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast);
-void dsa_port_set_host_flood(struct dsa_port *dp, bool uc, bool mc);
-int dsa_port_change_master(struct dsa_port *dp, struct net_device *master,
- struct netlink_ext_ack *extack);
-
-/* slave.c */
-extern const struct dsa_device_ops notag_netdev_ops;
-extern struct notifier_block dsa_slave_switchdev_notifier;
-extern struct notifier_block dsa_slave_switchdev_blocking_notifier;
-
-void dsa_slave_mii_bus_init(struct dsa_switch *ds);
-int dsa_slave_create(struct dsa_port *dp);
-void dsa_slave_destroy(struct net_device *slave_dev);
-int dsa_slave_suspend(struct net_device *slave_dev);
-int dsa_slave_resume(struct net_device *slave_dev);
-int dsa_slave_register_notifier(void);
-void dsa_slave_unregister_notifier(void);
-void dsa_slave_sync_ha(struct net_device *dev);
-void dsa_slave_unsync_ha(struct net_device *dev);
-void dsa_slave_setup_tagger(struct net_device *slave);
-int dsa_slave_change_mtu(struct net_device *dev, int new_mtu);
-int dsa_slave_change_master(struct net_device *dev, struct net_device *master,
- struct netlink_ext_ack *extack);
-int dsa_slave_manage_vlan_filtering(struct net_device *dev,
- bool vlan_filtering);
-
-static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev)
-{
- struct dsa_slave_priv *p = netdev_priv(dev);
-
- return p->dp;
-}
-
-static inline struct net_device *
-dsa_slave_to_master(const struct net_device *dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
-
- return dsa_port_to_master(dp);
-}
-
-/* If under a bridge with vlan_filtering=0, make sure to send pvid-tagged
- * frames as untagged, since the bridge will not untag them.
- */
-static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb)
-{
- struct dsa_port *dp = dsa_slave_to_port(skb->dev);
- struct net_device *br = dsa_port_bridge_dev_get(dp);
- struct net_device *dev = skb->dev;
- struct net_device *upper_dev;
- u16 vid, pvid, proto;
- int err;
-
- if (!br || br_vlan_enabled(br))
- return skb;
-
- err = br_vlan_get_proto(br, &proto);
- if (err)
- return skb;
-
- /* Move VLAN tag from data to hwaccel */
- if (!skb_vlan_tag_present(skb) && skb->protocol == htons(proto)) {
- skb = skb_vlan_untag(skb);
- if (!skb)
- return NULL;
- }
-
- if (!skb_vlan_tag_present(skb))
- return skb;
-
- vid = skb_vlan_tag_get_id(skb);
-
- /* We already run under an RCU read-side critical section since
- * we are called from netif_receive_skb_list_internal().
- */
- err = br_vlan_get_pvid_rcu(dev, &pvid);
- if (err)
- return skb;
-
- if (vid != pvid)
- return skb;
-
- /* The sad part about attempting to untag from DSA is that we
- * don't know, unless we check, if the skb will end up in
- * the bridge's data path - br_allowed_ingress() - or not.
- * For example, there might be an 8021q upper for the
- * default_pvid of the bridge, which will steal VLAN-tagged traffic
- * from the bridge's data path. This is a configuration that DSA
- * supports because vlan_filtering is 0. In that case, we should
- * definitely keep the tag, to make sure it keeps working.
- */
- upper_dev = __vlan_find_dev_deep_rcu(br, htons(proto), vid);
- if (upper_dev)
- return skb;
-
- __vlan_hwaccel_clear_tag(skb);
-
- return skb;
-}
-
-/* For switches without hardware support for DSA tagging to be able
- * to support termination through the bridge.
- */
-static inline struct net_device *
-dsa_find_designated_bridge_port_by_vid(struct net_device *master, u16 vid)
-{
- struct dsa_port *cpu_dp = master->dsa_ptr;
- struct dsa_switch_tree *dst = cpu_dp->dst;
- struct bridge_vlan_info vinfo;
- struct net_device *slave;
- struct dsa_port *dp;
- int err;
-
- list_for_each_entry(dp, &dst->ports, list) {
- if (dp->type != DSA_PORT_TYPE_USER)
- continue;
-
- if (!dp->bridge)
- continue;
-
- if (dp->stp_state != BR_STATE_LEARNING &&
- dp->stp_state != BR_STATE_FORWARDING)
- continue;
-
- /* Since the bridge might learn this packet, keep the CPU port
- * affinity with the port that will be used for the reply on
- * xmit.
- */
- if (dp->cpu_dp != cpu_dp)
- continue;
-
- slave = dp->slave;
-
- err = br_vlan_get_info_rcu(slave, vid, &vinfo);
- if (err)
- continue;
-
- return slave;
- }
-
- return NULL;
-}
-
-/* If the ingress port offloads the bridge, we mark the frame as autonomously
- * forwarded by hardware, so the software bridge doesn't forward in twice, back
- * to us, because we already did. However, if we're in fallback mode and we do
- * software bridging, we are not offloading it, therefore the dp->bridge
- * pointer is not populated, and flooding needs to be done by software (we are
- * effectively operating in standalone ports mode).
- */
-static inline void dsa_default_offload_fwd_mark(struct sk_buff *skb)
-{
- struct dsa_port *dp = dsa_slave_to_port(skb->dev);
-
- skb->offload_fwd_mark = !!(dp->bridge);
-}
-
-/* Helper for removing DSA header tags from packets in the RX path.
- * Must not be called before skb_pull(len).
- * skb->data
- * |
- * v
- * | | | | | | | | | | | | | | | | | | |
- * +-----------------------+-----------------------+---------------+-------+
- * | Destination MAC | Source MAC | DSA header | EType |
- * +-----------------------+-----------------------+---------------+-------+
- * | |
- * <----- len -----> <----- len ----->
- * |
- * >>>>>>> v
- * >>>>>>> | | | | | | | | | | | | | | |
- * >>>>>>> +-----------------------+-----------------------+-------+
- * >>>>>>> | Destination MAC | Source MAC | EType |
- * +-----------------------+-----------------------+-------+
- * ^
- * |
- * skb->data
- */
-static inline void dsa_strip_etype_header(struct sk_buff *skb, int len)
-{
- memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - len, 2 * ETH_ALEN);
-}
-
-/* Helper for creating space for DSA header tags in TX path packets.
- * Must not be called before skb_push(len).
- *
- * Before:
- *
- * <<<<<<< | | | | | | | | | | | | | | |
- * ^ <<<<<<< +-----------------------+-----------------------+-------+
- * | <<<<<<< | Destination MAC | Source MAC | EType |
- * | +-----------------------+-----------------------+-------+
- * <----- len ----->
- * |
- * |
- * skb->data
- *
- * After:
- *
- * | | | | | | | | | | | | | | | | | | |
- * +-----------------------+-----------------------+---------------+-------+
- * | Destination MAC | Source MAC | DSA header | EType |
- * +-----------------------+-----------------------+---------------+-------+
- * ^ | |
- * | <----- len ----->
- * skb->data
- */
-static inline void dsa_alloc_etype_header(struct sk_buff *skb, int len)
-{
- memmove(skb->data, skb->data + len, 2 * ETH_ALEN);
-}
-
-/* On RX, eth_type_trans() on the DSA master pulls ETH_HLEN bytes starting from
- * skb_mac_header(skb), which leaves skb->data pointing at the first byte after
- * what the DSA master perceives as the EtherType (the beginning of the L3
- * protocol). Since DSA EtherType header taggers treat the EtherType as part of
- * the DSA tag itself, and the EtherType is 2 bytes in length, the DSA header
- * is located 2 bytes behind skb->data. Note that EtherType in this context
- * means the first 2 bytes of the DSA header, not the encapsulated EtherType
- * that will become visible after the DSA header is stripped.
- */
-static inline void *dsa_etype_header_pos_rx(struct sk_buff *skb)
-{
- return skb->data - 2;
-}
-
-/* On TX, skb->data points to skb_mac_header(skb), which means that EtherType
- * header taggers start exactly where the EtherType is (the EtherType is
- * treated as part of the DSA header).
- */
-static inline void *dsa_etype_header_pos_tx(struct sk_buff *skb)
-{
- return skb->data + 2 * ETH_ALEN;
-}
-
-/* switch.c */
-int dsa_switch_register_notifier(struct dsa_switch *ds);
-void dsa_switch_unregister_notifier(struct dsa_switch *ds);
-
-static inline bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds)
-{
- return ds->ops->port_fdb_add && ds->ops->port_fdb_del &&
- ds->fdb_isolation && !ds->vlan_filtering_is_global &&
- !ds->needs_standalone_vlan_filtering;
-}
-
-static inline bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds)
-{
- return ds->ops->port_mdb_add && ds->ops->port_mdb_del &&
- ds->fdb_isolation && !ds->vlan_filtering_is_global &&
- !ds->needs_standalone_vlan_filtering;
-}
-
-/* dsa2.c */
-void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag);
-void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag);
-struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst,
- const struct net_device *lag_dev);
-struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst);
-int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v);
-int dsa_broadcast(unsigned long e, void *v);
-int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
- const struct dsa_device_ops *tag_ops,
- const struct dsa_device_ops *old_tag_ops);
-void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst,
- struct net_device *master,
- bool up);
-void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst,
- struct net_device *master,
- bool up);
-unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max);
-void dsa_bridge_num_put(const struct net_device *bridge_dev,
- unsigned int bridge_num);
-struct dsa_bridge *dsa_tree_bridge_find(struct dsa_switch_tree *dst,
- const struct net_device *br);
-
-/* tag_8021q.c */
-int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds,
- struct dsa_notifier_tag_8021q_vlan_info *info);
-int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds,
- struct dsa_notifier_tag_8021q_vlan_info *info);
-
-extern struct list_head dsa_tree_list;
-
-#endif
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 40367ab41cf8..26d90140d271 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -6,7 +6,15 @@
* Vivien Didelot <vivien.didelot@savoirfairelinux.com>
*/
-#include "dsa_priv.h"
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <net/dsa.h>
+
+#include "dsa.h"
+#include "master.h"
+#include "port.h"
+#include "tag.h"
static int dsa_master_get_regs_len(struct net_device *dev)
{
@@ -204,8 +212,7 @@ static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
* switch in the tree that is PTP capable.
*/
list_for_each_entry(dp, &dst->ports, list)
- if (dp->ds->ops->port_hwtstamp_get ||
- dp->ds->ops->port_hwtstamp_set)
+ if (dsa_port_supports_hwtstamp(dp, ifr))
return -EBUSY;
break;
}
@@ -300,13 +307,24 @@ static ssize_t tagging_store(struct device *d, struct device_attribute *attr,
const char *buf, size_t count)
{
const struct dsa_device_ops *new_tag_ops, *old_tag_ops;
+ const char *end = strchrnul(buf, '\n'), *name;
struct net_device *dev = to_net_dev(d);
struct dsa_port *cpu_dp = dev->dsa_ptr;
+ size_t len = end - buf;
int err;
+ /* Empty string passed */
+ if (!len)
+ return -ENOPROTOOPT;
+
+ name = kstrndup(buf, len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
old_tag_ops = cpu_dp->tag_ops;
- new_tag_ops = dsa_find_tagger_by_name(buf);
- /* Bad tagger name, or module is not loaded? */
+ new_tag_ops = dsa_tag_driver_get_by_name(name);
+ kfree(name);
+ /* Bad tagger name? */
if (IS_ERR(new_tag_ops))
return PTR_ERR(new_tag_ops);
diff --git a/net/dsa/master.h b/net/dsa/master.h
new file mode 100644
index 000000000000..3fc0e610b5b5
--- /dev/null
+++ b/net/dsa/master.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_MASTER_H
+#define __DSA_MASTER_H
+
+struct dsa_port;
+struct net_device;
+struct netdev_lag_upper_info;
+struct netlink_ext_ack;
+
+int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp);
+void dsa_master_teardown(struct net_device *dev);
+int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
+ struct netdev_lag_upper_info *uinfo,
+ struct netlink_ext_ack *extack);
+void dsa_master_lag_teardown(struct net_device *lag_dev,
+ struct dsa_port *cpu_dp);
+
+#endif
diff --git a/net/dsa/netlink.c b/net/dsa/netlink.c
index ecf9ed1de185..bd4bbaf851de 100644
--- a/net/dsa/netlink.c
+++ b/net/dsa/netlink.c
@@ -4,7 +4,8 @@
#include <linux/netdevice.h>
#include <net/rtnetlink.h>
-#include "dsa_priv.h"
+#include "netlink.h"
+#include "slave.h"
static const struct nla_policy dsa_policy[IFLA_DSA_MAX + 1] = {
[IFLA_DSA_MASTER] = { .type = NLA_U32 },
diff --git a/net/dsa/netlink.h b/net/dsa/netlink.h
new file mode 100644
index 000000000000..7eda2fa15722
--- /dev/null
+++ b/net/dsa/netlink.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_NETLINK_H
+#define __DSA_NETLINK_H
+
+extern struct rtnl_link_ops dsa_link_ops __read_mostly;
+
+#endif
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 208168276995..67ad1adec2a2 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -12,7 +12,11 @@
#include <linux/of_mdio.h>
#include <linux/of_net.h>
-#include "dsa_priv.h"
+#include "dsa.h"
+#include "port.h"
+#include "slave.h"
+#include "switch.h"
+#include "tag_8021q.h"
/**
* dsa_port_notify - Notify the switching fabric of changes to a port
@@ -110,6 +114,22 @@ static bool dsa_port_can_configure_learning(struct dsa_port *dp)
return !err;
}
+bool dsa_port_supports_hwtstamp(struct dsa_port *dp, struct ifreq *ifr)
+{
+ struct dsa_switch *ds = dp->ds;
+ int err;
+
+ if (!ds->ops->port_hwtstamp_get || !ds->ops->port_hwtstamp_set)
+ return false;
+
+ /* "See through" shim implementations of the "get" method.
+ * This will clobber the ifreq structure, but we will either return an
+ * error, or the master will overwrite it with proper values.
+ */
+ err = ds->ops->port_hwtstamp_get(ds, dp->index, ifr);
+ return err != -EOPNOTSUPP;
+}
+
int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age)
{
struct dsa_switch *ds = dp->ds;
@@ -1536,16 +1556,14 @@ static void dsa_port_phylink_validate(struct phylink_config *config,
unsigned long *supported,
struct phylink_link_state *state)
{
- struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_validate) {
- if (config->mac_capabilities)
- phylink_generic_validate(config, supported, state);
- return;
- }
-
- ds->ops->phylink_validate(ds, dp->index, supported, state);
+ /* Skip call for drivers which don't yet set mac_capabilities,
+ * since validating in that case would mean their PHY will advertise
+ * nothing. In turn, skipping validation makes them advertise
+ * everything that the PHY supports, so those drivers should be
+ * converted ASAP.
+ */
+ if (config->mac_capabilities)
+ phylink_generic_validate(config, supported, state);
}
static void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config,
diff --git a/net/dsa/port.h b/net/dsa/port.h
new file mode 100644
index 000000000000..9c218660d223
--- /dev/null
+++ b/net/dsa/port.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_PORT_H
+#define __DSA_PORT_H
+
+#include <linux/types.h>
+#include <net/dsa.h>
+
+struct ifreq;
+struct netdev_lag_lower_state_info;
+struct netdev_lag_upper_info;
+struct netlink_ext_ack;
+struct switchdev_mst_state;
+struct switchdev_obj_port_mdb;
+struct switchdev_vlan_msti;
+struct phy_device;
+
+bool dsa_port_supports_hwtstamp(struct dsa_port *dp, struct ifreq *ifr);
+void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp,
+ const struct dsa_device_ops *tag_ops);
+int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age);
+int dsa_port_set_mst_state(struct dsa_port *dp,
+ const struct switchdev_mst_state *state,
+ struct netlink_ext_ack *extack);
+int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy);
+int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy);
+void dsa_port_disable_rt(struct dsa_port *dp);
+void dsa_port_disable(struct dsa_port *dp);
+int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
+ struct netlink_ext_ack *extack);
+void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br);
+void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br);
+int dsa_port_lag_change(struct dsa_port *dp,
+ struct netdev_lag_lower_state_info *linfo);
+int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev,
+ struct netdev_lag_upper_info *uinfo,
+ struct netlink_ext_ack *extack);
+void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev);
+void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev);
+int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
+ struct netlink_ext_ack *extack);
+bool dsa_port_skip_vlan_configuration(struct dsa_port *dp);
+int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock);
+int dsa_port_mst_enable(struct dsa_port *dp, bool on,
+ struct netlink_ext_ack *extack);
+int dsa_port_vlan_msti(struct dsa_port *dp,
+ const struct switchdev_vlan_msti *msti);
+int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu);
+int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_standalone_host_fdb_add(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid);
+int dsa_port_standalone_host_fdb_del(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid);
+int dsa_port_bridge_host_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_bridge_host_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_lag_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_lag_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data);
+int dsa_port_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_standalone_host_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_standalone_host_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_bridge_host_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_bridge_host_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_pre_bridge_flags(const struct dsa_port *dp,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack);
+int dsa_port_bridge_flags(struct dsa_port *dp,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack);
+int dsa_port_vlan_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct netlink_ext_ack *extack);
+int dsa_port_vlan_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan);
+int dsa_port_host_vlan_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct netlink_ext_ack *extack);
+int dsa_port_host_vlan_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan);
+int dsa_port_mrp_add(const struct dsa_port *dp,
+ const struct switchdev_obj_mrp *mrp);
+int dsa_port_mrp_del(const struct dsa_port *dp,
+ const struct switchdev_obj_mrp *mrp);
+int dsa_port_mrp_add_ring_role(const struct dsa_port *dp,
+ const struct switchdev_obj_ring_role_mrp *mrp);
+int dsa_port_mrp_del_ring_role(const struct dsa_port *dp,
+ const struct switchdev_obj_ring_role_mrp *mrp);
+int dsa_port_phylink_create(struct dsa_port *dp);
+void dsa_port_phylink_destroy(struct dsa_port *dp);
+int dsa_shared_port_link_register_of(struct dsa_port *dp);
+void dsa_shared_port_link_unregister_of(struct dsa_port *dp);
+int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr);
+void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr);
+int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast);
+void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast);
+void dsa_port_set_host_flood(struct dsa_port *dp, bool uc, bool mc);
+int dsa_port_change_master(struct dsa_port *dp, struct net_device *master,
+ struct netlink_ext_ack *extack);
+
+#endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 1a59918d3b30..aab79c355224 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -22,7 +22,54 @@
#include <net/dcbnl.h>
#include <linux/netpoll.h>
-#include "dsa_priv.h"
+#include "dsa.h"
+#include "port.h"
+#include "master.h"
+#include "netlink.h"
+#include "slave.h"
+#include "tag.h"
+
+struct dsa_switchdev_event_work {
+ struct net_device *dev;
+ struct net_device *orig_dev;
+ struct work_struct work;
+ unsigned long event;
+ /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and
+ * SWITCHDEV_FDB_DEL_TO_DEVICE
+ */
+ unsigned char addr[ETH_ALEN];
+ u16 vid;
+ bool host_addr;
+};
+
+enum dsa_standalone_event {
+ DSA_UC_ADD,
+ DSA_UC_DEL,
+ DSA_MC_ADD,
+ DSA_MC_DEL,
+};
+
+struct dsa_standalone_event_work {
+ struct work_struct work;
+ struct net_device *dev;
+ enum dsa_standalone_event event;
+ unsigned char addr[ETH_ALEN];
+ u16 vid;
+};
+
+static bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds)
+{
+ return ds->ops->port_fdb_add && ds->ops->port_fdb_del &&
+ ds->fdb_isolation && !ds->vlan_filtering_is_global &&
+ !ds->needs_standalone_vlan_filtering;
+}
+
+static bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds)
+{
+ return ds->ops->port_mdb_add && ds->ops->port_mdb_del &&
+ ds->fdb_isolation && !ds->vlan_filtering_is_global &&
+ !ds->needs_standalone_vlan_filtering;
+}
static void dsa_slave_standalone_event_work(struct work_struct *work)
{
@@ -976,12 +1023,12 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev,
s = per_cpu_ptr(dev->tstats, i);
do {
- start = u64_stats_fetch_begin_irq(&s->syncp);
+ start = u64_stats_fetch_begin(&s->syncp);
tx_packets = u64_stats_read(&s->tx_packets);
tx_bytes = u64_stats_read(&s->tx_bytes);
rx_packets = u64_stats_read(&s->rx_packets);
rx_bytes = u64_stats_read(&s->rx_bytes);
- } while (u64_stats_fetch_retry_irq(&s->syncp, start));
+ } while (u64_stats_fetch_retry(&s->syncp, start));
data[0] += tx_packets;
data[1] += tx_bytes;
data[2] += rx_packets;
@@ -2165,13 +2212,6 @@ static const struct dcbnl_rtnl_ops __maybe_unused dsa_slave_dcbnl_ops = {
.ieee_delapp = dsa_slave_dcbnl_ieee_delapp,
};
-static struct devlink_port *dsa_slave_get_devlink_port(struct net_device *dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
-
- return &dp->devlink_port;
-}
-
static void dsa_slave_get_stats64(struct net_device *dev,
struct rtnl_link_stats64 *s)
{
@@ -2219,7 +2259,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_get_stats64 = dsa_slave_get_stats64,
.ndo_vlan_rx_add_vid = dsa_slave_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = dsa_slave_vlan_rx_kill_vid,
- .ndo_get_devlink_port = dsa_slave_get_devlink_port,
.ndo_change_mtu = dsa_slave_change_mtu,
.ndo_fill_forward_path = dsa_slave_fill_forward_path,
};
@@ -2374,16 +2413,25 @@ int dsa_slave_create(struct dsa_port *port)
{
struct net_device *master = dsa_port_to_master(port);
struct dsa_switch *ds = port->ds;
- const char *name = port->name;
struct net_device *slave_dev;
struct dsa_slave_priv *p;
+ const char *name;
+ int assign_type;
int ret;
if (!ds->num_tx_queues)
ds->num_tx_queues = 1;
+ if (port->name) {
+ name = port->name;
+ assign_type = NET_NAME_PREDICTABLE;
+ } else {
+ name = "eth%d";
+ assign_type = NET_NAME_ENUM;
+ }
+
slave_dev = alloc_netdev_mqs(sizeof(struct dsa_slave_priv), name,
- NET_NAME_UNKNOWN, ether_setup,
+ assign_type, ether_setup,
ds->num_tx_queues, 1);
if (slave_dev == NULL)
return -ENOMEM;
@@ -2406,6 +2454,7 @@ int dsa_slave_create(struct dsa_port *port)
SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
SET_NETDEV_DEV(slave_dev, port->ds->dev);
+ SET_NETDEV_DEVLINK_PORT(slave_dev, &port->devlink_port);
slave_dev->dev.of_node = port->dn;
slave_dev->vlan_features = master->vlan_features;
@@ -3145,7 +3194,7 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb,
case NETDEV_CHANGELOWERSTATE: {
struct netdev_notifier_changelowerstate_info *info = ptr;
struct dsa_port *dp;
- int err;
+ int err = 0;
if (dsa_slave_dev_check(dev)) {
dp = dsa_slave_to_port(dev);
diff --git a/net/dsa/slave.h b/net/dsa/slave.h
new file mode 100644
index 000000000000..d0abe609e00d
--- /dev/null
+++ b/net/dsa/slave.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_SLAVE_H
+#define __DSA_SLAVE_H
+
+#include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
+#include <linux/list.h>
+#include <linux/netpoll.h>
+#include <linux/types.h>
+#include <net/dsa.h>
+#include <net/gro_cells.h>
+
+struct net_device;
+struct netlink_ext_ack;
+
+extern struct notifier_block dsa_slave_switchdev_notifier;
+extern struct notifier_block dsa_slave_switchdev_blocking_notifier;
+
+struct dsa_slave_priv {
+ /* Copy of CPU port xmit for faster access in slave transmit hot path */
+ struct sk_buff * (*xmit)(struct sk_buff *skb,
+ struct net_device *dev);
+
+ struct gro_cells gcells;
+
+ /* DSA port data, such as switch, port index, etc. */
+ struct dsa_port *dp;
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ struct netpoll *netpoll;
+#endif
+
+ /* TC context */
+ struct list_head mall_tc_list;
+};
+
+void dsa_slave_mii_bus_init(struct dsa_switch *ds);
+int dsa_slave_create(struct dsa_port *dp);
+void dsa_slave_destroy(struct net_device *slave_dev);
+int dsa_slave_suspend(struct net_device *slave_dev);
+int dsa_slave_resume(struct net_device *slave_dev);
+int dsa_slave_register_notifier(void);
+void dsa_slave_unregister_notifier(void);
+void dsa_slave_sync_ha(struct net_device *dev);
+void dsa_slave_unsync_ha(struct net_device *dev);
+void dsa_slave_setup_tagger(struct net_device *slave);
+int dsa_slave_change_mtu(struct net_device *dev, int new_mtu);
+int dsa_slave_change_master(struct net_device *dev, struct net_device *master,
+ struct netlink_ext_ack *extack);
+int dsa_slave_manage_vlan_filtering(struct net_device *dev,
+ bool vlan_filtering);
+
+static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+
+ return p->dp;
+}
+
+static inline struct net_device *
+dsa_slave_to_master(const struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+
+ return dsa_port_to_master(dp);
+}
+
+#endif
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index ce56acdba203..d5bc4bb7310d 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -12,7 +12,12 @@
#include <linux/if_vlan.h>
#include <net/switchdev.h>
-#include "dsa_priv.h"
+#include "dsa.h"
+#include "netlink.h"
+#include "port.h"
+#include "slave.h"
+#include "switch.h"
+#include "tag_8021q.h"
static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds,
unsigned int ageing_time)
@@ -1013,6 +1018,52 @@ static int dsa_switch_event(struct notifier_block *nb,
return notifier_from_errno(err);
}
+/**
+ * dsa_tree_notify - Execute code for all switches in a DSA switch tree.
+ * @dst: collection of struct dsa_switch devices to notify.
+ * @e: event, must be of type DSA_NOTIFIER_*
+ * @v: event-specific value.
+ *
+ * Given a struct dsa_switch_tree, this can be used to run a function once for
+ * each member DSA switch. The other alternative of traversing the tree is only
+ * through its ports list, which does not uniquely list the switches.
+ */
+int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v)
+{
+ struct raw_notifier_head *nh = &dst->nh;
+ int err;
+
+ err = raw_notifier_call_chain(nh, e, v);
+
+ return notifier_to_errno(err);
+}
+
+/**
+ * dsa_broadcast - Notify all DSA trees in the system.
+ * @e: event, must be of type DSA_NOTIFIER_*
+ * @v: event-specific value.
+ *
+ * Can be used to notify the switching fabric of events such as cross-chip
+ * bridging between disjoint trees (such as islands of tagger-compatible
+ * switches bridged by an incompatible middle switch).
+ *
+ * WARNING: this function is not reliable during probe time, because probing
+ * between trees is asynchronous and not all DSA trees might have probed.
+ */
+int dsa_broadcast(unsigned long e, void *v)
+{
+ struct dsa_switch_tree *dst;
+ int err = 0;
+
+ list_for_each_entry(dst, &dsa_tree_list, list) {
+ err = dsa_tree_notify(dst, e, v);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
int dsa_switch_register_notifier(struct dsa_switch *ds)
{
ds->nb.notifier_call = dsa_switch_event;
diff --git a/net/dsa/switch.h b/net/dsa/switch.h
new file mode 100644
index 000000000000..15e67b95eb6e
--- /dev/null
+++ b/net/dsa/switch.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_SWITCH_H
+#define __DSA_SWITCH_H
+
+#include <net/dsa.h>
+
+struct netlink_ext_ack;
+
+enum {
+ DSA_NOTIFIER_AGEING_TIME,
+ DSA_NOTIFIER_BRIDGE_JOIN,
+ DSA_NOTIFIER_BRIDGE_LEAVE,
+ DSA_NOTIFIER_FDB_ADD,
+ DSA_NOTIFIER_FDB_DEL,
+ DSA_NOTIFIER_HOST_FDB_ADD,
+ DSA_NOTIFIER_HOST_FDB_DEL,
+ DSA_NOTIFIER_LAG_FDB_ADD,
+ DSA_NOTIFIER_LAG_FDB_DEL,
+ DSA_NOTIFIER_LAG_CHANGE,
+ DSA_NOTIFIER_LAG_JOIN,
+ DSA_NOTIFIER_LAG_LEAVE,
+ DSA_NOTIFIER_MDB_ADD,
+ DSA_NOTIFIER_MDB_DEL,
+ DSA_NOTIFIER_HOST_MDB_ADD,
+ DSA_NOTIFIER_HOST_MDB_DEL,
+ DSA_NOTIFIER_VLAN_ADD,
+ DSA_NOTIFIER_VLAN_DEL,
+ DSA_NOTIFIER_HOST_VLAN_ADD,
+ DSA_NOTIFIER_HOST_VLAN_DEL,
+ DSA_NOTIFIER_MTU,
+ DSA_NOTIFIER_TAG_PROTO,
+ DSA_NOTIFIER_TAG_PROTO_CONNECT,
+ DSA_NOTIFIER_TAG_PROTO_DISCONNECT,
+ DSA_NOTIFIER_TAG_8021Q_VLAN_ADD,
+ DSA_NOTIFIER_TAG_8021Q_VLAN_DEL,
+ DSA_NOTIFIER_MASTER_STATE_CHANGE,
+};
+
+/* DSA_NOTIFIER_AGEING_TIME */
+struct dsa_notifier_ageing_time_info {
+ unsigned int ageing_time;
+};
+
+/* DSA_NOTIFIER_BRIDGE_* */
+struct dsa_notifier_bridge_info {
+ const struct dsa_port *dp;
+ struct dsa_bridge bridge;
+ bool tx_fwd_offload;
+ struct netlink_ext_ack *extack;
+};
+
+/* DSA_NOTIFIER_FDB_* */
+struct dsa_notifier_fdb_info {
+ const struct dsa_port *dp;
+ const unsigned char *addr;
+ u16 vid;
+ struct dsa_db db;
+};
+
+/* DSA_NOTIFIER_LAG_FDB_* */
+struct dsa_notifier_lag_fdb_info {
+ struct dsa_lag *lag;
+ const unsigned char *addr;
+ u16 vid;
+ struct dsa_db db;
+};
+
+/* DSA_NOTIFIER_MDB_* */
+struct dsa_notifier_mdb_info {
+ const struct dsa_port *dp;
+ const struct switchdev_obj_port_mdb *mdb;
+ struct dsa_db db;
+};
+
+/* DSA_NOTIFIER_LAG_* */
+struct dsa_notifier_lag_info {
+ const struct dsa_port *dp;
+ struct dsa_lag lag;
+ struct netdev_lag_upper_info *info;
+ struct netlink_ext_ack *extack;
+};
+
+/* DSA_NOTIFIER_VLAN_* */
+struct dsa_notifier_vlan_info {
+ const struct dsa_port *dp;
+ const struct switchdev_obj_port_vlan *vlan;
+ struct netlink_ext_ack *extack;
+};
+
+/* DSA_NOTIFIER_MTU */
+struct dsa_notifier_mtu_info {
+ const struct dsa_port *dp;
+ int mtu;
+};
+
+/* DSA_NOTIFIER_TAG_PROTO_* */
+struct dsa_notifier_tag_proto_info {
+ const struct dsa_device_ops *tag_ops;
+};
+
+/* DSA_NOTIFIER_TAG_8021Q_VLAN_* */
+struct dsa_notifier_tag_8021q_vlan_info {
+ const struct dsa_port *dp;
+ u16 vid;
+};
+
+/* DSA_NOTIFIER_MASTER_STATE_CHANGE */
+struct dsa_notifier_master_state_info {
+ const struct net_device *master;
+ bool operational;
+};
+
+int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v);
+int dsa_broadcast(unsigned long e, void *v);
+
+int dsa_switch_register_notifier(struct dsa_switch *ds);
+void dsa_switch_unregister_notifier(struct dsa_switch *ds);
+
+#endif
diff --git a/net/dsa/tag.c b/net/dsa/tag.c
new file mode 100644
index 000000000000..383721e167d6
--- /dev/null
+++ b/net/dsa/tag.c
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * DSA tagging protocol handling
+ *
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
+ * Copyright (c) 2016 Andrew Lunn <andrew@lunn.ch>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/ptp_classify.h>
+#include <linux/skbuff.h>
+#include <net/dsa.h>
+#include <net/dst_metadata.h>
+
+#include "slave.h"
+#include "tag.h"
+
+static LIST_HEAD(dsa_tag_drivers_list);
+static DEFINE_MUTEX(dsa_tag_drivers_lock);
+
+/* Determine if we should defer delivery of skb until we have a rx timestamp.
+ *
+ * Called from dsa_switch_rcv. For now, this will only work if tagging is
+ * enabled on the switch. Normally the MAC driver would retrieve the hardware
+ * timestamp when it reads the packet out of the hardware. However in a DSA
+ * switch, the DSA driver owning the interface to which the packet is
+ * delivered is never notified unless we do so here.
+ */
+static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p,
+ struct sk_buff *skb)
+{
+ struct dsa_switch *ds = p->dp->ds;
+ unsigned int type;
+
+ if (skb_headroom(skb) < ETH_HLEN)
+ return false;
+
+ __skb_push(skb, ETH_HLEN);
+
+ type = ptp_classify_raw(skb);
+
+ __skb_pull(skb, ETH_HLEN);
+
+ if (type == PTP_CLASS_NONE)
+ return false;
+
+ if (likely(ds->ops->port_rxtstamp))
+ return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type);
+
+ return false;
+}
+
+static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *unused)
+{
+ struct metadata_dst *md_dst = skb_metadata_dst(skb);
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct sk_buff *nskb = NULL;
+ struct dsa_slave_priv *p;
+
+ if (unlikely(!cpu_dp)) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ return 0;
+
+ if (md_dst && md_dst->type == METADATA_HW_PORT_MUX) {
+ unsigned int port = md_dst->u.port_info.port_id;
+
+ skb_dst_drop(skb);
+ if (!skb_has_extensions(skb))
+ skb->slow_gro = 0;
+
+ skb->dev = dsa_master_find_slave(dev, 0, port);
+ if (likely(skb->dev)) {
+ dsa_default_offload_fwd_mark(skb);
+ nskb = skb;
+ }
+ } else {
+ nskb = cpu_dp->rcv(skb, dev);
+ }
+
+ if (!nskb) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ skb = nskb;
+ skb_push(skb, ETH_HLEN);
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+
+ if (unlikely(!dsa_slave_dev_check(skb->dev))) {
+ /* Packet is to be injected directly on an upper
+ * device, e.g. a team/bond, so skip all DSA-port
+ * specific actions.
+ */
+ netif_rx(skb);
+ return 0;
+ }
+
+ p = netdev_priv(skb->dev);
+
+ if (unlikely(cpu_dp->ds->untag_bridge_pvid)) {
+ nskb = dsa_untag_bridge_pvid(skb);
+ if (!nskb) {
+ kfree_skb(skb);
+ return 0;
+ }
+ skb = nskb;
+ }
+
+ dev_sw_netstats_rx_add(skb->dev, skb->len);
+
+ if (dsa_skb_defer_rx_timestamp(p, skb))
+ return 0;
+
+ gro_cells_receive(&p->gcells, skb);
+
+ return 0;
+}
+
+struct packet_type dsa_pack_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_XDSA),
+ .func = dsa_switch_rcv,
+};
+
+static void dsa_tag_driver_register(struct dsa_tag_driver *dsa_tag_driver,
+ struct module *owner)
+{
+ dsa_tag_driver->owner = owner;
+
+ mutex_lock(&dsa_tag_drivers_lock);
+ list_add_tail(&dsa_tag_driver->list, &dsa_tag_drivers_list);
+ mutex_unlock(&dsa_tag_drivers_lock);
+}
+
+void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[],
+ unsigned int count, struct module *owner)
+{
+ unsigned int i;
+
+ for (i = 0; i < count; i++)
+ dsa_tag_driver_register(dsa_tag_driver_array[i], owner);
+}
+
+static void dsa_tag_driver_unregister(struct dsa_tag_driver *dsa_tag_driver)
+{
+ mutex_lock(&dsa_tag_drivers_lock);
+ list_del(&dsa_tag_driver->list);
+ mutex_unlock(&dsa_tag_drivers_lock);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_drivers_register);
+
+void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[],
+ unsigned int count)
+{
+ unsigned int i;
+
+ for (i = 0; i < count; i++)
+ dsa_tag_driver_unregister(dsa_tag_driver_array[i]);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_drivers_unregister);
+
+const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops)
+{
+ return ops->name;
+};
+
+/* Function takes a reference on the module owning the tagger,
+ * so dsa_tag_driver_put must be called afterwards.
+ */
+const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name)
+{
+ const struct dsa_device_ops *ops = ERR_PTR(-ENOPROTOOPT);
+ struct dsa_tag_driver *dsa_tag_driver;
+
+ request_module("%s%s", DSA_TAG_DRIVER_ALIAS, name);
+
+ mutex_lock(&dsa_tag_drivers_lock);
+ list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) {
+ const struct dsa_device_ops *tmp = dsa_tag_driver->ops;
+
+ if (strcmp(name, tmp->name))
+ continue;
+
+ if (!try_module_get(dsa_tag_driver->owner))
+ break;
+
+ ops = tmp;
+ break;
+ }
+ mutex_unlock(&dsa_tag_drivers_lock);
+
+ return ops;
+}
+
+const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol)
+{
+ struct dsa_tag_driver *dsa_tag_driver;
+ const struct dsa_device_ops *ops;
+ bool found = false;
+
+ request_module("%sid-%d", DSA_TAG_DRIVER_ALIAS, tag_protocol);
+
+ mutex_lock(&dsa_tag_drivers_lock);
+ list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) {
+ ops = dsa_tag_driver->ops;
+ if (ops->proto == tag_protocol) {
+ found = true;
+ break;
+ }
+ }
+
+ if (found) {
+ if (!try_module_get(dsa_tag_driver->owner))
+ ops = ERR_PTR(-ENOPROTOOPT);
+ } else {
+ ops = ERR_PTR(-ENOPROTOOPT);
+ }
+
+ mutex_unlock(&dsa_tag_drivers_lock);
+
+ return ops;
+}
+
+void dsa_tag_driver_put(const struct dsa_device_ops *ops)
+{
+ struct dsa_tag_driver *dsa_tag_driver;
+
+ mutex_lock(&dsa_tag_drivers_lock);
+ list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) {
+ if (dsa_tag_driver->ops == ops) {
+ module_put(dsa_tag_driver->owner);
+ break;
+ }
+ }
+ mutex_unlock(&dsa_tag_drivers_lock);
+}
diff --git a/net/dsa/tag.h b/net/dsa/tag.h
new file mode 100644
index 000000000000..7cfbca824f1c
--- /dev/null
+++ b/net/dsa/tag.h
@@ -0,0 +1,310 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_TAG_H
+#define __DSA_TAG_H
+
+#include <linux/if_vlan.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <net/dsa.h>
+
+#include "port.h"
+#include "slave.h"
+
+struct dsa_tag_driver {
+ const struct dsa_device_ops *ops;
+ struct list_head list;
+ struct module *owner;
+};
+
+extern struct packet_type dsa_pack_type;
+
+const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol);
+const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name);
+void dsa_tag_driver_put(const struct dsa_device_ops *ops);
+const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops);
+
+static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops)
+{
+ return ops->needed_headroom + ops->needed_tailroom;
+}
+
+static inline struct net_device *dsa_master_find_slave(struct net_device *dev,
+ int device, int port)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dp->ds->index == device && dp->index == port &&
+ dp->type == DSA_PORT_TYPE_USER)
+ return dp->slave;
+
+ return NULL;
+}
+
+/* If under a bridge with vlan_filtering=0, make sure to send pvid-tagged
+ * frames as untagged, since the bridge will not untag them.
+ */
+static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb)
+{
+ struct dsa_port *dp = dsa_slave_to_port(skb->dev);
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+ struct net_device *dev = skb->dev;
+ struct net_device *upper_dev;
+ u16 vid, pvid, proto;
+ int err;
+
+ if (!br || br_vlan_enabled(br))
+ return skb;
+
+ err = br_vlan_get_proto(br, &proto);
+ if (err)
+ return skb;
+
+ /* Move VLAN tag from data to hwaccel */
+ if (!skb_vlan_tag_present(skb) && skb->protocol == htons(proto)) {
+ skb = skb_vlan_untag(skb);
+ if (!skb)
+ return NULL;
+ }
+
+ if (!skb_vlan_tag_present(skb))
+ return skb;
+
+ vid = skb_vlan_tag_get_id(skb);
+
+ /* We already run under an RCU read-side critical section since
+ * we are called from netif_receive_skb_list_internal().
+ */
+ err = br_vlan_get_pvid_rcu(dev, &pvid);
+ if (err)
+ return skb;
+
+ if (vid != pvid)
+ return skb;
+
+ /* The sad part about attempting to untag from DSA is that we
+ * don't know, unless we check, if the skb will end up in
+ * the bridge's data path - br_allowed_ingress() - or not.
+ * For example, there might be an 8021q upper for the
+ * default_pvid of the bridge, which will steal VLAN-tagged traffic
+ * from the bridge's data path. This is a configuration that DSA
+ * supports because vlan_filtering is 0. In that case, we should
+ * definitely keep the tag, to make sure it keeps working.
+ */
+ upper_dev = __vlan_find_dev_deep_rcu(br, htons(proto), vid);
+ if (upper_dev)
+ return skb;
+
+ __vlan_hwaccel_clear_tag(skb);
+
+ return skb;
+}
+
+/* For switches without hardware support for DSA tagging to be able
+ * to support termination through the bridge.
+ */
+static inline struct net_device *
+dsa_find_designated_bridge_port_by_vid(struct net_device *master, u16 vid)
+{
+ struct dsa_port *cpu_dp = master->dsa_ptr;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ struct bridge_vlan_info vinfo;
+ struct net_device *slave;
+ struct dsa_port *dp;
+ int err;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dp->type != DSA_PORT_TYPE_USER)
+ continue;
+
+ if (!dp->bridge)
+ continue;
+
+ if (dp->stp_state != BR_STATE_LEARNING &&
+ dp->stp_state != BR_STATE_FORWARDING)
+ continue;
+
+ /* Since the bridge might learn this packet, keep the CPU port
+ * affinity with the port that will be used for the reply on
+ * xmit.
+ */
+ if (dp->cpu_dp != cpu_dp)
+ continue;
+
+ slave = dp->slave;
+
+ err = br_vlan_get_info_rcu(slave, vid, &vinfo);
+ if (err)
+ continue;
+
+ return slave;
+ }
+
+ return NULL;
+}
+
+/* If the ingress port offloads the bridge, we mark the frame as autonomously
+ * forwarded by hardware, so the software bridge doesn't forward in twice, back
+ * to us, because we already did. However, if we're in fallback mode and we do
+ * software bridging, we are not offloading it, therefore the dp->bridge
+ * pointer is not populated, and flooding needs to be done by software (we are
+ * effectively operating in standalone ports mode).
+ */
+static inline void dsa_default_offload_fwd_mark(struct sk_buff *skb)
+{
+ struct dsa_port *dp = dsa_slave_to_port(skb->dev);
+
+ skb->offload_fwd_mark = !!(dp->bridge);
+}
+
+/* Helper for removing DSA header tags from packets in the RX path.
+ * Must not be called before skb_pull(len).
+ * skb->data
+ * |
+ * v
+ * | | | | | | | | | | | | | | | | | | |
+ * +-----------------------+-----------------------+---------------+-------+
+ * | Destination MAC | Source MAC | DSA header | EType |
+ * +-----------------------+-----------------------+---------------+-------+
+ * | |
+ * <----- len -----> <----- len ----->
+ * |
+ * >>>>>>> v
+ * >>>>>>> | | | | | | | | | | | | | | |
+ * >>>>>>> +-----------------------+-----------------------+-------+
+ * >>>>>>> | Destination MAC | Source MAC | EType |
+ * +-----------------------+-----------------------+-------+
+ * ^
+ * |
+ * skb->data
+ */
+static inline void dsa_strip_etype_header(struct sk_buff *skb, int len)
+{
+ memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - len, 2 * ETH_ALEN);
+}
+
+/* Helper for creating space for DSA header tags in TX path packets.
+ * Must not be called before skb_push(len).
+ *
+ * Before:
+ *
+ * <<<<<<< | | | | | | | | | | | | | | |
+ * ^ <<<<<<< +-----------------------+-----------------------+-------+
+ * | <<<<<<< | Destination MAC | Source MAC | EType |
+ * | +-----------------------+-----------------------+-------+
+ * <----- len ----->
+ * |
+ * |
+ * skb->data
+ *
+ * After:
+ *
+ * | | | | | | | | | | | | | | | | | | |
+ * +-----------------------+-----------------------+---------------+-------+
+ * | Destination MAC | Source MAC | DSA header | EType |
+ * +-----------------------+-----------------------+---------------+-------+
+ * ^ | |
+ * | <----- len ----->
+ * skb->data
+ */
+static inline void dsa_alloc_etype_header(struct sk_buff *skb, int len)
+{
+ memmove(skb->data, skb->data + len, 2 * ETH_ALEN);
+}
+
+/* On RX, eth_type_trans() on the DSA master pulls ETH_HLEN bytes starting from
+ * skb_mac_header(skb), which leaves skb->data pointing at the first byte after
+ * what the DSA master perceives as the EtherType (the beginning of the L3
+ * protocol). Since DSA EtherType header taggers treat the EtherType as part of
+ * the DSA tag itself, and the EtherType is 2 bytes in length, the DSA header
+ * is located 2 bytes behind skb->data. Note that EtherType in this context
+ * means the first 2 bytes of the DSA header, not the encapsulated EtherType
+ * that will become visible after the DSA header is stripped.
+ */
+static inline void *dsa_etype_header_pos_rx(struct sk_buff *skb)
+{
+ return skb->data - 2;
+}
+
+/* On TX, skb->data points to skb_mac_header(skb), which means that EtherType
+ * header taggers start exactly where the EtherType is (the EtherType is
+ * treated as part of the DSA header).
+ */
+static inline void *dsa_etype_header_pos_tx(struct sk_buff *skb)
+{
+ return skb->data + 2 * ETH_ALEN;
+}
+
+/* Create 2 modaliases per tagging protocol, one to auto-load the module
+ * given the ID reported by get_tag_protocol(), and the other by name.
+ */
+#define DSA_TAG_DRIVER_ALIAS "dsa_tag:"
+#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto, __name) \
+ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __name); \
+ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS "id-" \
+ __stringify(__proto##_VALUE))
+
+void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[],
+ unsigned int count,
+ struct module *owner);
+void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[],
+ unsigned int count);
+
+#define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count) \
+static int __init dsa_tag_driver_module_init(void) \
+{ \
+ dsa_tag_drivers_register(__dsa_tag_drivers_array, __count, \
+ THIS_MODULE); \
+ return 0; \
+} \
+module_init(dsa_tag_driver_module_init); \
+ \
+static void __exit dsa_tag_driver_module_exit(void) \
+{ \
+ dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count); \
+} \
+module_exit(dsa_tag_driver_module_exit)
+
+/**
+ * module_dsa_tag_drivers() - Helper macro for registering DSA tag
+ * drivers
+ * @__ops_array: Array of tag driver structures
+ *
+ * Helper macro for DSA tag drivers which do not do anything special
+ * in module init/exit. Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit().
+ */
+#define module_dsa_tag_drivers(__ops_array) \
+dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array))
+
+#define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops
+
+/* Create a static structure we can build a linked list of dsa_tag
+ * drivers
+ */
+#define DSA_TAG_DRIVER(__ops) \
+static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = { \
+ .ops = &__ops, \
+}
+
+/**
+ * module_dsa_tag_driver() - Helper macro for registering a single DSA tag
+ * driver
+ * @__ops: Single tag driver structures
+ *
+ * Helper macro for DSA tag drivers which do not do anything special
+ * in module init/exit. Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit().
+ */
+#define module_dsa_tag_driver(__ops) \
+DSA_TAG_DRIVER(__ops); \
+ \
+static struct dsa_tag_driver *dsa_tag_driver_array[] = { \
+ &DSA_TAG_DRIVER_NAME(__ops) \
+}; \
+module_dsa_tag_drivers(dsa_tag_driver_array)
+
+#endif
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 34e5ec5d3e23..b1263917fcb2 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -7,7 +7,10 @@
#include <linux/if_vlan.h>
#include <linux/dsa/8021q.h>
-#include "dsa_priv.h"
+#include "port.h"
+#include "switch.h"
+#include "tag.h"
+#include "tag_8021q.h"
/* Binary structure of the fake 12-bit VID field (when the TPID is
* ETH_P_DSA_8021Q):
@@ -60,6 +63,20 @@
#define DSA_8021Q_PORT(x) (((x) << DSA_8021Q_PORT_SHIFT) & \
DSA_8021Q_PORT_MASK)
+struct dsa_tag_8021q_vlan {
+ struct list_head list;
+ int port;
+ u16 vid;
+ refcount_t refcount;
+};
+
+struct dsa_8021q_context {
+ struct dsa_switch *ds;
+ struct list_head vlans;
+ /* EtherType of RX VID, used for filtering on master interface */
+ __be16 proto;
+};
+
u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num)
{
/* The VBID value of 0 is reserved for precise TX, but it is also
diff --git a/net/dsa/tag_8021q.h b/net/dsa/tag_8021q.h
new file mode 100644
index 000000000000..b75cbaa028ef
--- /dev/null
+++ b/net/dsa/tag_8021q.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_TAG_8021Q_H
+#define __DSA_TAG_8021Q_H
+
+#include <net/dsa.h>
+
+#include "switch.h"
+
+struct sk_buff;
+struct net_device;
+
+struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
+ u16 tpid, u16 tci);
+
+void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
+ int *vbid);
+
+struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master,
+ int vbid);
+
+int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds,
+ struct dsa_notifier_tag_8021q_vlan_info *info);
+int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds,
+ struct dsa_notifier_tag_8021q_vlan_info *info);
+
+#endif
diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c
index 8a02ac44282f..7f3b7d730b85 100644
--- a/net/dsa/tag_ar9331.c
+++ b/net/dsa/tag_ar9331.c
@@ -7,7 +7,9 @@
#include <linux/bitfield.h>
#include <linux/etherdevice.h>
-#include "dsa_priv.h"
+#include "tag.h"
+
+#define AR9331_NAME "ar9331"
#define AR9331_HDR_LEN 2
#define AR9331_HDR_VERSION 1
@@ -80,7 +82,7 @@ static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb,
}
static const struct dsa_device_ops ar9331_netdev_ops = {
- .name = "ar9331",
+ .name = AR9331_NAME,
.proto = DSA_TAG_PROTO_AR9331,
.xmit = ar9331_tag_xmit,
.rcv = ar9331_tag_rcv,
@@ -88,5 +90,5 @@ static const struct dsa_device_ops ar9331_netdev_ops = {
};
MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_AR9331);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_AR9331, AR9331_NAME);
module_dsa_tag_driver(ar9331_netdev_ops);
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 16889ea3e0a7..10239daa5745 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -10,7 +10,11 @@
#include <linux/list.h>
#include <linux/slab.h>
-#include "dsa_priv.h"
+#include "tag.h"
+
+#define BRCM_NAME "brcm"
+#define BRCM_LEGACY_NAME "brcm-legacy"
+#define BRCM_PREPEND_NAME "brcm-prepend"
/* Legacy Broadcom tag (6 bytes) */
#define BRCM_LEG_TAG_LEN 6
@@ -196,7 +200,7 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev)
}
static const struct dsa_device_ops brcm_netdev_ops = {
- .name = "brcm",
+ .name = BRCM_NAME,
.proto = DSA_TAG_PROTO_BRCM,
.xmit = brcm_tag_xmit,
.rcv = brcm_tag_rcv,
@@ -204,7 +208,7 @@ static const struct dsa_device_ops brcm_netdev_ops = {
};
DSA_TAG_DRIVER(brcm_netdev_ops);
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM, BRCM_NAME);
#endif
#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY)
@@ -273,7 +277,7 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb,
}
static const struct dsa_device_ops brcm_legacy_netdev_ops = {
- .name = "brcm-legacy",
+ .name = BRCM_LEGACY_NAME,
.proto = DSA_TAG_PROTO_BRCM_LEGACY,
.xmit = brcm_leg_tag_xmit,
.rcv = brcm_leg_tag_rcv,
@@ -281,7 +285,7 @@ static const struct dsa_device_ops brcm_legacy_netdev_ops = {
};
DSA_TAG_DRIVER(brcm_legacy_netdev_ops);
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY, BRCM_LEGACY_NAME);
#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY */
#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND)
@@ -300,7 +304,7 @@ static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb,
}
static const struct dsa_device_ops brcm_prepend_netdev_ops = {
- .name = "brcm-prepend",
+ .name = BRCM_PREPEND_NAME,
.proto = DSA_TAG_PROTO_BRCM_PREPEND,
.xmit = brcm_tag_xmit_prepend,
.rcv = brcm_tag_rcv_prepend,
@@ -308,7 +312,7 @@ static const struct dsa_device_ops brcm_prepend_netdev_ops = {
};
DSA_TAG_DRIVER(brcm_prepend_netdev_ops);
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_PREPEND);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_PREPEND, BRCM_PREPEND_NAME);
#endif
static struct dsa_tag_driver *dsa_tag_driver_array[] = {
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index e4b6e3f2a3db..1fd7fa26db64 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -50,7 +50,10 @@
#include <linux/list.h>
#include <linux/slab.h>
-#include "dsa_priv.h"
+#include "tag.h"
+
+#define DSA_NAME "dsa"
+#define EDSA_NAME "edsa"
#define DSA_HLEN 4
@@ -339,7 +342,7 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev)
}
static const struct dsa_device_ops dsa_netdev_ops = {
- .name = "dsa",
+ .name = DSA_NAME,
.proto = DSA_TAG_PROTO_DSA,
.xmit = dsa_xmit,
.rcv = dsa_rcv,
@@ -347,7 +350,7 @@ static const struct dsa_device_ops dsa_netdev_ops = {
};
DSA_TAG_DRIVER(dsa_netdev_ops);
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_DSA);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_DSA, DSA_NAME);
#endif /* CONFIG_NET_DSA_TAG_DSA */
#if IS_ENABLED(CONFIG_NET_DSA_TAG_EDSA)
@@ -381,7 +384,7 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev)
}
static const struct dsa_device_ops edsa_netdev_ops = {
- .name = "edsa",
+ .name = EDSA_NAME,
.proto = DSA_TAG_PROTO_EDSA,
.xmit = edsa_xmit,
.rcv = edsa_rcv,
@@ -389,7 +392,7 @@ static const struct dsa_device_ops edsa_netdev_ops = {
};
DSA_TAG_DRIVER(edsa_netdev_ops);
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_EDSA);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_EDSA, EDSA_NAME);
#endif /* CONFIG_NET_DSA_TAG_EDSA */
static struct dsa_tag_driver *dsa_tag_drivers[] = {
diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c
index df7140984da3..e279cd9057b0 100644
--- a/net/dsa/tag_gswip.c
+++ b/net/dsa/tag_gswip.c
@@ -10,7 +10,9 @@
#include <linux/skbuff.h>
#include <net/dsa.h>
-#include "dsa_priv.h"
+#include "tag.h"
+
+#define GSWIP_NAME "gswip"
#define GSWIP_TX_HEADER_LEN 4
@@ -98,7 +100,7 @@ static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb,
}
static const struct dsa_device_ops gswip_netdev_ops = {
- .name = "gswip",
+ .name = GSWIP_NAME,
.proto = DSA_TAG_PROTO_GSWIP,
.xmit = gswip_tag_xmit,
.rcv = gswip_tag_rcv,
@@ -106,6 +108,6 @@ static const struct dsa_device_ops gswip_netdev_ops = {
};
MODULE_LICENSE("GPL");
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_GSWIP);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_GSWIP, GSWIP_NAME);
module_dsa_tag_driver(gswip_netdev_ops);
diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c
index 846588c0070a..71884296fc70 100644
--- a/net/dsa/tag_hellcreek.c
+++ b/net/dsa/tag_hellcreek.c
@@ -11,7 +11,9 @@
#include <linux/skbuff.h>
#include <net/dsa.h>
-#include "dsa_priv.h"
+#include "tag.h"
+
+#define HELLCREEK_NAME "hellcreek"
#define HELLCREEK_TAG_LEN 1
@@ -57,7 +59,7 @@ static struct sk_buff *hellcreek_rcv(struct sk_buff *skb,
}
static const struct dsa_device_ops hellcreek_netdev_ops = {
- .name = "hellcreek",
+ .name = HELLCREEK_NAME,
.proto = DSA_TAG_PROTO_HELLCREEK,
.xmit = hellcreek_xmit,
.rcv = hellcreek_rcv,
@@ -65,6 +67,6 @@ static const struct dsa_device_ops hellcreek_netdev_ops = {
};
MODULE_LICENSE("Dual MIT/GPL");
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_HELLCREEK);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_HELLCREEK, HELLCREEK_NAME);
module_dsa_tag_driver(hellcreek_netdev_ops);
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 38fa19c1e2d5..0f6ae143afc9 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -7,7 +7,13 @@
#include <linux/etherdevice.h>
#include <linux/list.h>
#include <net/dsa.h>
-#include "dsa_priv.h"
+
+#include "tag.h"
+
+#define KSZ8795_NAME "ksz8795"
+#define KSZ9477_NAME "ksz9477"
+#define KSZ9893_NAME "ksz9893"
+#define LAN937X_NAME "lan937x"
/* Typically only one byte is used for tail tag. */
#define KSZ_EGRESS_TAG_LEN 1
@@ -74,7 +80,7 @@ static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev)
}
static const struct dsa_device_ops ksz8795_netdev_ops = {
- .name = "ksz8795",
+ .name = KSZ8795_NAME,
.proto = DSA_TAG_PROTO_KSZ8795,
.xmit = ksz8795_xmit,
.rcv = ksz8795_rcv,
@@ -82,7 +88,7 @@ static const struct dsa_device_ops ksz8795_netdev_ops = {
};
DSA_TAG_DRIVER(ksz8795_netdev_ops);
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795, KSZ8795_NAME);
/*
* For Ingress (Host -> KSZ9477), 2 bytes are added before FCS.
@@ -147,7 +153,7 @@ static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev)
}
static const struct dsa_device_ops ksz9477_netdev_ops = {
- .name = "ksz9477",
+ .name = KSZ9477_NAME,
.proto = DSA_TAG_PROTO_KSZ9477,
.xmit = ksz9477_xmit,
.rcv = ksz9477_rcv,
@@ -155,7 +161,7 @@ static const struct dsa_device_ops ksz9477_netdev_ops = {
};
DSA_TAG_DRIVER(ksz9477_netdev_ops);
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9477);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9477, KSZ9477_NAME);
#define KSZ9893_TAIL_TAG_OVERRIDE BIT(5)
#define KSZ9893_TAIL_TAG_LOOKUP BIT(6)
@@ -183,7 +189,7 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb,
}
static const struct dsa_device_ops ksz9893_netdev_ops = {
- .name = "ksz9893",
+ .name = KSZ9893_NAME,
.proto = DSA_TAG_PROTO_KSZ9893,
.xmit = ksz9893_xmit,
.rcv = ksz9477_rcv,
@@ -191,7 +197,7 @@ static const struct dsa_device_ops ksz9893_netdev_ops = {
};
DSA_TAG_DRIVER(ksz9893_netdev_ops);
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893, KSZ9893_NAME);
/* For xmit, 2 bytes are added before FCS.
* ---------------------------------------------------------------------------
@@ -241,7 +247,7 @@ static struct sk_buff *lan937x_xmit(struct sk_buff *skb,
}
static const struct dsa_device_ops lan937x_netdev_ops = {
- .name = "lan937x",
+ .name = LAN937X_NAME,
.proto = DSA_TAG_PROTO_LAN937X,
.xmit = lan937x_xmit,
.rcv = ksz9477_rcv,
@@ -249,7 +255,7 @@ static const struct dsa_device_ops lan937x_netdev_ops = {
};
DSA_TAG_DRIVER(lan937x_netdev_ops);
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN937X);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN937X, LAN937X_NAME);
static struct dsa_tag_driver *dsa_tag_driver_array[] = {
&DSA_TAG_DRIVER_NAME(ksz8795_netdev_ops),
diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c
index 98d7d7120bab..c25f5536706b 100644
--- a/net/dsa/tag_lan9303.c
+++ b/net/dsa/tag_lan9303.c
@@ -7,7 +7,7 @@
#include <linux/list.h>
#include <linux/slab.h>
-#include "dsa_priv.h"
+#include "tag.h"
/* To define the outgoing port and to discover the incoming port a regular
* VLAN tag is used by the LAN9303. But its VID meaning is 'special':
@@ -30,6 +30,8 @@
* Required when no forwarding between the external ports should happen.
*/
+#define LAN9303_NAME "lan9303"
+
#define LAN9303_TAG_LEN 4
# define LAN9303_TAG_TX_USE_ALR BIT(3)
# define LAN9303_TAG_TX_STP_OVERRIDE BIT(4)
@@ -110,7 +112,7 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev)
}
static const struct dsa_device_ops lan9303_netdev_ops = {
- .name = "lan9303",
+ .name = LAN9303_NAME,
.proto = DSA_TAG_PROTO_LAN9303,
.xmit = lan9303_xmit,
.rcv = lan9303_rcv,
@@ -118,6 +120,6 @@ static const struct dsa_device_ops lan9303_netdev_ops = {
};
MODULE_LICENSE("GPL");
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN9303);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN9303, LAN9303_NAME);
module_dsa_tag_driver(lan9303_netdev_ops);
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index 415d8ece242a..40af80452747 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -8,7 +8,9 @@
#include <linux/etherdevice.h>
#include <linux/if_vlan.h>
-#include "dsa_priv.h"
+#include "tag.h"
+
+#define MTK_NAME "mtk"
#define MTK_HDR_LEN 4
#define MTK_HDR_XMIT_UNTAGGED 0
@@ -25,6 +27,8 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
u8 xmit_tpid;
u8 *mtk_tag;
+ skb_set_queue_mapping(skb, dp->index);
+
/* Build the special tag after the MAC Source Address. If VLAN header
* is present, it's required that VLAN header and special tag is
* being combined. Only in this way we can allow the switch can parse
@@ -91,7 +95,7 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev)
}
static const struct dsa_device_ops mtk_netdev_ops = {
- .name = "mtk",
+ .name = MTK_NAME,
.proto = DSA_TAG_PROTO_MTK,
.xmit = mtk_tag_xmit,
.rcv = mtk_tag_rcv,
@@ -99,6 +103,6 @@ static const struct dsa_device_ops mtk_netdev_ops = {
};
MODULE_LICENSE("GPL");
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MTK);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MTK, MTK_NAME);
module_dsa_tag_driver(mtk_netdev_ops);
diff --git a/net/dsa/tag_none.c b/net/dsa/tag_none.c
new file mode 100644
index 000000000000..d2fd179c4227
--- /dev/null
+++ b/net/dsa/tag_none.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * net/dsa/tag_none.c - Traffic handling for switches with no tag
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
+ *
+ * WARNING: do not use this for new switches. In case of no hardware
+ * tagging support, look at tag_8021q.c instead.
+ */
+
+#include "tag.h"
+
+#define NONE_NAME "none"
+
+static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ /* Just return the original SKB */
+ return skb;
+}
+
+static const struct dsa_device_ops none_ops = {
+ .name = NONE_NAME,
+ .proto = DSA_TAG_PROTO_NONE,
+ .xmit = dsa_slave_notag_xmit,
+};
+
+module_dsa_tag_driver(none_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_NONE, NONE_NAME);
+MODULE_LICENSE("GPL");
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index 0d81f172b7a6..28ebecafdd24 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -2,7 +2,11 @@
/* Copyright 2019 NXP
*/
#include <linux/dsa/ocelot.h>
-#include "dsa_priv.h"
+
+#include "tag.h"
+
+#define OCELOT_NAME "ocelot"
+#define SEVILLE_NAME "seville"
/* If the port is under a VLAN-aware bridge, remove the VLAN header from the
* payload and move it into the DSA tag, which will make the switch classify
@@ -183,7 +187,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
}
static const struct dsa_device_ops ocelot_netdev_ops = {
- .name = "ocelot",
+ .name = OCELOT_NAME,
.proto = DSA_TAG_PROTO_OCELOT,
.xmit = ocelot_xmit,
.rcv = ocelot_rcv,
@@ -192,10 +196,10 @@ static const struct dsa_device_ops ocelot_netdev_ops = {
};
DSA_TAG_DRIVER(ocelot_netdev_ops);
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT, OCELOT_NAME);
static const struct dsa_device_ops seville_netdev_ops = {
- .name = "seville",
+ .name = SEVILLE_NAME,
.proto = DSA_TAG_PROTO_SEVILLE,
.xmit = seville_xmit,
.rcv = ocelot_rcv,
@@ -204,7 +208,7 @@ static const struct dsa_device_ops seville_netdev_ops = {
};
DSA_TAG_DRIVER(seville_netdev_ops);
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SEVILLE);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SEVILLE, SEVILLE_NAME);
static struct dsa_tag_driver *ocelot_tag_driver_array[] = {
&DSA_TAG_DRIVER_NAME(ocelot_netdev_ops),
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index 37ccf00404ea..1f0b8c20eba5 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -10,7 +10,11 @@
*/
#include <linux/dsa/8021q.h>
#include <linux/dsa/ocelot.h>
-#include "dsa_priv.h"
+
+#include "tag.h"
+#include "tag_8021q.h"
+
+#define OCELOT_8021Q_NAME "ocelot-8021q"
struct ocelot_8021q_tagger_private {
struct ocelot_8021q_tagger_data data; /* Must be first */
@@ -119,7 +123,7 @@ static int ocelot_connect(struct dsa_switch *ds)
}
static const struct dsa_device_ops ocelot_8021q_netdev_ops = {
- .name = "ocelot-8021q",
+ .name = OCELOT_8021Q_NAME,
.proto = DSA_TAG_PROTO_OCELOT_8021Q,
.xmit = ocelot_xmit,
.rcv = ocelot_rcv,
@@ -130,6 +134,6 @@ static const struct dsa_device_ops ocelot_8021q_netdev_ops = {
};
MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT_8021Q);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT_8021Q, OCELOT_8021Q_NAME);
module_dsa_tag_driver(ocelot_8021q_netdev_ops);
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 57d2e00f1e5d..e757c8de06f1 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -8,7 +8,9 @@
#include <net/dsa.h>
#include <linux/dsa/tag_qca.h>
-#include "dsa_priv.h"
+#include "tag.h"
+
+#define QCA_NAME "qca"
static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
{
@@ -107,7 +109,7 @@ static void qca_tag_disconnect(struct dsa_switch *ds)
}
static const struct dsa_device_ops qca_netdev_ops = {
- .name = "qca",
+ .name = QCA_NAME,
.proto = DSA_TAG_PROTO_QCA,
.connect = qca_tag_connect,
.disconnect = qca_tag_disconnect,
@@ -118,6 +120,6 @@ static const struct dsa_device_ops qca_netdev_ops = {
};
MODULE_LICENSE("GPL");
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_QCA);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_QCA, QCA_NAME);
module_dsa_tag_driver(qca_netdev_ops);
diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c
index 6d928ee3ef7a..c327314b95e3 100644
--- a/net/dsa/tag_rtl4_a.c
+++ b/net/dsa/tag_rtl4_a.c
@@ -18,7 +18,9 @@
#include <linux/etherdevice.h>
#include <linux/bits.h>
-#include "dsa_priv.h"
+#include "tag.h"
+
+#define RTL4_A_NAME "rtl4a"
#define RTL4_A_HDR_LEN 4
#define RTL4_A_ETHERTYPE 0x8899
@@ -112,7 +114,7 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb,
}
static const struct dsa_device_ops rtl4a_netdev_ops = {
- .name = "rtl4a",
+ .name = RTL4_A_NAME,
.proto = DSA_TAG_PROTO_RTL4_A,
.xmit = rtl4a_tag_xmit,
.rcv = rtl4a_tag_rcv,
@@ -121,4 +123,4 @@ static const struct dsa_device_ops rtl4a_netdev_ops = {
module_dsa_tag_driver(rtl4a_netdev_ops);
MODULE_LICENSE("GPL");
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL4_A);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL4_A, RTL4_A_NAME);
diff --git a/net/dsa/tag_rtl8_4.c b/net/dsa/tag_rtl8_4.c
index a593ead7ff26..4f67834fd121 100644
--- a/net/dsa/tag_rtl8_4.c
+++ b/net/dsa/tag_rtl8_4.c
@@ -77,13 +77,16 @@
#include <linux/bits.h>
#include <linux/etherdevice.h>
-#include "dsa_priv.h"
+#include "tag.h"
/* Protocols supported:
*
* 0x04 = RTL8365MB DSA protocol
*/
+#define RTL8_4_NAME "rtl8_4"
+#define RTL8_4T_NAME "rtl8_4t"
+
#define RTL8_4_TAG_LEN 8
#define RTL8_4_PROTOCOL GENMASK(15, 8)
@@ -234,7 +237,7 @@ static const struct dsa_device_ops rtl8_4_netdev_ops = {
DSA_TAG_DRIVER(rtl8_4_netdev_ops);
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4, RTL8_4_NAME);
/* Tail version */
static const struct dsa_device_ops rtl8_4t_netdev_ops = {
@@ -247,7 +250,7 @@ static const struct dsa_device_ops rtl8_4t_netdev_ops = {
DSA_TAG_DRIVER(rtl8_4t_netdev_ops);
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4T);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4T, RTL8_4T_NAME);
static struct dsa_tag_driver *dsa_tag_drivers[] = {
&DSA_TAG_DRIVER_NAME(rtl8_4_netdev_ops),
diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c
index e2a5ee6ae688..437a6820ac42 100644
--- a/net/dsa/tag_rzn1_a5psw.c
+++ b/net/dsa/tag_rzn1_a5psw.c
@@ -10,7 +10,7 @@
#include <linux/if_ether.h>
#include <net/dsa.h>
-#include "dsa_priv.h"
+#include "tag.h"
/* To define the outgoing port and to discover the incoming port a TAG is
* inserted after Src MAC :
@@ -22,6 +22,8 @@
* See struct a5psw_tag for layout
*/
+#define A5PSW_NAME "a5psw"
+
#define ETH_P_DSA_A5PSW 0xE001
#define A5PSW_TAG_LEN 8
#define A5PSW_CTRL_DATA_FORCE_FORWARD BIT(0)
@@ -101,7 +103,7 @@ static struct sk_buff *a5psw_tag_rcv(struct sk_buff *skb,
}
static const struct dsa_device_ops a5psw_netdev_ops = {
- .name = "a5psw",
+ .name = A5PSW_NAME,
.proto = DSA_TAG_PROTO_RZN1_A5PSW,
.xmit = a5psw_tag_xmit,
.rcv = a5psw_tag_rcv,
@@ -109,5 +111,5 @@ static const struct dsa_device_ops a5psw_netdev_ops = {
};
MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_A5PSW);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_A5PSW, A5PSW_NAME);
module_dsa_tag_driver(a5psw_netdev_ops);
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 83e4136516b0..f14f51b41491 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -5,7 +5,12 @@
#include <linux/dsa/sja1105.h>
#include <linux/dsa/8021q.h>
#include <linux/packing.h>
-#include "dsa_priv.h"
+
+#include "tag.h"
+#include "tag_8021q.h"
+
+#define SJA1105_NAME "sja1105"
+#define SJA1110_NAME "sja1110"
/* Is this a TX or an RX header? */
#define SJA1110_HEADER_HOST_TO_SWITCH BIT(15)
@@ -786,7 +791,7 @@ static int sja1105_connect(struct dsa_switch *ds)
}
static const struct dsa_device_ops sja1105_netdev_ops = {
- .name = "sja1105",
+ .name = SJA1105_NAME,
.proto = DSA_TAG_PROTO_SJA1105,
.xmit = sja1105_xmit,
.rcv = sja1105_rcv,
@@ -798,10 +803,10 @@ static const struct dsa_device_ops sja1105_netdev_ops = {
};
DSA_TAG_DRIVER(sja1105_netdev_ops);
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1105);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1105, SJA1105_NAME);
static const struct dsa_device_ops sja1110_netdev_ops = {
- .name = "sja1110",
+ .name = SJA1110_NAME,
.proto = DSA_TAG_PROTO_SJA1110,
.xmit = sja1110_xmit,
.rcv = sja1110_rcv,
@@ -813,7 +818,7 @@ static const struct dsa_device_ops sja1110_netdev_ops = {
};
DSA_TAG_DRIVER(sja1110_netdev_ops);
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1110);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1110, SJA1110_NAME);
static struct dsa_tag_driver *sja1105_tag_driver_array[] = {
&DSA_TAG_DRIVER_NAME(sja1105_netdev_ops),
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 5749ba85c2b8..7361b9106382 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -8,7 +8,9 @@
#include <linux/list.h>
#include <linux/slab.h>
-#include "dsa_priv.h"
+#include "tag.h"
+
+#define TRAILER_NAME "trailer"
static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
{
@@ -50,7 +52,7 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev)
}
static const struct dsa_device_ops trailer_netdev_ops = {
- .name = "trailer",
+ .name = TRAILER_NAME,
.proto = DSA_TAG_PROTO_TRAILER,
.xmit = trailer_xmit,
.rcv = trailer_rcv,
@@ -58,6 +60,6 @@ static const struct dsa_device_ops trailer_netdev_ops = {
};
MODULE_LICENSE("GPL");
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_TRAILER);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_TRAILER, TRAILER_NAME);
module_dsa_tag_driver(trailer_netdev_ops);
diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c
index ff442b8af636..af19969f9bc4 100644
--- a/net/dsa/tag_xrs700x.c
+++ b/net/dsa/tag_xrs700x.c
@@ -7,7 +7,9 @@
#include <linux/bitops.h>
-#include "dsa_priv.h"
+#include "tag.h"
+
+#define XRS700X_NAME "xrs700x"
static struct sk_buff *xrs700x_xmit(struct sk_buff *skb, struct net_device *dev)
{
@@ -51,7 +53,7 @@ static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev)
}
static const struct dsa_device_ops xrs700x_netdev_ops = {
- .name = "xrs700x",
+ .name = XRS700X_NAME,
.proto = DSA_TAG_PROTO_XRS700X,
.xmit = xrs700x_xmit,
.rcv = xrs700x_rcv,
@@ -59,6 +61,6 @@ static const struct dsa_device_ops xrs700x_netdev_ops = {
};
MODULE_LICENSE("GPL");
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_XRS700X);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_XRS700X, XRS700X_NAME);
module_dsa_tag_driver(xrs700x_netdev_ops);
diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c
index 403158862011..c7e37130647e 100644
--- a/net/ethtool/channels.c
+++ b/net/ethtool/channels.c
@@ -116,9 +116,10 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info)
struct ethtool_channels channels = {};
struct ethnl_req_info req_info = {};
struct nlattr **tb = info->attrs;
- u32 err_attr, max_rx_in_use = 0;
+ u32 err_attr, max_rxfh_in_use;
const struct ethtool_ops *ops;
struct net_device *dev;
+ u64 max_rxnfc_in_use;
int ret;
ret = ethnl_parse_header_dev_get(&req_info,
@@ -189,15 +190,23 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info)
}
/* ensure the new Rx count fits within the configured Rx flow
- * indirection table settings
+ * indirection table/rxnfc settings
*/
- if (netif_is_rxfh_configured(dev) &&
- !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) &&
- (channels.combined_count + channels.rx_count) <= max_rx_in_use) {
+ if (ethtool_get_max_rxnfc_channel(dev, &max_rxnfc_in_use))
+ max_rxnfc_in_use = 0;
+ if (!netif_is_rxfh_configured(dev) ||
+ ethtool_get_max_rxfh_channel(dev, &max_rxfh_in_use))
+ max_rxfh_in_use = 0;
+ if (channels.combined_count + channels.rx_count <= max_rxfh_in_use) {
ret = -EINVAL;
GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing indirection table settings");
goto out_ops;
}
+ if (channels.combined_count + channels.rx_count <= max_rxnfc_in_use) {
+ ret = -EINVAL;
+ GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing ntuple filter settings");
+ goto out_ops;
+ }
/* Disabling channels, query zero-copy AF_XDP sockets */
from_channel = channels.combined_count +
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 566adf85e658..21cfe8557205 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -202,6 +202,12 @@ const char link_mode_names[][ETH_GSTRING_LEN] = {
__DEFINE_LINK_MODE_NAME(100, FX, Half),
__DEFINE_LINK_MODE_NAME(100, FX, Full),
__DEFINE_LINK_MODE_NAME(10, T1L, Full),
+ __DEFINE_LINK_MODE_NAME(800000, CR8, Full),
+ __DEFINE_LINK_MODE_NAME(800000, KR8, Full),
+ __DEFINE_LINK_MODE_NAME(800000, DR8, Full),
+ __DEFINE_LINK_MODE_NAME(800000, DR8_2, Full),
+ __DEFINE_LINK_MODE_NAME(800000, SR8, Full),
+ __DEFINE_LINK_MODE_NAME(800000, VR8, Full),
};
static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS);
@@ -238,6 +244,8 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS);
#define __LINK_MODE_LANES_X 1
#define __LINK_MODE_LANES_FX 1
#define __LINK_MODE_LANES_T1L 1
+#define __LINK_MODE_LANES_VR8 8
+#define __LINK_MODE_LANES_DR8_2 8
#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \
[ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \
@@ -352,6 +360,12 @@ const struct link_mode_info link_mode_params[] = {
__DEFINE_LINK_MODE_PARAMS(100, FX, Half),
__DEFINE_LINK_MODE_PARAMS(100, FX, Full),
__DEFINE_LINK_MODE_PARAMS(10, T1L, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, CR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, KR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, DR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, DR8_2, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, SR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, VR8, Full),
};
static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS);
@@ -498,6 +512,72 @@ int __ethtool_get_link(struct net_device *dev)
return netif_running(dev) && dev->ethtool_ops->get_link(dev);
}
+static int ethtool_get_rxnfc_rule_count(struct net_device *dev)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc info = {
+ .cmd = ETHTOOL_GRXCLSRLCNT,
+ };
+ int err;
+
+ err = ops->get_rxnfc(dev, &info, NULL);
+ if (err)
+ return err;
+
+ return info.rule_cnt;
+}
+
+int ethtool_get_max_rxnfc_channel(struct net_device *dev, u64 *max)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc *info;
+ int err, i, rule_cnt;
+ u64 max_ring = 0;
+
+ if (!ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ rule_cnt = ethtool_get_rxnfc_rule_count(dev);
+ if (rule_cnt <= 0)
+ return -EINVAL;
+
+ info = kvzalloc(struct_size(info, rule_locs, rule_cnt), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ info->cmd = ETHTOOL_GRXCLSRLALL;
+ info->rule_cnt = rule_cnt;
+ err = ops->get_rxnfc(dev, info, info->rule_locs);
+ if (err)
+ goto err_free_info;
+
+ for (i = 0; i < rule_cnt; i++) {
+ struct ethtool_rxnfc rule_info = {
+ .cmd = ETHTOOL_GRXCLSRULE,
+ .fs.location = info->rule_locs[i],
+ };
+
+ err = ops->get_rxnfc(dev, &rule_info, NULL);
+ if (err)
+ goto err_free_info;
+
+ if (rule_info.fs.ring_cookie != RX_CLS_FLOW_DISC &&
+ rule_info.fs.ring_cookie != RX_CLS_FLOW_WAKE &&
+ !(rule_info.flow_type & FLOW_RSS) &&
+ !ethtool_get_flow_spec_ring_vf(rule_info.fs.ring_cookie))
+ max_ring =
+ max_t(u64, max_ring, rule_info.fs.ring_cookie);
+ }
+
+ kvfree(info);
+ *max = max_ring;
+ return 0;
+
+err_free_info:
+ kvfree(info);
+ return err;
+}
+
int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max)
{
u32 dev_size, current_max = 0;
diff --git a/net/ethtool/common.h b/net/ethtool/common.h
index c1779657e074..b1b9db810eca 100644
--- a/net/ethtool/common.h
+++ b/net/ethtool/common.h
@@ -43,6 +43,7 @@ bool convert_legacy_settings_to_link_ksettings(
struct ethtool_link_ksettings *link_ksettings,
const struct ethtool_cmd *legacy_settings);
int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max);
+int ethtool_get_max_rxnfc_channel(struct net_device *dev, u64 *max);
int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info);
extern const struct ethtool_phy_ops *ethtool_phy_ops;
diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c
index 1c94bb8ea03f..49c0a2a77f02 100644
--- a/net/ethtool/eeprom.c
+++ b/net/ethtool/eeprom.c
@@ -124,7 +124,7 @@ static int eeprom_prepare_data(const struct ethnl_req_info *req_base,
if (ret)
goto err_free;
- ret = get_module_eeprom_by_page(dev, &page_data, info->extack);
+ ret = get_module_eeprom_by_page(dev, &page_data, info ? info->extack : NULL);
if (ret < 0)
goto err_ops;
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 57e7238a4136..c2f1a542e6fa 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -44,16 +44,9 @@ struct ethtool_devlink_compat {
static struct devlink *netdev_to_devlink_get(struct net_device *dev)
{
- struct devlink_port *devlink_port;
-
- if (!dev->netdev_ops->ndo_get_devlink_port)
- return NULL;
-
- devlink_port = dev->netdev_ops->ndo_get_devlink_port(dev);
- if (!devlink_port)
+ if (!dev->devlink_port)
return NULL;
-
- return devlink_try_get(devlink_port->devlink);
+ return devlink_try_get(dev->devlink_port->devlink);
}
/*
@@ -713,15 +706,22 @@ static int
ethtool_get_drvinfo(struct net_device *dev, struct ethtool_devlink_compat *rsp)
{
const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct device *parent = dev->dev.parent;
rsp->info.cmd = ETHTOOL_GDRVINFO;
strscpy(rsp->info.version, UTS_RELEASE, sizeof(rsp->info.version));
if (ops->get_drvinfo) {
ops->get_drvinfo(dev, &rsp->info);
- } else if (dev->dev.parent && dev->dev.parent->driver) {
- strscpy(rsp->info.bus_info, dev_name(dev->dev.parent),
+ if (!rsp->info.bus_info[0] && parent)
+ strscpy(rsp->info.bus_info, dev_name(parent),
+ sizeof(rsp->info.bus_info));
+ if (!rsp->info.driver[0] && parent && parent->driver)
+ strscpy(rsp->info.driver, parent->driver->name,
+ sizeof(rsp->info.driver));
+ } else if (parent && parent->driver) {
+ strscpy(rsp->info.bus_info, dev_name(parent),
sizeof(rsp->info.bus_info));
- strscpy(rsp->info.driver, dev->dev.parent->driver->name,
+ strscpy(rsp->info.driver, parent->driver->name,
sizeof(rsp->info.driver));
} else if (dev->rtnl_link_ops) {
strscpy(rsp->info.driver, dev->rtnl_link_ops->kind,
@@ -1796,7 +1796,8 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
{
struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS };
u16 from_channel, to_channel;
- u32 max_rx_in_use = 0;
+ u64 max_rxnfc_in_use;
+ u32 max_rxfh_in_use;
unsigned int i;
int ret;
@@ -1827,11 +1828,15 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
return -EINVAL;
/* ensure the new Rx count fits within the configured Rx flow
- * indirection table settings */
- if (netif_is_rxfh_configured(dev) &&
- !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) &&
- (channels.combined_count + channels.rx_count) <= max_rx_in_use)
- return -EINVAL;
+ * indirection table/rxnfc settings */
+ if (ethtool_get_max_rxnfc_channel(dev, &max_rxnfc_in_use))
+ max_rxnfc_in_use = 0;
+ if (!netif_is_rxfh_configured(dev) ||
+ ethtool_get_max_rxfh_channel(dev, &max_rxfh_in_use))
+ max_rxfh_in_use = 0;
+ if (channels.combined_count + channels.rx_count <=
+ max_t(u64, max_rxnfc_in_use, max_rxfh_in_use))
+ return -EINVAL;
/* Disabling channels, query zero-copy AF_XDP sockets */
from_channel = channels.combined_count +
@@ -2008,7 +2013,8 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
} else {
/* Driver expects to be called at twice the frequency in rc */
int n = rc * 2, interval = HZ / n;
- u64 count = n * id.data, i = 0;
+ u64 count = mul_u32_u32(n, id.data);
+ u64 i = 0;
do {
rtnl_lock();
diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c
index fb676f349455..2158c17a0b32 100644
--- a/net/ethtool/linkstate.c
+++ b/net/ethtool/linkstate.c
@@ -13,6 +13,7 @@ struct linkstate_reply_data {
int link;
int sqi;
int sqi_max;
+ struct ethtool_link_ext_stats link_stats;
bool link_ext_state_provided;
struct ethtool_link_ext_state_info ethtool_link_ext_state_info;
};
@@ -22,7 +23,7 @@ struct linkstate_reply_data {
const struct nla_policy ethnl_linkstate_get_policy[] = {
[ETHTOOL_A_LINKSTATE_HEADER] =
- NLA_POLICY_NESTED(ethnl_header_policy),
+ NLA_POLICY_NESTED(ethnl_header_policy_stats),
};
static int linkstate_get_sqi(struct net_device *dev)
@@ -107,6 +108,19 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base,
goto out;
}
+ ethtool_stats_init((u64 *)&data->link_stats,
+ sizeof(data->link_stats) / 8);
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS) {
+ if (dev->phydev)
+ data->link_stats.link_down_events =
+ READ_ONCE(dev->phydev->link_down_events);
+
+ if (dev->ethtool_ops->get_link_ext_stats)
+ dev->ethtool_ops->get_link_ext_stats(dev,
+ &data->link_stats);
+ }
+
ret = 0;
out:
ethnl_ops_complete(dev);
@@ -134,6 +148,9 @@ static int linkstate_reply_size(const struct ethnl_req_info *req_base,
if (data->ethtool_link_ext_state_info.__link_ext_substate)
len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_SUBSTATE */
+ if (data->link_stats.link_down_events != ETHTOOL_STAT_NOT_SET)
+ len += nla_total_size(sizeof(u32));
+
return len;
}
@@ -166,6 +183,11 @@ static int linkstate_fill_reply(struct sk_buff *skb,
return -EMSGSIZE;
}
+ if (data->link_stats.link_down_events != ETHTOOL_STAT_NOT_SET)
+ if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT,
+ data->link_stats.link_down_events))
+ return -EMSGSIZE;
+
return 0;
}
diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c
index 5a471e115b66..e8683e485dc9 100644
--- a/net/ethtool/pse-pd.c
+++ b/net/ethtool/pse-pd.c
@@ -64,7 +64,7 @@ static int pse_prepare_data(const struct ethnl_req_info *req_base,
if (ret < 0)
return ret;
- ret = pse_get_pse_attributes(dev, info->extack, data);
+ ret = pse_get_pse_attributes(dev, info ? info->extack : NULL, data);
ethnl_ops_complete(dev);
diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
index 5bf357734b11..56bb27d67a2e 100644
--- a/net/hsr/hsr_forward.c
+++ b/net/hsr/hsr_forward.c
@@ -150,15 +150,15 @@ struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame,
struct hsr_port *port)
{
if (!frame->skb_std) {
- if (frame->skb_hsr) {
+ if (frame->skb_hsr)
frame->skb_std =
create_stripped_skb_hsr(frame->skb_hsr, frame);
- } else {
- /* Unexpected */
- WARN_ONCE(1, "%s:%d: Unexpected frame received (port_src %s)\n",
- __FILE__, __LINE__, port->dev->name);
+ else
+ netdev_warn_once(port->dev,
+ "Unexpected frame received in hsr_get_untagged_frame()\n");
+
+ if (!frame->skb_std)
return NULL;
- }
}
return skb_clone(frame->skb_std, GFP_ATOMIC);
@@ -351,17 +351,18 @@ static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev,
struct hsr_node *node_src)
{
bool was_multicast_frame;
- int res;
+ int res, recv_len;
was_multicast_frame = (skb->pkt_type == PACKET_MULTICAST);
hsr_addr_subst_source(node_src, skb);
skb_pull(skb, ETH_HLEN);
+ recv_len = skb->len;
res = netif_rx(skb);
if (res == NET_RX_DROP) {
dev->stats.rx_dropped++;
} else {
dev->stats.rx_packets++;
- dev->stats.rx_bytes += skb->len;
+ dev->stats.rx_bytes += recv_len;
if (was_multicast_frame)
dev->stats.multicast++;
}
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index de259b5170ab..57546e07e06a 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -129,6 +129,9 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size)
wpan_phy_net_set(&rdev->wpan_phy, &init_net);
init_waitqueue_head(&rdev->dev_wait);
+ init_waitqueue_head(&rdev->wpan_phy.sync_txq);
+
+ spin_lock_init(&rdev->wpan_phy.queue_lock);
return &rdev->wpan_phy;
}
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 38c4f3cb010e..b33d1b5eda87 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -2157,7 +2157,8 @@ static int nl802154_del_llsec_seclevel(struct sk_buff *skb,
#define NL802154_FLAG_CHECK_NETDEV_UP 0x08
#define NL802154_FLAG_NEED_WPAN_DEV 0x10
-static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
+static int nl802154_pre_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
struct genl_info *info)
{
struct cfg802154_registered_device *rdev;
@@ -2219,7 +2220,8 @@ static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
return 0;
}
-static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
+static void nl802154_post_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
struct genl_info *info)
{
if (info->user_ptr[1]) {
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index 6e55fae4c686..1fa2fe041ec0 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -502,8 +502,10 @@ static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len)
if (err < 0)
goto out;
- if (addr->family != AF_IEEE802154)
+ if (addr->family != AF_IEEE802154) {
+ err = -EINVAL;
goto out;
+ }
ieee802154_addr_from_sa(&haddr, &addr->addr);
dev = ieee802154_get_dev(sock_net(sk), &haddr);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e983bb0c5012..2dfb12230f08 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -402,6 +402,16 @@ config INET_IPCOMP
If unsure, say Y.
+config INET_TABLE_PERTURB_ORDER
+ int "INET: Source port perturbation table size (as power of 2)" if EXPERT
+ default 16
+ help
+ Source port perturbation table size (as power of 2) for
+ RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm.
+
+ The default is almost always what you want.
+ Only change this if you know what you are doing.
+
config INET_XFRM_TUNNEL
tristate
select INET_TUNNEL
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index bbdd9c44f14e..af7d2cf490fb 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,7 +10,7 @@ obj-y := route.o inetpeer.o protocol.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
tcp_rate.o tcp_recovery.o tcp_ulp.o \
- tcp_offload.o datagram.o raw.o udp.o udplite.o \
+ tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 3dd02396517d..ab4a06be489b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -522,9 +522,9 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
/* Make sure we are allowed to bind here. */
if (snum || !(inet->bind_address_no_port ||
(flags & BIND_FORCE_ADDRESS_NO_PORT))) {
- if (sk->sk_prot->get_port(sk, snum)) {
+ err = sk->sk_prot->get_port(sk, snum);
+ if (err) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
- err = -EADDRINUSE;
goto out_release_sock;
}
if (!(flags & BIND_FROM_BPF)) {
@@ -754,6 +754,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags,
(TCPF_ESTABLISHED | TCPF_SYN_RECV |
TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+ if (test_bit(SOCK_SUPPORT_ZC, &sock->flags))
+ set_bit(SOCK_SUPPORT_ZC, &newsock->flags);
sock_graft(sk2, newsock);
newsock->state = SS_CONNECTED;
@@ -1228,7 +1230,6 @@ EXPORT_SYMBOL(inet_unregister_protosw);
static int inet_sk_reselect_saddr(struct sock *sk)
{
- struct inet_bind_hashbucket *prev_addr_hashbucket;
struct inet_sock *inet = inet_sk(sk);
__be32 old_saddr = inet->inet_saddr;
__be32 daddr = inet->inet_daddr;
@@ -1258,16 +1259,8 @@ static int inet_sk_reselect_saddr(struct sock *sk)
return 0;
}
- prev_addr_hashbucket =
- inet_bhashfn_portaddr(tcp_or_dccp_get_hashinfo(sk), sk,
- sock_net(sk), inet->inet_num);
-
- inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
-
- err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
+ err = inet_bhash2_update_saddr(sk, &new_saddr, AF_INET);
if (err) {
- inet->inet_saddr = old_saddr;
- inet->inet_rcv_saddr = old_saddr;
ip_rt_put(rt);
return err;
}
@@ -1706,9 +1699,9 @@ u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt,
bhptr = per_cpu_ptr(mib, cpu);
syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
do {
- start = u64_stats_fetch_begin_irq(syncp);
+ start = u64_stats_fetch_begin(syncp);
v = *(((u64 *)bhptr) + offt);
- } while (u64_stats_fetch_retry_irq(syncp, start));
+ } while (u64_stats_fetch_retry(syncp, start));
return v;
}
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 6da16ae6a962..4517d2bd186a 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -61,7 +61,9 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size,
if (!bpf_tracing_btf_ctx_access(off, size, type, prog, info))
return false;
- if (info->reg_type == PTR_TO_BTF_ID && info->btf_id == sock_id)
+ if (base_type(info->reg_type) == PTR_TO_BTF_ID &&
+ !bpf_type_has_unsafe_modifiers(info->reg_type) &&
+ info->btf_id == sock_id)
/* promote it to tcp_sock */
info->btf_id = tcp_sock_id;
@@ -69,18 +71,17 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size,
}
static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
- const struct btf *btf,
- const struct btf_type *t, int off,
- int size, enum bpf_access_type atype,
- u32 *next_btf_id,
- enum bpf_type_flag *flag)
+ const struct bpf_reg_state *reg,
+ int off, int size, enum bpf_access_type atype,
+ u32 *next_btf_id, enum bpf_type_flag *flag)
{
+ const struct btf_type *t;
size_t end;
if (atype == BPF_READ)
- return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
- flag);
+ return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
if (t != tcp_sock_type) {
bpf_log(log, "only read is supported\n");
return -EACCES;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 405a8c2aea64..4d1af0cd7d99 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -70,10 +70,10 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
}
inet->inet_daddr = fl4->daddr;
inet->inet_dport = usin->sin_port;
- reuseport_has_conns(sk, true);
+ reuseport_has_conns_set(sk);
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
- inet->inet_id = prandom_u32();
+ inet->inet_id = get_random_u16();
sk_dst_set(sk, &rt->dst);
err = 0;
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 170152772d33..3969fa805679 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -314,6 +314,9 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
xo->seq.low += skb_shinfo(skb)->gso_segs;
}
+ if (xo->seq.low < seq)
+ xo->seq.hi++;
+
esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32));
ip_hdr(skb)->tot_len = htons(skb->len);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 943edf4ad4db..f361d3d56be2 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -389,7 +389,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
dev_match = dev_match || (res.type == RTN_LOCAL &&
dev == net->loopback_dev);
if (dev_match) {
- ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK;
+ ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
return ret;
}
if (no_addr)
@@ -401,7 +401,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
ret = 0;
if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
if (res.type == RTN_UNICAST)
- ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK;
+ ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
}
return ret;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e9a7f70a54df..19a662003eef 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -888,9 +888,11 @@ int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
return 1;
}
- /* cannot match on nexthop object attributes */
- if (fi->nh)
- return 1;
+ if (fi->nh) {
+ if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_mp)
+ return 1;
+ return 0;
+ }
if (cfg->fc_oif || cfg->fc_gw_family) {
struct fib_nh *nh;
@@ -1231,7 +1233,7 @@ static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh,
nh->fib_nh_dev = in_dev->dev;
netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
- nh->fib_nh_scope = RT_SCOPE_LINK;
+ nh->fib_nh_scope = RT_SCOPE_HOST;
if (!netif_carrier_ok(nh->fib_nh_dev))
nh->fib_nh_flags |= RTNH_F_LINKDOWN;
err = 0;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 452ff177e4da..74d403dbd2b4 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -126,7 +126,7 @@ struct key_vector {
/* This list pointer if valid if (pos | bits) == 0 (LEAF) */
struct hlist_head leaf;
/* This array is valid if (pos | bits) > 0 (TNODE) */
- struct key_vector __rcu *tnode[0];
+ DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode);
};
};
@@ -1381,8 +1381,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
/* The alias was already inserted, so the node must exist. */
l = l ? l : fib_find_node(t, &tp, key);
- if (WARN_ON_ONCE(!l))
+ if (WARN_ON_ONCE(!l)) {
+ err = -ENOENT;
goto out_free_new_fa;
+ }
if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) ==
new_fa) {
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index df0660d818ac..81be3e0f0e70 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -213,7 +213,7 @@ static void igmp_stop_timer(struct ip_mc_list *im)
/* It must be called with locked im->lock */
static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
{
- int tv = prandom_u32() % max_delay;
+ int tv = prandom_u32_max(max_delay);
im->tm_running = 1;
if (!mod_timer(&im->timer, jiffies+tv+2))
@@ -222,7 +222,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
static void igmp_gq_start_timer(struct in_device *in_dev)
{
- int tv = prandom_u32() % in_dev->mr_maxdelay;
+ int tv = prandom_u32_max(in_dev->mr_maxdelay);
unsigned long exp = jiffies + tv + 2;
if (in_dev->mr_gq_running &&
@@ -236,7 +236,7 @@ static void igmp_gq_start_timer(struct in_device *in_dev)
static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
{
- int tv = prandom_u32() % delay;
+ int tv = prandom_u32_max(delay);
if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
in_dev_hold(in_dev);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index ebca860e113f..4a34bc7cb15e 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -314,7 +314,7 @@ other_half_scan:
if (likely(remaining > 1))
remaining &= ~1U;
- offset = prandom_u32() % remaining;
+ offset = prandom_u32_max(remaining);
/* __inet_hash_connect() favors ports having @low parity
* We do the opposite to not pollute connect() users.
*/
@@ -471,11 +471,11 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
bool found_port = false, check_bind_conflict = true;
bool bhash_created = false, bhash2_created = false;
+ int ret = -EADDRINUSE, port = snum, l3mdev;
struct inet_bind_hashbucket *head, *head2;
struct inet_bind2_bucket *tb2 = NULL;
struct inet_bind_bucket *tb = NULL;
bool head2_lock_acquired = false;
- int ret = 1, port = snum, l3mdev;
struct net *net = sock_net(sk);
l3mdev = inet_sk_bound_l3mdev(sk);
@@ -1186,7 +1186,7 @@ int inet_csk_listen_start(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
- int err = -EADDRINUSE;
+ int err;
reqsk_queue_alloc(&icsk->icsk_accept_queue);
@@ -1202,7 +1202,8 @@ int inet_csk_listen_start(struct sock *sk)
* after validation is complete.
*/
inet_sk_state_store(sk, TCP_LISTEN);
- if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
+ err = sk->sk_prot->get_port(sk, inet->inet_num);
+ if (!err) {
inet->inet_sport = htons(inet->inet_num);
sk_dst_reset(sk);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c9f9ac5013a7..7072fc0783ef 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -133,6 +133,7 @@ static void inet_frags_free_cb(void *ptr, void *arg)
count = del_timer_sync(&fq->timer) ? 1 : 0;
spin_lock_bh(&fq->lock);
+ fq->flags |= INET_FRAG_DROP;
if (!(fq->flags & INET_FRAG_COMPLETE)) {
fq->flags |= INET_FRAG_COMPLETE;
count++;
@@ -260,7 +261,8 @@ static void inet_frag_destroy_rcu(struct rcu_head *head)
kmem_cache_free(f->frags_cachep, q);
}
-unsigned int inet_frag_rbtree_purge(struct rb_root *root)
+unsigned int inet_frag_rbtree_purge(struct rb_root *root,
+ enum skb_drop_reason reason)
{
struct rb_node *p = rb_first(root);
unsigned int sum = 0;
@@ -274,7 +276,7 @@ unsigned int inet_frag_rbtree_purge(struct rb_root *root)
struct sk_buff *next = FRAG_CB(skb)->next_frag;
sum += skb->truesize;
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
skb = next;
}
}
@@ -284,17 +286,21 @@ EXPORT_SYMBOL(inet_frag_rbtree_purge);
void inet_frag_destroy(struct inet_frag_queue *q)
{
- struct fqdir *fqdir;
unsigned int sum, sum_truesize = 0;
+ enum skb_drop_reason reason;
struct inet_frags *f;
+ struct fqdir *fqdir;
WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
+ reason = (q->flags & INET_FRAG_DROP) ?
+ SKB_DROP_REASON_FRAG_REASM_TIMEOUT :
+ SKB_CONSUMED;
WARN_ON(del_timer(&q->timer) != 0);
/* Release all fragment data. */
fqdir = q->fqdir;
f = fqdir->f;
- sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
+ sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason);
sum = sum_truesize + f->qsize;
call_rcu(&q->rcu, inet_frag_destroy_rcu);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index a0ad34e4f044..3cec471a2cd2 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -858,34 +858,80 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in
return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
}
-int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk)
+static void inet_update_saddr(struct sock *sk, void *saddr, int family)
+{
+ if (family == AF_INET) {
+ inet_sk(sk)->inet_saddr = *(__be32 *)saddr;
+ sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else {
+ sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr;
+ }
+#endif
+}
+
+static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset)
{
struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_bind_hashbucket *head, *head2;
struct inet_bind2_bucket *tb2, *new_tb2;
int l3mdev = inet_sk_bound_l3mdev(sk);
- struct inet_bind_hashbucket *head2;
int port = inet_sk(sk)->inet_num;
struct net *net = sock_net(sk);
+ int bhash;
+
+ if (!inet_csk(sk)->icsk_bind2_hash) {
+ /* Not bind()ed before. */
+ if (reset)
+ inet_reset_saddr(sk);
+ else
+ inet_update_saddr(sk, saddr, family);
+
+ return 0;
+ }
/* Allocate a bind2 bucket ahead of time to avoid permanently putting
* the bhash2 table in an inconsistent state if a new tb2 bucket
* allocation fails.
*/
new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC);
- if (!new_tb2)
+ if (!new_tb2) {
+ if (reset) {
+ /* The (INADDR_ANY, port) bucket might have already
+ * been freed, then we cannot fixup icsk_bind2_hash,
+ * so we give up and unlink sk from bhash/bhash2 not
+ * to leave inconsistency in bhash2.
+ */
+ inet_put_port(sk);
+ inet_reset_saddr(sk);
+ }
+
return -ENOMEM;
+ }
+ bhash = inet_bhashfn(net, port, hinfo->bhash_size);
+ head = &hinfo->bhash[bhash];
head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
- if (prev_saddr) {
- spin_lock_bh(&prev_saddr->lock);
- __sk_del_bind2_node(sk);
- inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep,
- inet_csk(sk)->icsk_bind2_hash);
- spin_unlock_bh(&prev_saddr->lock);
- }
+ /* If we change saddr locklessly, another thread
+ * iterating over bhash might see corrupted address.
+ */
+ spin_lock_bh(&head->lock);
- spin_lock_bh(&head2->lock);
+ spin_lock(&head2->lock);
+ __sk_del_bind2_node(sk);
+ inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash);
+ spin_unlock(&head2->lock);
+
+ if (reset)
+ inet_reset_saddr(sk);
+ else
+ inet_update_saddr(sk, saddr, family);
+
+ head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+
+ spin_lock(&head2->lock);
tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
if (!tb2) {
tb2 = new_tb2;
@@ -893,26 +939,40 @@ int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct soc
}
sk_add_bind2_node(sk, &tb2->owners);
inet_csk(sk)->icsk_bind2_hash = tb2;
- spin_unlock_bh(&head2->lock);
+ spin_unlock(&head2->lock);
+
+ spin_unlock_bh(&head->lock);
if (tb2 != new_tb2)
kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2);
return 0;
}
+
+int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
+{
+ return __inet_bhash2_update_saddr(sk, saddr, family, false);
+}
EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr);
+void inet_bhash2_reset_saddr(struct sock *sk)
+{
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ __inet_bhash2_update_saddr(sk, NULL, 0, true);
+}
+EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr);
+
/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm
* Note that we use 32bit integers (vs RFC 'short integers')
* because 2^16 is not a multiple of num_ephemeral and this
* property might be used by clever attacker.
+ *
* RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though
- * attacks were since demonstrated, thus we use 65536 instead to really
- * give more isolation and privacy, at the expense of 256kB of kernel
- * memory.
+ * attacks were since demonstrated, thus we use 65536 by default instead
+ * to really give more isolation and privacy, at the expense of 256kB
+ * of kernel memory.
*/
-#define INET_TABLE_PERTURB_SHIFT 16
-#define INET_TABLE_PERTURB_SIZE (1 << INET_TABLE_PERTURB_SHIFT)
+#define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER)
static u32 *table_perturb;
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
@@ -1037,7 +1097,7 @@ ok:
* on low contention the randomness is maximal and on high contention
* it may be inexistent.
*/
- i = max_t(int, i, (prandom_u32() & 7) * 2);
+ i = max_t(int, i, prandom_u32_max(8) * 2);
WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
/* Head lock still held and bh's disabled */
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index fb153569889e..69c00ffdcf3e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -153,6 +153,7 @@ static void ip_expire(struct timer_list *t)
if (qp->q.flags & INET_FRAG_COMPLETE)
goto out;
+ qp->q.flags |= INET_FRAG_DROP;
ipq_kill(qp);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
@@ -194,7 +195,7 @@ out:
spin_unlock(&qp->q.lock);
out_rcu_unlock:
rcu_read_unlock();
- kfree_skb(head);
+ kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT);
ipq_put(qp);
}
@@ -254,7 +255,8 @@ static int ip_frag_reinit(struct ipq *qp)
return -ETIMEDOUT;
}
- sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
+ sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments,
+ SKB_DROP_REASON_FRAG_TOO_FAR);
sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
qp->q.flags = 0;
@@ -278,10 +280,14 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
struct net_device *dev;
unsigned int fragsize;
int err = -ENOENT;
+ SKB_DR(reason);
u8 ecn;
- if (qp->q.flags & INET_FRAG_COMPLETE)
+ /* If reassembly is already done, @skb must be a duplicate frag. */
+ if (qp->q.flags & INET_FRAG_COMPLETE) {
+ SKB_DR_SET(reason, DUP_FRAG);
goto err;
+ }
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
unlikely(ip_frag_too_far(qp)) &&
@@ -382,8 +388,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
insert_error:
if (err == IPFRAG_DUP) {
- kfree_skb(skb);
- return -EINVAL;
+ SKB_DR_SET(reason, DUP_FRAG);
+ err = -EINVAL;
+ goto err;
}
err = -EINVAL;
__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
@@ -391,7 +398,7 @@ discard_qp:
inet_frag_kill(&qp->q);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
err:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return err;
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f866d6282b2b..a4ccef3e6935 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -510,7 +510,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
err_free_skb:
kfree_skb(skb);
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
}
static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -592,7 +592,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
err_free_skb:
kfree_skb(skb);
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
}
static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
@@ -663,7 +663,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
free_skb:
kfree_skb(skb);
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
return NETDEV_TX_OK;
}
@@ -717,7 +717,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
free_skb:
kfree_skb(skb);
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
return NETDEV_TX_OK;
}
@@ -745,7 +745,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
free_skb:
kfree_skb(skb);
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
return NETDEV_TX_OK;
}
@@ -1665,7 +1665,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
if (err)
goto out;
- err = rtnl_configure_link(dev, NULL);
+ err = rtnl_configure_link(dev, NULL, 0, NULL);
if (err < 0)
goto out;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 1b512390b3cf..e880ce77322a 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -366,6 +366,11 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
iph->tos, dev);
if (unlikely(err))
goto drop_error;
+ } else {
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+ if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY))
+ IPCB(skb)->flags |= IPSKB_NOPOLICY;
}
#ifdef CONFIG_IP_ROUTE_CLASSID
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 1ae83ad629b2..922c87ef1ab5 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -172,7 +172,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
* Avoid using the hashed IP ident generator.
*/
if (sk->sk_protocol == IPPROTO_TCP)
- iph->id = (__force __be16)prandom_u32();
+ iph->id = (__force __be16)get_random_u16();
else
__ip_select_ident(net, iph, 1);
}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 6e19cad154f5..9f92ae35bb01 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -267,7 +267,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
}
#endif
if (cmsg->cmsg_level == SOL_SOCKET) {
- err = __sock_cmsg_send(sk, msg, cmsg, &ipc->sockc);
+ err = __sock_cmsg_send(sk, cmsg, &ipc->sockc);
if (err)
return err;
continue;
@@ -433,6 +433,7 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
}
kfree_skb(skb);
}
+EXPORT_SYMBOL_GPL(ip_icmp_error);
void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info)
{
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 019f3b0839c5..de90b09dfe78 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -368,23 +368,23 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
#ifdef CONFIG_NET_IPGRE_BROADCAST
if (ipv4_is_multicast(iph->daddr)) {
- tunnel->dev->stats.multicast++;
+ DEV_STATS_INC(tunnel->dev, multicast);
skb->pkt_type = PACKET_BROADCAST;
}
#endif
if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
- tunnel->dev->stats.rx_crc_errors++;
- tunnel->dev->stats.rx_errors++;
+ DEV_STATS_INC(tunnel->dev, rx_crc_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto drop;
}
if (tunnel->parms.i_flags&TUNNEL_SEQ) {
if (!(tpi->flags&TUNNEL_SEQ) ||
(tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
- tunnel->dev->stats.rx_fifo_errors++;
- tunnel->dev->stats.rx_errors++;
+ DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto drop;
}
tunnel->i_seqno = ntohl(tpi->seq) + 1;
@@ -398,8 +398,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
&iph->saddr, iph->tos);
if (err > 1) {
- ++tunnel->dev->stats.rx_frame_errors;
- ++tunnel->dev->stats.rx_errors;
+ DEV_STATS_INC(tunnel->dev, rx_frame_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto drop;
}
}
@@ -581,7 +581,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
if (!rt) {
rt = ip_route_output_key(tunnel->net, &fl4);
if (IS_ERR(rt)) {
- dev->stats.tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
goto tx_error;
}
if (use_cache)
@@ -590,7 +590,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
}
if (rt->dst.dev == dev) {
ip_rt_put(rt);
- dev->stats.collisions++;
+ DEV_STATS_INC(dev, collisions);
goto tx_error;
}
@@ -625,10 +625,10 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
df, !net_eq(tunnel->net, dev_net(dev)));
return;
tx_error:
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
goto kfree;
tx_dropped:
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
kfree:
kfree_skb(skb);
}
@@ -662,7 +662,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
/* NBMA tunnel */
if (!skb_dst(skb)) {
- dev->stats.tx_fifo_errors++;
+ DEV_STATS_INC(dev, tx_fifo_errors);
goto tx_error;
}
@@ -749,7 +749,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
rt = ip_route_output_key(tunnel->net, &fl4);
if (IS_ERR(rt)) {
- dev->stats.tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
goto tx_error;
}
if (use_cache)
@@ -762,7 +762,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
if (rt->dst.dev == dev) {
ip_rt_put(rt);
- dev->stats.collisions++;
+ DEV_STATS_INC(dev, collisions);
goto tx_error;
}
@@ -805,7 +805,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
if (skb_cow_head(skb, dev->needed_headroom)) {
ip_rt_put(rt);
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
kfree_skb(skb);
return;
}
@@ -819,7 +819,7 @@ tx_error_icmp:
dst_link_failure(skb);
#endif
tx_error:
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 8c2bd1d9ddce..53bfd8af6920 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -107,8 +107,8 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
dev = tunnel->dev;
if (err) {
- dev->stats.rx_errors++;
- dev->stats.rx_dropped++;
+ DEV_STATS_INC(dev, rx_errors);
+ DEV_STATS_INC(dev, rx_dropped);
return 0;
}
@@ -183,7 +183,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
if (IS_ERR(rt)) {
- dev->stats.tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
goto tx_error_icmp;
}
dst = &rt->dst;
@@ -198,14 +198,14 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
if (dst->error) {
dst_release(dst);
dst = NULL;
- dev->stats.tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
goto tx_error_icmp;
}
skb_dst_set(skb, dst);
break;
#endif
default:
- dev->stats.tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
goto tx_error_icmp;
}
}
@@ -213,7 +213,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
dst_hold(dst);
dst = xfrm_lookup_route(tunnel->net, dst, fl, NULL, 0);
if (IS_ERR(dst)) {
- dev->stats.tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
goto tx_error_icmp;
}
@@ -221,7 +221,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
goto xmit;
if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) {
- dev->stats.tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
dst_release(dst);
goto tx_error_icmp;
}
@@ -230,7 +230,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
if (tdev == dev) {
dst_release(dst);
- dev->stats.collisions++;
+ DEV_STATS_INC(dev, collisions);
goto tx_error;
}
@@ -267,7 +267,7 @@ xmit:
tx_error_icmp:
dst_link_failure(skb);
tx_error:
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -304,7 +304,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
return vti_xmit(skb, dev, &fl);
tx_err:
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
kfree_skb(skb);
return NETDEV_TX_OK;
}
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 180f9daf5bec..abea77759b7e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -310,7 +310,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
tx_error:
kfree_skb(skb);
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
return NETDEV_TX_OK;
}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e04544ac4b45..b58df3c1bf7d 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -506,8 +506,8 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
return err;
}
- dev->stats.tx_bytes += skb->len;
- dev->stats.tx_packets++;
+ DEV_STATS_ADD(dev, tx_bytes, skb->len);
+ DEV_STATS_INC(dev, tx_packets);
rcu_read_lock();
/* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
@@ -1839,8 +1839,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
if (vif->flags & VIFF_REGISTER) {
WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
- vif_dev->stats.tx_bytes += skb->len;
- vif_dev->stats.tx_packets++;
+ DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
+ DEV_STATS_INC(vif_dev, tx_packets);
ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
goto out_free;
}
@@ -1898,8 +1898,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
if (vif->flags & VIFF_TUNNEL) {
ip_encap(net, skb, vif->local, vif->remote);
/* FIXME: extra output firewall step used to be here. --RR */
- vif_dev->stats.tx_packets++;
- vif_dev->stats.tx_bytes += skb->len;
+ DEV_STATS_INC(vif_dev, tx_packets);
+ DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
}
IPCB(skb)->flags |= IPSKB_FORWARDED;
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 25ea6ac44db9..7fcfdfd8f9de 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -14,9 +14,6 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
struct nlattr *nla;
int remaining;
- if (!fc_mx)
- return 0;
-
nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) {
int type = nla_type(nla);
u32 val;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index f8e176c77d1c..b3cc416ed292 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -435,7 +435,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
switch (ctinfo) {
case IP_CT_NEW:
- ct->mark = hash;
+ WRITE_ONCE(ct->mark, hash);
break;
case IP_CT_RELATED:
case IP_CT_RELATED_REPLY:
@@ -452,7 +452,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
#ifdef DEBUG
nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
#endif
- pr_debug("hash=%u ct_hash=%u ", hash, ct->mark);
+ pr_debug("hash=%u ct_hash=%u ", hash, READ_ONCE(ct->mark));
if (!clusterip_responsible(cipinfo->config, hash)) {
pr_debug("not responsible\n");
return NF_DROP;
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index ff85db52b2e5..ded5bef02f77 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -78,6 +78,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
flow.flowi4_tos = iph->tos & IPTOS_RT_MASK;
flow.flowi4_scope = RT_SCOPE_UNIVERSE;
flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par));
+ flow.flowi4_uid = sock_net_uid(xt_net(par), NULL);
return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert;
}
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
index 0bcd6aee6000..a522c3a3be52 100644
--- a/net/ipv4/netfilter/nft_dup_ipv4.c
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -52,7 +52,8 @@ static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
return err;
}
-static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dup_ipv4_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index e886147eed11..9eee535c64dd 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -65,6 +65,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
struct flowi4 fl4 = {
.flowi4_scope = RT_SCOPE_UNIVERSE,
.flowi4_iif = LOOPBACK_IFINDEX,
+ .flowi4_uid = sock_net_uid(nft_net(pkt), NULL),
};
const struct net_device *oif;
const struct net_device *found;
@@ -137,12 +138,11 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
break;
}
- if (!oif) {
- found = FIB_RES_DEV(res);
+ if (!oif) {
+ found = FIB_RES_DEV(res);
} else {
if (!fib_info_nh_uses_dev(res.fi, oif))
return;
-
found = oif;
}
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 853a75a8fbaf..d8ef05347fd9 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -2534,7 +2534,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
if (!err) {
nh->nh_flags = fib_nh->fib_nh_flags;
fib_info_update_nhc_saddr(net, &fib_nh->nh_common,
- fib_nh->fib_nh_scope);
+ !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1);
} else {
fib_nh_release(net, fib_nh);
}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index bde333b24837..bb9854c2b7a1 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -138,7 +138,7 @@ next_port:
fail:
spin_unlock(&ping_table.lock);
- return 1;
+ return -EADDRINUSE;
}
EXPORT_SYMBOL_GPL(ping_get_port);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 5386f460bd20..f88daace9de3 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -297,6 +297,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS),
SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS),
SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE),
+ SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 795cbe1de912..cd1fa9f70f1a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3664,7 +3664,7 @@ static __net_init int rt_genid_init(struct net *net)
{
atomic_set(&net->ipv4.rt_genid, 0);
atomic_set(&net->fnhe_genid, 0);
- atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
+ atomic_set(&net->ipv4.dev_addr_genid, get_random_u32());
return 0;
}
@@ -3719,7 +3719,7 @@ int __init ip_rt_init(void)
ip_idents = idents_hash;
- prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
+ get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 9b8a6db7a66b..0d0cc4ef2b85 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -40,6 +40,9 @@ static int one_day_secs = 24 * 3600;
static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
+static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
+static int tcp_plb_max_rounds = 31;
+static int tcp_plb_max_cong_thresh = 256;
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -400,12 +403,36 @@ static int proc_tcp_ehash_entries(struct ctl_table *table, int write,
if (!net_eq(net, &init_net) && !hinfo->pernet)
tcp_ehash_entries *= -1;
+ memset(&tbl, 0, sizeof(tbl));
tbl.data = &tcp_ehash_entries;
tbl.maxlen = sizeof(int);
return proc_dointvec(&tbl, write, buffer, lenp, ppos);
}
+static int proc_udp_hash_entries(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_udp_child_hash_entries);
+ int udp_hash_entries;
+ struct ctl_table tbl;
+
+ udp_hash_entries = net->ipv4.udp_table->mask + 1;
+
+ /* A negative number indicates that the child netns
+ * shares the global udp_table.
+ */
+ if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table)
+ udp_hash_entries *= -1;
+
+ memset(&tbl, 0, sizeof(tbl));
+ tbl.data = &udp_hash_entries;
+ tbl.maxlen = sizeof(int);
+
+ return proc_dointvec(&tbl, write, buffer, lenp, ppos);
+}
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
void *buffer, size_t *lenp,
@@ -1360,6 +1387,21 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = &tcp_child_ehash_entries_max,
},
{
+ .procname = "udp_hash_entries",
+ .data = &init_net.ipv4.sysctl_udp_child_hash_entries,
+ .mode = 0444,
+ .proc_handler = proc_udp_hash_entries,
+ },
+ {
+ .procname = "udp_child_hash_entries",
+ .data = &init_net.ipv4.sysctl_udp_child_hash_entries,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &udp_child_hash_entries_max,
+ },
+ {
.procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min,
.maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
@@ -1384,6 +1426,47 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
},
+ {
+ .procname = "tcp_plb_enabled",
+ .data = &init_net.ipv4.sysctl_tcp_plb_enabled,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "tcp_plb_idle_rehash_rounds",
+ .data = &init_net.ipv4.sysctl_tcp_plb_idle_rehash_rounds,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra2 = &tcp_plb_max_rounds,
+ },
+ {
+ .procname = "tcp_plb_rehash_rounds",
+ .data = &init_net.ipv4.sysctl_tcp_plb_rehash_rounds,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra2 = &tcp_plb_max_rounds,
+ },
+ {
+ .procname = "tcp_plb_suspend_rto_sec",
+ .data = &init_net.ipv4.sysctl_tcp_plb_suspend_rto_sec,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_plb_cong_thresh",
+ .data = &init_net.ipv4.sysctl_tcp_plb_cong_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_plb_max_cong_thresh,
+ },
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f8232811a5be..24602a5184b0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk)
WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
+ set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
sk_sockets_allocated_inc(sk);
}
EXPORT_SYMBOL(tcp_init_sock);
@@ -3113,8 +3114,7 @@ int tcp_disconnect(struct sock *sk, int flags)
inet->inet_dport = 0;
- if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
- inet_reset_saddr(sk);
+ inet_bhash2_reset_saddr(sk);
sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
@@ -3175,6 +3175,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->sacked_out = 0;
tp->tlp_high_seq = 0;
tp->last_oow_ack_time = 0;
+ tp->plb_rehash = 0;
/* There's a bubble in the pipe until at least the first ACK. */
tp->app_limited = ~0U;
tp->rack.mstamp = 0;
@@ -3646,7 +3647,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
case TCP_REPAIR_OPTIONS:
if (!tp->repair)
err = -EINVAL;
- else if (sk->sk_state == TCP_ESTABLISHED)
+ else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent)
err = tcp_repair_options_est(sk, optval, optlen);
else
err = -EPERM;
@@ -3938,6 +3939,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_reord_seen = tp->reord_seen;
info->tcpi_rcv_ooopack = tp->rcv_ooopack;
info->tcpi_snd_wnd = tp->snd_wnd;
+ info->tcpi_rcv_wnd = tp->rcv_wnd;
+ info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash;
info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
unlock_sock_fast(sk, slow);
}
@@ -3972,6 +3975,7 @@ static size_t tcp_opt_stats_get_size(void)
nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */
0;
}
@@ -4048,6 +4052,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
nla_put_u8(stats, TCP_NLA_TTL,
tcp_skb_ttl_or_hop_limit(ack_skb));
+ nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash);
return stats;
}
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index a1626afe87a1..cf9c3e8f7ccb 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -278,7 +278,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
{
bool cork = false, enospc = sk_msg_full(msg);
struct sock *sk_redir;
- u32 tosend, delta = 0;
+ u32 tosend, origsize, sent, delta = 0;
u32 eval = __SK_NONE;
int ret;
@@ -333,10 +333,12 @@ more_data:
cork = true;
psock->cork = NULL;
}
- sk_msg_return(sk, msg, msg->sg.size);
+ sk_msg_return(sk, msg, tosend);
release_sock(sk);
+ origsize = msg->sg.size;
ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
+ sent = origsize - msg->sg.size;
if (eval == __SK_REDIRECT)
sock_put(sk_redir);
@@ -375,7 +377,7 @@ more_data:
msg->sg.data[msg->sg.start].page_link &&
msg->sg.data[msg->sg.start].length) {
if (eval == __SK_REDIRECT)
- sk_mem_charge(sk, msg->sg.size);
+ sk_mem_charge(sk, tosend - sent);
goto more_data;
}
}
@@ -607,7 +609,7 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
} else {
sk->sk_write_space = psock->saved_write_space;
/* Pairs with lockless read in sk_clone_lock() */
- WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+ sock_replace_proto(sk, psock->sk_proto);
}
return 0;
}
@@ -620,7 +622,7 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
}
/* Pairs with lockless read in sk_clone_lock() */
- WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]);
+ sock_replace_proto(sk, &tcp_bpf_prots[family][config]);
return 0;
}
EXPORT_SYMBOL_GPL(tcp_bpf_update_proto);
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 112f28f93693..ba4d98e510e0 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -243,7 +243,7 @@ static bool tcp_cdg_backoff(struct sock *sk, u32 grad)
struct cdg *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- if (prandom_u32() <= nexp_u32(grad * backoff_factor))
+ if (get_random_u32() <= nexp_u32(grad * backoff_factor))
return false;
if (use_ineff) {
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 2a6c0dd665a4..e0a2ca7456ff 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -54,6 +54,7 @@ struct dctcp {
u32 next_seq;
u32 ce_state;
u32 loss_cwnd;
+ struct tcp_plb_state plb;
};
static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
@@ -91,6 +92,8 @@ static void dctcp_init(struct sock *sk)
ca->ce_state = 0;
dctcp_reset(tp, ca);
+ tcp_plb_init(sk, &ca->plb);
+
return;
}
@@ -117,14 +120,28 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
/* Expired RTT */
if (!before(tp->snd_una, ca->next_seq)) {
+ u32 delivered = tp->delivered - ca->old_delivered;
u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce;
u32 alpha = ca->dctcp_alpha;
+ u32 ce_ratio = 0;
+
+ if (delivered > 0) {
+ /* dctcp_alpha keeps EWMA of fraction of ECN marked
+ * packets. Because of EWMA smoothing, PLB reaction can
+ * be slow so we use ce_ratio which is an instantaneous
+ * measure of congestion. ce_ratio is the fraction of
+ * ECN marked packets in the previous RTT.
+ */
+ if (delivered_ce > 0)
+ ce_ratio = (delivered_ce << TCP_PLB_SCALE) / delivered;
+ tcp_plb_update_state(sk, &ca->plb, (int)ce_ratio);
+ tcp_plb_check_rehash(sk, &ca->plb);
+ }
/* alpha = (1 - g) * alpha + g * F */
alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
if (delivered_ce) {
- u32 delivered = tp->delivered - ca->old_delivered;
/* If dctcp_shift_g == 1, a 32bit value would overflow
* after 8 M packets.
@@ -172,8 +189,12 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
break;
case CA_EVENT_LOSS:
+ tcp_plb_update_state_upon_rto(sk, &ca->plb);
dctcp_react_to_loss(sk);
break;
+ case CA_EVENT_TX_START:
+ tcp_plb_check_rehash(sk, &ca->plb); /* Maybe rehash when inflight is 0 */
+ break;
default:
/* Don't care for the rest. */
break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bc2ea12221f9..1efacbe948da 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2192,7 +2192,8 @@ void tcp_enter_loss(struct sock *sk)
*/
static bool tcp_check_sack_reneging(struct sock *sk, int flag)
{
- if (flag & FLAG_SACK_RENEGING) {
+ if (flag & FLAG_SACK_RENEGING &&
+ flag & FLAG_SND_UNA_ADVANCED) {
struct tcp_sock *tp = tcp_sk(sk);
unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
msecs_to_jiffies(10));
@@ -4763,8 +4764,8 @@ static void tcp_ofo_queue(struct sock *sk)
}
}
-static bool tcp_prune_ofo_queue(struct sock *sk);
-static int tcp_prune_queue(struct sock *sk);
+static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
+static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
unsigned int size)
@@ -4772,11 +4773,11 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
!sk_rmem_schedule(sk, skb, size)) {
- if (tcp_prune_queue(sk) < 0)
+ if (tcp_prune_queue(sk, skb) < 0)
return -1;
while (!sk_rmem_schedule(sk, skb, size)) {
- if (!tcp_prune_ofo_queue(sk))
+ if (!tcp_prune_ofo_queue(sk, skb))
return -1;
}
}
@@ -5328,6 +5329,8 @@ new_range:
* Clean the out-of-order queue to make room.
* We drop high sequences packets to :
* 1) Let a chance for holes to be filled.
+ * This means we do not drop packets from ooo queue if their sequence
+ * is before incoming packet sequence.
* 2) not add too big latencies if thousands of packets sit there.
* (But if application shrinks SO_RCVBUF, we could still end up
* freeing whole queue here)
@@ -5335,24 +5338,31 @@ new_range:
*
* Return true if queue has shrunk.
*/
-static bool tcp_prune_ofo_queue(struct sock *sk)
+static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct rb_node *node, *prev;
+ bool pruned = false;
int goal;
if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
return false;
- NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
goal = sk->sk_rcvbuf >> 3;
node = &tp->ooo_last_skb->rbnode;
+
do {
+ struct sk_buff *skb = rb_to_skb(node);
+
+ /* If incoming skb would land last in ofo queue, stop pruning. */
+ if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq))
+ break;
+ pruned = true;
prev = rb_prev(node);
rb_erase(node, &tp->out_of_order_queue);
- goal -= rb_to_skb(node)->truesize;
- tcp_drop_reason(sk, rb_to_skb(node),
- SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
+ goal -= skb->truesize;
+ tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
+ tp->ooo_last_skb = rb_to_skb(prev);
if (!prev || goal <= 0) {
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
!tcp_under_memory_pressure(sk))
@@ -5361,16 +5371,18 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
}
node = prev;
} while (node);
- tp->ooo_last_skb = rb_to_skb(prev);
- /* Reset SACK state. A conforming SACK implementation will
- * do the same at a timeout based retransmit. When a connection
- * is in a sad state like this, we care only about integrity
- * of the connection not performance.
- */
- if (tp->rx_opt.sack_ok)
- tcp_sack_reset(&tp->rx_opt);
- return true;
+ if (pruned) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+ /* Reset SACK state. A conforming SACK implementation will
+ * do the same at a timeout based retransmit. When a connection
+ * is in a sad state like this, we care only about integrity
+ * of the connection not performance.
+ */
+ if (tp->rx_opt.sack_ok)
+ tcp_sack_reset(&tp->rx_opt);
+ }
+ return pruned;
}
/* Reduce allocated memory if we can, trying to get
@@ -5380,7 +5392,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
* until the socket owning process reads some of the data
* to stabilize the situation.
*/
-static int tcp_prune_queue(struct sock *sk)
+static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -5407,7 +5419,7 @@ static int tcp_prune_queue(struct sock *sk)
/* Collapsing did not help, destructive actions follow.
* This must not ever occur. */
- tcp_prune_ofo_queue(sk);
+ tcp_prune_ofo_queue(sk, in_skb);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;
@@ -6829,10 +6841,18 @@ static bool tcp_syn_flood_action(const struct sock *sk, const char *proto)
#endif
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
- if (!queue->synflood_warned && syncookies != 2 &&
- xchg(&queue->synflood_warned, 1) == 0)
- net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
- proto, sk->sk_num, msg);
+ if (!READ_ONCE(queue->synflood_warned) && syncookies != 2 &&
+ xchg(&queue->synflood_warned, 1) == 0) {
+ if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) {
+ net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n",
+ proto, inet6_rcv_saddr(sk),
+ sk->sk_num, msg);
+ } else {
+ net_info_ratelimited("%s: Possible SYN flooding on port %pI4:%u. %s.\n",
+ proto, &sk->sk_rcv_saddr,
+ sk->sk_num, msg);
+ }
+ }
return want_cookie;
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6376ad915765..1215fa4c1b9f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -199,15 +199,14 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
- struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct inet_timewait_death_row *tcp_death_row;
- __be32 daddr, nexthop, prev_sk_rcv_saddr;
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct ip_options_rcu *inet_opt;
struct net *net = sock_net(sk);
__be16 orig_sport, orig_dport;
+ __be32 daddr, nexthop;
struct flowi4 *fl4;
struct rtable *rt;
int err;
@@ -251,24 +250,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (!inet->inet_saddr) {
- if (inet_csk(sk)->icsk_bind2_hash) {
- prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo,
- sk, net, inet->inet_num);
- prev_sk_rcv_saddr = sk->sk_rcv_saddr;
- }
- inet->inet_saddr = fl4->saddr;
- }
-
- sk_rcv_saddr_set(sk, inet->inet_saddr);
-
- if (prev_addr_hashbucket) {
- err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
+ err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
if (err) {
- inet->inet_saddr = 0;
- sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
ip_rt_put(rt);
return err;
}
+ } else {
+ sk_rcv_saddr_set(sk, inet->inet_saddr);
}
if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
@@ -323,7 +311,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_daddr);
}
- inet->inet_id = prandom_u32();
+ inet->inet_id = get_random_u16();
if (tcp_fastopen_defer_connect(sk, &err))
return err;
@@ -343,6 +331,7 @@ failure:
* if necessary.
*/
tcp_set_state(sk, TCP_CLOSE);
+ inet_bhash2_reset_saddr(sk);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->inet_dport = 0;
@@ -1543,7 +1532,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (inet_opt)
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
- newinet->inet_id = prandom_u32();
+ newinet->inet_id = get_random_u16();
/* Set ToS of the new socket based upon the value of incoming SYN.
* ECT bits are set later in tcp_init_transfer().
@@ -1874,11 +1863,13 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
__skb_push(skb, hdrlen);
no_coalesce:
+ limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
+
/* Only socket owner can try to collapse/prune rx queues
* to reduce memory overhead, so add a little headroom here.
* Few sockets backlog are possibly concurrently non empty.
*/
- limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
+ limit += 64 * 1024;
if (unlikely(sk_add_backlog(sk, skb, limit))) {
bh_unlock_sock(sk);
@@ -2478,7 +2469,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
case TCP_SEQ_STATE_LISTENING:
if (st->bucket > hinfo->lhash2_mask)
break;
- st->state = TCP_SEQ_STATE_LISTENING;
rc = listening_get_first(seq);
while (offset-- && rc && bucket == st->bucket)
rc = listening_get_next(seq, rc);
@@ -3216,6 +3206,14 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
atomic_set(&net->ipv4.tfo_active_disable_times, 0);
+ /* Set default values for PLB */
+ net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
+ net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
+ net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
+ net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
+ /* Default congestion threshold for PLB to mark a round is 50% */
+ net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
+
/* Reno is always built in */
if (!net_eq(net, &init_net) &&
bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c69f4d966024..894410dc9293 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1077,15 +1077,15 @@ static void tcp_tasklet_func(struct tasklet_struct *t)
*/
void tcp_release_cb(struct sock *sk)
{
- unsigned long flags, nflags;
+ unsigned long flags = smp_load_acquire(&sk->sk_tsq_flags);
+ unsigned long nflags;
/* perform an atomic operation only if at least one flag is set */
do {
- flags = sk->sk_tsq_flags;
if (!(flags & TCP_DEFERRED_ALL))
return;
nflags = flags & ~TCP_DEFERRED_ALL;
- } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
+ } while (!try_cmpxchg(&sk->sk_tsq_flags, &flags, nflags));
if (flags & TCPF_TSQ_DEFERRED) {
tcp_tsq_write(sk);
@@ -1139,6 +1139,8 @@ void tcp_wfree(struct sk_buff *skb)
struct sock *sk = skb->sk;
struct tcp_sock *tp = tcp_sk(sk);
unsigned long flags, nval, oval;
+ struct tsq_tasklet *tsq;
+ bool empty;
/* Keep one reference on sk_wmem_alloc.
* Will be released by sk_free() from here or tcp_tasklet_func()
@@ -1155,28 +1157,23 @@ void tcp_wfree(struct sk_buff *skb)
if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
goto out;
- for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
- struct tsq_tasklet *tsq;
- bool empty;
-
+ oval = smp_load_acquire(&sk->sk_tsq_flags);
+ do {
if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
goto out;
nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
- nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
- if (nval != oval)
- continue;
+ } while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval));
- /* queue this socket to tasklet queue */
- local_irq_save(flags);
- tsq = this_cpu_ptr(&tsq_tasklet);
- empty = list_empty(&tsq->head);
- list_add(&tp->tsq_node, &tsq->head);
- if (empty)
- tasklet_schedule(&tsq->tasklet);
- local_irq_restore(flags);
- return;
- }
+ /* queue this socket to tasklet queue */
+ local_irq_save(flags);
+ tsq = this_cpu_ptr(&tsq_tasklet);
+ empty = list_empty(&tsq->head);
+ list_add(&tp->tsq_node, &tsq->head);
+ if (empty)
+ tasklet_schedule(&tsq->tasklet);
+ local_irq_restore(flags);
+ return;
out:
sk_free(sk);
}
diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c
new file mode 100644
index 000000000000..bb1a08fda113
--- /dev/null
+++ b/net/ipv4/tcp_plb.c
@@ -0,0 +1,109 @@
+/* Protective Load Balancing (PLB)
+ *
+ * PLB was designed to reduce link load imbalance across datacenter
+ * switches. PLB is a host-based optimization; it leverages congestion
+ * signals from the transport layer to randomly change the path of the
+ * connection experiencing sustained congestion. PLB prefers to repath
+ * after idle periods to minimize packet reordering. It repaths by
+ * changing the IPv6 Flow Label on the packets of a connection, which
+ * datacenter switches include as part of ECMP/WCMP hashing.
+ *
+ * PLB is described in detail in:
+ *
+ * Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu,
+ * Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson,
+ * David Wetherall,Abdul Kabbani:
+ * "PLB: Congestion Signals are Simple and Effective for
+ * Network Load Balancing"
+ * In ACM SIGCOMM 2022, Amsterdam Netherlands.
+ *
+ */
+
+#include <net/tcp.h>
+
+/* Called once per round-trip to update PLB state for a connection. */
+void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb,
+ const int cong_ratio)
+{
+ struct net *net = sock_net(sk);
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
+ return;
+
+ if (cong_ratio >= 0) {
+ if (cong_ratio < READ_ONCE(net->ipv4.sysctl_tcp_plb_cong_thresh))
+ plb->consec_cong_rounds = 0;
+ else if (plb->consec_cong_rounds <
+ READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds))
+ plb->consec_cong_rounds++;
+ }
+}
+EXPORT_SYMBOL_GPL(tcp_plb_update_state);
+
+/* Check whether recent congestion has been persistent enough to warrant
+ * a load balancing decision that switches the connection to another path.
+ */
+void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb)
+{
+ struct net *net = sock_net(sk);
+ u32 max_suspend;
+ bool forced_rehash = false, idle_rehash = false;
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
+ return;
+
+ forced_rehash = plb->consec_cong_rounds >=
+ READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds);
+ /* If sender goes idle then we check whether to rehash. */
+ idle_rehash = READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds) &&
+ !tcp_sk(sk)->packets_out &&
+ plb->consec_cong_rounds >=
+ READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds);
+
+ if (!forced_rehash && !idle_rehash)
+ return;
+
+ /* Note that tcp_jiffies32 can wrap; we detect wraps by checking for
+ * cases where the max suspension end is before the actual suspension
+ * end. We clear pause_until to 0 to indicate there is no recent
+ * RTO event that constrains PLB rehashing.
+ */
+ max_suspend = 2 * READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ;
+ if (plb->pause_until &&
+ (!before(tcp_jiffies32, plb->pause_until) ||
+ before(tcp_jiffies32 + max_suspend, plb->pause_until)))
+ plb->pause_until = 0;
+
+ if (plb->pause_until)
+ return;
+
+ sk_rethink_txhash(sk);
+ plb->consec_cong_rounds = 0;
+ tcp_sk(sk)->plb_rehash++;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH);
+}
+EXPORT_SYMBOL_GPL(tcp_plb_check_rehash);
+
+/* Upon RTO, disallow load balancing for a while, to avoid having load
+ * balancing decisions switch traffic to a black-holed path that was
+ * previously avoided with a sk_rethink_txhash() call at RTO time.
+ */
+void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb)
+{
+ struct net *net = sock_net(sk);
+ u32 pause;
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
+ return;
+
+ pause = READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ;
+ pause += prandom_u32_max(pause);
+ plb->pause_until = tcp_jiffies32 + pause;
+
+ /* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call
+ * that may switch this connection to a path with completely different
+ * congestion characteristics.
+ */
+ plb->consec_cong_rounds = 0;
+}
+EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto);
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 7c27aa629af1..9ae50b1bd844 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -136,6 +136,9 @@ static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops)
if (icsk->icsk_ulp_ops)
goto out_err;
+ if (sk->sk_socket)
+ clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
+
err = ulp_ops->init(sk);
if (err)
goto out_err;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8126f67d18b3..9592fe3e444a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -129,7 +129,12 @@ DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
#define MAX_UDP_PORTS 65536
-#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
+#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET)
+
+static struct udp_table *udp_get_table_prot(struct sock *sk)
+{
+ return sk->sk_prot->h.udp_table ? : sock_net(sk)->ipv4.udp_table;
+}
static int udp_lib_lport_inuse(struct net *net, __u16 num,
const struct udp_hslot *hslot,
@@ -232,21 +237,21 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
int udp_lib_get_port(struct sock *sk, unsigned short snum,
unsigned int hash2_nulladdr)
{
+ struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2;
- struct udp_table *udptable = sk->sk_prot->h.udp_table;
- int error = 1;
struct net *net = sock_net(sk);
+ int error = -EADDRINUSE;
if (!snum) {
+ DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
+ unsigned short first, last;
int low, high, remaining;
unsigned int rand;
- unsigned short first, last;
- DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
- rand = prandom_u32();
+ rand = get_random_u32();
first = reciprocal_scale(rand, remaining) + low;
/*
* force rand to be an odd multiple of UDP_HTABLE_SIZE
@@ -448,7 +453,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
result = lookup_reuseport(net, sk, skb,
saddr, sport, daddr, hnum);
/* Fall back to scoring if group has connections */
- if (result && !reuseport_has_conns(sk, false))
+ if (result && !reuseport_has_conns(sk))
return result;
result = result ? : sk;
@@ -467,7 +472,7 @@ static struct sock *udp4_lookup_run_bpf(struct net *net,
struct sock *sk, *reuse_sk;
bool no_reuseport;
- if (udptable != &udp_table)
+ if (udptable != net->ipv4.udp_table)
return NULL; /* only UDP is supported */
no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, saddr, sport,
@@ -548,10 +553,11 @@ struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
__be16 sport, __be16 dport)
{
const struct iphdr *iph = ip_hdr(skb);
+ struct net *net = dev_net(skb->dev);
- return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
+ return __udp4_lib_lookup(net, iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
- inet_sdif(skb), &udp_table, NULL);
+ inet_sdif(skb), net->ipv4.udp_table, NULL);
}
/* Must be called under rcu_read_lock().
@@ -564,7 +570,7 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
struct sock *sk;
sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
- dif, 0, &udp_table, NULL);
+ dif, 0, net->ipv4.udp_table, NULL);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
return sk;
@@ -784,7 +790,8 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
if (tunnel) {
/* ...not for tunnels though: we don't have a sending socket */
if (udp_sk(sk)->encap_err_rcv)
- udp_sk(sk)->encap_err_rcv(sk, skb, iph->ihl << 2);
+ udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, info,
+ (u8 *)(uh+1));
goto out;
}
if (!inet->recverr) {
@@ -801,7 +808,7 @@ out:
int udp_err(struct sk_buff *skb, u32 info)
{
- return __udp4_lib_err(skb, info, &udp_table);
+ return __udp4_lib_err(skb, info, dev_net(skb->dev)->ipv4.udp_table);
}
/*
@@ -1448,7 +1455,7 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
if (likely(partial)) {
up->forward_deficit += size;
size = up->forward_deficit;
- if (size < (sk->sk_rcvbuf >> 2) &&
+ if (size < READ_ONCE(up->forward_threshold) &&
!skb_queue_empty(&up->reader_queue))
return;
} else {
@@ -1622,8 +1629,9 @@ static void udp_destruct_sock(struct sock *sk)
int udp_init_sock(struct sock *sk)
{
- skb_queue_head_init(&udp_sk(sk)->reader_queue);
+ udp_lib_init_sock(sk);
sk->sk_destruct = udp_destruct_sock;
+ set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
return 0;
}
@@ -1997,7 +2005,7 @@ EXPORT_SYMBOL(udp_disconnect);
void udp_lib_unhash(struct sock *sk)
{
if (sk_hashed(sk)) {
- struct udp_table *udptable = sk->sk_prot->h.udp_table;
+ struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2;
hslot = udp_hashslot(udptable, sock_net(sk),
@@ -2028,7 +2036,7 @@ EXPORT_SYMBOL(udp_lib_unhash);
void udp_lib_rehash(struct sock *sk, u16 newhash)
{
if (sk_hashed(sk)) {
- struct udp_table *udptable = sk->sk_prot->h.udp_table;
+ struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2, *nhslot2;
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
@@ -2517,10 +2525,14 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
__be16 rmt_port, __be32 rmt_addr,
int dif, int sdif)
{
- struct sock *sk, *result;
+ struct udp_table *udptable = net->ipv4.udp_table;
unsigned short hnum = ntohs(loc_port);
- unsigned int slot = udp_hashfn(net, hnum, udp_table.mask);
- struct udp_hslot *hslot = &udp_table.hash[slot];
+ struct sock *sk, *result;
+ struct udp_hslot *hslot;
+ unsigned int slot;
+
+ slot = udp_hashfn(net, hnum, udptable->mask);
+ hslot = &udptable->hash[slot];
/* Do not bother scanning a too big list */
if (hslot->count > 10)
@@ -2548,14 +2560,19 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
__be16 rmt_port, __be32 rmt_addr,
int dif, int sdif)
{
- unsigned short hnum = ntohs(loc_port);
- unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
- unsigned int slot2 = hash2 & udp_table.mask;
- struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
+ struct udp_table *udptable = net->ipv4.udp_table;
INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
- const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
+ unsigned short hnum = ntohs(loc_port);
+ unsigned int hash2, slot2;
+ struct udp_hslot *hslot2;
+ __portpair ports;
struct sock *sk;
+ hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+ ports = INET_COMBINED_PORTS(rmt_port, hnum);
+
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
if (inet_match(net, sk, acookie, ports, dif, sdif))
return sk;
@@ -2635,7 +2652,7 @@ int udp_v4_early_demux(struct sk_buff *skb)
int udp_rcv(struct sk_buff *skb)
{
- return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
+ return __udp4_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
}
void udp_destroy_sock(struct sock *sk)
@@ -2671,6 +2688,18 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
int err = 0;
int is_udplite = IS_UDPLITE(sk);
+ if (level == SOL_SOCKET) {
+ err = sk_setsockopt(sk, level, optname, optval, optlen);
+
+ if (optname == SO_RCVBUF || optname == SO_RCVBUFFORCE) {
+ sockopt_lock_sock(sk);
+ /* paired with READ_ONCE in udp_rmem_release() */
+ WRITE_ONCE(up->forward_threshold, sk->sk_rcvbuf >> 2);
+ sockopt_release_sock(sk);
+ }
+ return err;
+ }
+
if (optlen < sizeof(int))
return -EINVAL;
@@ -2784,7 +2813,7 @@ EXPORT_SYMBOL(udp_lib_setsockopt);
int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen)
{
- if (level == SOL_UDP || level == SOL_UDPLITE)
+ if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET)
return udp_lib_setsockopt(sk, level, optname,
optval, optlen,
udp_push_pending_frames);
@@ -2946,7 +2975,7 @@ struct proto udp_prot = {
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
.obj_size = sizeof(struct udp_sock),
- .h.udp_table = &udp_table,
+ .h.udp_table = NULL,
.diag_destroy = udp_abort,
};
EXPORT_SYMBOL(udp_prot);
@@ -2954,21 +2983,30 @@ EXPORT_SYMBOL(udp_prot);
/* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS
+static struct udp_table *udp_get_table_afinfo(struct udp_seq_afinfo *afinfo,
+ struct net *net)
+{
+ return afinfo->udp_table ? : net->ipv4.udp_table;
+}
+
static struct sock *udp_get_first(struct seq_file *seq, int start)
{
- struct sock *sk;
- struct udp_seq_afinfo *afinfo;
struct udp_iter_state *state = seq->private;
struct net *net = seq_file_net(seq);
+ struct udp_seq_afinfo *afinfo;
+ struct udp_table *udptable;
+ struct sock *sk;
if (state->bpf_seq_afinfo)
afinfo = state->bpf_seq_afinfo;
else
afinfo = pde_data(file_inode(seq->file));
- for (state->bucket = start; state->bucket <= afinfo->udp_table->mask;
+ udptable = udp_get_table_afinfo(afinfo, net);
+
+ for (state->bucket = start; state->bucket <= udptable->mask;
++state->bucket) {
- struct udp_hslot *hslot = &afinfo->udp_table->hash[state->bucket];
+ struct udp_hslot *hslot = &udptable->hash[state->bucket];
if (hlist_empty(&hslot->head))
continue;
@@ -2990,9 +3028,10 @@ found:
static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
{
- struct udp_seq_afinfo *afinfo;
struct udp_iter_state *state = seq->private;
struct net *net = seq_file_net(seq);
+ struct udp_seq_afinfo *afinfo;
+ struct udp_table *udptable;
if (state->bpf_seq_afinfo)
afinfo = state->bpf_seq_afinfo;
@@ -3006,8 +3045,11 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
sk->sk_family != afinfo->family)));
if (!sk) {
- if (state->bucket <= afinfo->udp_table->mask)
- spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
+ udptable = udp_get_table_afinfo(afinfo, net);
+
+ if (state->bucket <= udptable->mask)
+ spin_unlock_bh(&udptable->hash[state->bucket].lock);
+
return udp_get_first(seq, state->bucket + 1);
}
return sk;
@@ -3048,16 +3090,19 @@ EXPORT_SYMBOL(udp_seq_next);
void udp_seq_stop(struct seq_file *seq, void *v)
{
- struct udp_seq_afinfo *afinfo;
struct udp_iter_state *state = seq->private;
+ struct udp_seq_afinfo *afinfo;
+ struct udp_table *udptable;
if (state->bpf_seq_afinfo)
afinfo = state->bpf_seq_afinfo;
else
afinfo = pde_data(file_inode(seq->file));
- if (state->bucket <= afinfo->udp_table->mask)
- spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
+ udptable = udp_get_table_afinfo(afinfo, seq_file_net(seq));
+
+ if (state->bucket <= udptable->mask)
+ spin_unlock_bh(&udptable->hash[state->bucket].lock);
}
EXPORT_SYMBOL(udp_seq_stop);
@@ -3170,7 +3215,7 @@ EXPORT_SYMBOL(udp_seq_ops);
static struct udp_seq_afinfo udp4_seq_afinfo = {
.family = AF_INET,
- .udp_table = &udp_table,
+ .udp_table = NULL,
};
static int __net_init udp4_proc_init_net(struct net *net)
@@ -3232,7 +3277,7 @@ void __init udp_table_init(struct udp_table *table, const char *name)
&table->log,
&table->mask,
UDP_HTABLE_SIZE_MIN,
- 64 * 1024);
+ UDP_HTABLE_SIZE_MAX);
table->hash2 = table->hash + (table->mask + 1);
for (i = 0; i <= table->mask; i++) {
@@ -3257,7 +3302,7 @@ u32 udp_flow_hashrnd(void)
}
EXPORT_SYMBOL(udp_flow_hashrnd);
-static int __net_init udp_sysctl_init(struct net *net)
+static void __net_init udp_sysctl_init(struct net *net)
{
net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
@@ -3265,12 +3310,103 @@ static int __net_init udp_sysctl_init(struct net *net)
#ifdef CONFIG_NET_L3_MASTER_DEV
net->ipv4.sysctl_udp_l3mdev_accept = 0;
#endif
+}
+
+static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries)
+{
+ struct udp_table *udptable;
+ int i;
+
+ udptable = kmalloc(sizeof(*udptable), GFP_KERNEL);
+ if (!udptable)
+ goto out;
+
+ udptable->hash = vmalloc_huge(hash_entries * 2 * sizeof(struct udp_hslot),
+ GFP_KERNEL_ACCOUNT);
+ if (!udptable->hash)
+ goto free_table;
+
+ udptable->hash2 = udptable->hash + hash_entries;
+ udptable->mask = hash_entries - 1;
+ udptable->log = ilog2(hash_entries);
+
+ for (i = 0; i < hash_entries; i++) {
+ INIT_HLIST_HEAD(&udptable->hash[i].head);
+ udptable->hash[i].count = 0;
+ spin_lock_init(&udptable->hash[i].lock);
+
+ INIT_HLIST_HEAD(&udptable->hash2[i].head);
+ udptable->hash2[i].count = 0;
+ spin_lock_init(&udptable->hash2[i].lock);
+ }
+
+ return udptable;
+
+free_table:
+ kfree(udptable);
+out:
+ return NULL;
+}
+
+static void __net_exit udp_pernet_table_free(struct net *net)
+{
+ struct udp_table *udptable = net->ipv4.udp_table;
+
+ if (udptable == &udp_table)
+ return;
+
+ kvfree(udptable->hash);
+ kfree(udptable);
+}
+
+static void __net_init udp_set_table(struct net *net)
+{
+ struct udp_table *udptable;
+ unsigned int hash_entries;
+ struct net *old_net;
+
+ if (net_eq(net, &init_net))
+ goto fallback;
+
+ old_net = current->nsproxy->net_ns;
+ hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries);
+ if (!hash_entries)
+ goto fallback;
+
+ /* Set min to keep the bitmap on stack in udp_lib_get_port() */
+ if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET)
+ hash_entries = UDP_HTABLE_SIZE_MIN_PERNET;
+ else
+ hash_entries = roundup_pow_of_two(hash_entries);
+
+ udptable = udp_pernet_table_alloc(hash_entries);
+ if (udptable) {
+ net->ipv4.udp_table = udptable;
+ } else {
+ pr_warn("Failed to allocate UDP hash table (entries: %u) "
+ "for a netns, fallback to the global one\n",
+ hash_entries);
+fallback:
+ net->ipv4.udp_table = &udp_table;
+ }
+}
+
+static int __net_init udp_pernet_init(struct net *net)
+{
+ udp_sysctl_init(net);
+ udp_set_table(net);
return 0;
}
+static void __net_exit udp_pernet_exit(struct net *net)
+{
+ udp_pernet_table_free(net);
+}
+
static struct pernet_operations __net_initdata udp_sysctl_ops = {
- .init = udp_sysctl_init,
+ .init = udp_pernet_init,
+ .exit = udp_pernet_exit,
};
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
@@ -3288,7 +3424,7 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
return -ENOMEM;
afinfo->family = AF_UNSPEC;
- afinfo->udp_table = &udp_table;
+ afinfo->udp_table = NULL;
st->bpf_seq_afinfo = afinfo;
ret = bpf_iter_init_seq_net(priv_data, aux);
if (ret)
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index ff15918b7bdc..e5dc91d0e079 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -141,14 +141,14 @@ int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
if (restore) {
sk->sk_write_space = psock->saved_write_space;
- WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+ sock_replace_proto(sk, psock->sk_proto);
return 0;
}
if (sk->sk_family == AF_INET6)
udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
- WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]);
+ sock_replace_proto(sk, &udp_bpf_prots[family]);
return 0;
}
EXPORT_SYMBOL_GPL(udp_bpf_update_proto);
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 1ed8c4d78e5c..de3f2d31f510 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -147,13 +147,13 @@ done:
static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r)
{
- udp_dump(&udp_table, skb, cb, r);
+ udp_dump(sock_net(cb->skb->sk)->ipv4.udp_table, skb, cb, r);
}
static int udp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- return udp_dump_one(&udp_table, cb, req);
+ return udp_dump_one(sock_net(cb->skb->sk)->ipv4.udp_table, cb, req);
}
static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
@@ -225,7 +225,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
static int udp_diag_destroy(struct sk_buff *in_skb,
const struct inet_diag_req_v2 *req)
{
- return __udp_diag_destroy(in_skb, req, &udp_table);
+ return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table);
}
static int udplite_diag_destroy(struct sk_buff *in_skb,
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 6d1a4bec2614..aedde65e2268 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -600,10 +600,11 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
__be16 dport)
{
const struct iphdr *iph = skb_gro_network_header(skb);
+ struct net *net = dev_net(skb->dev);
- return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
+ return __udp4_lib_lookup(net, iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
- inet_sdif(skb), &udp_table, NULL);
+ inet_sdif(skb), net->ipv4.udp_table, NULL);
}
INDIRECT_CALLABLE_SCOPE
diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
index bc3a043a5d5c..029219749785 100644
--- a/net/ipv4/udp_tunnel_nic.c
+++ b/net/ipv4/udp_tunnel_nic.c
@@ -624,6 +624,8 @@ __udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table,
continue;
nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY);
+ if (!nest)
+ return -EMSGSIZE;
if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT,
utn->entries[table][j].port) ||
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 10ce86bf228e..9c3f5202a97b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -104,7 +104,7 @@ static inline u32 cstamp_delta(unsigned long cstamp)
static inline s32 rfc3315_s14_backoff_init(s32 irt)
{
/* multiply 'initial retransmission time' by 0.9 .. 1.1 */
- u64 tmp = (900000 + prandom_u32() % 200001) * (u64)irt;
+ u64 tmp = (900000 + prandom_u32_max(200001)) * (u64)irt;
do_div(tmp, 1000000);
return (s32)tmp;
}
@@ -112,11 +112,11 @@ static inline s32 rfc3315_s14_backoff_init(s32 irt)
static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt)
{
/* multiply 'retransmission timeout' by 1.9 .. 2.1 */
- u64 tmp = (1900000 + prandom_u32() % 200001) * (u64)rt;
+ u64 tmp = (1900000 + prandom_u32_max(200001)) * (u64)rt;
do_div(tmp, 1000000);
if ((s32)tmp > mrt) {
/* multiply 'maximum retransmission time' by 0.9 .. 1.1 */
- tmp = (900000 + prandom_u32() % 200001) * (u64)mrt;
+ tmp = (900000 + prandom_u32_max(200001)) * (u64)mrt;
do_div(tmp, 1000000);
}
return (s32)tmp;
@@ -3967,7 +3967,7 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp)
if (ifp->flags & IFA_F_OPTIMISTIC)
rand_num = 0;
else
- rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1);
+ rand_num = prandom_u32_max(idev->cnf.rtr_solicit_delay ?: 1);
nonce = 0;
if (idev->cnf.enhanced_dad ||
@@ -7214,9 +7214,11 @@ err_reg_dflt:
__addrconf_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL);
err_reg_all:
kfree(dflt);
+ net->ipv6.devconf_dflt = NULL;
#endif
err_alloc_dflt:
kfree(all);
+ net->ipv6.devconf_all = NULL;
err_alloc_all:
kfree(net->ipv6.inet6_addr_lst);
err_alloc_addr:
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 8a22486cf270..17ac45aa7194 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -437,6 +437,7 @@ static void ip6addrlbl_putmsg(struct nlmsghdr *nlh,
{
struct ifaddrlblmsg *ifal = nlmsg_data(nlh);
ifal->ifal_family = AF_INET6;
+ ifal->__ifal_reserved = 0;
ifal->ifal_prefixlen = prefixlen;
ifal->ifal_flags = 0;
ifal->ifal_index = ifindex;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 024191004982..fee9163382c2 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -114,6 +114,7 @@ void inet6_sock_destruct(struct sock *sk)
inet6_cleanup_sock(sk);
inet_sock_destruct(sk);
}
+EXPORT_SYMBOL_GPL(inet6_sock_destruct);
static int inet6_create(struct net *net, struct socket *sock, int protocol,
int kern)
@@ -409,10 +410,10 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
/* Make sure we are allowed to bind here. */
if (snum || !(inet->bind_address_no_port ||
(flags & BIND_FORCE_ADDRESS_NO_PORT))) {
- if (sk->sk_prot->get_port(sk, snum)) {
+ err = sk->sk_prot->get_port(sk, snum);
+ if (err) {
sk->sk_ipv6only = saved_ipv6only;
inet_reset_saddr(sk);
- err = -EADDRINUSE;
goto out;
}
if (!(flags & BIND_FROM_BPF)) {
@@ -489,7 +490,7 @@ int inet6_release(struct socket *sock)
}
EXPORT_SYMBOL(inet6_release);
-void inet6_destroy_sock(struct sock *sk)
+void inet6_cleanup_sock(struct sock *sk)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct sk_buff *skb;
@@ -514,12 +515,6 @@ void inet6_destroy_sock(struct sock *sk)
txopt_put(opt);
}
}
-EXPORT_SYMBOL_GPL(inet6_destroy_sock);
-
-void inet6_cleanup_sock(struct sock *sk)
-{
- inet6_destroy_sock(sk);
-}
EXPORT_SYMBOL_GPL(inet6_cleanup_sock);
/*
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index df665d4e8f0f..7c7155b48f17 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -256,7 +256,7 @@ ipv4_connected:
goto out;
}
- reuseport_has_conns(sk, true);
+ reuseport_has_conns_set(sk);
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
out:
@@ -334,6 +334,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
if (sock_queue_err_skb(sk, skb))
kfree_skb(skb);
}
+EXPORT_SYMBOL_GPL(ipv6_icmp_error);
void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
{
@@ -771,7 +772,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
}
if (cmsg->cmsg_level == SOL_SOCKET) {
- err = __sock_cmsg_send(sk, msg, cmsg, &ipc6->sockc);
+ err = __sock_cmsg_send(sk, cmsg, &ipc6->sockc);
if (err)
return err;
continue;
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 97edf461bc72..75c02992c520 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -345,6 +345,9 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features
xo->seq.low += skb_shinfo(skb)->gso_segs;
}
+ if (xo->seq.low < seq)
+ xo->seq.hi++;
+
esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32));
len = skb->len - sizeof(struct ipv6hdr);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 413f66781e50..2438da5ff6da 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -91,13 +91,12 @@ static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
static int fib6_new_sernum(struct net *net)
{
- int new, old;
+ int new, old = atomic_read(&net->ipv6.fib6_sernum);
do {
- old = atomic_read(&net->ipv6.fib6_sernum);
new = old < INT_MAX ? old + 1 : 1;
- } while (atomic_cmpxchg(&net->ipv6.fib6_sernum,
- old, new) != old);
+ } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new));
+
return new;
}
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index ceb85c67ce39..18481eb76a0a 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -220,7 +220,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
spin_lock_bh(&ip6_fl_lock);
if (label == 0) {
for (;;) {
- fl->label = htonl(prandom_u32())&IPV6_FLOWLABEL_MASK;
+ fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK;
if (fl->label) {
lfl = __fl_lookup(net, fl->label);
if (!lfl)
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 02b1b54165e8..89f5f0f3f5d6 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -895,7 +895,6 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb,
struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- struct net_device_stats *stats = &t->dev->stats;
__be16 payload_protocol;
int ret;
@@ -925,8 +924,8 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb,
tx_err:
if (!t->parms.collect_md || !IS_ERR(skb_tunnel_info_txcheck(skb)))
- stats->tx_errors++;
- stats->tx_dropped++;
+ DEV_STATS_INC(dev, tx_errors);
+ DEV_STATS_INC(dev, tx_dropped);
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -937,7 +936,6 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
struct ip_tunnel_info *tun_info = NULL;
struct ip6_tnl *t = netdev_priv(dev);
struct dst_entry *dst = skb_dst(skb);
- struct net_device_stats *stats;
bool truncate = false;
int encap_limit = -1;
__u8 dsfield = false;
@@ -1086,10 +1084,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
tx_err:
- stats = &t->dev->stats;
if (!IS_ERR(tun_info))
- stats->tx_errors++;
- stats->tx_dropped++;
+ DEV_STATS_INC(dev, tx_errors);
+ DEV_STATS_INC(dev, tx_dropped);
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -1155,14 +1152,16 @@ static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu,
dev->needed_headroom = dst_len;
if (set_mtu) {
- dev->mtu = rt->dst.dev->mtu - t_hlen;
+ int mtu = rt->dst.dev->mtu - t_hlen;
+
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- dev->mtu -= 8;
+ mtu -= 8;
if (dev->type == ARPHRD_ETHER)
- dev->mtu -= ETH_HLEN;
+ mtu -= ETH_HLEN;
- if (dev->mtu < IPV6_MIN_MTU)
- dev->mtu = IPV6_MIN_MTU;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ WRITE_ONCE(dev->mtu, mtu);
}
}
ip6_rt_put(rt);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index cc5d5e75b658..47b6607a1370 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -803,8 +803,8 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
(tunnel->parms.i_flags & TUNNEL_CSUM)) ||
((tpi->flags & TUNNEL_CSUM) &&
!(tunnel->parms.i_flags & TUNNEL_CSUM))) {
- tunnel->dev->stats.rx_crc_errors++;
- tunnel->dev->stats.rx_errors++;
+ DEV_STATS_INC(tunnel->dev, rx_crc_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto drop;
}
@@ -812,8 +812,8 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
if (!(tpi->flags & TUNNEL_SEQ) ||
(tunnel->i_seqno &&
(s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
- tunnel->dev->stats.rx_fifo_errors++;
- tunnel->dev->stats.rx_errors++;
+ DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto drop;
}
tunnel->i_seqno = ntohl(tpi->seq) + 1;
@@ -824,8 +824,8 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
/* Warning: All skb pointers will be invalidated! */
if (tunnel->dev->type == ARPHRD_ETHER) {
if (!pskb_may_pull(skb, ETH_HLEN)) {
- tunnel->dev->stats.rx_length_errors++;
- tunnel->dev->stats.rx_errors++;
+ DEV_STATS_INC(tunnel->dev, rx_length_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto drop;
}
@@ -849,8 +849,8 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
&ipv6h->saddr,
ipv6_get_dsfield(ipv6h));
if (err > 1) {
- ++tunnel->dev->stats.rx_frame_errors;
- ++tunnel->dev->stats.rx_errors;
+ DEV_STATS_INC(tunnel->dev, rx_frame_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto drop;
}
}
@@ -1071,7 +1071,6 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
{
struct ip6_tnl *t = netdev_priv(dev);
struct net *net = t->net;
- struct net_device_stats *stats = &t->dev->stats;
struct ipv6hdr *ipv6h;
struct ipv6_tel_txoption opt;
struct dst_entry *dst = NULL, *ndst = NULL;
@@ -1166,7 +1165,7 @@ route_lookup:
tdev = dst->dev;
if (tdev == dev) {
- stats->collisions++;
+ DEV_STATS_INC(dev, collisions);
net_warn_ratelimited("%s: Local routing loop detected!\n",
t->parms.name);
goto tx_err_dst_release;
@@ -1265,7 +1264,7 @@ route_lookup:
ip6tunnel_xmit(NULL, skb, dev);
return 0;
tx_err_link_failure:
- stats->tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
dst_link_failure(skb);
tx_err_dst_release:
dst_release(dst);
@@ -1408,7 +1407,6 @@ static netdev_tx_t
ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- struct net_device_stats *stats = &t->dev->stats;
u8 ipproto;
int ret;
@@ -1438,8 +1436,8 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
tx_err:
- stats->tx_errors++;
- stats->tx_dropped++;
+ DEV_STATS_INC(dev, tx_errors);
+ DEV_STATS_INC(dev, tx_dropped);
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -1450,8 +1448,8 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
struct net_device *tdev = NULL;
struct __ip6_tnl_parm *p = &t->parms;
struct flowi6 *fl6 = &t->fl.u.ip6;
- unsigned int mtu;
int t_hlen;
+ int mtu;
__dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr));
memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
@@ -1498,12 +1496,13 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
dev->hard_header_len = tdev->hard_header_len + t_hlen;
mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU);
- dev->mtu = mtu - t_hlen;
+ mtu = mtu - t_hlen;
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- dev->mtu -= 8;
+ mtu -= 8;
- if (dev->mtu < IPV6_MIN_MTU)
- dev->mtu = IPV6_MIN_MTU;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ WRITE_ONCE(dev->mtu, mtu);
}
}
}
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 151337d7f67b..10b222865d46 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -317,7 +317,7 @@ static int vti6_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi,
ipv6h = ipv6_hdr(skb);
if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) {
- t->dev->stats.rx_dropped++;
+ DEV_STATS_INC(t->dev, rx_dropped);
rcu_read_unlock();
goto discard;
}
@@ -359,8 +359,8 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
dev = t->dev;
if (err) {
- dev->stats.rx_errors++;
- dev->stats.rx_dropped++;
+ DEV_STATS_INC(dev, rx_errors);
+ DEV_STATS_INC(dev, rx_dropped);
return 0;
}
@@ -446,7 +446,6 @@ static int
vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
{
struct ip6_tnl *t = netdev_priv(dev);
- struct net_device_stats *stats = &t->dev->stats;
struct dst_entry *dst = skb_dst(skb);
struct net_device *tdev;
struct xfrm_state *x;
@@ -506,7 +505,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
tdev = dst->dev;
if (tdev == dev) {
- stats->collisions++;
+ DEV_STATS_INC(dev, collisions);
net_warn_ratelimited("%s: Local routing loop detected!\n",
t->parms.name);
goto tx_err_dst_release;
@@ -544,7 +543,7 @@ xmit:
return 0;
tx_err_link_failure:
- stats->tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
dst_link_failure(skb);
tx_err_dst_release:
dst_release(dst);
@@ -555,7 +554,6 @@ static netdev_tx_t
vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- struct net_device_stats *stats = &t->dev->stats;
struct flowi fl;
int ret;
@@ -591,8 +589,8 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
tx_err:
- stats->tx_errors++;
- stats->tx_dropped++;
+ DEV_STATS_INC(dev, tx_errors);
+ DEV_STATS_INC(dev, tx_dropped);
kfree_skb(skb);
return NETDEV_TX_OK;
}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index facdc78a43e5..23e766597f36 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -608,8 +608,8 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
goto tx_err;
- dev->stats.tx_bytes += skb->len;
- dev->stats.tx_packets++;
+ DEV_STATS_ADD(dev, tx_bytes, skb->len);
+ DEV_STATS_INC(dev, tx_packets);
rcu_read_lock();
ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
MRT6MSG_WHOLEPKT);
@@ -618,7 +618,7 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
tx_err:
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -2044,8 +2044,8 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
if (vif->flags & MIFF_REGISTER) {
WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
- vif_dev->stats.tx_bytes += skb->len;
- vif_dev->stats.tx_packets++;
+ DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
+ DEV_STATS_INC(vif_dev, tx_packets);
ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT);
goto out_free;
}
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 532f4478c884..9ce51680290b 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -1005,10 +1005,8 @@ unlock:
return retv;
e_inval:
- sockopt_release_sock(sk);
- if (needs_rtnl)
- rtnl_unlock();
- return -EINVAL;
+ retv = -EINVAL;
+ goto unlock;
}
int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 0566ab03ddbe..7860383295d8 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1050,7 +1050,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
/* called with mc_lock */
static void mld_gq_start_work(struct inet6_dev *idev)
{
- unsigned long tv = prandom_u32() % idev->mc_maxdelay;
+ unsigned long tv = prandom_u32_max(idev->mc_maxdelay);
idev->mc_gq_running = 1;
if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2))
@@ -1068,7 +1068,7 @@ static void mld_gq_stop_work(struct inet6_dev *idev)
/* called with mc_lock */
static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay)
{
- unsigned long tv = prandom_u32() % delay;
+ unsigned long tv = prandom_u32_max(delay);
if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2))
in6_dev_hold(idev);
@@ -1085,7 +1085,7 @@ static void mld_ifc_stop_work(struct inet6_dev *idev)
/* called with mc_lock */
static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay)
{
- unsigned long tv = prandom_u32() % delay;
+ unsigned long tv = prandom_u32_max(delay);
if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2))
in6_dev_hold(idev);
@@ -1130,7 +1130,7 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime)
}
if (delay >= resptime)
- delay = prandom_u32() % resptime;
+ delay = prandom_u32_max(resptime);
if (!mod_delayed_work(mld_wq, &ma->mca_work, delay))
refcount_inc(&ma->mca_refcnt);
@@ -2574,7 +2574,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma)
igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
- delay = prandom_u32() % unsolicited_report_interval(ma->idev);
+ delay = prandom_u32_max(unsolicited_report_interval(ma->idev));
if (cancel_delayed_work(&ma->mca_work)) {
refcount_dec(&ma->mca_refcnt);
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index 69d86b040a6a..a01d9b842bd0 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -40,6 +40,7 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
.flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev),
.flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
.flowi6_proto = iph->nexthdr,
+ .flowi6_uid = sock_net_uid(net, NULL),
.daddr = iph->saddr,
};
int lookup_flags;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 38db0064d661..d13240f13607 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -253,7 +253,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
if (err) {
if (err == IPFRAG_DUP) {
/* No error for duplicates, pretend they got queued. */
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_DUP_FRAG);
return -EINPROGRESS;
}
goto insert_error;
diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c
index 70a405b4006f..c82f3fdd4a65 100644
--- a/net/ipv6/netfilter/nft_dup_ipv6.c
+++ b/net/ipv6/netfilter/nft_dup_ipv6.c
@@ -50,7 +50,8 @@ static int nft_dup_ipv6_init(const struct nft_ctx *ctx,
return err;
}
-static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dup_ipv6_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
index 91faac610e03..36dc14b34388 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -66,6 +66,7 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
struct flowi6 fl6 = {
.flowi6_iif = LOOPBACK_IFINDEX,
.flowi6_proto = pkt->tprot,
+ .flowi6_uid = sock_net_uid(nft_net(pkt), NULL),
};
u32 ret = 0;
@@ -163,6 +164,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
struct flowi6 fl6 = {
.flowi6_iif = LOOPBACK_IFINDEX,
.flowi6_proto = pkt->tprot,
+ .flowi6_uid = sock_net_uid(nft_net(pkt), NULL),
};
struct rt6_info *rt;
int lookup_flags;
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 2880dc7d9a49..2685c3f15e9d 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -18,7 +18,7 @@ static u32 __ipv6_select_ident(struct net *net,
u32 id;
do {
- id = prandom_u32();
+ id = get_random_u32();
} while (!id);
return id;
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 86c26e48d065..808983bc2ec9 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -23,11 +23,6 @@
#include <linux/bpf-cgroup.h>
#include <net/ping.h>
-static void ping_v6_destroy(struct sock *sk)
-{
- inet6_destroy_sock(sk);
-}
-
/* Compatibility glue so we can support IPv6 when it's compiled as a module */
static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
int *addr_len)
@@ -205,7 +200,6 @@ struct proto pingv6_prot = {
.owner = THIS_MODULE,
.init = ping_init_sock,
.close = ping_close,
- .destroy = ping_v6_destroy,
.pre_connect = ping_v6_pre_connect,
.connect = ip6_datagram_connect_v6_only,
.disconnect = __udp_disconnect,
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 722de9dd0ff7..a06a9f847db5 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1173,8 +1173,6 @@ static void raw6_destroy(struct sock *sk)
lock_sock(sk);
ip6_flush_pending_frames(sk);
release_sock(sk);
-
- inet6_destroy_sock(sk);
}
static int rawv6_init_sk(struct sock *sk)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index ff866f2a879e..5bc8a28e67f9 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -112,10 +112,14 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
struct sk_buff *prev_tail;
struct net_device *dev;
int err = -ENOENT;
+ SKB_DR(reason);
u8 ecn;
- if (fq->q.flags & INET_FRAG_COMPLETE)
+ /* If reassembly is already done, @skb must be a duplicate frag. */
+ if (fq->q.flags & INET_FRAG_COMPLETE) {
+ SKB_DR_SET(reason, DUP_FRAG);
goto err;
+ }
err = -EINVAL;
offset = ntohs(fhdr->frag_off) & ~0x7;
@@ -226,8 +230,9 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
insert_error:
if (err == IPFRAG_DUP) {
- kfree_skb(skb);
- return -EINVAL;
+ SKB_DR_SET(reason, DUP_FRAG);
+ err = -EINVAL;
+ goto err;
}
err = -EINVAL;
__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
@@ -237,7 +242,7 @@ discard_fq:
__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_REASMFAILS);
err:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return err;
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 69252eb462b2..2f355f0ec32a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6555,10 +6555,16 @@ static void __net_exit ip6_route_net_exit(struct net *net)
static int __net_init ip6_route_net_init_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
- proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
- sizeof(struct ipv6_route_iter));
- proc_create_net_single("rt6_stats", 0444, net->proc_net,
- rt6_stats_seq_show, NULL);
+ if (!proc_create_net("ipv6_route", 0, net->proc_net,
+ &ipv6_route_seq_ops,
+ sizeof(struct ipv6_route_iter)))
+ return -ENOMEM;
+
+ if (!proc_create_net_single("rt6_stats", 0444, net->proc_net,
+ rt6_stats_seq_show, NULL)) {
+ remove_proc_entry("ipv6_route", net->proc_net);
+ return -ENOMEM;
+ }
#endif
return 0;
}
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 8370726ae7bf..487f8e98deaa 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -1644,13 +1644,13 @@ static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt)
pcounters = per_cpu_ptr(slwt->pcpu_counters, i);
do {
- start = u64_stats_fetch_begin_irq(&pcounters->syncp);
+ start = u64_stats_fetch_begin(&pcounters->syncp);
packets = u64_stats_read(&pcounters->packets);
bytes = u64_stats_read(&pcounters->bytes);
errors = u64_stats_read(&pcounters->errors);
- } while (u64_stats_fetch_retry_irq(&pcounters->syncp, start));
+ } while (u64_stats_fetch_retry(&pcounters->syncp, start));
counters.packets += packets;
counters.bytes += bytes;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index d27683e3fc97..70d81bba5093 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -694,7 +694,7 @@ static int ipip6_rcv(struct sk_buff *skb)
skb->dev = tunnel->dev;
if (packet_is_spoofed(skb, iph, tunnel)) {
- tunnel->dev->stats.rx_errors++;
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto out;
}
@@ -714,8 +714,8 @@ static int ipip6_rcv(struct sk_buff *skb)
net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
&iph->saddr, iph->tos);
if (err > 1) {
- ++tunnel->dev->stats.rx_frame_errors;
- ++tunnel->dev->stats.rx_errors;
+ DEV_STATS_INC(tunnel->dev, rx_frame_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto out;
}
}
@@ -942,7 +942,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
if (!rt) {
rt = ip_route_output_flow(tunnel->net, &fl4, NULL);
if (IS_ERR(rt)) {
- dev->stats.tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
goto tx_error_icmp;
}
dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr);
@@ -950,14 +950,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
ip_rt_put(rt);
- dev->stats.tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
goto tx_error_icmp;
}
tdev = rt->dst.dev;
if (tdev == dev) {
ip_rt_put(rt);
- dev->stats.collisions++;
+ DEV_STATS_INC(dev, collisions);
goto tx_error;
}
@@ -970,7 +970,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
mtu = dst_mtu(&rt->dst) - t_hlen;
if (mtu < IPV4_MIN_MTU) {
- dev->stats.collisions++;
+ DEV_STATS_INC(dev, collisions);
ip_rt_put(rt);
goto tx_error;
}
@@ -1009,7 +1009,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
if (!new_skb) {
ip_rt_put(rt);
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -1039,7 +1039,7 @@ tx_error_icmp:
dst_link_failure(skb);
tx_error:
kfree_skb(skb);
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
return NETDEV_TX_OK;
}
@@ -1058,7 +1058,7 @@ static netdev_tx_t sit_tunnel_xmit__(struct sk_buff *skb,
return NETDEV_TX_OK;
tx_error:
kfree_skb(skb);
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
return NETDEV_TX_OK;
}
@@ -1087,7 +1087,7 @@ static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
tx_err:
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
kfree_skb(skb);
return NETDEV_TX_OK;
@@ -1124,10 +1124,12 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev)
if (tdev && !netif_is_l3_master(tdev)) {
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+ int mtu;
- dev->mtu = tdev->mtu - t_hlen;
- if (dev->mtu < IPV6_MIN_MTU)
- dev->mtu = IPV6_MIN_MTU;
+ mtu = tdev->mtu - t_hlen;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ WRITE_ONCE(dev->mtu, mtu);
}
}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2a3f9296df1e..f52b6f271a24 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -292,24 +292,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (!saddr) {
- struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
- struct in6_addr prev_v6_rcv_saddr;
-
- if (icsk->icsk_bind2_hash) {
- prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo,
- sk, net, inet->inet_num);
- prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
- }
saddr = &fl6.saddr;
- sk->sk_v6_rcv_saddr = *saddr;
- if (prev_addr_hashbucket) {
- err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
- if (err) {
- sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr;
- goto failure;
- }
- }
+ err = inet_bhash2_update_saddr(sk, saddr, AF_INET6);
+ if (err)
+ goto failure;
}
/* set the source address */
@@ -359,6 +346,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
late_failure:
tcp_set_state(sk, TCP_CLOSE);
+ inet_bhash2_reset_saddr(sk);
failure:
inet->inet_dport = 0;
sk->sk_route_caps = 0;
@@ -1966,12 +1954,6 @@ static int tcp_v6_init_sock(struct sock *sk)
return 0;
}
-static void tcp_v6_destroy_sock(struct sock *sk)
-{
- tcp_v4_destroy_sock(sk);
- inet6_destroy_sock(sk);
-}
-
#ifdef CONFIG_PROC_FS
/* Proc filesystem TCPv6 sock list dumping. */
static void get_openreq6(struct seq_file *seq,
@@ -2164,7 +2146,7 @@ struct proto tcpv6_prot = {
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v6_init_sock,
- .destroy = tcp_v6_destroy_sock,
+ .destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 8d09f0ea5b8c..9fb2f33ee3a7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -64,8 +64,9 @@ static void udpv6_destruct_sock(struct sock *sk)
int udpv6_init_sock(struct sock *sk)
{
- skb_queue_head_init(&udp_sk(sk)->reader_queue);
+ udp_lib_init_sock(sk);
sk->sk_destruct = udpv6_destruct_sock;
+ set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
return 0;
}
@@ -195,7 +196,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
result = lookup_reuseport(net, sk, skb,
saddr, sport, daddr, hnum);
/* Fall back to scoring if group has connections */
- if (result && !reuseport_has_conns(sk, false))
+ if (result && !reuseport_has_conns(sk))
return result;
result = result ? : sk;
@@ -216,7 +217,7 @@ static inline struct sock *udp6_lookup_run_bpf(struct net *net,
struct sock *sk, *reuse_sk;
bool no_reuseport;
- if (udptable != &udp_table)
+ if (udptable != net->ipv4.udp_table)
return NULL; /* only UDP is supported */
no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_UDP, saddr, sport,
@@ -297,10 +298,11 @@ struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
__be16 sport, __be16 dport)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct net *net = dev_net(skb->dev);
- return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
+ return __udp6_lib_lookup(net, &iph->saddr, sport,
&iph->daddr, dport, inet6_iif(skb),
- inet6_sdif(skb), &udp_table, NULL);
+ inet6_sdif(skb), net->ipv4.udp_table, NULL);
}
/* Must be called under rcu_read_lock().
@@ -313,7 +315,7 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be
struct sock *sk;
sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport,
- dif, 0, &udp_table, NULL);
+ dif, 0, net->ipv4.udp_table, NULL);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
return sk;
@@ -631,7 +633,8 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
/* Tunnels don't have an application socket: don't pass errors back */
if (tunnel) {
if (udp_sk(sk)->encap_err_rcv)
- udp_sk(sk)->encap_err_rcv(sk, skb, offset);
+ udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest,
+ ntohl(info), (u8 *)(uh+1));
goto out;
}
@@ -687,7 +690,8 @@ static __inline__ int udpv6_err(struct sk_buff *skb,
struct inet6_skb_parm *opt, u8 type,
u8 code, int offset, __be32 info)
{
- return __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
+ return __udp6_lib_err(skb, opt, type, code, offset, info,
+ dev_net(skb->dev)->ipv4.udp_table);
}
static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
@@ -1061,13 +1065,18 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
__be16 rmt_port, const struct in6_addr *rmt_addr,
int dif, int sdif)
{
+ struct udp_table *udptable = net->ipv4.udp_table;
unsigned short hnum = ntohs(loc_port);
- unsigned int hash2 = ipv6_portaddr_hash(net, loc_addr, hnum);
- unsigned int slot2 = hash2 & udp_table.mask;
- struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
- const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
+ unsigned int hash2, slot2;
+ struct udp_hslot *hslot2;
+ __portpair ports;
struct sock *sk;
+ hash2 = ipv6_portaddr_hash(net, loc_addr, hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+ ports = INET_COMBINED_PORTS(rmt_port, hnum);
+
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
if (sk->sk_state == TCP_ESTABLISHED &&
inet6_match(net, sk, rmt_addr, loc_addr, ports, dif, sdif))
@@ -1121,7 +1130,7 @@ void udp_v6_early_demux(struct sk_buff *skb)
INDIRECT_CALLABLE_SCOPE int udpv6_rcv(struct sk_buff *skb)
{
- return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP);
+ return __udp6_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
}
/*
@@ -1638,6 +1647,7 @@ do_confirm:
err = 0;
goto out;
}
+EXPORT_SYMBOL(udpv6_sendmsg);
void udpv6_destroy_sock(struct sock *sk)
{
@@ -1661,8 +1671,6 @@ void udpv6_destroy_sock(struct sock *sk)
udp_encap_disable();
}
}
-
- inet6_destroy_sock(sk);
}
/*
@@ -1671,7 +1679,7 @@ void udpv6_destroy_sock(struct sock *sk)
int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen)
{
- if (level == SOL_UDP || level == SOL_UDPLITE)
+ if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET)
return udp_lib_setsockopt(sk, level, optname,
optval, optlen,
udp_v6_push_pending_frames);
@@ -1719,7 +1727,7 @@ EXPORT_SYMBOL(udp6_seq_ops);
static struct udp_seq_afinfo udp6_seq_afinfo = {
.family = AF_INET6,
- .udp_table = &udp_table,
+ .udp_table = NULL,
};
int __net_init udp6_proc_init(struct net *net)
@@ -1769,7 +1777,7 @@ struct proto udpv6_prot = {
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
.obj_size = sizeof(struct udp6_sock),
- .h.udp_table = &udp_table,
+ .h.udp_table = NULL,
.diag_destroy = udp_abort,
};
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 7720d04ed396..e0e10f6bcdc1 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -116,10 +116,11 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
__be16 dport)
{
const struct ipv6hdr *iph = skb_gro_network_header(skb);
+ struct net *net = dev_net(skb->dev);
- return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
+ return __udp6_lib_lookup(net, &iph->saddr, sport,
&iph->daddr, dport, inet6_iif(skb),
- inet6_sdif(skb), &udp_table, NULL);
+ inet6_sdif(skb), net->ipv4.udp_table, NULL);
}
INDIRECT_CALLABLE_SCOPE
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 4a4b0e49ec92..ea435eba3053 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -287,9 +287,13 @@ int __init xfrm6_init(void)
if (ret)
goto out_state;
- register_pernet_subsys(&xfrm6_net_ops);
+ ret = register_pernet_subsys(&xfrm6_net_ops);
+ if (ret)
+ goto out_protocol;
out:
return ret;
+out_protocol:
+ xfrm6_protocol_fini();
out_state:
xfrm6_state_fini();
out_policy:
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 27725464ec08..890a2423f559 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -162,7 +162,8 @@ static void kcm_rcv_ready(struct kcm_sock *kcm)
/* Buffer limit is okay now, add to ready list */
list_add_tail(&kcm->wait_rx_list,
&kcm->mux->kcm_rx_waiters);
- kcm->rx_wait = true;
+ /* paired with lockless reads in kcm_rfree() */
+ WRITE_ONCE(kcm->rx_wait, true);
}
static void kcm_rfree(struct sk_buff *skb)
@@ -178,7 +179,7 @@ static void kcm_rfree(struct sk_buff *skb)
/* For reading rx_wait and rx_psock without holding lock */
smp_mb__after_atomic();
- if (!kcm->rx_wait && !kcm->rx_psock &&
+ if (!READ_ONCE(kcm->rx_wait) && !READ_ONCE(kcm->rx_psock) &&
sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) {
spin_lock_bh(&mux->rx_lock);
kcm_rcv_ready(kcm);
@@ -221,7 +222,7 @@ static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head)
struct sk_buff *skb;
struct kcm_sock *kcm;
- while ((skb = __skb_dequeue(head))) {
+ while ((skb = skb_dequeue(head))) {
/* Reset destructor to avoid calling kcm_rcv_ready */
skb->destructor = sock_rfree;
skb_orphan(skb);
@@ -237,7 +238,8 @@ try_again:
if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
/* Should mean socket buffer full */
list_del(&kcm->wait_rx_list);
- kcm->rx_wait = false;
+ /* paired with lockless reads in kcm_rfree() */
+ WRITE_ONCE(kcm->rx_wait, false);
/* Commit rx_wait to read in kcm_free */
smp_wmb();
@@ -280,10 +282,12 @@ static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
kcm = list_first_entry(&mux->kcm_rx_waiters,
struct kcm_sock, wait_rx_list);
list_del(&kcm->wait_rx_list);
- kcm->rx_wait = false;
+ /* paired with lockless reads in kcm_rfree() */
+ WRITE_ONCE(kcm->rx_wait, false);
psock->rx_kcm = kcm;
- kcm->rx_psock = psock;
+ /* paired with lockless reads in kcm_rfree() */
+ WRITE_ONCE(kcm->rx_psock, psock);
spin_unlock_bh(&mux->rx_lock);
@@ -310,7 +314,8 @@ static void unreserve_rx_kcm(struct kcm_psock *psock,
spin_lock_bh(&mux->rx_lock);
psock->rx_kcm = NULL;
- kcm->rx_psock = NULL;
+ /* paired with lockless reads in kcm_rfree() */
+ WRITE_ONCE(kcm->rx_psock, NULL);
/* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with
* kcm_rfree
@@ -834,7 +839,7 @@ static ssize_t kcm_sendpage(struct socket *sock, struct page *page,
}
get_page(page);
- skb_fill_page_desc(skb, i, page, offset, size);
+ skb_fill_page_desc_noacc(skb, i, page, offset, size);
skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
coalesced:
@@ -1080,53 +1085,17 @@ out_error:
return err;
}
-static struct sk_buff *kcm_wait_data(struct sock *sk, int flags,
- long timeo, int *err)
-{
- struct sk_buff *skb;
-
- while (!(skb = skb_peek(&sk->sk_receive_queue))) {
- if (sk->sk_err) {
- *err = sock_error(sk);
- return NULL;
- }
-
- if (sock_flag(sk, SOCK_DONE))
- return NULL;
-
- if ((flags & MSG_DONTWAIT) || !timeo) {
- *err = -EAGAIN;
- return NULL;
- }
-
- sk_wait_data(sk, &timeo, NULL);
-
- /* Handle signals */
- if (signal_pending(current)) {
- *err = sock_intr_errno(timeo);
- return NULL;
- }
- }
-
- return skb;
-}
-
static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
size_t len, int flags)
{
struct sock *sk = sock->sk;
struct kcm_sock *kcm = kcm_sk(sk);
int err = 0;
- long timeo;
struct strp_msg *stm;
int copied = 0;
struct sk_buff *skb;
- timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
-
- lock_sock(sk);
-
- skb = kcm_wait_data(sk, flags, timeo, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto out;
@@ -1157,14 +1126,11 @@ msg_finished:
/* Finished with message */
msg->msg_flags |= MSG_EOR;
KCM_STATS_INCR(kcm->stats.rx_msgs);
- skb_unlink(skb, &sk->sk_receive_queue);
- kfree_skb(skb);
}
}
out:
- release_sock(sk);
-
+ skb_free_datagram(sk, skb);
return copied ? : err;
}
@@ -1174,7 +1140,6 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
{
struct sock *sk = sock->sk;
struct kcm_sock *kcm = kcm_sk(sk);
- long timeo;
struct strp_msg *stm;
int err = 0;
ssize_t copied;
@@ -1182,11 +1147,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
/* Only support splice for SOCKSEQPACKET */
- timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
-
- lock_sock(sk);
-
- skb = kcm_wait_data(sk, flags, timeo, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto err_out;
@@ -1214,13 +1175,11 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
* finish reading the message.
*/
- release_sock(sk);
-
+ skb_free_datagram(sk, skb);
return copied;
err_out:
- release_sock(sk);
-
+ skb_free_datagram(sk, skb);
return err;
}
@@ -1240,7 +1199,8 @@ static void kcm_recv_disable(struct kcm_sock *kcm)
if (!kcm->rx_psock) {
if (kcm->rx_wait) {
list_del(&kcm->wait_rx_list);
- kcm->rx_wait = false;
+ /* paired with lockless reads in kcm_rfree() */
+ WRITE_ONCE(kcm->rx_wait, false);
}
requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
@@ -1793,7 +1753,8 @@ static void kcm_done(struct kcm_sock *kcm)
if (kcm->rx_wait) {
list_del(&kcm->wait_rx_list);
- kcm->rx_wait = false;
+ /* paired with lockless reads in kcm_rfree() */
+ WRITE_ONCE(kcm->rx_wait, false);
}
/* Move any pending receive messages to other kcm sockets */
requeue_rx_msgs(mux, &sk->sk_receive_queue);
diff --git a/net/key/af_key.c b/net/key/af_key.c
index e1d2155605aa..2bdbcec781cd 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -2905,7 +2905,7 @@ static int count_ah_combs(const struct xfrm_tmpl *t)
break;
if (!aalg->pfkey_supported)
continue;
- if (aalg_tmpl_set(t, aalg) && aalg->available)
+ if (aalg_tmpl_set(t, aalg))
sz += sizeof(struct sadb_comb);
}
return sz + sizeof(struct sadb_prop);
@@ -2923,7 +2923,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t)
if (!ealg->pfkey_supported)
continue;
- if (!(ealg_tmpl_set(t, ealg) && ealg->available))
+ if (!(ealg_tmpl_set(t, ealg)))
continue;
for (k = 1; ; k++) {
@@ -2934,16 +2934,17 @@ static int count_esp_combs(const struct xfrm_tmpl *t)
if (!aalg->pfkey_supported)
continue;
- if (aalg_tmpl_set(t, aalg) && aalg->available)
+ if (aalg_tmpl_set(t, aalg))
sz += sizeof(struct sadb_comb);
}
}
return sz + sizeof(struct sadb_prop);
}
-static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
+static int dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
{
struct sadb_prop *p;
+ int sz = 0;
int i;
p = skb_put(skb, sizeof(struct sadb_prop));
@@ -2971,13 +2972,17 @@ static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
c->sadb_comb_soft_addtime = 20*60*60;
c->sadb_comb_hard_usetime = 8*60*60;
c->sadb_comb_soft_usetime = 7*60*60;
+ sz += sizeof(*c);
}
}
+
+ return sz + sizeof(*p);
}
-static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
+static int dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
{
struct sadb_prop *p;
+ int sz = 0;
int i, k;
p = skb_put(skb, sizeof(struct sadb_prop));
@@ -3019,8 +3024,11 @@ static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
c->sadb_comb_soft_addtime = 20*60*60;
c->sadb_comb_hard_usetime = 8*60*60;
c->sadb_comb_soft_usetime = 7*60*60;
+ sz += sizeof(*c);
}
}
+
+ return sz + sizeof(*p);
}
static int key_notify_policy_expire(struct xfrm_policy *xp, const struct km_event *c)
@@ -3150,6 +3158,7 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct
struct sadb_x_sec_ctx *sec_ctx;
struct xfrm_sec_ctx *xfrm_ctx;
int ctx_size = 0;
+ int alg_size = 0;
sockaddr_size = pfkey_sockaddr_size(x->props.family);
if (!sockaddr_size)
@@ -3161,16 +3170,16 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct
sizeof(struct sadb_x_policy);
if (x->id.proto == IPPROTO_AH)
- size += count_ah_combs(t);
+ alg_size = count_ah_combs(t);
else if (x->id.proto == IPPROTO_ESP)
- size += count_esp_combs(t);
+ alg_size = count_esp_combs(t);
if ((xfrm_ctx = x->security)) {
ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len);
size += sizeof(struct sadb_x_sec_ctx) + ctx_size;
}
- skb = alloc_skb(size + 16, GFP_ATOMIC);
+ skb = alloc_skb(size + alg_size + 16, GFP_ATOMIC);
if (skb == NULL)
return -ENOMEM;
@@ -3224,10 +3233,13 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct
pol->sadb_x_policy_priority = xp->priority;
/* Set sadb_comb's. */
+ alg_size = 0;
if (x->id.proto == IPPROTO_AH)
- dump_ah_combs(skb, t);
+ alg_size = dump_ah_combs(skb, t);
else if (x->id.proto == IPPROTO_ESP)
- dump_esp_combs(skb, t);
+ alg_size = dump_esp_combs(skb, t);
+
+ hdr->sadb_msg_len += alg_size / 8;
/* security context */
if (xfrm_ctx) {
@@ -3382,7 +3394,7 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr,
hdr->sadb_msg_len = size / sizeof(uint64_t);
hdr->sadb_msg_errno = 0;
hdr->sadb_msg_reserved = 0;
- hdr->sadb_msg_seq = x->km.seq = get_acqseq();
+ hdr->sadb_msg_seq = x->km.seq;
hdr->sadb_msg_pid = 0;
/* SA */
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 7499c51b1850..9a1415fe3fa7 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1150,8 +1150,10 @@ static void l2tp_tunnel_destruct(struct sock *sk)
}
/* Remove hooks into tunnel socket */
+ write_lock_bh(&sk->sk_callback_lock);
sk->sk_destruct = tunnel->old_sk_destruct;
sk->sk_user_data = NULL;
+ write_unlock_bh(&sk->sk_callback_lock);
/* Call the original destructor */
if (sk->sk_destruct)
@@ -1469,16 +1471,19 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
sock = sockfd_lookup(tunnel->fd, &ret);
if (!sock)
goto err;
-
- ret = l2tp_validate_socket(sock->sk, net, tunnel->encap);
- if (ret < 0)
- goto err_sock;
}
+ sk = sock->sk;
+ write_lock_bh(&sk->sk_callback_lock);
+ ret = l2tp_validate_socket(sk, net, tunnel->encap);
+ if (ret < 0)
+ goto err_inval_sock;
+ rcu_assign_sk_user_data(sk, tunnel);
+ write_unlock_bh(&sk->sk_callback_lock);
+
tunnel->l2tp_net = net;
pn = l2tp_pernet(net);
- sk = sock->sk;
sock_hold(sk);
tunnel->sock = sk;
@@ -1503,8 +1508,6 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
};
setup_udp_tunnel_sock(net, sock, &udp_cfg);
- } else {
- sk->sk_user_data = tunnel;
}
tunnel->old_sk_destruct = sk->sk_destruct;
@@ -1521,6 +1524,11 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
return 0;
err_sock:
+ write_lock_bh(&sk->sk_callback_lock);
+ rcu_assign_sk_user_data(sk, NULL);
+err_inval_sock:
+ write_unlock_bh(&sk->sk_callback_lock);
+
if (tunnel->fd < 0)
sock_release(sock);
else
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 9dbd801ddb98..2478aa60145f 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -257,8 +257,6 @@ static void l2tp_ip6_destroy_sock(struct sock *sk)
if (tunnel)
l2tp_tunnel_delete(tunnel);
-
- inet6_destroy_sock(sk);
}
static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 9414d3bbd65f..c6fa53230450 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -183,34 +183,15 @@ static void ieee80211_add_addbaext(struct ieee80211_sub_if_data *sdata,
const struct ieee80211_addba_ext_ie *req,
u16 buf_size)
{
- struct ieee80211_supported_band *sband;
struct ieee80211_addba_ext_ie *resp;
- const struct ieee80211_sta_he_cap *he_cap;
- u8 frag_level, cap_frag_level;
u8 *pos;
- sband = ieee80211_get_sband(sdata);
- if (!sband)
- return;
- he_cap = ieee80211_get_he_iftype_cap(sband,
- ieee80211_vif_type_p2p(&sdata->vif));
- if (!he_cap)
- return;
-
pos = skb_put_zero(skb, 2 + sizeof(struct ieee80211_addba_ext_ie));
*pos++ = WLAN_EID_ADDBA_EXT;
*pos++ = sizeof(struct ieee80211_addba_ext_ie);
resp = (struct ieee80211_addba_ext_ie *)pos;
resp->data = req->data & IEEE80211_ADDBA_EXT_NO_FRAG;
- frag_level = u32_get_bits(req->data,
- IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK);
- cap_frag_level = u32_get_bits(he_cap->he_cap_elem.mac_cap_info[0],
- IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK);
- if (frag_level > cap_frag_level)
- frag_level = cap_frag_level;
- resp->data |= u8_encode_bits(frag_level,
- IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK);
resp->data |= u8_encode_bits(buf_size >> IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT,
IEEE80211_ADDBA_EXT_BUF_SIZE_MASK);
}
@@ -242,7 +223,7 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid,
sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
else if (sdata->vif.type == NL80211_IFTYPE_STATION)
- memcpy(mgmt->bssid, sdata->deflink.u.mgd.bssid, ETH_ALEN);
+ memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN);
else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
memcpy(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN);
@@ -297,9 +278,9 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
}
if (!sta->sta.deflink.ht_cap.ht_supported &&
- sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ) {
+ !sta->sta.deflink.he_cap.has_he) {
ht_dbg(sta->sdata,
- "STA %pM erroneously requests BA session on tid %d w/o QoS\n",
+ "STA %pM erroneously requests BA session on tid %d w/o HT\n",
sta->sta.addr, tid);
/* send a response anyway, it's an error case if we get here */
goto end;
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 07c892aa8c73..9c40f8d3bce8 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -82,7 +82,7 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
else if (sdata->vif.type == NL80211_IFTYPE_STATION)
- memcpy(mgmt->bssid, sdata->deflink.u.mgd.bssid, ETH_ALEN);
+ memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN);
else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
memcpy(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN);
diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c
index 2e66598fac79..e8ebd343e2bf 100644
--- a/net/mac80211/airtime.c
+++ b/net/mac80211/airtime.c
@@ -452,6 +452,9 @@ static u32 ieee80211_get_rate_duration(struct ieee80211_hw *hw,
(status->encoding == RX_ENC_HE && streams > 8)))
return 0;
+ if (idx >= MCS_GROUP_RATES)
+ return 0;
+
duration = airtime_mcs_groups[group].duration[idx];
duration <<= airtime_mcs_groups[group].shift;
*overhead = 36 + (streams << 2);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 687b4c878d4a..c848fe04dd44 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2554,47 +2554,50 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
struct bss_parameters *params)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_link_data *link;
struct ieee80211_supported_band *sband;
u32 changed = 0;
- if (!sdata_dereference(sdata->deflink.u.ap.beacon, sdata))
+ link = ieee80211_link_or_deflink(sdata, params->link_id, true);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+
+ if (!sdata_dereference(link->u.ap.beacon, sdata))
return -ENOENT;
- sband = ieee80211_get_sband(sdata);
+ sband = ieee80211_get_link_sband(link);
if (!sband)
return -EINVAL;
if (params->use_cts_prot >= 0) {
- sdata->vif.bss_conf.use_cts_prot = params->use_cts_prot;
+ link->conf->use_cts_prot = params->use_cts_prot;
changed |= BSS_CHANGED_ERP_CTS_PROT;
}
if (params->use_short_preamble >= 0) {
- sdata->vif.bss_conf.use_short_preamble =
- params->use_short_preamble;
+ link->conf->use_short_preamble = params->use_short_preamble;
changed |= BSS_CHANGED_ERP_PREAMBLE;
}
- if (!sdata->vif.bss_conf.use_short_slot &&
+ if (!link->conf->use_short_slot &&
(sband->band == NL80211_BAND_5GHZ ||
sband->band == NL80211_BAND_6GHZ)) {
- sdata->vif.bss_conf.use_short_slot = true;
+ link->conf->use_short_slot = true;
changed |= BSS_CHANGED_ERP_SLOT;
}
if (params->use_short_slot_time >= 0) {
- sdata->vif.bss_conf.use_short_slot =
- params->use_short_slot_time;
+ link->conf->use_short_slot = params->use_short_slot_time;
changed |= BSS_CHANGED_ERP_SLOT;
}
if (params->basic_rates) {
- ieee80211_parse_bitrates(sdata->vif.bss_conf.chandef.width,
+ ieee80211_parse_bitrates(link->conf->chandef.width,
wiphy->bands[sband->band],
params->basic_rates,
params->basic_rates_len,
- &sdata->vif.bss_conf.basic_rates);
+ &link->conf->basic_rates);
changed |= BSS_CHANGED_BASIC_RATES;
- ieee80211_check_rate_mask(&sdata->deflink);
+ ieee80211_check_rate_mask(link);
}
if (params->ap_isolate >= 0) {
@@ -2606,30 +2609,29 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
}
if (params->ht_opmode >= 0) {
- sdata->vif.bss_conf.ht_operation_mode =
- (u16) params->ht_opmode;
+ link->conf->ht_operation_mode = (u16)params->ht_opmode;
changed |= BSS_CHANGED_HT;
}
if (params->p2p_ctwindow >= 0) {
- sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow &=
+ link->conf->p2p_noa_attr.oppps_ctwindow &=
~IEEE80211_P2P_OPPPS_CTWINDOW_MASK;
- sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |=
+ link->conf->p2p_noa_attr.oppps_ctwindow |=
params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK;
changed |= BSS_CHANGED_P2P_PS;
}
if (params->p2p_opp_ps > 0) {
- sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |=
+ link->conf->p2p_noa_attr.oppps_ctwindow |=
IEEE80211_P2P_OPPPS_ENABLE_BIT;
changed |= BSS_CHANGED_P2P_PS;
} else if (params->p2p_opp_ps == 0) {
- sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow &=
+ link->conf->p2p_noa_attr.oppps_ctwindow &=
~IEEE80211_P2P_OPPPS_ENABLE_BIT;
changed |= BSS_CHANGED_P2P_PS;
}
- ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed);
+ ieee80211_link_info_change_notify(sdata, link, changed);
return 0;
}
@@ -4338,9 +4340,6 @@ static int ieee80211_get_txq_stats(struct wiphy *wiphy,
struct ieee80211_sub_if_data *sdata;
int ret = 0;
- if (!local->ops->wake_tx_queue)
- return 1;
-
spin_lock_bh(&local->fq.lock);
rcu_read_lock();
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 78c7d60e8667..dfb9f55e2685 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -663,9 +663,7 @@ void debugfs_hw_add(struct ieee80211_local *local)
DEBUGFS_ADD_MODE(force_tx_status, 0600);
DEBUGFS_ADD_MODE(aql_enable, 0600);
DEBUGFS_ADD(aql_pending);
-
- if (local->ops->wake_tx_queue)
- DEBUGFS_ADD_MODE(aqm, 0600);
+ DEBUGFS_ADD_MODE(aqm, 0600);
DEBUGFS_ADD_MODE(airtime_flags, 0600);
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 5b014786fd2d..c87e1137e5da 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -677,8 +677,7 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata)
DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz);
DEBUGFS_ADD(hw_queues);
- if (sdata->local->ops->wake_tx_queue &&
- sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
+ if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
sdata->vif.type != NL80211_IFTYPE_NAN)
DEBUGFS_ADD(aqm);
}
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index d3397c1248d3..7a3d7893e19d 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -5,7 +5,7 @@
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright(c) 2016 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2021 Intel Corporation
+ * Copyright (C) 2018 - 2022 Intel Corporation
*/
#include <linux/debugfs.h>
@@ -435,8 +435,29 @@ static ssize_t sta_agg_status_write(struct file *file, const char __user *userbu
}
STA_OPS_RW(agg_status);
-static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf,
- size_t count, loff_t *ppos)
+/* link sta attributes */
+#define LINK_STA_OPS(name) \
+static const struct file_operations link_sta_ ##name## _ops = { \
+ .read = link_sta_##name##_read, \
+ .open = simple_open, \
+ .llseek = generic_file_llseek, \
+}
+
+static ssize_t link_sta_addr_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct link_sta_info *link_sta = file->private_data;
+ u8 mac[3 * ETH_ALEN + 1];
+
+ snprintf(mac, sizeof(mac), "%pM\n", link_sta->pub->addr);
+
+ return simple_read_from_buffer(userbuf, count, ppos, mac, 3 * ETH_ALEN);
+}
+
+LINK_STA_OPS(addr);
+
+static ssize_t link_sta_ht_capa_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
{
#define PRINT_HT_CAP(_cond, _str) \
do { \
@@ -446,8 +467,8 @@ static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf,
char *buf, *p;
int i;
ssize_t bufsz = 512;
- struct sta_info *sta = file->private_data;
- struct ieee80211_sta_ht_cap *htc = &sta->sta.deflink.ht_cap;
+ struct link_sta_info *link_sta = file->private_data;
+ struct ieee80211_sta_ht_cap *htc = &link_sta->pub->ht_cap;
ssize_t ret;
buf = kzalloc(bufsz, GFP_KERNEL);
@@ -524,14 +545,14 @@ static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf,
kfree(buf);
return ret;
}
-STA_OPS(ht_capa);
+LINK_STA_OPS(ht_capa);
-static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf,
- size_t count, loff_t *ppos)
+static ssize_t link_sta_vht_capa_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
{
char *buf, *p;
- struct sta_info *sta = file->private_data;
- struct ieee80211_sta_vht_cap *vhtc = &sta->sta.deflink.vht_cap;
+ struct link_sta_info *link_sta = file->private_data;
+ struct ieee80211_sta_vht_cap *vhtc = &link_sta->pub->vht_cap;
ssize_t ret;
ssize_t bufsz = 512;
@@ -638,15 +659,15 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf,
kfree(buf);
return ret;
}
-STA_OPS(vht_capa);
+LINK_STA_OPS(vht_capa);
-static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf,
- size_t count, loff_t *ppos)
+static ssize_t link_sta_he_capa_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
{
char *buf, *p;
size_t buf_sz = PAGE_SIZE;
- struct sta_info *sta = file->private_data;
- struct ieee80211_sta_he_cap *hec = &sta->sta.deflink.he_cap;
+ struct link_sta_info *link_sta = file->private_data;
+ struct ieee80211_sta_he_cap *hec = &link_sta->pub->he_cap;
struct ieee80211_he_mcs_nss_supp *nss = &hec->he_mcs_nss_supp;
u8 ppe_size;
u8 *cap;
@@ -1011,7 +1032,7 @@ out:
kfree(buf);
return ret;
}
-STA_OPS(he_capa);
+LINK_STA_OPS(he_capa);
#define DEBUGFS_ADD(name) \
debugfs_create_file(#name, 0400, \
@@ -1048,18 +1069,11 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
DEBUGFS_ADD(num_ps_buf_frames);
DEBUGFS_ADD(last_seq_ctrl);
DEBUGFS_ADD(agg_status);
- DEBUGFS_ADD(ht_capa);
- DEBUGFS_ADD(vht_capa);
- DEBUGFS_ADD(he_capa);
-
- DEBUGFS_ADD_COUNTER(rx_duplicates, deflink.rx_stats.num_duplicates);
- DEBUGFS_ADD_COUNTER(rx_fragments, deflink.rx_stats.fragments);
+ /* FIXME: Kept here as the statistics are only done on the deflink */
DEBUGFS_ADD_COUNTER(tx_filtered, deflink.status_stats.filtered);
- if (local->ops->wake_tx_queue) {
- DEBUGFS_ADD(aqm);
- DEBUGFS_ADD(airtime);
- }
+ DEBUGFS_ADD(aqm);
+ DEBUGFS_ADD(airtime);
if (wiphy_ext_feature_isset(local->hw.wiphy,
NL80211_EXT_FEATURE_AQL))
@@ -1076,3 +1090,85 @@ void ieee80211_sta_debugfs_remove(struct sta_info *sta)
debugfs_remove_recursive(sta->debugfs_dir);
sta->debugfs_dir = NULL;
}
+
+#undef DEBUGFS_ADD
+#undef DEBUGFS_ADD_COUNTER
+
+#define DEBUGFS_ADD(name) \
+ debugfs_create_file(#name, 0400, \
+ link_sta->debugfs_dir, link_sta, &link_sta_ ##name## _ops)
+#define DEBUGFS_ADD_COUNTER(name, field) \
+ debugfs_create_ulong(#name, 0400, link_sta->debugfs_dir, &link_sta->field)
+
+void ieee80211_link_sta_debugfs_add(struct link_sta_info *link_sta)
+{
+ if (WARN_ON(!link_sta->sta->debugfs_dir))
+ return;
+
+ /* For non-MLO, leave the files in the main directory. */
+ if (link_sta->sta->sta.valid_links) {
+ char link_dir_name[10];
+
+ snprintf(link_dir_name, sizeof(link_dir_name),
+ "link-%d", link_sta->link_id);
+
+ link_sta->debugfs_dir =
+ debugfs_create_dir(link_dir_name,
+ link_sta->sta->debugfs_dir);
+
+ DEBUGFS_ADD(addr);
+ } else {
+ if (WARN_ON(link_sta != &link_sta->sta->deflink))
+ return;
+
+ link_sta->debugfs_dir = link_sta->sta->debugfs_dir;
+ }
+
+ DEBUGFS_ADD(ht_capa);
+ DEBUGFS_ADD(vht_capa);
+ DEBUGFS_ADD(he_capa);
+
+ DEBUGFS_ADD_COUNTER(rx_duplicates, rx_stats.num_duplicates);
+ DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments);
+}
+
+void ieee80211_link_sta_debugfs_remove(struct link_sta_info *link_sta)
+{
+ if (!link_sta->debugfs_dir || !link_sta->sta->debugfs_dir) {
+ link_sta->debugfs_dir = NULL;
+ return;
+ }
+
+ if (link_sta->debugfs_dir == link_sta->sta->debugfs_dir) {
+ WARN_ON(link_sta != &link_sta->sta->deflink);
+ link_sta->sta->debugfs_dir = NULL;
+ return;
+ }
+
+ debugfs_remove_recursive(link_sta->debugfs_dir);
+ link_sta->debugfs_dir = NULL;
+}
+
+void ieee80211_link_sta_debugfs_drv_add(struct link_sta_info *link_sta)
+{
+ if (WARN_ON(!link_sta->debugfs_dir))
+ return;
+
+ drv_link_sta_add_debugfs(link_sta->sta->local, link_sta->sta->sdata,
+ link_sta->pub, link_sta->debugfs_dir);
+}
+
+void ieee80211_link_sta_debugfs_drv_remove(struct link_sta_info *link_sta)
+{
+ if (!link_sta->debugfs_dir)
+ return;
+
+ if (WARN_ON(link_sta->debugfs_dir == link_sta->sta->debugfs_dir))
+ return;
+
+ /* Recreate the directory excluding the driver data */
+ debugfs_remove_recursive(link_sta->debugfs_dir);
+ link_sta->debugfs_dir = NULL;
+
+ ieee80211_link_sta_debugfs_add(link_sta);
+}
diff --git a/net/mac80211/debugfs_sta.h b/net/mac80211/debugfs_sta.h
index d2e7c27ad6d1..cde8148bdb18 100644
--- a/net/mac80211/debugfs_sta.h
+++ b/net/mac80211/debugfs_sta.h
@@ -7,9 +7,21 @@
#ifdef CONFIG_MAC80211_DEBUGFS
void ieee80211_sta_debugfs_add(struct sta_info *sta);
void ieee80211_sta_debugfs_remove(struct sta_info *sta);
+
+void ieee80211_link_sta_debugfs_add(struct link_sta_info *link_sta);
+void ieee80211_link_sta_debugfs_remove(struct link_sta_info *link_sta);
+
+void ieee80211_link_sta_debugfs_drv_add(struct link_sta_info *link_sta);
+void ieee80211_link_sta_debugfs_drv_remove(struct link_sta_info *link_sta);
#else
static inline void ieee80211_sta_debugfs_add(struct sta_info *sta) {}
static inline void ieee80211_sta_debugfs_remove(struct sta_info *sta) {}
+
+static inline void ieee80211_link_sta_debugfs_add(struct link_sta_info *link_sta) {}
+static inline void ieee80211_link_sta_debugfs_remove(struct link_sta_info *link_sta) {}
+
+static inline void ieee80211_link_sta_debugfs_drv_add(struct link_sta_info *link_sta) {}
+static inline void ieee80211_link_sta_debugfs_drv_remove(struct link_sta_info *link_sta) {}
#endif
#endif /* __MAC80211_DEBUGFS_STA_H */
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index 5392ffa18270..d737db4e07e2 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -7,6 +7,7 @@
#include "ieee80211_i.h"
#include "trace.h"
#include "driver-ops.h"
+#include "debugfs_sta.h"
int drv_start(struct ieee80211_local *local)
{
@@ -497,6 +498,11 @@ int drv_change_sta_links(struct ieee80211_local *local,
struct ieee80211_sta *sta,
u16 old_links, u16 new_links)
{
+ struct sta_info *info = container_of(sta, struct sta_info, sta);
+ struct link_sta_info *link_sta;
+ unsigned long links_to_add;
+ unsigned long links_to_rem;
+ unsigned int link_id;
int ret = -EOPNOTSUPP;
might_sleep();
@@ -510,11 +516,30 @@ int drv_change_sta_links(struct ieee80211_local *local,
if (old_links == new_links)
return 0;
+ links_to_add = ~old_links & new_links;
+ links_to_rem = old_links & ~new_links;
+
+ for_each_set_bit(link_id, &links_to_rem, IEEE80211_MLD_MAX_NUM_LINKS) {
+ link_sta = rcu_dereference_protected(info->link[link_id],
+ lockdep_is_held(&local->sta_mtx));
+
+ ieee80211_link_sta_debugfs_drv_remove(link_sta);
+ }
+
trace_drv_change_sta_links(local, sdata, sta, old_links, new_links);
if (local->ops->change_sta_links)
ret = local->ops->change_sta_links(&local->hw, &sdata->vif, sta,
old_links, new_links);
trace_drv_return_int(local, ret);
- return ret;
+ if (ret)
+ return ret;
+
+ for_each_set_bit(link_id, &links_to_add, IEEE80211_MLD_MAX_NUM_LINKS) {
+ link_sta = rcu_dereference_protected(info->link[link_id],
+ lockdep_is_held(&local->sta_mtx));
+ ieee80211_link_sta_debugfs_drv_add(link_sta);
+ }
+
+ return 0;
}
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 81e40b0a3b16..809bad53e15b 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -480,6 +480,22 @@ static inline void drv_sta_add_debugfs(struct ieee80211_local *local,
local->ops->sta_add_debugfs(&local->hw, &sdata->vif,
sta, dir);
}
+
+static inline void drv_link_sta_add_debugfs(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_link_sta *link_sta,
+ struct dentry *dir)
+{
+ might_sleep();
+
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ if (local->ops->link_sta_add_debugfs)
+ local->ops->link_sta_add_debugfs(&local->hw, &sdata->vif,
+ link_sta, dir);
+}
#endif
static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index a842f2e1c230..63ff0d2524b6 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -390,6 +390,7 @@ struct ieee80211_mgd_auth_data {
bool done, waiting;
bool peer_confirmed;
bool timeout_started;
+ int link_id;
u8 ap_addr[ETH_ALEN] __aligned(2);
@@ -412,6 +413,8 @@ struct ieee80211_mgd_assoc_data {
u8 *elems; /* pointing to inside ie[] below */
ieee80211_conn_flags_t conn_flags;
+
+ u16 status;
} link[IEEE80211_MLD_MAX_NUM_LINKS];
u8 ap_addr[ETH_ALEN] __aligned(2);
@@ -1707,6 +1710,17 @@ struct ieee802_11_elems {
u8 tx_pwr_env_num;
u8 eht_cap_len;
+ /* mult-link element can be de-fragmented and thus u8 is not sufficient */
+ size_t multi_link_len;
+
+ /*
+ * store the per station profile pointer and length in case that the
+ * parsing also handled Multi-Link element parsing for a specific link
+ * ID.
+ */
+ struct ieee80211_mle_per_sta_profile *prof;
+ size_t sta_prof_len;
+
/* whether a parse error occurred while retrieving these elements */
bool parse_error;
@@ -2205,9 +2219,13 @@ static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata,
* represent a non-transmitting BSS in which case the data
* for that non-transmitting BSS is returned
* @link_id: the link ID to parse elements for, if a STA profile
- * is present in the multi-link element, or -1 to ignore
+ * is present in the multi-link element, or -1 to ignore;
+ * note that the code currently assumes parsing an association
+ * (or re-association) response frame if this is given
* @from_ap: frame is received from an AP (currently used only
* for EHT capabilities parsing)
+ * @scratch_len: if non zero, specifies the requested length of the scratch
+ * buffer; otherwise, 'len' is used.
*/
struct ieee80211_elems_parse_params {
const u8 *start;
@@ -2218,6 +2236,7 @@ struct ieee80211_elems_parse_params {
struct cfg80211_bss *bss;
int link_id;
bool from_ap;
+ size_t scratch_len;
};
struct ieee802_11_elems *
@@ -2288,7 +2307,6 @@ void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
enum queue_stop_reason reason,
bool refcounted);
-void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue);
void ieee80211_add_pending_skb(struct ieee80211_local *local,
struct sk_buff *skb);
void ieee80211_add_pending_skbs(struct ieee80211_local *local,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index dd9ac1f7d2ea..7c4ce716c939 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -458,12 +458,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
if (cancel_scan)
ieee80211_scan_cancel(local);
- /*
- * Stop TX on this interface first.
- */
- if (!local->ops->wake_tx_queue && sdata->dev)
- netif_tx_stop_all_queues(sdata->dev);
-
ieee80211_roc_purge(local, sdata);
switch (sdata->vif.type) {
@@ -811,13 +805,6 @@ static void ieee80211_uninit(struct net_device *dev)
ieee80211_teardown_sdata(IEEE80211_DEV_TO_SUB_IF(dev));
}
-static u16 ieee80211_netdev_select_queue(struct net_device *dev,
- struct sk_buff *skb,
- struct net_device *sb_dev)
-{
- return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
-}
-
static void
ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
@@ -831,7 +818,6 @@ static const struct net_device_ops ieee80211_dataif_ops = {
.ndo_start_xmit = ieee80211_subif_start_xmit,
.ndo_set_rx_mode = ieee80211_set_multicast_list,
.ndo_set_mac_address = ieee80211_change_mac,
- .ndo_select_queue = ieee80211_netdev_select_queue,
.ndo_get_stats64 = ieee80211_get_stats64,
};
@@ -939,7 +925,6 @@ static const struct net_device_ops ieee80211_dataif_8023_ops = {
.ndo_start_xmit = ieee80211_subif_start_xmit_8023,
.ndo_set_rx_mode = ieee80211_set_multicast_list,
.ndo_set_mac_address = ieee80211_change_mac,
- .ndo_select_queue = ieee80211_netdev_select_queue,
.ndo_get_stats64 = ieee80211_get_stats64,
.ndo_fill_forward_path = ieee80211_netdev_fill_forward_path,
};
@@ -1441,35 +1426,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
ieee80211_recalc_ps(local);
- if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
- sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
- local->ops->wake_tx_queue) {
- /* XXX: for AP_VLAN, actually track AP queues */
- if (dev)
- netif_tx_start_all_queues(dev);
- } else if (dev) {
- unsigned long flags;
- int n_acs = IEEE80211_NUM_ACS;
- int ac;
-
- if (local->hw.queues < IEEE80211_NUM_ACS)
- n_acs = 1;
-
- spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
- if (sdata->vif.cab_queue == IEEE80211_INVAL_HW_QUEUE ||
- (local->queue_stop_reasons[sdata->vif.cab_queue] == 0 &&
- skb_queue_empty(&local->pending[sdata->vif.cab_queue]))) {
- for (ac = 0; ac < n_acs; ac++) {
- int ac_queue = sdata->vif.hw_queue[ac];
-
- if (local->queue_stop_reasons[ac_queue] == 0 &&
- skb_queue_empty(&local->pending[ac_queue]))
- netif_start_subqueue(dev, ac);
- }
- }
- spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
- }
-
set_bit(SDATA_STATE_RUNNING, &sdata->state);
return 0;
@@ -1499,17 +1455,12 @@ static void ieee80211_if_setup(struct net_device *dev)
{
ether_setup(dev);
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->priv_flags |= IFF_NO_QUEUE;
dev->netdev_ops = &ieee80211_dataif_ops;
dev->needs_free_netdev = true;
dev->priv_destructor = ieee80211_if_free;
}
-static void ieee80211_if_setup_no_queue(struct net_device *dev)
-{
- ieee80211_if_setup(dev);
- dev->priv_flags |= IFF_NO_QUEUE;
-}
-
static void ieee80211_iface_process_skb(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb)
@@ -2094,9 +2045,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
struct net_device *ndev = NULL;
struct ieee80211_sub_if_data *sdata = NULL;
struct txq_info *txqi;
- void (*if_setup)(struct net_device *dev);
int ret, i;
- int txqs = 1;
ASSERT_RTNL();
@@ -2119,30 +2068,18 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
sizeof(void *));
int txq_size = 0;
- if (local->ops->wake_tx_queue &&
- type != NL80211_IFTYPE_AP_VLAN &&
+ if (type != NL80211_IFTYPE_AP_VLAN &&
(type != NL80211_IFTYPE_MONITOR ||
(params->flags & MONITOR_FLAG_ACTIVE)))
txq_size += sizeof(struct txq_info) +
local->hw.txq_data_size;
- if (local->ops->wake_tx_queue) {
- if_setup = ieee80211_if_setup_no_queue;
- } else {
- if_setup = ieee80211_if_setup;
- if (local->hw.queues >= IEEE80211_NUM_ACS)
- txqs = IEEE80211_NUM_ACS;
- }
-
ndev = alloc_netdev_mqs(size + txq_size,
name, name_assign_type,
- if_setup, txqs, 1);
+ ieee80211_if_setup, 1, 1);
if (!ndev)
return -ENOMEM;
- if (!local->ops->wake_tx_queue && local->hw.wiphy->tx_queue_len)
- ndev->tx_queue_len = local->hw.wiphy->tx_queue_len;
-
dev_net_set(ndev, wiphy_net(local->hw.wiphy));
ndev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
diff --git a/net/mac80211/link.c b/net/mac80211/link.c
index e309708abae8..d1f5a9f7c647 100644
--- a/net/mac80211/link.c
+++ b/net/mac80211/link.c
@@ -357,6 +357,11 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata,
list_for_each_entry(sta, &local->sta_list, list) {
if (sdata != sta->sdata)
continue;
+
+ /* this is very temporary, but do it anyway */
+ __ieee80211_sta_recalc_aggregates(sta,
+ old_active | active_links);
+
ret = drv_change_sta_links(local, sdata, &sta->sta,
old_active,
old_active | active_links);
@@ -369,10 +374,22 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata,
list_for_each_entry(sta, &local->sta_list, list) {
if (sdata != sta->sdata)
continue;
+
+ __ieee80211_sta_recalc_aggregates(sta, active_links);
+
ret = drv_change_sta_links(local, sdata, &sta->sta,
old_active | active_links,
active_links);
WARN_ON_ONCE(ret);
+
+ /*
+ * Do it again, just in case - the driver might very
+ * well have called ieee80211_sta_recalc_aggregates()
+ * from there when filling in the new links, which
+ * would set it wrong since the vif's active links are
+ * not switched yet...
+ */
+ __ieee80211_sta_recalc_aggregates(sta, active_links);
}
for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) {
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 46f3eddc2388..846528850612 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -630,7 +630,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
if (WARN_ON(!ops->tx || !ops->start || !ops->stop || !ops->config ||
!ops->add_interface || !ops->remove_interface ||
- !ops->configure_filter))
+ !ops->configure_filter || !ops->wake_tx_queue))
return NULL;
if (WARN_ON(ops->sta_state && (ops->sta_add || ops->sta_remove)))
@@ -719,9 +719,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
if (!ops->set_key)
wiphy->flags |= WIPHY_FLAG_IBSS_RSN;
- if (ops->wake_tx_queue)
- wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_TXQS);
-
+ wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_TXQS);
wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_RRM);
wiphy->bss_priv_size = sizeof(struct ieee80211_bss);
@@ -834,10 +832,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
atomic_set(&local->agg_queue_stop[i], 0);
}
tasklet_setup(&local->tx_pending_tasklet, ieee80211_tx_pending);
-
- if (ops->wake_tx_queue)
- tasklet_setup(&local->wake_txqs_tasklet, ieee80211_wake_txqs);
-
+ tasklet_setup(&local->wake_txqs_tasklet, ieee80211_wake_txqs);
tasklet_setup(&local->tasklet, ieee80211_tasklet_handler);
skb_queue_head_init(&local->skb_queue);
@@ -1087,6 +1082,16 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
channels += sband->n_channels;
+ /*
+ * Due to the way the aggregation code handles this and it
+ * being an HT capability, we can't really support delayed
+ * BA in MLO (yet).
+ */
+ if (WARN_ON(sband->ht_cap.ht_supported &&
+ (sband->ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA) &&
+ hw->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO))
+ return -EINVAL;
+
if (max_bitrates < sband->n_bitrates)
max_bitrates = sband->n_bitrates;
supp_ht = supp_ht || sband->ht_cap.ht_supported;
@@ -1155,6 +1160,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
if (!local->int_scan_req)
return -ENOMEM;
+ eth_broadcast_addr(local->int_scan_req->bssid);
+
for (band = 0; band < NUM_NL80211_BANDS; band++) {
if (!local->hw.wiphy->bands[band])
continue;
@@ -1439,8 +1446,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
ieee80211_led_exit(local);
destroy_workqueue(local->workqueue);
fail_workqueue:
- if (local->wiphy_ciphers_allocated)
+ if (local->wiphy_ciphers_allocated) {
kfree(local->hw.wiphy->cipher_suites);
+ local->wiphy_ciphers_allocated = false;
+ }
kfree(local->int_scan_req);
return result;
}
@@ -1508,8 +1517,10 @@ void ieee80211_free_hw(struct ieee80211_hw *hw)
mutex_destroy(&local->iflist_mtx);
mutex_destroy(&local->mtx);
- if (local->wiphy_ciphers_allocated)
+ if (local->wiphy_ciphers_allocated) {
kfree(local->hw.wiphy->cipher_suites);
+ local->wiphy_ciphers_allocated = false;
+ }
idr_for_each(&local->ack_status_frames,
ieee80211_free_ack_frame, NULL);
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index acc1c299f1ae..69d5e1ec6ede 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -710,7 +710,7 @@ int mesh_path_send_to_gates(struct mesh_path *mpath)
void mesh_path_discard_frame(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb)
{
- kfree_skb(skb);
+ ieee80211_free_txskb(&sdata->local->hw, skb);
sdata->u.mesh.mshstats.dropped_frames_no_route++;
}
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index d8484cd870de..a804e0220ed7 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2717,18 +2717,10 @@ static u32 ieee80211_link_set_associated(struct ieee80211_link_data *link,
}
if (link->u.mgd.have_beacon) {
- /*
- * If the AP is buggy we may get here with no DTIM period
- * known, so assume it's 1 which is the only safe assumption
- * in that case, although if the TIM IE is broken powersave
- * probably just won't work at all.
- */
- bss_conf->dtim_period = link->u.mgd.dtim_period ?: 1;
bss_conf->beacon_rate = bss->beacon_rate;
changed |= BSS_CHANGED_BEACON_INFO;
} else {
bss_conf->beacon_rate = NULL;
- bss_conf->dtim_period = 0;
}
/* Tell the driver to monitor connection quality (if supported) */
@@ -2754,7 +2746,8 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
struct cfg80211_bss *cbss = assoc_data->link[link_id].bss;
struct ieee80211_link_data *link;
- if (!cbss)
+ if (!cbss ||
+ assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS)
continue;
link = sdata_dereference(sdata->link[link_id], sdata);
@@ -2782,7 +2775,8 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
struct ieee80211_link_data *link;
struct cfg80211_bss *cbss = assoc_data->link[link_id].bss;
- if (!cbss)
+ if (!cbss ||
+ assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS)
continue;
link = sdata_dereference(sdata->link[link_id], sdata);
@@ -3868,9 +3862,15 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
}
}
-static bool ieee80211_twt_req_supported(const struct link_sta_info *link_sta,
+static bool ieee80211_twt_req_supported(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_supported_band *sband,
+ const struct link_sta_info *link_sta,
const struct ieee802_11_elems *elems)
{
+ const struct ieee80211_sta_he_cap *own_he_cap =
+ ieee80211_get_he_iftype_cap(sband,
+ ieee80211_vif_type_p2p(&sdata->vif));
+
if (elems->ext_capab_len < 10)
return false;
@@ -3878,14 +3878,19 @@ static bool ieee80211_twt_req_supported(const struct link_sta_info *link_sta,
return false;
return link_sta->pub->he_cap.he_cap_elem.mac_cap_info[0] &
- IEEE80211_HE_MAC_CAP0_TWT_RES;
+ IEEE80211_HE_MAC_CAP0_TWT_RES &&
+ own_he_cap &&
+ (own_he_cap->he_cap_elem.mac_cap_info[0] &
+ IEEE80211_HE_MAC_CAP0_TWT_REQ);
}
-static int ieee80211_recalc_twt_req(struct ieee80211_link_data *link,
+static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_supported_band *sband,
+ struct ieee80211_link_data *link,
struct link_sta_info *link_sta,
struct ieee802_11_elems *elems)
{
- bool twt = ieee80211_twt_req_supported(link_sta, elems);
+ bool twt = ieee80211_twt_req_supported(sdata, sband, link_sta, elems);
if (link->conf->twt_requester != twt) {
link->conf->twt_requester = twt;
@@ -3923,11 +3928,12 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data;
struct ieee80211_bss_conf *bss_conf = link->conf;
struct ieee80211_local *local = sdata->local;
+ unsigned int link_id = link->link_id;
struct ieee80211_elems_parse_params parse_params = {
.start = elem_start,
.len = elem_len,
.bss = cbss,
- .link_id = link == &sdata->deflink ? -1 : link->link_id,
+ .link_id = link_id == assoc_data->assoc_link_id ? -1 : link_id,
.from_ap = true,
};
bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ;
@@ -3942,8 +3948,35 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
if (!elems)
return false;
- /* FIXME: use from STA profile element after parsing that */
- capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
+ if (link_id == assoc_data->assoc_link_id) {
+ capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
+
+ /*
+ * we should not get to this flow unless the association was
+ * successful, so set the status directly to success
+ */
+ assoc_data->link[link_id].status = WLAN_STATUS_SUCCESS;
+ } else if (!elems->prof) {
+ ret = false;
+ goto out;
+ } else {
+ const u8 *ptr = elems->prof->variable +
+ elems->prof->sta_info_len - 1;
+
+ /*
+ * During parsing, we validated that these fields exist,
+ * otherwise elems->prof would have been set to NULL.
+ */
+ capab_info = get_unaligned_le16(ptr);
+ assoc_data->link[link_id].status = get_unaligned_le16(ptr + 2);
+
+ if (assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) {
+ link_info(link, "association response status code=%u\n",
+ assoc_data->link[link_id].status);
+ ret = true;
+ goto out;
+ }
+ }
if (!is_s1g && !elems->supp_rates) {
sdata_info(sdata, "no SuppRates element in AssocResp\n");
@@ -4099,7 +4132,8 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
else
bss_conf->twt_protected = false;
- *changed |= ieee80211_recalc_twt_req(link, link_sta, elems);
+ *changed |= ieee80211_recalc_twt_req(sdata, sband, link,
+ link_sta, elems);
if (elems->eht_operation && elems->eht_cap &&
!(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_EHT)) {
@@ -4864,6 +4898,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
unsigned int link_id;
struct sta_info *sta;
u64 changed[IEEE80211_MLD_MAX_NUM_LINKS] = {};
+ u16 valid_links = 0;
int err;
mutex_lock(&sdata->local->sta_mtx);
@@ -4876,8 +4911,6 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
goto out_err;
if (sdata->vif.valid_links) {
- u16 valid_links = 0;
-
for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) {
if (!assoc_data->link[link_id].bss)
continue;
@@ -4894,10 +4927,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
}
for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) {
+ struct cfg80211_bss *cbss = assoc_data->link[link_id].bss;
struct ieee80211_link_data *link;
struct link_sta_info *link_sta;
- if (!assoc_data->link[link_id].bss)
+ if (!cbss)
continue;
link = sdata_dereference(sdata->link[link_id], sdata);
@@ -4906,28 +4940,36 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
if (sdata->vif.valid_links)
link_info(link,
- "local address %pM, AP link address %pM\n",
+ "local address %pM, AP link address %pM%s\n",
link->conf->addr,
- assoc_data->link[link_id].bss->bssid);
+ assoc_data->link[link_id].bss->bssid,
+ link_id == assoc_data->assoc_link_id ?
+ " (assoc)" : "");
link_sta = rcu_dereference_protected(sta->link[link_id],
lockdep_is_held(&local->sta_mtx));
if (WARN_ON(!link_sta))
goto out_err;
- if (link_id != assoc_data->assoc_link_id) {
- struct cfg80211_bss *cbss = assoc_data->link[link_id].bss;
+ if (!link->u.mgd.have_beacon) {
const struct cfg80211_bss_ies *ies;
rcu_read_lock();
- ies = rcu_dereference(cbss->ies);
+ ies = rcu_dereference(cbss->beacon_ies);
+ if (ies)
+ link->u.mgd.have_beacon = true;
+ else
+ ies = rcu_dereference(cbss->ies);
ieee80211_get_dtim(ies,
&link->conf->sync_dtim_count,
&link->u.mgd.dtim_period);
- link->conf->dtim_period = link->u.mgd.dtim_period ?: 1;
link->conf->beacon_int = cbss->beacon_interval;
rcu_read_unlock();
+ }
+
+ link->conf->dtim_period = link->u.mgd.dtim_period ?: 1;
+ if (link_id != assoc_data->assoc_link_id) {
err = ieee80211_prep_channel(sdata, link, cbss,
&link->u.mgd.conn_flags);
if (err) {
@@ -4947,6 +4989,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
&changed[link_id]))
goto out_err;
+ if (assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) {
+ valid_links &= ~BIT(link_id);
+ ieee80211_sta_remove_link(sta, link_id);
+ continue;
+ }
+
if (link_id != assoc_data->assoc_link_id) {
err = ieee80211_sta_activate_link(sta, link_id);
if (err)
@@ -4954,6 +5002,9 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
}
}
+ /* links might have changed due to rejected ones, set them again */
+ ieee80211_vif_set_links(sdata, valid_links);
+
rate_control_rate_init(sta);
if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) {
@@ -5033,6 +5084,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
struct cfg80211_rx_assoc_resp resp = {
.uapsd_queues = -1,
};
+ u8 ap_mld_addr[ETH_ALEN] __aligned(2);
unsigned int link_id;
sdata_assert_lock(sdata);
@@ -5187,10 +5239,13 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
link = sdata_dereference(sdata->link[link_id], sdata);
if (!link)
continue;
+
if (!assoc_data->link[link_id].bss)
continue;
+
resp.links[link_id].bss = assoc_data->link[link_id].bss;
resp.links[link_id].addr = link->conf->addr;
+ resp.links[link_id].status = assoc_data->link[link_id].status;
/* get uapsd queues configuration - same for all links */
resp.uapsd_queues = 0;
@@ -5199,6 +5254,11 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
resp.uapsd_queues |= ieee80211_ac_to_qos_mask[ac];
}
+ if (sdata->vif.valid_links) {
+ ether_addr_copy(ap_mld_addr, sdata->vif.cfg.ap_addr);
+ resp.ap_mld_addr = ap_mld_addr;
+ }
+
ieee80211_destroy_assoc_data(sdata,
status_code == WLAN_STATUS_SUCCESS ?
ASSOC_SUCCESS :
@@ -5208,8 +5268,6 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
resp.len = len;
resp.req_ies = ifmgd->assoc_req_ies;
resp.req_ies_len = ifmgd->assoc_req_ies_len;
- if (sdata->vif.valid_links)
- resp.ap_mld_addr = sdata->vif.cfg.ap_addr;
cfg80211_rx_assoc_resp(sdata->dev, &resp);
notify_driver:
drv_mgd_complete_tx(sdata->local, sdata, &info);
@@ -5432,6 +5490,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
struct ieee802_11_elems *elems;
struct ieee80211_local *local = sdata->local;
struct ieee80211_chanctx_conf *chanctx_conf;
+ struct ieee80211_supported_band *sband;
struct ieee80211_channel *chan;
struct link_sta_info *link_sta;
struct sta_info *sta;
@@ -5694,7 +5753,12 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
goto free;
}
- changed |= ieee80211_recalc_twt_req(link, link_sta, elems);
+ if (WARN_ON(!link->conf->chandef.chan))
+ goto free;
+
+ sband = local->hw.wiphy->bands[link->conf->chandef.chan->band];
+
+ changed |= ieee80211_recalc_twt_req(sdata, sband, link, link_sta, elems);
if (ieee80211_config_bw(link, elems->ht_cap_elem,
elems->vht_cap_elem, elems->ht_operation,
@@ -6640,6 +6704,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
req->ap_mld_addr ?: req->bss->bssid,
ETH_ALEN);
auth_data->bss = req->bss;
+ auth_data->link_id = req->link_id;
if (req->auth_data_len >= 4) {
if (req->auth_type == NL80211_AUTHTYPE_SAE) {
@@ -6658,7 +6723,8 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
* removal and re-addition of the STA entry in
* ieee80211_prep_connection().
*/
- cont_auth = ifmgd->auth_data && req->bss == ifmgd->auth_data->bss;
+ cont_auth = ifmgd->auth_data && req->bss == ifmgd->auth_data->bss &&
+ ifmgd->auth_data->link_id == req->link_id;
if (req->ie && req->ie_len) {
memcpy(&auth_data->data[auth_data->data_len],
@@ -6982,7 +7048,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
/* keep sta info, bssid if matching */
match = ether_addr_equal(ifmgd->auth_data->ap_addr,
- assoc_data->ap_addr);
+ assoc_data->ap_addr) &&
+ ifmgd->auth_data->link_id == req->link_id;
ieee80211_destroy_auth_data(sdata, match);
}
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 7f3f5f51081d..762346598338 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -1963,9 +1963,6 @@ minstrel_ht_alloc(struct ieee80211_hw *hw)
/* safe default, does not necessarily have to match hw properties */
mp->max_retry = 7;
- if (hw->max_rates >= 4)
- mp->has_mrr = true;
-
mp->hw = hw;
mp->update_interval = HZ / 20;
@@ -2036,7 +2033,7 @@ static void __init init_sample_table(void)
memset(sample_table, 0xff, sizeof(sample_table));
for (col = 0; col < SAMPLE_COLUMNS; col++) {
- prandom_bytes(rnd, sizeof(rnd));
+ get_random_bytes(rnd, sizeof(rnd));
for (i = 0; i < MCS_GROUP_RATES; i++) {
new_idx = (i + rnd[i]) % MCS_GROUP_RATES;
while (sample_table[col][new_idx] != 0xff)
diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h
index 1766ff0c78d3..4be0401f7721 100644
--- a/net/mac80211/rc80211_minstrel_ht.h
+++ b/net/mac80211/rc80211_minstrel_ht.h
@@ -74,7 +74,6 @@
struct minstrel_priv {
struct ieee80211_hw *hw;
- bool has_mrr;
unsigned int cw_min;
unsigned int cw_max;
unsigned int max_retry;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index f99416d2e144..c28c6fbf786e 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1571,9 +1571,6 @@ static void sta_ps_start(struct sta_info *sta)
ieee80211_clear_fast_xmit(sta);
- if (!sta->sta.txq[0])
- return;
-
for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) {
struct ieee80211_txq *txq = sta->sta.txq[tid];
struct txq_info *txqi = to_txq_info(txq);
diff --git a/net/mac80211/s1g.c b/net/mac80211/s1g.c
index 8ca7d45d6daa..c1f964e9991c 100644
--- a/net/mac80211/s1g.c
+++ b/net/mac80211/s1g.c
@@ -112,6 +112,9 @@ ieee80211_s1g_rx_twt_setup(struct ieee80211_sub_if_data *sdata,
goto out;
}
+ /* TWT Information not supported yet */
+ twt->control |= IEEE80211_TWT_CONTROL_RX_DISABLED;
+
drv_add_twt_setup(sdata->local, sdata, &sta->sta, twt);
out:
ieee80211_s1g_send_twt_setup(sdata, mgmt->sa, sdata->vif.addr, twt);
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 0e8c4f48c36d..dc3cdee51e66 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -641,7 +641,7 @@ static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata,
if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) {
struct ieee80211_hdr *hdr = (void *)skb->data;
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
- u16 sn = get_random_u32();
+ u16 sn = get_random_u16();
info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO;
hdr->seq_ctrl =
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index cebfd148bb40..04e0f132b1d9 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -140,17 +140,15 @@ static void __cleanup_single_sta(struct sta_info *sta)
atomic_dec(&ps->num_sta_ps);
}
- if (sta->sta.txq[0]) {
- for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
- struct txq_info *txqi;
+ for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
+ struct txq_info *txqi;
- if (!sta->sta.txq[i])
- continue;
+ if (!sta->sta.txq[i])
+ continue;
- txqi = to_txq_info(sta->sta.txq[i]);
+ txqi = to_txq_info(sta->sta.txq[i]);
- ieee80211_txq_purge(local, txqi);
- }
+ ieee80211_txq_purge(local, txqi);
}
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
@@ -366,6 +364,9 @@ static void sta_remove_link(struct sta_info *sta, unsigned int link_id,
if (unhash)
link_sta_info_hash_del(sta->local, link_sta);
+ if (test_sta_flag(sta, WLAN_STA_INSERTED))
+ ieee80211_link_sta_debugfs_remove(link_sta);
+
if (link_sta != &sta->deflink)
alloc = container_of(link_sta, typeof(*alloc), info);
@@ -425,8 +426,7 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
sta_dbg(sta->sdata, "Destroyed STA %pM\n", sta->sta.addr);
- if (sta->sta.txq[0])
- kfree(to_txq_info(sta->sta.txq[0]));
+ kfree(to_txq_info(sta->sta.txq[0]));
kfree(rcu_dereference_raw(sta->sta.rates));
#ifdef CONFIG_MAC80211_MESH
kfree(sta->mesh);
@@ -511,6 +511,7 @@ static void sta_info_add_link(struct sta_info *sta,
link_info->sta = sta;
link_info->link_id = link_id;
link_info->pub = link_sta;
+ link_info->pub->sta = &sta->sta;
link_sta->link_id = link_id;
rcu_assign_pointer(sta->link[link_id], link_info);
rcu_assign_pointer(sta->sta.link[link_id], link_sta);
@@ -527,6 +528,8 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata,
struct ieee80211_local *local = sdata->local;
struct ieee80211_hw *hw = &local->hw;
struct sta_info *sta;
+ void *txq_data;
+ int size;
int i;
sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp);
@@ -596,21 +599,18 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata,
sta->last_connected = ktime_get_seconds();
- if (local->ops->wake_tx_queue) {
- void *txq_data;
- int size = sizeof(struct txq_info) +
- ALIGN(hw->txq_data_size, sizeof(void *));
+ size = sizeof(struct txq_info) +
+ ALIGN(hw->txq_data_size, sizeof(void *));
- txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp);
- if (!txq_data)
- goto free;
+ txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp);
+ if (!txq_data)
+ goto free;
- for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
- struct txq_info *txq = txq_data + i * size;
+ for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
+ struct txq_info *txq = txq_data + i * size;
- /* might not do anything for the bufferable MMPDU TXQ */
- ieee80211_txq_init(sdata, sta, txq, i);
- }
+ /* might not do anything for the (bufferable) MMPDU TXQ */
+ ieee80211_txq_init(sdata, sta, txq, i);
}
if (sta_prepare_rate_control(local, sta, gfp))
@@ -684,8 +684,7 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata,
return sta;
free_txq:
- if (sta->sta.txq[0])
- kfree(to_txq_info(sta->sta.txq[0]));
+ kfree(to_txq_info(sta->sta.txq[0]));
free:
sta_info_free_link(&sta->deflink);
#ifdef CONFIG_MAC80211_MESH
@@ -874,6 +873,26 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
ieee80211_sta_debugfs_add(sta);
rate_control_add_sta_debugfs(sta);
+ if (sta->sta.valid_links) {
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(sta->link); i++) {
+ struct link_sta_info *link_sta;
+
+ link_sta = rcu_dereference_protected(sta->link[i],
+ lockdep_is_held(&local->sta_mtx));
+
+ if (!link_sta)
+ continue;
+
+ ieee80211_link_sta_debugfs_add(link_sta);
+ if (sdata->vif.active_links & BIT(i))
+ ieee80211_link_sta_debugfs_drv_add(link_sta);
+ }
+ } else {
+ ieee80211_link_sta_debugfs_add(&sta->deflink);
+ ieee80211_link_sta_debugfs_drv_add(&sta->deflink);
+ }
sinfo->generation = local->sta_generation;
cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL);
@@ -1958,9 +1977,6 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
* TIM recalculation.
*/
- if (!sta->sta.txq[0])
- return;
-
for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
if (!sta->sta.txq[tid] ||
!(driver_release_tids & BIT(tid)) ||
@@ -2127,22 +2143,30 @@ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid,
}
EXPORT_SYMBOL(ieee80211_sta_register_airtime);
-void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta)
+void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links)
{
- struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
- struct ieee80211_link_sta *link_sta;
- int link_id, i;
bool first = true;
+ int link_id;
- if (!pubsta->valid_links || !pubsta->mlo) {
- pubsta->cur = &pubsta->deflink.agg;
+ if (!sta->sta.valid_links || !sta->sta.mlo) {
+ sta->sta.cur = &sta->sta.deflink.agg;
return;
}
rcu_read_lock();
- for_each_sta_active_link(&sta->sdata->vif, pubsta, link_sta, link_id) {
+ for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) {
+ struct ieee80211_link_sta *link_sta;
+ int i;
+
+ if (!(active_links & BIT(link_id)))
+ continue;
+
+ link_sta = rcu_dereference(sta->sta.link[link_id]);
+ if (!link_sta)
+ continue;
+
if (first) {
- sta->cur = pubsta->deflink.agg;
+ sta->cur = sta->sta.deflink.agg;
first = false;
continue;
}
@@ -2161,7 +2185,14 @@ void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta)
}
rcu_read_unlock();
- pubsta->cur = &sta->cur;
+ sta->sta.cur = &sta->cur;
+}
+
+void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta)
+{
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+
+ __ieee80211_sta_recalc_aggregates(sta, sta->sdata->vif.active_links);
}
EXPORT_SYMBOL(ieee80211_sta_recalc_aggregates);
@@ -2396,9 +2427,9 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats,
u64 value;
do {
- start = u64_stats_fetch_begin_irq(&rxstats->syncp);
+ start = u64_stats_fetch_begin(&rxstats->syncp);
value = rxstats->msdu[tid];
- } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start));
+ } while (u64_stats_fetch_retry(&rxstats->syncp, start));
return value;
}
@@ -2445,7 +2476,7 @@ static void sta_set_tidstats(struct sta_info *sta,
tidstats->tx_msdu_failed = sta->deflink.status_stats.msdu_failed[tid];
}
- if (local->ops->wake_tx_queue && tid < IEEE80211_NUM_TIDS) {
+ if (tid < IEEE80211_NUM_TIDS) {
spin_lock_bh(&local->fq.lock);
rcu_read_lock();
@@ -2464,9 +2495,9 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats)
u64 value;
do {
- start = u64_stats_fetch_begin_irq(&rxstats->syncp);
+ start = u64_stats_fetch_begin(&rxstats->syncp);
value = rxstats->bytes;
- } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start));
+ } while (u64_stats_fetch_retry(&rxstats->syncp, start));
return value;
}
@@ -2773,9 +2804,6 @@ unsigned long ieee80211_sta_last_active(struct sta_info *sta)
static void sta_update_codel_params(struct sta_info *sta, u32 thr)
{
- if (!sta->sdata->local->ops->wake_tx_queue)
- return;
-
if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) {
sta->cparams.target = MS2TIME(50);
sta->cparams.interval = MS2TIME(300);
@@ -2823,6 +2851,8 @@ int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id)
sta_info_add_link(sta, link_id, &alloc->info, &alloc->sta);
+ ieee80211_link_sta_debugfs_add(&alloc->info);
+
return 0;
}
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 2517ea714dc4..69820b551668 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -513,6 +513,7 @@ struct ieee80211_fragment_cache {
* @status_stats.avg_ack_signal: average ACK signal
* @cur_max_bandwidth: maximum bandwidth to use for TX to the station,
* taken from HT/VHT capabilities or VHT operating mode notification
+ * @debugfs_dir: debug filesystem directory dentry
* @pub: public (driver visible) link STA data
* TODO Move other link params from sta_info as required for MLD operation
*/
@@ -560,6 +561,10 @@ struct link_sta_info {
enum ieee80211_sta_rx_bandwidth cur_max_bandwidth;
+#ifdef CONFIG_MAC80211_DEBUGFS
+ struct dentry *debugfs_dir;
+#endif
+
struct ieee80211_link_sta *pub;
};
@@ -922,6 +927,8 @@ void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta,
const u8 *ext_capab,
unsigned int ext_capab_len);
+void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links);
+
enum sta_stats_type {
STA_STATS_RATE_TYPE_INVALID = 0,
STA_STATS_RATE_TYPE_LEGACY,
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index f4b4d25eef95..b255f3b5bf01 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -1016,7 +1016,6 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev,
skb->priority = 256 + 5;
break;
}
- skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, skb));
/*
* Set the WLAN_TDLS_TEARDOWN flag to indicate a teardown in progress.
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index a364148149f9..165ac0711d71 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1599,9 +1599,6 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local)
bool supp_vht = false;
enum nl80211_band band;
- if (!local->ops->wake_tx_queue)
- return 0;
-
ret = fq_init(fq, 4096);
if (ret)
return ret;
@@ -1649,9 +1646,6 @@ void ieee80211_txq_teardown_flows(struct ieee80211_local *local)
{
struct fq *fq = &local->fq;
- if (!local->ops->wake_tx_queue)
- return;
-
kfree(local->cvars);
local->cvars = NULL;
@@ -1668,8 +1662,7 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local,
struct ieee80211_vif *vif;
struct txq_info *txqi;
- if (!local->ops->wake_tx_queue ||
- sdata->vif.type == NL80211_IFTYPE_MONITOR)
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR)
return false;
if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
@@ -2973,7 +2966,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
if (pre_conf_link_id != link_id &&
link_id != IEEE80211_LINK_UNSPECIFIED) {
-#ifdef CPTCFG_MAC80211_VERBOSE_DEBUG
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
net_info_ratelimited("%s: dropped frame to %pM with bad link ID request (%d vs. %d)\n",
sdata->name, hdr.addr1,
pre_conf_link_id, link_id);
@@ -4184,12 +4177,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
if (IS_ERR(sta))
sta = NULL;
- if (local->ops->wake_tx_queue) {
- u16 queue = __ieee80211_select_queue(sdata, sta, skb);
- skb_set_queue_mapping(skb, queue);
- skb_get_hash(skb);
- }
-
+ skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, sta, skb));
ieee80211_aggr_check(sdata, sta, skb);
sk_pacing_shift_update(skb->sk, sdata->local->hw.tx_sk_pacing_shift);
@@ -4418,6 +4406,11 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
if (likely(!is_multicast_ether_addr(eth->h_dest)))
goto normal;
+ if (unlikely(!ieee80211_sdata_running(sdata))) {
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+
if (unlikely(ieee80211_multicast_to_unicast(skb, dev))) {
struct sk_buff_head queue;
@@ -4495,11 +4488,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
struct tid_ampdu_tx *tid_tx;
u8 tid;
- if (local->ops->wake_tx_queue) {
- u16 queue = __ieee80211_select_queue(sdata, sta, skb);
- skb_set_queue_mapping(skb, queue);
- skb_get_hash(skb);
- }
+ skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, sta, skb));
if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning)) &&
test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))
@@ -4753,9 +4742,6 @@ void ieee80211_tx_pending(struct tasklet_struct *t)
if (!txok)
break;
}
-
- if (skb_queue_empty(&local->pending[i]))
- ieee80211_propagate_queue_wake(local, i);
}
spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
@@ -5948,10 +5934,9 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
}
if (!IS_ERR(sta)) {
- u16 queue = __ieee80211_select_queue(sdata, sta, skb);
+ u16 queue = ieee80211_select_queue(sdata, sta, skb);
skb_set_queue_mapping(skb, queue);
- skb_get_hash(skb);
/*
* for MLO STA, the SA should be the AP MLD address, but
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index b512cb37aafb..6f5407038459 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -288,6 +288,52 @@ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw,
}
EXPORT_SYMBOL(ieee80211_ctstoself_duration);
+static void wake_tx_push_queue(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_txq *queue)
+{
+ int q = sdata->vif.hw_queue[queue->ac];
+ struct ieee80211_tx_control control = {
+ .sta = queue->sta,
+ };
+ struct sk_buff *skb;
+ unsigned long flags;
+ bool q_stopped;
+
+ while (1) {
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ q_stopped = local->queue_stop_reasons[q];
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+
+ if (q_stopped)
+ break;
+
+ skb = ieee80211_tx_dequeue(&local->hw, queue);
+ if (!skb)
+ break;
+
+ drv_tx(local, &control, skb);
+ }
+}
+
+/* wake_tx_queue handler for driver not implementing a custom one*/
+void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw,
+ struct ieee80211_txq *txq)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif);
+ struct ieee80211_txq *queue;
+
+ /* Use ieee80211_next_txq() for airtime fairness accounting */
+ ieee80211_txq_schedule_start(hw, txq->ac);
+ while ((queue = ieee80211_next_txq(hw, txq->ac))) {
+ wake_tx_push_queue(local, sdata, queue);
+ ieee80211_return_txq(hw, queue, false);
+ }
+ ieee80211_txq_schedule_end(hw, txq->ac);
+}
+EXPORT_SYMBOL(ieee80211_handle_wake_tx_queue);
+
static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac)
{
struct ieee80211_local *local = sdata->local;
@@ -400,39 +446,6 @@ void ieee80211_wake_txqs(struct tasklet_struct *t)
spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
}
-void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue)
-{
- struct ieee80211_sub_if_data *sdata;
- int n_acs = IEEE80211_NUM_ACS;
-
- if (local->ops->wake_tx_queue)
- return;
-
- if (local->hw.queues < IEEE80211_NUM_ACS)
- n_acs = 1;
-
- list_for_each_entry_rcu(sdata, &local->interfaces, list) {
- int ac;
-
- if (!sdata->dev)
- continue;
-
- if (sdata->vif.cab_queue != IEEE80211_INVAL_HW_QUEUE &&
- local->queue_stop_reasons[sdata->vif.cab_queue] != 0)
- continue;
-
- for (ac = 0; ac < n_acs; ac++) {
- int ac_queue = sdata->vif.hw_queue[ac];
-
- if (ac_queue == queue ||
- (sdata->vif.cab_queue == queue &&
- local->queue_stop_reasons[ac_queue] == 0 &&
- skb_queue_empty(&local->pending[ac_queue])))
- netif_wake_subqueue(sdata->dev, ac);
- }
- }
-}
-
static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
enum queue_stop_reason reason,
bool refcounted,
@@ -463,11 +476,7 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
/* someone still has this queue stopped */
return;
- if (skb_queue_empty(&local->pending[queue])) {
- rcu_read_lock();
- ieee80211_propagate_queue_wake(local, queue);
- rcu_read_unlock();
- } else
+ if (!skb_queue_empty(&local->pending[queue]))
tasklet_schedule(&local->tx_pending_tasklet);
/*
@@ -477,12 +486,10 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
* release someone's lock, but it is fine because all the callers of
* __ieee80211_wake_queue call it right before releasing the lock.
*/
- if (local->ops->wake_tx_queue) {
- if (reason == IEEE80211_QUEUE_STOP_REASON_DRIVER)
- tasklet_schedule(&local->wake_txqs_tasklet);
- else
- _ieee80211_wake_txqs(local, flags);
- }
+ if (reason == IEEE80211_QUEUE_STOP_REASON_DRIVER)
+ tasklet_schedule(&local->wake_txqs_tasklet);
+ else
+ _ieee80211_wake_txqs(local, flags);
}
void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
@@ -539,10 +546,6 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
for (ac = 0; ac < n_acs; ac++) {
if (sdata->vif.hw_queue[ac] == queue ||
sdata->vif.cab_queue == queue) {
- if (!local->ops->wake_tx_queue) {
- netif_stop_subqueue(sdata->dev, ac);
- continue;
- }
spin_lock(&local->fq.lock);
sdata->vif.txqs_stopped[ac] = true;
spin_unlock(&local->fq.lock);
@@ -1026,8 +1029,10 @@ ieee80211_parse_extension_element(u32 *crc,
elems->eht_operation = data;
break;
case WLAN_EID_EXT_EHT_MULTI_LINK:
- if (ieee80211_mle_size_ok(data, len))
+ if (ieee80211_mle_size_ok(data, len)) {
elems->multi_link = (void *)data;
+ elems->multi_link_len = len;
+ }
break;
}
}
@@ -1499,6 +1504,145 @@ static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len,
return found ? profile_len : 0;
}
+static void ieee80211_defragment_element(struct ieee802_11_elems *elems,
+ void **elem_ptr, size_t *len,
+ size_t total_len, u8 frag_id)
+{
+ u8 *data = *elem_ptr, *pos, *start;
+ const struct element *elem;
+
+ /*
+ * Since 'data' points to the data of the element, not the element
+ * itself, allow 254 in case it was an extended element where the
+ * extended ID isn't part of the data we see here and thus not part of
+ * 'len' either.
+ */
+ if (!data || (*len != 254 && *len != 255))
+ return;
+
+ start = elems->scratch_pos;
+
+ if (WARN_ON(*len > (elems->scratch + elems->scratch_len -
+ elems->scratch_pos)))
+ return;
+
+ memcpy(elems->scratch_pos, data, *len);
+ elems->scratch_pos += *len;
+
+ pos = data + *len;
+ total_len -= *len;
+ for_each_element(elem, pos, total_len) {
+ if (elem->id != frag_id)
+ break;
+
+ if (WARN_ON(elem->datalen >
+ (elems->scratch + elems->scratch_len -
+ elems->scratch_pos)))
+ return;
+
+ memcpy(elems->scratch_pos, elem->data, elem->datalen);
+ elems->scratch_pos += elem->datalen;
+
+ *len += elem->datalen;
+ }
+
+ *elem_ptr = start;
+}
+
+static void ieee80211_mle_get_sta_prof(struct ieee802_11_elems *elems,
+ u8 link_id)
+{
+ const struct ieee80211_multi_link_elem *ml = elems->multi_link;
+ size_t ml_len = elems->multi_link_len;
+ const struct element *sub;
+
+ if (!ml || !ml_len)
+ return;
+
+ if (le16_get_bits(ml->control, IEEE80211_ML_CONTROL_TYPE) !=
+ IEEE80211_ML_CONTROL_TYPE_BASIC)
+ return;
+
+ for_each_mle_subelement(sub, (u8 *)ml, ml_len) {
+ struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data;
+ u16 control;
+
+ if (sub->id != IEEE80211_MLE_SUBELEM_PER_STA_PROFILE)
+ continue;
+
+ if (!ieee80211_mle_sta_prof_size_ok(sub->data, sub->datalen))
+ return;
+
+ control = le16_to_cpu(prof->control);
+
+ if (link_id != u16_get_bits(control,
+ IEEE80211_MLE_STA_CONTROL_LINK_ID))
+ continue;
+
+ if (!(control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE))
+ return;
+
+ elems->prof = prof;
+ elems->sta_prof_len = sub->datalen;
+
+ /* the sub element can be fragmented */
+ ieee80211_defragment_element(elems, (void **)&elems->prof,
+ &elems->sta_prof_len,
+ ml_len - (sub->data - (u8 *)ml),
+ IEEE80211_MLE_SUBELEM_FRAGMENT);
+ return;
+ }
+}
+
+static void ieee80211_mle_parse_link(struct ieee802_11_elems *elems,
+ struct ieee80211_elems_parse_params *params)
+{
+ struct ieee80211_mle_per_sta_profile *prof;
+ struct ieee80211_elems_parse_params sub = {
+ .action = params->action,
+ .from_ap = params->from_ap,
+ .link_id = -1,
+ };
+ const struct element *non_inherit = NULL;
+ const u8 *end;
+
+ if (params->link_id == -1)
+ return;
+
+ ieee80211_defragment_element(elems, (void **)&elems->multi_link,
+ &elems->multi_link_len,
+ elems->total_len - ((u8 *)elems->multi_link -
+ elems->ie_start),
+ WLAN_EID_FRAGMENT);
+
+ ieee80211_mle_get_sta_prof(elems, params->link_id);
+ prof = elems->prof;
+
+ if (!prof)
+ return;
+
+ /* check if we have the 4 bytes for the fixed part in assoc response */
+ if (elems->sta_prof_len < sizeof(*prof) + prof->sta_info_len - 1 + 4) {
+ elems->prof = NULL;
+ elems->sta_prof_len = 0;
+ return;
+ }
+
+ /*
+ * Skip the capability information and the status code that are expected
+ * as part of the station profile in association response frames. Note
+ * the -1 is because the 'sta_info_len' is accounted to as part of the
+ * per-STA profile, but not part of the 'u8 variable[]' portion.
+ */
+ sub.start = prof->variable + prof->sta_info_len - 1 + 4;
+ end = (const u8 *)prof + elems->sta_prof_len;
+ sub.len = end - sub.start;
+
+ non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE,
+ sub.start, sub.len);
+ _ieee802_11_parse_elems_full(&sub, elems, non_inherit);
+}
+
struct ieee802_11_elems *
ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params)
{
@@ -1506,7 +1650,7 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params)
const struct element *non_inherit = NULL;
u8 *nontransmitted_profile;
int nontransmitted_profile_len = 0;
- size_t scratch_len = params->len;
+ size_t scratch_len = params->scratch_len ?: 3 * params->len;
elems = kzalloc(sizeof(*elems) + scratch_len, GFP_ATOMIC);
if (!elems)
@@ -1541,6 +1685,8 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params)
_ieee802_11_parse_elems_full(&sub, elems, NULL);
}
+ ieee80211_mle_parse_link(elems, params);
+
if (elems->tim && !elems->parse_error) {
const struct ieee80211_tim_ie *tim_ie = elems->tim;
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index ecc1de2e68a5..a12c63638680 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -122,6 +122,9 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata,
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
u8 *p;
+ /* Ensure hash is set prior to potential SW encryption */
+ skb_get_hash(skb);
+
if ((info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER) ||
local->hw.queues < IEEE80211_NUM_ACS)
return 0;
@@ -141,12 +144,15 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata,
return ieee80211_downgrade_queue(sdata, NULL, skb);
}
-u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
- struct sta_info *sta, struct sk_buff *skb)
+u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta, struct sk_buff *skb)
{
struct mac80211_qos_map *qos_map;
bool qos;
+ /* Ensure hash is set prior to potential SW encryption */
+ skb_get_hash(skb);
+
/* all mesh/ocb stations are required to support WME */
if (sta && (sdata->vif.type == NL80211_IFTYPE_MESH_POINT ||
sdata->vif.type == NL80211_IFTYPE_OCB))
@@ -176,59 +182,6 @@ u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
return ieee80211_downgrade_queue(sdata, sta, skb);
}
-
-/* Indicate which queue to use. */
-u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
- struct sk_buff *skb)
-{
- struct ieee80211_local *local = sdata->local;
- struct sta_info *sta = NULL;
- const u8 *ra = NULL;
- u16 ret;
-
- /* when using iTXQ, we can do this later */
- if (local->ops->wake_tx_queue)
- return 0;
-
- if (local->hw.queues < IEEE80211_NUM_ACS || skb->len < 6) {
- skb->priority = 0; /* required for correct WPA/11i MIC */
- return 0;
- }
-
- rcu_read_lock();
- switch (sdata->vif.type) {
- case NL80211_IFTYPE_AP_VLAN:
- sta = rcu_dereference(sdata->u.vlan.sta);
- if (sta)
- break;
- fallthrough;
- case NL80211_IFTYPE_AP:
- ra = skb->data;
- break;
- case NL80211_IFTYPE_STATION:
- /* might be a TDLS station */
- sta = sta_info_get(sdata, skb->data);
- if (sta)
- break;
-
- ra = sdata->deflink.u.mgd.bssid;
- break;
- case NL80211_IFTYPE_ADHOC:
- ra = skb->data;
- break;
- default:
- break;
- }
-
- if (!sta && ra && !is_multicast_ether_addr(ra))
- sta = sta_info_get(sdata, ra);
-
- ret = __ieee80211_select_queue(sdata, sta, skb);
-
- rcu_read_unlock();
- return ret;
-}
-
/**
* ieee80211_set_qos_hdr - Fill in the QoS header if there is one.
*
diff --git a/net/mac80211/wme.h b/net/mac80211/wme.h
index 2e3dec0b6087..81f0039527a9 100644
--- a/net/mac80211/wme.h
+++ b/net/mac80211/wme.h
@@ -13,10 +13,8 @@
u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb,
struct ieee80211_hdr *hdr);
-u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
- struct sta_info *sta, struct sk_buff *skb);
u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
- struct sk_buff *skb);
+ struct sta_info *sta, struct sk_buff *skb);
void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb);
diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c
index 1e4a9f74ed43..dc2d918fac68 100644
--- a/net/mac802154/cfg.c
+++ b/net/mac802154/cfg.c
@@ -46,7 +46,7 @@ static int ieee802154_suspend(struct wpan_phy *wpan_phy)
if (!local->open_count)
goto suspend;
- ieee802154_stop_queue(&local->hw);
+ ieee802154_sync_and_hold_queue(local);
synchronize_net();
/* stop hardware - this must stop RX */
@@ -67,12 +67,12 @@ static int ieee802154_resume(struct wpan_phy *wpan_phy)
goto wake_up;
/* restart hardware */
- ret = drv_start(local);
+ ret = drv_start(local, local->phy->filtering, &local->addr_filt);
if (ret)
return ret;
wake_up:
- ieee802154_wake_queue(&local->hw);
+ ieee802154_release_queue(local);
local->suspended = false;
return 0;
}
diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h
index d23f0db98015..a7af3f0ddb3e 100644
--- a/net/mac802154/driver-ops.h
+++ b/net/mac802154/driver-ops.h
@@ -24,203 +24,290 @@ drv_xmit_sync(struct ieee802154_local *local, struct sk_buff *skb)
return local->ops->xmit_sync(&local->hw, skb);
}
-static inline int drv_start(struct ieee802154_local *local)
+static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id)
{
+ struct ieee802154_hw_addr_filt filt;
int ret;
might_sleep();
- trace_802154_drv_start(local);
- local->started = true;
- smp_mb();
- ret = local->ops->start(&local->hw);
+ if (!local->ops->set_hw_addr_filt) {
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+ }
+
+ filt.pan_id = pan_id;
+
+ trace_802154_drv_set_pan_id(local, pan_id);
+ ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
+ IEEE802154_AFILT_PANID_CHANGED);
trace_802154_drv_return_int(local, ret);
return ret;
}
-static inline void drv_stop(struct ieee802154_local *local)
+static inline int
+drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr)
{
- might_sleep();
+ struct ieee802154_hw_addr_filt filt;
+ int ret;
- trace_802154_drv_stop(local);
- local->ops->stop(&local->hw);
- trace_802154_drv_return_void(local);
+ might_sleep();
- /* sync away all work on the tasklet before clearing started */
- tasklet_disable(&local->tasklet);
- tasklet_enable(&local->tasklet);
+ if (!local->ops->set_hw_addr_filt) {
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+ }
- barrier();
+ filt.ieee_addr = extended_addr;
- local->started = false;
+ trace_802154_drv_set_extended_addr(local, extended_addr);
+ ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
+ IEEE802154_AFILT_IEEEADDR_CHANGED);
+ trace_802154_drv_return_int(local, ret);
+ return ret;
}
static inline int
-drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel)
+drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr)
{
+ struct ieee802154_hw_addr_filt filt;
int ret;
might_sleep();
- trace_802154_drv_set_channel(local, page, channel);
- ret = local->ops->set_channel(&local->hw, page, channel);
+ if (!local->ops->set_hw_addr_filt) {
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+ }
+
+ filt.short_addr = short_addr;
+
+ trace_802154_drv_set_short_addr(local, short_addr);
+ ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
+ IEEE802154_AFILT_SADDR_CHANGED);
trace_802154_drv_return_int(local, ret);
return ret;
}
-static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm)
+static inline int
+drv_set_pan_coord(struct ieee802154_local *local, bool is_coord)
{
+ struct ieee802154_hw_addr_filt filt;
int ret;
might_sleep();
- if (!local->ops->set_txpower) {
+ if (!local->ops->set_hw_addr_filt) {
WARN_ON(1);
return -EOPNOTSUPP;
}
- trace_802154_drv_set_tx_power(local, mbm);
- ret = local->ops->set_txpower(&local->hw, mbm);
+ filt.pan_coord = is_coord;
+
+ trace_802154_drv_set_pan_coord(local, is_coord);
+ ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
+ IEEE802154_AFILT_PANC_CHANGED);
trace_802154_drv_return_int(local, ret);
return ret;
}
-static inline int drv_set_cca_mode(struct ieee802154_local *local,
- const struct wpan_phy_cca *cca)
+static inline int
+drv_set_promiscuous_mode(struct ieee802154_local *local, bool on)
{
int ret;
might_sleep();
- if (!local->ops->set_cca_mode) {
+ if (!local->ops->set_promiscuous_mode) {
WARN_ON(1);
return -EOPNOTSUPP;
}
- trace_802154_drv_set_cca_mode(local, cca);
- ret = local->ops->set_cca_mode(&local->hw, cca);
+ trace_802154_drv_set_promiscuous_mode(local, on);
+ ret = local->ops->set_promiscuous_mode(&local->hw, on);
trace_802154_drv_return_int(local, ret);
return ret;
}
-static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode)
+static inline int drv_start(struct ieee802154_local *local,
+ enum ieee802154_filtering_level level,
+ const struct ieee802154_hw_addr_filt *addr_filt)
{
int ret;
might_sleep();
- if (!local->ops->set_lbt) {
+ /* setup receive mode parameters e.g. address mode */
+ if (local->hw.flags & IEEE802154_HW_AFILT) {
+ ret = drv_set_pan_id(local, addr_filt->pan_id);
+ if (ret < 0)
+ return ret;
+
+ ret = drv_set_short_addr(local, addr_filt->short_addr);
+ if (ret < 0)
+ return ret;
+
+ ret = drv_set_extended_addr(local, addr_filt->ieee_addr);
+ if (ret < 0)
+ return ret;
+ }
+
+ switch (level) {
+ case IEEE802154_FILTERING_NONE:
+ fallthrough;
+ case IEEE802154_FILTERING_1_FCS:
+ fallthrough;
+ case IEEE802154_FILTERING_2_PROMISCUOUS:
+ /* TODO: Requires a different receive mode setup e.g.
+ * at86rf233 hardware.
+ */
+ fallthrough;
+ case IEEE802154_FILTERING_3_SCAN:
+ if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) {
+ ret = drv_set_promiscuous_mode(local, true);
+ if (ret < 0)
+ return ret;
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ /* In practice other filtering levels can be requested, but as
+ * for now most hardware/drivers only support
+ * IEEE802154_FILTERING_NONE, we fallback to this actual
+ * filtering level in hardware and make our own additional
+ * filtering in mac802154 receive path.
+ *
+ * TODO: Move this logic to the device drivers as hardware may
+ * support more higher level filters. Hardware may also require
+ * a different order how register are set, which could currently
+ * be buggy, so all received parameters need to be moved to the
+ * start() callback and let the driver go into the mode before
+ * it will turn on receive handling.
+ */
+ local->phy->filtering = IEEE802154_FILTERING_NONE;
+ break;
+ case IEEE802154_FILTERING_4_FRAME_FIELDS:
+ /* Do not error out if IEEE802154_HW_PROMISCUOUS because we
+ * expect the hardware to operate at the level
+ * IEEE802154_FILTERING_4_FRAME_FIELDS anyway.
+ */
+ if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) {
+ ret = drv_set_promiscuous_mode(local, false);
+ if (ret < 0)
+ return ret;
+ }
+
+ local->phy->filtering = IEEE802154_FILTERING_4_FRAME_FIELDS;
+ break;
+ default:
WARN_ON(1);
- return -EOPNOTSUPP;
+ return -EINVAL;
}
- trace_802154_drv_set_lbt_mode(local, mode);
- ret = local->ops->set_lbt(&local->hw, mode);
+ trace_802154_drv_start(local);
+ local->started = true;
+ smp_mb();
+ ret = local->ops->start(&local->hw);
trace_802154_drv_return_int(local, ret);
return ret;
}
+static inline void drv_stop(struct ieee802154_local *local)
+{
+ might_sleep();
+
+ trace_802154_drv_stop(local);
+ local->ops->stop(&local->hw);
+ trace_802154_drv_return_void(local);
+
+ /* sync away all work on the tasklet before clearing started */
+ tasklet_disable(&local->tasklet);
+ tasklet_enable(&local->tasklet);
+
+ barrier();
+
+ local->started = false;
+}
+
static inline int
-drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm)
+drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel)
{
int ret;
might_sleep();
- if (!local->ops->set_cca_ed_level) {
- WARN_ON(1);
- return -EOPNOTSUPP;
- }
-
- trace_802154_drv_set_cca_ed_level(local, mbm);
- ret = local->ops->set_cca_ed_level(&local->hw, mbm);
+ trace_802154_drv_set_channel(local, page, channel);
+ ret = local->ops->set_channel(&local->hw, page, channel);
trace_802154_drv_return_int(local, ret);
return ret;
}
-static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id)
+static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm)
{
- struct ieee802154_hw_addr_filt filt;
int ret;
might_sleep();
- if (!local->ops->set_hw_addr_filt) {
+ if (!local->ops->set_txpower) {
WARN_ON(1);
return -EOPNOTSUPP;
}
- filt.pan_id = pan_id;
-
- trace_802154_drv_set_pan_id(local, pan_id);
- ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
- IEEE802154_AFILT_PANID_CHANGED);
+ trace_802154_drv_set_tx_power(local, mbm);
+ ret = local->ops->set_txpower(&local->hw, mbm);
trace_802154_drv_return_int(local, ret);
return ret;
}
-static inline int
-drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr)
+static inline int drv_set_cca_mode(struct ieee802154_local *local,
+ const struct wpan_phy_cca *cca)
{
- struct ieee802154_hw_addr_filt filt;
int ret;
might_sleep();
- if (!local->ops->set_hw_addr_filt) {
+ if (!local->ops->set_cca_mode) {
WARN_ON(1);
return -EOPNOTSUPP;
}
- filt.ieee_addr = extended_addr;
-
- trace_802154_drv_set_extended_addr(local, extended_addr);
- ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
- IEEE802154_AFILT_IEEEADDR_CHANGED);
+ trace_802154_drv_set_cca_mode(local, cca);
+ ret = local->ops->set_cca_mode(&local->hw, cca);
trace_802154_drv_return_int(local, ret);
return ret;
}
-static inline int
-drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr)
+static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode)
{
- struct ieee802154_hw_addr_filt filt;
int ret;
might_sleep();
- if (!local->ops->set_hw_addr_filt) {
+ if (!local->ops->set_lbt) {
WARN_ON(1);
return -EOPNOTSUPP;
}
- filt.short_addr = short_addr;
-
- trace_802154_drv_set_short_addr(local, short_addr);
- ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
- IEEE802154_AFILT_SADDR_CHANGED);
+ trace_802154_drv_set_lbt_mode(local, mode);
+ ret = local->ops->set_lbt(&local->hw, mode);
trace_802154_drv_return_int(local, ret);
return ret;
}
static inline int
-drv_set_pan_coord(struct ieee802154_local *local, bool is_coord)
+drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm)
{
- struct ieee802154_hw_addr_filt filt;
int ret;
might_sleep();
- if (!local->ops->set_hw_addr_filt) {
+ if (!local->ops->set_cca_ed_level) {
WARN_ON(1);
return -EOPNOTSUPP;
}
- filt.pan_coord = is_coord;
-
- trace_802154_drv_set_pan_coord(local, is_coord);
- ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
- IEEE802154_AFILT_PANC_CHANGED);
+ trace_802154_drv_set_cca_ed_level(local, mbm);
+ ret = local->ops->set_cca_ed_level(&local->hw, mbm);
trace_802154_drv_return_int(local, ret);
return ret;
}
@@ -264,22 +351,4 @@ drv_set_max_frame_retries(struct ieee802154_local *local, s8 max_frame_retries)
return ret;
}
-static inline int
-drv_set_promiscuous_mode(struct ieee802154_local *local, bool on)
-{
- int ret;
-
- might_sleep();
-
- if (!local->ops->set_promiscuous_mode) {
- WARN_ON(1);
- return -EOPNOTSUPP;
- }
-
- trace_802154_drv_set_promiscuous_mode(local, on);
- ret = local->ops->set_promiscuous_mode(&local->hw, on);
- trace_802154_drv_return_int(local, ret);
- return ret;
-}
-
#endif /* __MAC802154_DRIVER_OPS */
diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h
index 1381e6a5e180..509e0172fe82 100644
--- a/net/mac802154/ieee802154_i.h
+++ b/net/mac802154/ieee802154_i.h
@@ -26,6 +26,8 @@ struct ieee802154_local {
struct ieee802154_hw hw;
const struct ieee802154_ops *ops;
+ /* hardware address filter */
+ struct ieee802154_hw_addr_filt addr_filt;
/* ieee802154 phy */
struct wpan_phy *phy;
@@ -55,7 +57,7 @@ struct ieee802154_local {
struct sk_buff_head skb_queue;
struct sk_buff *tx_skb;
- struct work_struct tx_work;
+ struct work_struct sync_tx_work;
/* A negative Linux error code or a null/positive MLME error status */
int tx_result;
};
@@ -82,6 +84,16 @@ struct ieee802154_sub_if_data {
struct ieee802154_local *local;
struct net_device *dev;
+ /* Each interface starts and works in nominal state at a given filtering
+ * level given by iface_default_filtering, which is set once for all at
+ * the interface creation and should not evolve over time. For some MAC
+ * operations however, the filtering level may change temporarily, as
+ * reflected in the required_filtering field. The actual filtering at
+ * the PHY level may be different and is shown in struct wpan_phy.
+ */
+ enum ieee802154_filtering_level iface_default_filtering;
+ enum ieee802154_filtering_level required_filtering;
+
unsigned long state;
char name[IFNAMSIZ];
@@ -123,13 +135,53 @@ ieee802154_sdata_running(struct ieee802154_sub_if_data *sdata)
extern struct ieee802154_mlme_ops mac802154_mlme_wpan;
void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb);
-void ieee802154_xmit_worker(struct work_struct *work);
+void ieee802154_xmit_sync_worker(struct work_struct *work);
+int ieee802154_sync_and_hold_queue(struct ieee802154_local *local);
+int ieee802154_mlme_op_pre(struct ieee802154_local *local);
+int ieee802154_mlme_tx(struct ieee802154_local *local,
+ struct ieee802154_sub_if_data *sdata,
+ struct sk_buff *skb);
+void ieee802154_mlme_op_post(struct ieee802154_local *local);
+int ieee802154_mlme_tx_one(struct ieee802154_local *local,
+ struct ieee802154_sub_if_data *sdata,
+ struct sk_buff *skb);
netdev_tx_t
ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev);
netdev_tx_t
ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev);
enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer);
+/**
+ * ieee802154_hold_queue - hold ieee802154 queue
+ * @local: main mac object
+ *
+ * Hold a queue by incrementing an atomic counter and requesting the netif
+ * queues to be stopped. The queues cannot be woken up while the counter has not
+ * been reset with as any ieee802154_release_queue() calls as needed.
+ */
+void ieee802154_hold_queue(struct ieee802154_local *local);
+
+/**
+ * ieee802154_release_queue - release ieee802154 queue
+ * @local: main mac object
+ *
+ * Release a queue which is held by decrementing an atomic counter and wake it
+ * up only if the counter reaches 0.
+ */
+void ieee802154_release_queue(struct ieee802154_local *local);
+
+/**
+ * ieee802154_disable_queue - disable ieee802154 queue
+ * @local: main mac object
+ *
+ * When trying to sync the Tx queue, we cannot just stop the queue
+ * (which is basically a bit being set without proper lock handling)
+ * because it would be racy. We actually need to call netif_tx_disable()
+ * instead, which is done by this helper. Restarting the queue can
+ * however still be done with a regular wake call.
+ */
+void ieee802154_disable_queue(struct ieee802154_local *local);
+
/* MIB callbacks */
void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan);
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 500ed1b81250..d9b50884d34e 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -147,25 +147,12 @@ static int ieee802154_setup_hw(struct ieee802154_sub_if_data *sdata)
struct wpan_dev *wpan_dev = &sdata->wpan_dev;
int ret;
- if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) {
- ret = drv_set_promiscuous_mode(local,
- wpan_dev->promiscuous_mode);
- if (ret < 0)
- return ret;
- }
+ sdata->required_filtering = sdata->iface_default_filtering;
if (local->hw.flags & IEEE802154_HW_AFILT) {
- ret = drv_set_pan_id(local, wpan_dev->pan_id);
- if (ret < 0)
- return ret;
-
- ret = drv_set_extended_addr(local, wpan_dev->extended_addr);
- if (ret < 0)
- return ret;
-
- ret = drv_set_short_addr(local, wpan_dev->short_addr);
- if (ret < 0)
- return ret;
+ local->addr_filt.pan_id = wpan_dev->pan_id;
+ local->addr_filt.ieee_addr = wpan_dev->extended_addr;
+ local->addr_filt.short_addr = wpan_dev->short_addr;
}
if (local->hw.flags & IEEE802154_HW_LBT) {
@@ -206,7 +193,8 @@ static int mac802154_slave_open(struct net_device *dev)
if (res)
goto err;
- res = drv_start(local);
+ res = drv_start(local, sdata->required_filtering,
+ &local->addr_filt);
if (res)
goto err;
}
@@ -223,15 +211,16 @@ err:
static int
ieee802154_check_mac_settings(struct ieee802154_local *local,
- struct wpan_dev *wpan_dev,
- struct wpan_dev *nwpan_dev)
+ struct ieee802154_sub_if_data *sdata,
+ struct ieee802154_sub_if_data *nsdata)
{
+ struct wpan_dev *nwpan_dev = &nsdata->wpan_dev;
+ struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+
ASSERT_RTNL();
- if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) {
- if (wpan_dev->promiscuous_mode != nwpan_dev->promiscuous_mode)
- return -EBUSY;
- }
+ if (sdata->iface_default_filtering != nsdata->iface_default_filtering)
+ return -EBUSY;
if (local->hw.flags & IEEE802154_HW_AFILT) {
if (wpan_dev->pan_id != nwpan_dev->pan_id ||
@@ -285,8 +274,7 @@ ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata,
/* check all phy mac sublayer settings are the same.
* We have only one phy, different values makes trouble.
*/
- ret = ieee802154_check_mac_settings(local, wpan_dev,
- &nsdata->wpan_dev);
+ ret = ieee802154_check_mac_settings(local, sdata, nsdata);
if (ret < 0)
return ret;
}
@@ -586,7 +574,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
sdata->dev->priv_destructor = mac802154_wpan_free;
sdata->dev->netdev_ops = &mac802154_wpan_ops;
sdata->dev->ml_priv = &mac802154_mlme_wpan;
- wpan_dev->promiscuous_mode = false;
+ sdata->iface_default_filtering = IEEE802154_FILTERING_4_FRAME_FIELDS;
wpan_dev->header_ops = &ieee802154_header_ops;
mutex_init(&sdata->sec_mtx);
@@ -600,7 +588,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
case NL802154_IFTYPE_MONITOR:
sdata->dev->needs_free_netdev = true;
sdata->dev->netdev_ops = &mac802154_monitor_ops;
- wpan_dev->promiscuous_mode = true;
+ sdata->iface_default_filtering = IEEE802154_FILTERING_NONE;
break;
default:
BUG();
diff --git a/net/mac802154/main.c b/net/mac802154/main.c
index bd7bdb1219dd..40fab08df24b 100644
--- a/net/mac802154/main.c
+++ b/net/mac802154/main.c
@@ -95,7 +95,7 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops)
skb_queue_head_init(&local->skb_queue);
- INIT_WORK(&local->tx_work, ieee802154_xmit_worker);
+ INIT_WORK(&local->sync_tx_work, ieee802154_xmit_sync_worker);
/* init supported flags with 802.15.4 default ranges */
phy->supported.max_minbe = 8;
diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index c439125ef2b9..0724aac8f48c 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -34,6 +34,7 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata,
struct sk_buff *skb, const struct ieee802154_hdr *hdr)
{
struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+ struct wpan_phy *wpan_phy = sdata->local->hw.phy;
__le16 span, sshort;
int rc;
@@ -42,6 +43,17 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata,
span = wpan_dev->pan_id;
sshort = wpan_dev->short_addr;
+ /* Level 3 filtering: Only beacons are accepted during scans */
+ if (sdata->required_filtering == IEEE802154_FILTERING_3_SCAN &&
+ sdata->required_filtering > wpan_phy->filtering) {
+ if (mac_cb(skb)->type != IEEE802154_FC_TYPE_BEACON) {
+ dev_dbg(&sdata->dev->dev,
+ "drop non-beacon frame (0x%x) during scan\n",
+ mac_cb(skb)->type);
+ goto fail;
+ }
+ }
+
switch (mac_cb(skb)->dest.mode) {
case IEEE802154_ADDR_NONE:
if (hdr->source.mode != IEEE802154_ADDR_NONE)
@@ -114,8 +126,10 @@ fail:
static void
ieee802154_print_addr(const char *name, const struct ieee802154_addr *addr)
{
- if (addr->mode == IEEE802154_ADDR_NONE)
+ if (addr->mode == IEEE802154_ADDR_NONE) {
pr_debug("%s not present\n", name);
+ return;
+ }
pr_debug("%s PAN ID: %04x\n", name, le16_to_cpu(addr->pan_id));
if (addr->mode == IEEE802154_ADDR_SHORT) {
@@ -132,7 +146,7 @@ static int
ieee802154_parse_frame_start(struct sk_buff *skb, struct ieee802154_hdr *hdr)
{
int hlen;
- struct ieee802154_mac_cb *cb = mac_cb_init(skb);
+ struct ieee802154_mac_cb *cb = mac_cb(skb);
skb_reset_mac_header(skb);
@@ -209,6 +223,13 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local,
if (!ieee802154_sdata_running(sdata))
continue;
+ /* Do not deliver packets received on interfaces expecting
+ * AACK=1 if the address filters where disabled.
+ */
+ if (local->hw.phy->filtering < IEEE802154_FILTERING_4_FRAME_FIELDS &&
+ sdata->required_filtering == IEEE802154_FILTERING_4_FRAME_FIELDS)
+ continue;
+
ieee802154_subif_frame(sdata, skb, &hdr);
skb = NULL;
break;
@@ -268,10 +289,8 @@ void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb)
ieee802154_monitors_rx(local, skb);
- /* Check if transceiver doesn't validate the checksum.
- * If not we validate the checksum here.
- */
- if (local->hw.flags & IEEE802154_HW_RX_DROP_BAD_CKSUM) {
+ /* Level 1 filtering: Check the FCS by software when relevant */
+ if (local->hw.phy->filtering == IEEE802154_FILTERING_NONE) {
crc = crc_ccitt(0, skb->data, skb->len);
if (crc) {
rcu_read_unlock();
@@ -294,8 +313,9 @@ void
ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi)
{
struct ieee802154_local *local = hw_to_local(hw);
+ struct ieee802154_mac_cb *cb = mac_cb_init(skb);
- mac_cb(skb)->lqi = lqi;
+ cb->lqi = lqi;
skb->pkt_type = IEEE802154_RX_MSG;
skb_queue_tail(&local->skb_queue, skb);
tasklet_schedule(&local->tasklet);
diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c
index c829e4a75325..9d8d43cf1e64 100644
--- a/net/mac802154/tx.c
+++ b/net/mac802154/tx.c
@@ -22,10 +22,10 @@
#include "ieee802154_i.h"
#include "driver-ops.h"
-void ieee802154_xmit_worker(struct work_struct *work)
+void ieee802154_xmit_sync_worker(struct work_struct *work)
{
struct ieee802154_local *local =
- container_of(work, struct ieee802154_local, tx_work);
+ container_of(work, struct ieee802154_local, sync_tx_work);
struct sk_buff *skb = local->tx_skb;
struct net_device *dev = skb->dev;
int res;
@@ -43,7 +43,9 @@ void ieee802154_xmit_worker(struct work_struct *work)
err_tx:
/* Restart the netif queue on each sub_if_data object. */
- ieee802154_wake_queue(&local->hw);
+ ieee802154_release_queue(local);
+ if (atomic_dec_and_test(&local->phy->ongoing_txs))
+ wake_up(&local->phy->sync_txq);
kfree_skb(skb);
netdev_dbg(dev, "transmission failed\n");
}
@@ -65,7 +67,7 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb)
consume_skb(skb);
skb = nskb;
} else {
- goto err_tx;
+ goto err_free_skb;
}
}
@@ -74,32 +76,134 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb)
}
/* Stop the netif queue on each sub_if_data object. */
- ieee802154_stop_queue(&local->hw);
+ ieee802154_hold_queue(local);
+ atomic_inc(&local->phy->ongoing_txs);
- /* async is priority, otherwise sync is fallback */
+ /* Drivers should preferably implement the async callback. In some rare
+ * cases they only provide a sync callback which we will use as a
+ * fallback.
+ */
if (local->ops->xmit_async) {
unsigned int len = skb->len;
ret = drv_xmit_async(local, skb);
- if (ret) {
- ieee802154_wake_queue(&local->hw);
- goto err_tx;
- }
+ if (ret)
+ goto err_wake_netif_queue;
dev->stats.tx_packets++;
dev->stats.tx_bytes += len;
} else {
local->tx_skb = skb;
- queue_work(local->workqueue, &local->tx_work);
+ queue_work(local->workqueue, &local->sync_tx_work);
}
return NETDEV_TX_OK;
-err_tx:
+err_wake_netif_queue:
+ ieee802154_release_queue(local);
+ if (atomic_dec_and_test(&local->phy->ongoing_txs))
+ wake_up(&local->phy->sync_txq);
+err_free_skb:
kfree_skb(skb);
return NETDEV_TX_OK;
}
+static int ieee802154_sync_queue(struct ieee802154_local *local)
+{
+ int ret;
+
+ ieee802154_hold_queue(local);
+ ieee802154_disable_queue(local);
+ wait_event(local->phy->sync_txq, !atomic_read(&local->phy->ongoing_txs));
+ ret = local->tx_result;
+ ieee802154_release_queue(local);
+
+ return ret;
+}
+
+int ieee802154_sync_and_hold_queue(struct ieee802154_local *local)
+{
+ int ret;
+
+ ieee802154_hold_queue(local);
+ ret = ieee802154_sync_queue(local);
+ set_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags);
+
+ return ret;
+}
+
+int ieee802154_mlme_op_pre(struct ieee802154_local *local)
+{
+ return ieee802154_sync_and_hold_queue(local);
+}
+
+int ieee802154_mlme_tx(struct ieee802154_local *local,
+ struct ieee802154_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ /* Avoid possible calls to ->ndo_stop() when we asynchronously perform
+ * MLME transmissions.
+ */
+ rtnl_lock();
+
+ /* Ensure the device was not stopped, otherwise error out */
+ if (!local->open_count) {
+ rtnl_unlock();
+ return -ENETDOWN;
+ }
+
+ /* Warn if the ieee802154 core thinks MLME frames can be sent while the
+ * net interface expects this cannot happen.
+ */
+ if (WARN_ON_ONCE(!netif_running(sdata->dev))) {
+ rtnl_unlock();
+ return -ENETDOWN;
+ }
+
+ ieee802154_tx(local, skb);
+ ret = ieee802154_sync_queue(local);
+
+ rtnl_unlock();
+
+ return ret;
+}
+
+void ieee802154_mlme_op_post(struct ieee802154_local *local)
+{
+ ieee802154_release_queue(local);
+}
+
+int ieee802154_mlme_tx_one(struct ieee802154_local *local,
+ struct ieee802154_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ ieee802154_mlme_op_pre(local);
+ ret = ieee802154_mlme_tx(local, sdata, skb);
+ ieee802154_mlme_op_post(local);
+
+ return ret;
+}
+
+static bool ieee802154_queue_is_stopped(struct ieee802154_local *local)
+{
+ return test_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags);
+}
+
+static netdev_tx_t
+ieee802154_hot_tx(struct ieee802154_local *local, struct sk_buff *skb)
+{
+ /* Warn if the net interface tries to transmit frames while the
+ * ieee802154 core assumes the queue is stopped.
+ */
+ WARN_ON_ONCE(ieee802154_queue_is_stopped(local));
+
+ return ieee802154_tx(local, skb);
+}
+
netdev_tx_t
ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
@@ -107,7 +211,7 @@ ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev)
skb->skb_iif = dev->ifindex;
- return ieee802154_tx(sdata->local, skb);
+ return ieee802154_hot_tx(sdata->local, skb);
}
netdev_tx_t
@@ -129,5 +233,5 @@ ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev)
skb->skb_iif = dev->ifindex;
- return ieee802154_tx(sdata->local, skb);
+ return ieee802154_hot_tx(sdata->local, skb);
}
diff --git a/net/mac802154/util.c b/net/mac802154/util.c
index 9f024d85563b..ebc9a8521765 100644
--- a/net/mac802154/util.c
+++ b/net/mac802154/util.c
@@ -13,12 +13,23 @@
/* privid for wpan_phys to determine whether they belong to us or not */
const void *const mac802154_wpan_phy_privid = &mac802154_wpan_phy_privid;
-void ieee802154_wake_queue(struct ieee802154_hw *hw)
+/**
+ * ieee802154_wake_queue - wake ieee802154 queue
+ * @hw: main hardware object
+ *
+ * Tranceivers usually have either one transmit framebuffer or one framebuffer
+ * for both transmitting and receiving. Hence, the core currently only handles
+ * one frame at a time for each phy, which means we had to stop the queue to
+ * avoid new skb to come during the transmission. The queue then needs to be
+ * woken up after the operation.
+ */
+static void ieee802154_wake_queue(struct ieee802154_hw *hw)
{
struct ieee802154_local *local = hw_to_local(hw);
struct ieee802154_sub_if_data *sdata;
rcu_read_lock();
+ clear_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags);
list_for_each_entry_rcu(sdata, &local->interfaces, list) {
if (!sdata->dev)
continue;
@@ -27,9 +38,18 @@ void ieee802154_wake_queue(struct ieee802154_hw *hw)
}
rcu_read_unlock();
}
-EXPORT_SYMBOL(ieee802154_wake_queue);
-void ieee802154_stop_queue(struct ieee802154_hw *hw)
+/**
+ * ieee802154_stop_queue - stop ieee802154 queue
+ * @hw: main hardware object
+ *
+ * Tranceivers usually have either one transmit framebuffer or one framebuffer
+ * for both transmitting and receiving. Hence, the core currently only handles
+ * one frame at a time for each phy, which means we need to tell upper layers to
+ * stop giving us new skbs while we are busy with the transmitted one. The queue
+ * must then be stopped before transmitting.
+ */
+static void ieee802154_stop_queue(struct ieee802154_hw *hw)
{
struct ieee802154_local *local = hw_to_local(hw);
struct ieee802154_sub_if_data *sdata;
@@ -43,14 +63,47 @@ void ieee802154_stop_queue(struct ieee802154_hw *hw)
}
rcu_read_unlock();
}
-EXPORT_SYMBOL(ieee802154_stop_queue);
+
+void ieee802154_hold_queue(struct ieee802154_local *local)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&local->phy->queue_lock, flags);
+ if (!atomic_fetch_inc(&local->phy->hold_txs))
+ ieee802154_stop_queue(&local->hw);
+ spin_unlock_irqrestore(&local->phy->queue_lock, flags);
+}
+
+void ieee802154_release_queue(struct ieee802154_local *local)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&local->phy->queue_lock, flags);
+ if (atomic_dec_and_test(&local->phy->hold_txs))
+ ieee802154_wake_queue(&local->hw);
+ spin_unlock_irqrestore(&local->phy->queue_lock, flags);
+}
+
+void ieee802154_disable_queue(struct ieee802154_local *local)
+{
+ struct ieee802154_sub_if_data *sdata;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (!sdata->dev)
+ continue;
+
+ netif_tx_disable(sdata->dev);
+ }
+ rcu_read_unlock();
+}
enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer)
{
struct ieee802154_local *local =
container_of(timer, struct ieee802154_local, ifs_timer);
- ieee802154_wake_queue(&local->hw);
+ ieee802154_release_queue(local);
return HRTIMER_NORESTART;
}
@@ -84,10 +137,12 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb,
hw->phy->sifs_period * NSEC_PER_USEC,
HRTIMER_MODE_REL);
} else {
- ieee802154_wake_queue(hw);
+ ieee802154_release_queue(local);
}
dev_consume_skb_any(skb);
+ if (atomic_dec_and_test(&hw->phy->ongoing_txs))
+ wake_up(&hw->phy->sync_txq);
}
EXPORT_SYMBOL(ieee802154_xmit_complete);
@@ -97,8 +152,10 @@ void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb,
struct ieee802154_local *local = hw_to_local(hw);
local->tx_result = reason;
- ieee802154_wake_queue(hw);
+ ieee802154_release_queue(local);
dev_kfree_skb_any(skb);
+ if (atomic_dec_and_test(&hw->phy->ongoing_txs))
+ wake_up(&hw->phy->sync_txq);
}
EXPORT_SYMBOL(ieee802154_xmit_error);
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index b6b5e496fa40..fc9e728b6333 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -665,12 +665,14 @@ static __init int mctp_init(void)
rc = mctp_neigh_init();
if (rc)
- goto err_unreg_proto;
+ goto err_unreg_routes;
mctp_device_init();
return 0;
+err_unreg_routes:
+ mctp_routes_exit();
err_unreg_proto:
proto_unregister(&mctp_proto);
err_unreg_sock:
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 2155f15a074c..f9a80b82dc51 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -1400,7 +1400,7 @@ int __init mctp_routes_init(void)
return register_pernet_subsys(&mctp_net_ops);
}
-void __exit mctp_routes_exit(void)
+void mctp_routes_exit(void)
{
unregister_pernet_subsys(&mctp_net_ops);
rtnl_unregister(PF_MCTP, RTM_DELROUTE);
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index b52afe316dc4..35b5f806fdda 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1079,9 +1079,9 @@ static void mpls_get_stats(struct mpls_dev *mdev,
p = per_cpu_ptr(mdev->stats, i);
do {
- start = u64_stats_fetch_begin_irq(&p->syncp);
+ start = u64_stats_fetch_begin(&p->syncp);
local = p->stats;
- } while (u64_stats_fetch_retry_irq(&p->syncp, start));
+ } while (u64_stats_fetch_retry(&p->syncp, start));
stats->rx_packets += local.rx_packets;
stats->rx_bytes += local.rx_bytes;
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
index 6e7df47c9584..a3829ce548f9 100644
--- a/net/mptcp/Makefile
+++ b/net/mptcp/Makefile
@@ -2,7 +2,7 @@
obj-$(CONFIG_MPTCP) += mptcp.o
mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
- mib.o pm_netlink.o sockopt.o pm_userspace.o
+ mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c
new file mode 100644
index 000000000000..d237d142171c
--- /dev/null
+++ b/net/mptcp/fastopen.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+/* MPTCP Fast Open Mechanism
+ *
+ * Copyright (c) 2021-2022, Dmytro SHYTYI
+ */
+
+#include "protocol.h"
+
+void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow,
+ struct request_sock *req)
+{
+ struct sock *ssk = subflow->tcp_sock;
+ struct sock *sk = subflow->conn;
+ struct sk_buff *skb;
+ struct tcp_sock *tp;
+
+ tp = tcp_sk(ssk);
+
+ subflow->is_mptfo = 1;
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+ if (WARN_ON_ONCE(!skb))
+ return;
+
+ /* dequeue the skb from sk receive queue */
+ __skb_unlink(skb, &ssk->sk_receive_queue);
+ skb_ext_reset(skb);
+ skb_orphan(skb);
+
+ /* We copy the fastopen data, but that don't belong to the mptcp sequence
+ * space, need to offset it in the subflow sequence, see mptcp_subflow_get_map_offset()
+ */
+ tp->copied_seq += skb->len;
+ subflow->ssn_offset += skb->len;
+
+ /* initialize a dummy sequence number, we will update it at MPC
+ * completion, if needed
+ */
+ MPTCP_SKB_CB(skb)->map_seq = -skb->len;
+ MPTCP_SKB_CB(skb)->end_seq = 0;
+ MPTCP_SKB_CB(skb)->offset = 0;
+ MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
+
+ mptcp_data_lock(sk);
+
+ mptcp_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+
+ sk->sk_data_ready(sk);
+
+ mptcp_data_unlock(sk);
+}
+
+void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
+ const struct mptcp_options_received *mp_opt)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *skb;
+
+ mptcp_data_lock(sk);
+ skb = skb_peek_tail(&sk->sk_receive_queue);
+ if (skb) {
+ WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq);
+ pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx", sk,
+ MPTCP_SKB_CB(skb)->map_seq, MPTCP_SKB_CB(skb)->map_seq + msk->ack_seq,
+ MPTCP_SKB_CB(skb)->end_seq, MPTCP_SKB_CB(skb)->end_seq + msk->ack_seq);
+ MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq;
+ MPTCP_SKB_CB(skb)->end_seq += msk->ack_seq;
+ }
+
+ pr_debug("msk=%p ack_seq=%llx", msk, msk->ack_seq);
+ mptcp_data_unlock(sk);
+}
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 30d289044e71..5ded85e2c374 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -26,6 +26,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
{
u8 subtype = *ptr >> 4;
int expected_opsize;
+ u16 subopt;
u8 version;
u8 flags;
u8 i;
@@ -38,11 +39,15 @@ static void mptcp_parse_option(const struct sk_buff *skb,
expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA;
else
expected_opsize = TCPOLEN_MPTCP_MPC_ACK;
+ subopt = OPTION_MPTCP_MPC_ACK;
} else {
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) {
expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK;
- else
+ subopt = OPTION_MPTCP_MPC_SYNACK;
+ } else {
expected_opsize = TCPOLEN_MPTCP_MPC_SYN;
+ subopt = OPTION_MPTCP_MPC_SYN;
+ }
}
/* Cfr RFC 8684 Section 3.3.0:
@@ -85,7 +90,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->deny_join_id0 = !!(flags & MPTCP_CAP_DENY_JOIN_ID0);
- mp_opt->suboptions |= OPTIONS_MPTCP_MPC;
+ mp_opt->suboptions |= subopt;
if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) {
mp_opt->sndr_key = get_unaligned_be64(ptr);
ptr += 8;
@@ -934,7 +939,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) &&
!subflow->request_join)
tcp_send_ack(ssk);
- goto fully_established;
+ goto check_notify;
}
/* we must process OoO packets before the first subflow is fully
@@ -945,17 +950,20 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) {
if (subflow->mp_join)
goto reset;
+ if (subflow->is_mptfo && mp_opt->suboptions & OPTION_MPTCP_MPC_ACK)
+ goto set_fully_established;
return subflow->mp_capable;
}
- if (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) ||
- ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo)) {
+ if (subflow->remote_key_valid &&
+ (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) ||
+ ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo))) {
/* subflows are fully established as soon as we get any
* additional ack, including ADD_ADDR.
*/
subflow->fully_established = 1;
WRITE_ONCE(msk->fully_established, true);
- goto fully_established;
+ goto check_notify;
}
/* If the first established packet does not contain MP_CAPABLE + data
@@ -974,11 +982,12 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
if (mp_opt->deny_join_id0)
WRITE_ONCE(msk->pm.remote_deny_join_id0, true);
+set_fully_established:
if (unlikely(!READ_ONCE(msk->pm.server_side)))
pr_warn_once("bogus mpc option on established client sk");
mptcp_subflow_fully_established(subflow, mp_opt);
-fully_established:
+check_notify:
/* if the subflow is not already linked into the conn_list, we can't
* notify the PM: this subflow is still on the listener queue
* and the PM possibly acquiring the subflow lock could race with
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 9813ed0fde9b..d66fbd558263 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -912,10 +912,14 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
*/
if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID)
pernet->next_id = 1;
- if (pernet->addrs >= MPTCP_PM_ADDR_MAX)
+ if (pernet->addrs >= MPTCP_PM_ADDR_MAX) {
+ ret = -ERANGE;
goto out;
- if (test_bit(entry->addr.id, pernet->id_bitmap))
+ }
+ if (test_bit(entry->addr.id, pernet->id_bitmap)) {
+ ret = -EBUSY;
goto out;
+ }
/* do not insert duplicate address, differentiate on port only
* singled addresses
@@ -929,8 +933,10 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
* endpoint is an implicit one and the user-space
* did not provide an endpoint id
*/
- if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT))
+ if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) {
+ ret = -EEXIST;
goto out;
+ }
if (entry->addr.id)
goto out;
@@ -1003,16 +1009,12 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
return err;
msk = mptcp_sk(entry->lsk->sk);
- if (!msk) {
- err = -EINVAL;
- goto out;
- }
+ if (!msk)
+ return -EINVAL;
ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
- err = -EINVAL;
- goto out;
- }
+ if (!ssock)
+ return -EINVAL;
mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -1020,22 +1022,14 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
addrlen = sizeof(struct sockaddr_in6);
#endif
err = kernel_bind(ssock, (struct sockaddr *)&addr, addrlen);
- if (err) {
- pr_warn("kernel_bind error, err=%d", err);
- goto out;
- }
+ if (err)
+ return err;
err = kernel_listen(ssock, backlog);
- if (err) {
- pr_warn("kernel_listen error, err=%d", err);
- goto out;
- }
+ if (err)
+ return err;
return 0;
-
-out:
- sock_release(entry->lsk);
- return err;
}
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
@@ -1327,7 +1321,7 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
}
- entry = kmalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT);
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT);
if (!entry) {
GENL_SET_ERR_MSG(info, "can't allocate addr");
return -ENOMEM;
@@ -1337,23 +1331,22 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
if (entry->addr.port) {
ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry);
if (ret) {
- GENL_SET_ERR_MSG(info, "create listen socket error");
- kfree(entry);
- return ret;
+ GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret);
+ goto out_free;
}
}
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
if (ret < 0) {
- GENL_SET_ERR_MSG(info, "too many addresses or duplicate one");
- if (entry->lsk)
- sock_release(entry->lsk);
- kfree(entry);
- return ret;
+ GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret);
+ goto out_free;
}
mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk));
-
return 0;
+
+out_free:
+ __mptcp_pm_release_addr_entry(entry);
+ return ret;
}
int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id,
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index 9e82250cbb70..5cb65f0928f4 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -291,7 +291,7 @@ int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info)
goto create_err;
}
- sk = &msk->sk.icsk_inet.sk;
+ sk = (struct sock *)msk;
lock_sock(sk);
err = __mptcp_subflow_connect(sk, &addr_l, &addr_r);
@@ -403,7 +403,7 @@ int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info)
goto destroy_err;
}
- sk = &msk->sk.icsk_inet.sk;
+ sk = (struct sock *)msk;
lock_sock(sk);
ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r);
if (ssk) {
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index f599ad44ed24..b0d387be500a 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -36,15 +36,6 @@ struct mptcp6_sock {
};
#endif
-struct mptcp_skb_cb {
- u64 map_seq;
- u64 end_seq;
- u32 offset;
- u8 has_rxtstamp:1;
-};
-
-#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
-
enum {
MPTCP_CMSG_TS = BIT(0),
MPTCP_CMSG_INQ = BIT(1),
@@ -200,7 +191,7 @@ static void mptcp_rfree(struct sk_buff *skb)
mptcp_rmem_uncharge(sk, len);
}
-static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
+void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
skb_orphan(skb);
skb->sk = sk;
@@ -1602,7 +1593,7 @@ out:
__mptcp_check_send_data_fin(sk);
}
-static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
+static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_sendmsg_info info = {
@@ -1611,7 +1602,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
struct mptcp_data_frag *dfrag;
struct sock *xmit_ssk;
int len, copied = 0;
- bool first = true;
info.flags = 0;
while ((dfrag = mptcp_send_head(sk))) {
@@ -1621,11 +1611,10 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
while (len > 0) {
int ret = 0;
- /* the caller already invoked the packet scheduler,
- * check for a different subflow usage only after
+ /* check for a different subflow usage only after
* spooling the first chunk of data
*/
- xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk));
+ xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk);
if (!xmit_ssk)
goto out;
if (xmit_ssk != ssk) {
@@ -1673,6 +1662,37 @@ static void mptcp_set_nospace(struct sock *sk)
set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags);
}
+static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msghdr *msg,
+ size_t len, int *copied_syn)
+{
+ unsigned int saved_flags = msg->msg_flags;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ int ret;
+
+ lock_sock(ssk);
+ msg->msg_flags |= MSG_DONTWAIT;
+ msk->connect_flags = O_NONBLOCK;
+ msk->is_sendmsg = 1;
+ ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL);
+ msk->is_sendmsg = 0;
+ msg->msg_flags = saved_flags;
+ release_sock(ssk);
+
+ /* do the blocking bits of inet_stream_connect outside the ssk socket lock */
+ if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) {
+ ret = __inet_stream_connect(sk->sk_socket, msg->msg_name,
+ msg->msg_namelen, msg->msg_flags, 1);
+
+ /* Keep the same behaviour of plain TCP: zero the copied bytes in
+ * case of any error, except timeout or signal
+ */
+ if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR)
+ *copied_syn = 0;
+ }
+
+ return ret;
+}
+
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1682,34 +1702,22 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int ret = 0;
long timeo;
- /* we don't support FASTOPEN yet */
- if (msg->msg_flags & MSG_FASTOPEN)
- return -EOPNOTSUPP;
-
/* silently ignore everything else */
- msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL;
+ msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_FASTOPEN;
lock_sock(sk);
ssock = __mptcp_nmpc_socket(msk);
- if (unlikely(ssock && inet_sk(ssock->sk)->defer_connect)) {
- struct sock *ssk = ssock->sk;
+ if (unlikely(ssock && (inet_sk(ssock->sk)->defer_connect ||
+ msg->msg_flags & MSG_FASTOPEN))) {
int copied_syn = 0;
- lock_sock(ssk);
-
- ret = tcp_sendmsg_fastopen(ssk, msg, &copied_syn, len, NULL);
+ ret = mptcp_sendmsg_fastopen(sk, ssock->sk, msg, len, &copied_syn);
copied += copied_syn;
- if (ret == -EINPROGRESS && copied_syn > 0) {
- /* reflect the new state on the MPTCP socket */
- inet_sk_state_store(sk, inet_sk_state_load(ssk));
- release_sock(ssk);
+ if (ret == -EINPROGRESS && copied_syn > 0)
goto out;
- } else if (ret) {
- release_sock(ssk);
+ else if (ret)
goto do_error;
- }
- release_sock(ssk);
}
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
@@ -2253,7 +2261,7 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
struct mptcp_data_frag *cur, *rtx_head;
struct mptcp_sock *msk = mptcp_sk(sk);
- if (__mptcp_check_fallback(mptcp_sk(sk)))
+ if (__mptcp_check_fallback(msk))
return false;
if (tcp_rtx_and_write_queues_empty(sk))
@@ -2332,12 +2340,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
goto out;
}
- /* if we are invoked by the msk cleanup code, the subflow is
- * already orphaned
- */
- if (ssk->sk_socket)
- sock_orphan(ssk);
-
+ sock_orphan(ssk);
subflow->disposable = 1;
/* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
@@ -2434,7 +2437,7 @@ static bool mptcp_check_close_timeout(const struct sock *sk)
static void mptcp_check_fastclose(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow, *tmp;
- struct sock *sk = &msk->sk.icsk_inet.sk;
+ struct sock *sk = (struct sock *)msk;
if (likely(!READ_ONCE(msk->rcv_fastclose)))
return;
@@ -2596,7 +2599,7 @@ static void mptcp_do_fastclose(struct sock *sk)
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
- struct sock *sk = &msk->sk.icsk_inet.sk;
+ struct sock *sk = (struct sock *)msk;
unsigned long fail_tout;
int state;
@@ -2708,6 +2711,8 @@ static int mptcp_init_sock(struct sock *sk)
if (ret)
return ret;
+ set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
+
/* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will
* propagate the correct value
*/
@@ -2918,14 +2923,18 @@ cleanup:
if (ssk == msk->first)
subflow->fail_tout = 0;
- sock_orphan(ssk);
+ /* detach from the parent socket, but allow data_ready to
+ * push incoming data into the mptcp stack, to properly ack it
+ */
+ ssk->sk_socket = NULL;
+ ssk->sk_wq = NULL;
unlock_sock_fast(ssk, slow);
}
sock_orphan(sk);
sock_hold(sk);
pr_debug("msk=%p state=%d", sk, sk->sk_state);
- if (mptcp_sk(sk)->token)
+ if (msk->token)
mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
if (sk->sk_state == TCP_CLOSE) {
@@ -2952,7 +2961,7 @@ static void mptcp_close(struct sock *sk, long timeout)
sock_put(sk);
}
-static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
+void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
const struct ipv6_pinfo *ssk6 = inet6_sk(ssk);
@@ -2984,8 +2993,8 @@ static int mptcp_disconnect(struct sock *sk, int flags)
mptcp_stop_timer(sk);
sk_stop_timer(sk, &sk->sk_timer);
- if (mptcp_sk(sk)->token)
- mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
+ if (msk->token)
+ mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
/* msk->subflow is still intact, the following will not free the first
* subflow
@@ -3027,7 +3036,6 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC);
struct mptcp_sock *msk;
- u64 ack_seq;
if (!nsk)
return NULL;
@@ -3053,15 +3061,6 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
- if (mp_opt->suboptions & OPTIONS_MPTCP_MPC) {
- msk->can_ack = true;
- msk->remote_key = mp_opt->sndr_key;
- mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
- ack_seq++;
- WRITE_ONCE(msk->ack_seq, ack_seq);
- atomic64_set(&msk->rcv_wnd_sent, ack_seq);
- }
-
sock_reset_flag(nsk, SOCK_RCU_FREE);
/* will be fully established after successful MPC subflow creation */
inet_sk_state_store(nsk, TCP_SYN_RECV);
@@ -3196,16 +3195,10 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
if (!mptcp_send_head(sk))
return;
- if (!sock_owned_by_user(sk)) {
- struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk));
-
- if (xmit_ssk == ssk)
- __mptcp_subflow_push_pending(sk, ssk);
- else if (xmit_ssk)
- mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), MPTCP_DELEGATE_SEND);
- } else {
+ if (!sock_owned_by_user(sk))
+ __mptcp_subflow_push_pending(sk, ssk, false);
+ else
__set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
- }
}
#define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \
@@ -3296,7 +3289,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk)
if (test_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status)) {
mptcp_data_lock(sk);
if (!sock_owned_by_user(sk))
- __mptcp_subflow_push_pending(sk, ssk);
+ __mptcp_subflow_push_pending(sk, ssk, true);
else
__set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk);
@@ -3340,7 +3333,6 @@ void mptcp_finish_connect(struct sock *ssk)
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk;
struct sock *sk;
- u64 ack_seq;
subflow = mptcp_subflow_ctx(ssk);
sk = subflow->conn;
@@ -3348,22 +3340,16 @@ void mptcp_finish_connect(struct sock *ssk)
pr_debug("msk=%p, token=%u", sk, subflow->token);
- mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
- ack_seq++;
- subflow->map_seq = ack_seq;
+ subflow->map_seq = subflow->iasn;
subflow->map_subflow_seq = 1;
/* the socket is not connected yet, no msk/subflow ops can access/race
* accessing the field below
*/
- WRITE_ONCE(msk->remote_key, subflow->remote_key);
WRITE_ONCE(msk->local_key, subflow->local_key);
WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
WRITE_ONCE(msk->snd_nxt, msk->write_seq);
- WRITE_ONCE(msk->ack_seq, ack_seq);
- WRITE_ONCE(msk->can_ack, 1);
WRITE_ONCE(msk->snd_una, msk->write_seq);
- atomic64_set(&msk->rcv_wnd_sent, ack_seq);
mptcp_pm_new_connection(msk, ssk, 0);
@@ -3507,10 +3493,73 @@ static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return put_user(answ, (int __user *)arg);
}
+static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow)
+{
+ subflow->request_mptcp = 0;
+ __mptcp_do_fallback(msk);
+}
+
+static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct socket *ssock;
+ int err = -EINVAL;
+
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock)
+ return -EINVAL;
+
+ mptcp_token_destroy(msk);
+ inet_sk_state_store(sk, TCP_SYN_SENT);
+ subflow = mptcp_subflow_ctx(ssock->sk);
+#ifdef CONFIG_TCP_MD5SIG
+ /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
+ * TCP option space.
+ */
+ if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
+ mptcp_subflow_early_fallback(msk, subflow);
+#endif
+ if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) {
+ MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT);
+ mptcp_subflow_early_fallback(msk, subflow);
+ }
+ if (likely(!__mptcp_check_fallback(msk)))
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE);
+
+ /* if reaching here via the fastopen/sendmsg path, the caller already
+ * acquired the subflow socket lock, too.
+ */
+ if (msk->is_sendmsg)
+ err = __inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags, 1);
+ else
+ err = inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags);
+ inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect;
+
+ /* on successful connect, the msk state will be moved to established by
+ * subflow_finish_connect()
+ */
+ if (unlikely(err && err != -EINPROGRESS)) {
+ inet_sk_state_store(sk, inet_sk_state_load(ssock->sk));
+ return err;
+ }
+
+ mptcp_copy_inaddrs(sk, ssock->sk);
+
+ /* unblocking connect, mptcp-level inet_stream_connect will error out
+ * without changing the socket state, update it here.
+ */
+ if (err == -EINPROGRESS)
+ sk->sk_socket->state = ssock->state;
+ return err;
+}
+
static struct proto mptcp_prot = {
.name = "MPTCP",
.owner = THIS_MODULE,
.init = mptcp_init_sock,
+ .connect = mptcp_connect,
.disconnect = mptcp_disconnect,
.close = mptcp_close,
.accept = mptcp_accept,
@@ -3562,78 +3611,16 @@ unlock:
return err;
}
-static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
- struct mptcp_subflow_context *subflow)
-{
- subflow->request_mptcp = 0;
- __mptcp_do_fallback(msk);
-}
-
static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
- struct mptcp_sock *msk = mptcp_sk(sock->sk);
- struct mptcp_subflow_context *subflow;
- struct socket *ssock;
- int err = -EINVAL;
+ int ret;
lock_sock(sock->sk);
- if (uaddr) {
- if (addr_len < sizeof(uaddr->sa_family))
- goto unlock;
-
- if (uaddr->sa_family == AF_UNSPEC) {
- err = mptcp_disconnect(sock->sk, flags);
- sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
- goto unlock;
- }
- }
-
- if (sock->state != SS_UNCONNECTED && msk->subflow) {
- /* pending connection or invalid state, let existing subflow
- * cope with that
- */
- ssock = msk->subflow;
- goto do_connect;
- }
-
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock)
- goto unlock;
-
- mptcp_token_destroy(msk);
- inet_sk_state_store(sock->sk, TCP_SYN_SENT);
- subflow = mptcp_subflow_ctx(ssock->sk);
-#ifdef CONFIG_TCP_MD5SIG
- /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
- * TCP option space.
- */
- if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
- mptcp_subflow_early_fallback(msk, subflow);
-#endif
- if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) {
- MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT);
- mptcp_subflow_early_fallback(msk, subflow);
- }
- if (likely(!__mptcp_check_fallback(msk)))
- MPTCP_INC_STATS(sock_net(sock->sk), MPTCP_MIB_MPCAPABLEACTIVE);
-
-do_connect:
- err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
- inet_sk(sock->sk)->defer_connect = inet_sk(ssock->sk)->defer_connect;
- sock->state = ssock->state;
-
- /* on successful connect, the msk state will be moved to established by
- * subflow_finish_connect()
- */
- if (!err || err == -EINPROGRESS)
- mptcp_copy_inaddrs(sock->sk, ssock->sk);
- else
- inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
-
-unlock:
+ mptcp_sk(sock->sk)->connect_flags = flags;
+ ret = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
release_sock(sock->sk);
- return err;
+ return ret;
}
static int mptcp_listen(struct socket *sock, int backlog)
@@ -3684,6 +3671,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
struct mptcp_subflow_context *subflow;
struct sock *newsk = newsock->sk;
+ set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags);
+
lock_sock(newsk);
/* PM/worker can now acquire the first subflow socket
@@ -3699,7 +3688,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
if (mptcp_is_fully_established(newsk))
mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL);
- mptcp_copy_inaddrs(newsk, msk->first);
mptcp_rcv_space_init(msk, msk->first);
mptcp_propagate_sndbuf(newsk, msk->first);
@@ -3898,12 +3886,6 @@ static const struct proto_ops mptcp_v6_stream_ops = {
static struct proto mptcp_v6_prot;
-static void mptcp_v6_destroy(struct sock *sk)
-{
- mptcp_destroy(sk);
- inet6_destroy_sock(sk);
-}
-
static struct inet_protosw mptcp_v6_protosw = {
.type = SOCK_STREAM,
.protocol = IPPROTO_MPTCP,
@@ -3919,7 +3901,6 @@ int __init mptcp_proto_v6_init(void)
mptcp_v6_prot = mptcp_prot;
strcpy(mptcp_v6_prot.name, "MPTCPv6");
mptcp_v6_prot.slab = NULL;
- mptcp_v6_prot.destroy = mptcp_v6_destroy;
mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock);
err = proto_register(&mptcp_v6_prot, 1);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index c0b5b4628f65..8b4379a2cd85 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -126,6 +126,15 @@
#define MPTCP_CONNECTED 6
#define MPTCP_RESET_SCHEDULER 7
+struct mptcp_skb_cb {
+ u64 map_seq;
+ u64 end_seq;
+ u32 offset;
+ u8 has_rxtstamp:1;
+};
+
+#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
+
static inline bool before64(__u64 seq1, __u64 seq2)
{
return (__s64)(seq1 - seq2) < 0;
@@ -285,7 +294,9 @@ struct mptcp_sock {
u8 mpc_endpoint_id;
u8 recvmsg_inq:1,
cork:1,
- nodelay:1;
+ nodelay:1,
+ is_sendmsg:1;
+ int connect_flags;
struct work_struct work;
struct sk_buff *ooo_last_skb;
struct rb_root out_of_order_queue;
@@ -465,17 +476,22 @@ struct mptcp_subflow_context {
send_fastclose : 1,
send_infinite_map : 1,
rx_eof : 1,
- can_ack : 1, /* only after processing the remote a key */
+ remote_key_valid : 1, /* received the peer key from */
disposable : 1, /* ctx can be free at ulp release time */
stale : 1, /* unable to snd/rcv data, do not use for xmit */
local_id_valid : 1, /* local_id is correctly initialized */
- valid_csum_seen : 1; /* at least one csum validated */
+ valid_csum_seen : 1, /* at least one csum validated */
+ is_mptfo : 1, /* subflow is doing TFO */
+ __unused : 8;
enum mptcp_data_avail data_avail;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
u32 remote_token;
- u8 hmac[MPTCPOPT_HMAC_LEN];
+ union {
+ u8 hmac[MPTCPOPT_HMAC_LEN]; /* MPJ subflow only */
+ u64 iasn; /* initial ack sequence number, MPC subflows only */
+ };
u8 local_id;
u8 remote_id;
u8 reset_seen:1;
@@ -599,8 +615,9 @@ int mptcp_is_checksum_enabled(const struct net *net);
int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net);
int mptcp_get_pm_type(const struct net *net);
+void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
- struct mptcp_options_received *mp_opt);
+ const struct mptcp_options_received *mp_opt);
bool __mptcp_retransmit_pending_data(struct sock *sk);
void mptcp_check_and_set_pending(struct sock *sk);
void __mptcp_push_pending(struct sock *sk, unsigned int flags);
@@ -616,6 +633,7 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent);
struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
bool __mptcp_close(struct sock *sk, long timeout);
void mptcp_cancel_work(struct sock *sk);
+void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk);
bool mptcp_addresses_equal(const struct mptcp_addr_info *a,
const struct mptcp_addr_info *b, bool use_port);
@@ -823,6 +841,11 @@ void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_
void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id);
bool mptcp_userspace_pm_active(const struct mptcp_sock *msk);
+void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
+ const struct mptcp_options_received *mp_opt);
+void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow,
+ struct request_sock *req);
+
static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk)
{
return READ_ONCE(msk->pm.addr_signal) &
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index c7cb68c725b2..a47423ebb33a 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -559,7 +559,10 @@ static bool mptcp_supported_sockopt(int level, int optname)
case TCP_NOTSENT_LOWAT:
case TCP_TX_DELAY:
case TCP_INQ:
+ case TCP_FASTOPEN:
case TCP_FASTOPEN_CONNECT:
+ case TCP_FASTOPEN_KEY:
+ case TCP_FASTOPEN_NO_COOKIE:
return true;
}
@@ -568,9 +571,6 @@ static bool mptcp_supported_sockopt(int level, int optname)
/* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS,
* TCP_REPAIR_WINDOW are not supported, better avoid this mess
*/
- /* TCP_FASTOPEN_KEY, TCP_FASTOPEN, TCP_FASTOPEN_NO_COOKIE,
- * are not supported fastopen is currently unsupported
- */
}
return false;
}
@@ -757,29 +757,17 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname,
return -EOPNOTSUPP;
}
-static int mptcp_setsockopt_sol_tcp_defer(struct mptcp_sock *msk, sockptr_t optval,
- unsigned int optlen)
-{
- struct socket *listener;
-
- listener = __mptcp_nmpc_socket(msk);
- if (!listener)
- return 0; /* TCP_DEFER_ACCEPT does not fail */
-
- return tcp_setsockopt(listener->sk, SOL_TCP, TCP_DEFER_ACCEPT, optval, optlen);
-}
-
-static int mptcp_setsockopt_sol_tcp_fastopen_connect(struct mptcp_sock *msk, sockptr_t optval,
- unsigned int optlen)
+static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct socket *sock;
- /* Limit to first subflow */
+ /* Limit to first subflow, before the connection establishment */
sock = __mptcp_nmpc_socket(msk);
if (!sock)
return -EINVAL;
- return tcp_setsockopt(sock->sk, SOL_TCP, TCP_FASTOPEN_CONNECT, optval, optlen);
+ return tcp_setsockopt(sock->sk, level, optname, optval, optlen);
}
static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
@@ -809,9 +797,15 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
case TCP_NODELAY:
return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen);
case TCP_DEFER_ACCEPT:
- return mptcp_setsockopt_sol_tcp_defer(msk, optval, optlen);
+ /* See tcp.c: TCP_DEFER_ACCEPT does not fail */
+ mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen);
+ return 0;
+ case TCP_FASTOPEN:
case TCP_FASTOPEN_CONNECT:
- return mptcp_setsockopt_sol_tcp_fastopen_connect(msk, optval, optlen);
+ case TCP_FASTOPEN_KEY:
+ case TCP_FASTOPEN_NO_COOKIE:
+ return mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname,
+ optval, optlen);
}
return -EOPNOTSUPP;
@@ -994,7 +988,7 @@ static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval,
int __user *optlen)
{
struct mptcp_subflow_context *subflow;
- struct sock *sk = &msk->sk.icsk_inet.sk;
+ struct sock *sk = (struct sock *)msk;
unsigned int sfcount = 0, copied = 0;
struct mptcp_subflow_data sfd;
char __user *infoptr;
@@ -1085,8 +1079,8 @@ static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addr
static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *optval,
int __user *optlen)
{
- struct sock *sk = &msk->sk.icsk_inet.sk;
struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
unsigned int sfcount = 0, copied = 0;
struct mptcp_subflow_data sfd;
char __user *addrptr;
@@ -1173,7 +1167,10 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
case TCP_INFO:
case TCP_CC_INFO:
case TCP_DEFER_ACCEPT:
+ case TCP_FASTOPEN:
case TCP_FASTOPEN_CONNECT:
+ case TCP_FASTOPEN_KEY:
+ case TCP_FASTOPEN_NO_COOKIE:
return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname,
optval, optlen);
case TCP_INQ:
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 07dd23d0fe04..29904303f5c2 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -307,7 +307,48 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
return NULL;
}
+static void subflow_prep_synack(const struct sock *sk, struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ enum tcp_synack_type synack_type)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ /* clear tstamp_ok, as needed depending on cookie */
+ if (foc && foc->len > -1)
+ ireq->tstamp_ok = 0;
+
+ if (synack_type == TCP_SYNACK_FASTOPEN)
+ mptcp_fastopen_subflow_synack_set_params(subflow, req);
+}
+
+static int subflow_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
+ struct flowi *fl,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
+{
+ subflow_prep_synack(sk, req, foc, synack_type);
+
+ return tcp_request_sock_ipv4_ops.send_synack(sk, dst, fl, req, foc,
+ synack_type, syn_skb);
+}
+
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
+ struct flowi *fl,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
+{
+ subflow_prep_synack(sk, req, foc, synack_type);
+
+ return tcp_request_sock_ipv6_ops.send_synack(sk, dst, fl, req, foc,
+ synack_type, syn_skb);
+}
+
static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
struct sk_buff *skb,
struct flowi *fl,
@@ -392,11 +433,33 @@ static void mptcp_set_connected(struct sock *sk)
mptcp_data_unlock(sk);
}
+static void subflow_set_remote_key(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow,
+ const struct mptcp_options_received *mp_opt)
+{
+ /* active MPC subflow will reach here multiple times:
+ * at subflow_finish_connect() time and at 4th ack time
+ */
+ if (subflow->remote_key_valid)
+ return;
+
+ subflow->remote_key_valid = 1;
+ subflow->remote_key = mp_opt->sndr_key;
+ mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn);
+ subflow->iasn++;
+
+ WRITE_ONCE(msk->remote_key, subflow->remote_key);
+ WRITE_ONCE(msk->ack_seq, subflow->iasn);
+ WRITE_ONCE(msk->can_ack, true);
+ atomic64_set(&msk->rcv_wnd_sent, subflow->iasn);
+}
+
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_options_received mp_opt;
struct sock *parent = subflow->conn;
+ struct mptcp_sock *msk;
subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
@@ -404,6 +467,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
if (subflow->conn_finished)
return;
+ msk = mptcp_sk(parent);
mptcp_propagate_sndbuf(parent, sk);
subflow->rel_write_seq = 1;
subflow->conn_finished = 1;
@@ -416,19 +480,16 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
MPTCP_INC_STATS(sock_net(sk),
MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
mptcp_do_fallback(sk);
- pr_fallback(mptcp_sk(subflow->conn));
+ pr_fallback(msk);
goto fallback;
}
if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD)
- WRITE_ONCE(mptcp_sk(parent)->csum_enabled, true);
+ WRITE_ONCE(msk->csum_enabled, true);
if (mp_opt.deny_join_id0)
- WRITE_ONCE(mptcp_sk(parent)->pm.remote_deny_join_id0, true);
+ WRITE_ONCE(msk->pm.remote_deny_join_id0, true);
subflow->mp_capable = 1;
- subflow->can_ack = 1;
- subflow->remote_key = mp_opt.sndr_key;
- pr_debug("subflow=%p, remote_key=%llu", subflow,
- subflow->remote_key);
+ subflow_set_remote_key(msk, subflow, &mp_opt);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK);
mptcp_finish_connect(sk);
mptcp_set_connected(parent);
@@ -466,7 +527,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->mp_join = 1;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
- if (subflow_use_different_dport(mptcp_sk(parent), sk)) {
+ if (subflow_use_different_dport(msk, sk)) {
pr_debug("synack inet_dport=%d %d",
ntohs(inet_sk(sk)->inet_dport),
ntohs(inet_sk(parent)->inet_dport));
@@ -474,7 +535,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
}
} else if (mptcp_check_fallback(sk)) {
fallback:
- mptcp_rcv_space_init(mptcp_sk(parent), sk);
+ mptcp_rcv_space_init(msk, sk);
mptcp_set_connected(parent);
}
return;
@@ -637,14 +698,16 @@ static void subflow_drop_ctx(struct sock *ssk)
}
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
- struct mptcp_options_received *mp_opt)
+ const struct mptcp_options_received *mp_opt)
{
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
- subflow->remote_key = mp_opt->sndr_key;
+ subflow_set_remote_key(msk, subflow, mp_opt);
subflow->fully_established = 1;
- subflow->can_ack = 1;
WRITE_ONCE(msk->fully_established, true);
+
+ if (subflow->is_mptfo)
+ mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt);
}
static struct sock *subflow_syn_recv_sock(const struct sock *sk,
@@ -723,6 +786,8 @@ create_child:
goto dispose_child;
}
+ if (new_msk)
+ mptcp_copy_inaddrs(new_msk, child);
subflow_drop_ctx(child);
goto out;
}
@@ -750,10 +815,15 @@ create_child:
ctx->conn = new_msk;
new_msk = NULL;
+ /* set msk addresses early to ensure mptcp_pm_get_local_id()
+ * uses the correct data
+ */
+ mptcp_copy_inaddrs(ctx->conn, child);
+
/* with OoO packets we can reach here without ingress
* mpc option
*/
- if (mp_opt.suboptions & OPTIONS_MPTCP_MPC)
+ if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK)
mptcp_subflow_fully_established(ctx, &mp_opt);
} else if (ctx->mp_join) {
struct mptcp_sock *owner;
@@ -1191,16 +1261,8 @@ static bool subflow_check_data_avail(struct sock *ssk)
if (WARN_ON_ONCE(!skb))
goto no_data;
- /* if msk lacks the remote key, this subflow must provide an
- * MP_CAPABLE-based mapping
- */
- if (unlikely(!READ_ONCE(msk->can_ack))) {
- if (!subflow->mpc_map)
- goto fallback;
- WRITE_ONCE(msk->remote_key, subflow->remote_key);
- WRITE_ONCE(msk->ack_seq, subflow->map_seq);
- WRITE_ONCE(msk->can_ack, true);
- }
+ if (unlikely(!READ_ONCE(msk->can_ack)))
+ goto fallback;
old_ack = READ_ONCE(msk->ack_seq);
ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
@@ -1473,6 +1535,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id,
&flags, &ifindex);
+ subflow->remote_key_valid = 1;
subflow->remote_key = msk->remote_key;
subflow->local_key = msk->local_key;
subflow->token = msk->token;
@@ -1595,7 +1658,9 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
/* kernel sockets do not by default acquire net ref, but TCP timer
* needs it.
+ * Update ns_tracker to current stack trace and refcounted tracker.
*/
+ __netns_tracker_free(net, &sf->sk->ns_tracker, false);
sf->sk->sk_net_refcnt = 1;
get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL);
sock_inuse_add(net, 1);
@@ -1738,16 +1803,16 @@ void mptcp_subflow_queue_clean(struct sock *listener_ssk)
for (msk = head; msk; msk = next) {
struct sock *sk = (struct sock *)msk;
- bool slow, do_cancel_work;
+ bool do_cancel_work;
sock_hold(sk);
- slow = lock_sock_fast_nested(sk);
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
next = msk->dl_next;
msk->first = NULL;
msk->dl_next = NULL;
do_cancel_work = __mptcp_close(sk, 0);
- unlock_sock_fast(sk, slow);
+ release_sock(sk);
if (do_cancel_work)
mptcp_cancel_work(sk);
sock_put(sk);
@@ -1864,6 +1929,7 @@ static void subflow_ulp_clone(const struct request_sock *req,
new_ctx->ssn_offset = subflow_req->ssn_offset;
new_ctx->mp_join = 1;
new_ctx->fully_established = 1;
+ new_ctx->remote_key_valid = 1;
new_ctx->backup = subflow_req->backup;
new_ctx->remote_id = subflow_req->remote_id;
new_ctx->token = subflow_req->token;
@@ -1920,6 +1986,7 @@ void __init mptcp_subflow_init(void)
subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req;
+ subflow_request_sock_ipv4_ops.send_synack = subflow_v4_send_synack;
subflow_specific = ipv4_specific;
subflow_specific.conn_request = subflow_v4_conn_request;
@@ -1933,6 +2000,7 @@ void __init mptcp_subflow_init(void)
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
+ subflow_request_sock_ipv6_ops.send_synack = subflow_v6_send_synack;
subflow_v6_specific = ipv6_specific;
subflow_v6_specific.conn_request = subflow_v6_conn_request;
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
index f52ee7b26aed..65430f314a68 100644
--- a/net/mptcp/token.c
+++ b/net/mptcp/token.c
@@ -287,8 +287,8 @@ EXPORT_SYMBOL_GPL(mptcp_token_get_sock);
* This function returns the first mptcp connection structure found inside the
* token container starting from the specified position, or NULL.
*
- * On successful iteration, the iterator is move to the next position and the
- * the acquires a reference to the returned socket.
+ * On successful iteration, the iterator is moved to the next position and
+ * a reference to the returned socket is acquired.
*/
struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot,
long *s_num)
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 4b8d04640ff3..0846bd75b1da 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -568,12 +568,6 @@ config NFT_TUNNEL
This option adds the "tunnel" expression that you can use to set
tunneling policies.
-config NFT_OBJREF
- tristate "Netfilter nf_tables stateful object reference module"
- help
- This option adds the "objref" expression that allows you to refer to
- stateful objects, such as counters and quotas.
-
config NFT_QUEUE
depends on NETFILTER_NETLINK_QUEUE
tristate "Netfilter nf_tables queue module"
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 0f060d100880..1d4db1943936 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -86,7 +86,8 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \
- nft_counter.o nft_chain_route.o nf_tables_offload.o \
+ nft_counter.o nft_objref.o nft_inner.o \
+ nft_chain_route.o nf_tables_offload.o \
nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \
nft_set_pipapo.o
@@ -104,7 +105,6 @@ obj-$(CONFIG_NFT_CT) += nft_ct.o
obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o
obj-$(CONFIG_NFT_LIMIT) += nft_limit.o
obj-$(CONFIG_NFT_NAT) += nft_nat.o
-obj-$(CONFIG_NFT_OBJREF) += nft_objref.o
obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
obj-$(CONFIG_NFT_QUOTA) += nft_quota.o
obj-$(CONFIG_NFT_REJECT) += nft_reject.o
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 6e391308431d..7499192af586 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -42,31 +42,8 @@
#define AHASH_MAX_SIZE (6 * AHASH_INIT_SIZE)
/* Max muber of elements in the array block when tuned */
#define AHASH_MAX_TUNED 64
-
#define AHASH_MAX(h) ((h)->bucketsize)
-/* Max number of elements can be tuned */
-#ifdef IP_SET_HASH_WITH_MULTI
-static u8
-tune_bucketsize(u8 curr, u32 multi)
-{
- u32 n;
-
- if (multi < curr)
- return curr;
-
- n = curr + AHASH_INIT_SIZE;
- /* Currently, at listing one hash bucket must fit into a message.
- * Therefore we have a hard limit here.
- */
- return n > curr && n <= AHASH_MAX_TUNED ? n : curr;
-}
-#define TUNE_BUCKETSIZE(h, multi) \
- ((h)->bucketsize = tune_bucketsize((h)->bucketsize, multi))
-#else
-#define TUNE_BUCKETSIZE(h, multi)
-#endif
-
/* A hash bucket */
struct hbucket {
struct rcu_head rcu; /* for call_rcu */
@@ -936,7 +913,12 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
goto set_full;
/* Create a new slot */
if (n->pos >= n->size) {
- TUNE_BUCKETSIZE(h, multi);
+#ifdef IP_SET_HASH_WITH_MULTI
+ if (h->bucketsize >= AHASH_MAX_TUNED)
+ goto set_full;
+ else if (h->bucketsize <= multi)
+ h->bucketsize += AHASH_INIT_SIZE;
+#endif
if (n->size >= AHASH_MAX(h)) {
/* Trigger rehashing */
mtype_data_next(&h->next, d);
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index dd30c03d5a23..75d556d71652 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -151,18 +151,16 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
if (((u64)ip_to - ip + 1) >> (32 - h->netmask) > IPSET_MAX_RANGE)
return -ERANGE;
- if (retried) {
+ if (retried)
ip = ntohl(h->next.ip);
- e.ip = htonl(ip);
- }
for (; ip <= ip_to;) {
+ e.ip = htonl(ip);
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
ip += hosts;
- e.ip = htonl(ip);
- if (e.ip == 0)
+ if (ip == 0)
return 0;
ret = 0;
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index f9b16f2b2219..fdacbc3c15be 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -599,13 +599,19 @@ static const struct seq_operations ip_vs_app_seq_ops = {
int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs)
{
INIT_LIST_HEAD(&ipvs->app_list);
- proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_seq_ops,
- sizeof(struct seq_net_private));
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs_app", 0, ipvs->net->proc_net,
+ &ip_vs_app_seq_ops,
+ sizeof(struct seq_net_private)))
+ return -ENOMEM;
+#endif
return 0;
}
void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs)
{
unregister_ip_vs_app(ipvs, NULL /* all */);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_app", ipvs->net->proc_net);
+#endif
}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index fb67f1ca2495..13534e02346c 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1265,8 +1265,8 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
* The drop rate array needs tuning for real environments.
* Called from timer bh only => no locking
*/
- static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
- static char todrop_counter[9] = {0};
+ static const signed char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+ static signed char todrop_counter[9] = {0};
int i;
/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
@@ -1308,7 +1308,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
* Randomly scan 1/32 of the whole table every second
*/
for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
- unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
+ unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask;
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->ipvs != ipvs)
@@ -1447,20 +1447,36 @@ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
{
atomic_set(&ipvs->conn_count, 0);
- proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
- &ip_vs_conn_seq_ops, sizeof(struct ip_vs_iter_state));
- proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
- &ip_vs_conn_sync_seq_ops,
- sizeof(struct ip_vs_iter_state));
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
+ &ip_vs_conn_seq_ops,
+ sizeof(struct ip_vs_iter_state)))
+ goto err_conn;
+
+ if (!proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
+ &ip_vs_conn_sync_seq_ops,
+ sizeof(struct ip_vs_iter_state)))
+ goto err_conn_sync;
+#endif
+
return 0;
+
+#ifdef CONFIG_PROC_FS
+err_conn_sync:
+ remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
+err_conn:
+ return -ENOMEM;
+#endif
}
void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
{
/* flush all the connection entries first */
ip_vs_conn_flush(ipvs);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net);
+#endif
}
int __init ip_vs_conn_init(void)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 988222fff9f0..4d62059a6021 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2296,13 +2296,13 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
u64 conns, inpkts, outpkts, inbytes, outbytes;
do {
- start = u64_stats_fetch_begin_irq(&u->syncp);
+ start = u64_stats_fetch_begin(&u->syncp);
conns = u->cnt.conns;
inpkts = u->cnt.inpkts;
outpkts = u->cnt.outpkts;
inbytes = u->cnt.inbytes;
outbytes = u->cnt.outbytes;
- } while (u64_stats_fetch_retry_irq(&u->syncp, start));
+ } while (u64_stats_fetch_retry(&u->syncp, start));
seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
i, (u64)conns, (u64)inpkts,
diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c
index acb55d8393ef..f2579fc9c75b 100644
--- a/net/netfilter/ipvs/ip_vs_twos.c
+++ b/net/netfilter/ipvs/ip_vs_twos.c
@@ -71,8 +71,8 @@ static struct ip_vs_dest *ip_vs_twos_schedule(struct ip_vs_service *svc,
* from 0 to total_weight
*/
total_weight += 1;
- rweight1 = prandom_u32() % total_weight;
- rweight2 = prandom_u32() % total_weight;
+ rweight1 = prandom_u32_max(total_weight);
+ rweight2 = prandom_u32_max(total_weight);
/* Pick two weighted servers */
list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 8639e7efd0e2..24002bc61e07 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -191,19 +191,16 @@ BTF_ID(struct, nf_conn___init)
/* Check writes into `struct nf_conn` */
static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
- const struct btf *btf,
- const struct btf_type *t, int off,
- int size, enum bpf_access_type atype,
- u32 *next_btf_id,
- enum bpf_type_flag *flag)
+ const struct bpf_reg_state *reg,
+ int off, int size, enum bpf_access_type atype,
+ u32 *next_btf_id, enum bpf_type_flag *flag)
{
- const struct btf_type *ncit;
- const struct btf_type *nct;
+ const struct btf_type *ncit, *nct, *t;
size_t end;
- ncit = btf_type_by_id(btf, btf_nf_conn_ids[1]);
- nct = btf_type_by_id(btf, btf_nf_conn_ids[0]);
-
+ ncit = btf_type_by_id(reg->btf, btf_nf_conn_ids[1]);
+ nct = btf_type_by_id(reg->btf, btf_nf_conn_ids[0]);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
if (t != nct && t != ncit) {
bpf_log(log, "only read is supported\n");
return -EACCES;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index f97bda06d2a9..b96338b4bf36 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -211,28 +211,24 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
unsigned int zoneid,
const struct net *net)
{
- struct {
- struct nf_conntrack_man src;
- union nf_inet_addr dst_addr;
- unsigned int zone;
- u32 net_mix;
- u16 dport;
- u16 proto;
- } __aligned(SIPHASH_ALIGNMENT) combined;
+ u64 a, b, c, d;
get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
- memset(&combined, 0, sizeof(combined));
+ /* The direction must be ignored, handle usable tuplehash members manually */
+ a = (u64)tuple->src.u3.all[0] << 32 | tuple->src.u3.all[3];
+ b = (u64)tuple->dst.u3.all[0] << 32 | tuple->dst.u3.all[3];
- /* The direction must be ignored, so handle usable members manually. */
- combined.src = tuple->src;
- combined.dst_addr = tuple->dst.u3;
- combined.zone = zoneid;
- combined.net_mix = net_hash_mix(net);
- combined.dport = (__force __u16)tuple->dst.u.all;
- combined.proto = tuple->dst.protonum;
+ c = (__force u64)tuple->src.u.all << 32 | (__force u64)tuple->dst.u.all << 16;
+ c |= tuple->dst.protonum;
- return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd);
+ d = (u64)zoneid << 32 | net_hash_mix(net);
+
+ /* IPv4: u3.all[1,2,3] == 0 */
+ c ^= (u64)tuple->src.u3.all[1] << 32 | tuple->src.u3.all[2];
+ d += (u64)tuple->dst.u3.all[1] << 32 | tuple->dst.u3.all[2];
+
+ return (u32)siphash_4u64(a, b, c, d, &nf_conntrack_hash_rnd);
}
static u32 scale_hash(u32 hash)
@@ -1781,7 +1777,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
}
#ifdef CONFIG_NF_CONNTRACK_MARK
- ct->mark = exp->master->mark;
+ ct->mark = READ_ONCE(exp->master->mark);
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
ct->secmark = exp->master->secmark;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index ff737a76052e..48ea6d0264b5 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -26,7 +26,9 @@
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_log.h>
+#include <net/ip.h>
static DEFINE_MUTEX(nf_ct_helper_mutex);
struct hlist_head *nf_ct_helper_hash __read_mostly;
@@ -240,6 +242,104 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
}
EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
+/* 'skb' should already be pulled to nh_ofs. */
+int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, u16 proto)
+{
+ const struct nf_conntrack_helper *helper;
+ const struct nf_conn_help *help;
+ unsigned int protoff;
+ int err;
+
+ if (ctinfo == IP_CT_RELATED_REPLY)
+ return NF_ACCEPT;
+
+ help = nfct_help(ct);
+ if (!help)
+ return NF_ACCEPT;
+
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
+ if (helper->tuple.src.l3num != NFPROTO_UNSPEC &&
+ helper->tuple.src.l3num != proto)
+ return NF_ACCEPT;
+
+ switch (proto) {
+ case NFPROTO_IPV4:
+ protoff = ip_hdrlen(skb);
+ proto = ip_hdr(skb)->protocol;
+ break;
+ case NFPROTO_IPV6: {
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ __be16 frag_off;
+ int ofs;
+
+ ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("proto header not found\n");
+ return NF_ACCEPT;
+ }
+ protoff = ofs;
+ proto = nexthdr;
+ break;
+ }
+ default:
+ WARN_ONCE(1, "helper invoked on non-IP family!");
+ return NF_DROP;
+ }
+
+ if (helper->tuple.dst.protonum != proto)
+ return NF_ACCEPT;
+
+ err = helper->help(skb, protoff, ct, ctinfo);
+ if (err != NF_ACCEPT)
+ return err;
+
+ /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
+ * FTP with NAT) adusting the TCP payload size when mangling IP
+ * addresses and/or port numbers in the text-based control connection.
+ */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+ return NF_DROP;
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper);
+
+int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family,
+ u8 proto, bool nat, struct nf_conntrack_helper **hp)
+{
+ struct nf_conntrack_helper *helper;
+ struct nf_conn_help *help;
+ int ret = 0;
+
+ helper = nf_conntrack_helper_try_module_get(name, family, proto);
+ if (!helper)
+ return -EINVAL;
+
+ help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
+ if (!help) {
+ nf_conntrack_helper_put(helper);
+ return -ENOMEM;
+ }
+#if IS_ENABLED(CONFIG_NF_NAT)
+ if (nat) {
+ ret = nf_nat_helper_try_module_get(name, family, proto);
+ if (ret) {
+ nf_conntrack_helper_put(helper);
+ return ret;
+ }
+ }
+#endif
+ rcu_assign_pointer(help->helper, helper);
+ *hp = helper;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_add_helper);
+
/* appropriate ct lock protecting must be taken by caller */
static int unhelp(struct nf_conn *ct, void *me)
{
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 7562b215b932..d71150a40fb0 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -328,9 +328,9 @@ nla_put_failure:
}
#ifdef CONFIG_NF_CONNTRACK_MARK
-static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_mark(struct sk_buff *skb, u32 mark)
{
- if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark)))
+ if (nla_put_be32(skb, CTA_MARK, htonl(mark)))
goto nla_put_failure;
return 0;
@@ -543,7 +543,7 @@ static int ctnetlink_dump_extinfo(struct sk_buff *skb,
static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct)
{
if (ctnetlink_dump_status(skb, ct) < 0 ||
- ctnetlink_dump_mark(skb, ct) < 0 ||
+ ctnetlink_dump_mark(skb, READ_ONCE(ct->mark)) < 0 ||
ctnetlink_dump_secctx(skb, ct) < 0 ||
ctnetlink_dump_id(skb, ct) < 0 ||
ctnetlink_dump_use(skb, ct) < 0 ||
@@ -722,6 +722,7 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
struct sk_buff *skb;
unsigned int type;
unsigned int flags = 0, group;
+ u32 mark;
int err;
if (events & (1 << IPCT_DESTROY)) {
@@ -826,8 +827,9 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
}
#ifdef CONFIG_NF_CONNTRACK_MARK
- if ((events & (1 << IPCT_MARK) || ct->mark)
- && ctnetlink_dump_mark(skb, ct) < 0)
+ mark = READ_ONCE(ct->mark);
+ if ((events & (1 << IPCT_MARK) || mark) &&
+ ctnetlink_dump_mark(skb, mark) < 0)
goto nla_put_failure;
#endif
nlmsg_end(skb, nlh);
@@ -1154,7 +1156,7 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
}
#ifdef CONFIG_NF_CONNTRACK_MARK
- if ((ct->mark & filter->mark.mask) != filter->mark.val)
+ if ((READ_ONCE(ct->mark) & filter->mark.mask) != filter->mark.val)
goto ignore_entry;
#endif
status = (u32)READ_ONCE(ct->status);
@@ -2002,9 +2004,9 @@ static void ctnetlink_change_mark(struct nf_conn *ct,
mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
mark = ntohl(nla_get_be32(cda[CTA_MARK]));
- newmark = (ct->mark & mask) ^ mark;
- if (newmark != ct->mark)
- ct->mark = newmark;
+ newmark = (READ_ONCE(ct->mark) & mask) ^ mark;
+ if (newmark != READ_ONCE(ct->mark))
+ WRITE_ONCE(ct->mark, newmark);
}
#endif
@@ -2669,6 +2671,7 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
{
const struct nf_conntrack_zone *zone;
struct nlattr *nest_parms;
+ u32 mark;
zone = nf_ct_zone(ct);
@@ -2730,7 +2733,8 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
goto nla_put_failure;
#ifdef CONFIG_NF_CONNTRACK_MARK
- if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0)
+ mark = READ_ONCE(ct->mark);
+ if (mark && ctnetlink_dump_mark(skb, mark) < 0)
goto nla_put_failure;
#endif
if (ctnetlink_dump_labels(skb, ct) < 0)
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 4ffe84c5a82c..bca839ab1ae8 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -366,7 +366,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
goto release;
#if defined(CONFIG_NF_CONNTRACK_MARK)
- seq_printf(s, "mark=%u ", ct->mark);
+ seq_printf(s, "mark=%u ", READ_ONCE(ct->mark));
#endif
ct_show_secctx(s, ct);
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index b04645ced89b..00b522890d77 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -1098,6 +1098,7 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
struct flow_block_cb *block_cb, *next;
int err = 0;
+ down_write(&flowtable->flow_block_lock);
switch (cmd) {
case FLOW_BLOCK_BIND:
list_splice(&bo->cb_list, &flowtable->flow_block.cb_list);
@@ -1112,6 +1113,7 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
WARN_ON_ONCE(1);
err = -EOPNOTSUPP;
}
+ up_write(&flowtable->flow_block_lock);
return err;
}
@@ -1168,7 +1170,9 @@ static int nf_flow_table_offload_cmd(struct flow_block_offload *bo,
nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable,
extack);
+ down_write(&flowtable->flow_block_lock);
err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo);
+ up_write(&flowtable->flow_block_lock);
if (err < 0)
return err;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index d8e6380f6337..e29e4ccb5c5a 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -468,7 +468,7 @@ find_free_id:
if (range->flags & NF_NAT_RANGE_PROTO_OFFSET)
off = (ntohs(*keyptr) - ntohs(range->base_proto.all));
else
- off = prandom_u32();
+ off = get_random_u16();
attempts = range_size;
if (attempts > max_attempts)
@@ -490,7 +490,7 @@ another_round:
if (attempts >= range_size || attempts < 16)
return;
attempts /= 2;
- off = prandom_u32();
+ off = get_random_u16();
goto another_round;
}
@@ -1152,7 +1152,16 @@ static int __init nf_nat_init(void)
WARN_ON(nf_nat_hook != NULL);
RCU_INIT_POINTER(nf_nat_hook, &nat_hook);
- return register_nf_nat_bpf();
+ ret = register_nf_nat_bpf();
+ if (ret < 0) {
+ RCU_INIT_POINTER(nf_nat_hook, NULL);
+ nf_ct_helper_expectfn_unregister(&follow_master_nat);
+ synchronize_net();
+ unregister_pernet_subsys(&nat_net_ops);
+ kvfree(nf_nat_bysource);
+ }
+
+ return ret;
}
static void __exit nf_nat_cleanup(void)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index a0653a8dfa82..6269b0d9977c 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1534,10 +1534,10 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
for_each_possible_cpu(cpu) {
cpu_stats = per_cpu_ptr(stats, cpu);
do {
- seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ seq = u64_stats_fetch_begin(&cpu_stats->syncp);
pkts = cpu_stats->pkts;
bytes = cpu_stats->bytes;
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, seq));
total.pkts += pkts;
total.bytes += bytes;
}
@@ -2759,7 +2759,7 @@ static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
};
static int nf_tables_fill_expr_info(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name))
goto nla_put_failure;
@@ -2769,7 +2769,7 @@ static int nf_tables_fill_expr_info(struct sk_buff *skb,
NFTA_EXPR_DATA);
if (data == NULL)
goto nla_put_failure;
- if (expr->ops->dump(skb, expr) < 0)
+ if (expr->ops->dump(skb, expr, reset) < 0)
goto nla_put_failure;
nla_nest_end(skb, data);
}
@@ -2781,14 +2781,14 @@ nla_put_failure:
};
int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
struct nlattr *nest;
nest = nla_nest_start_noflag(skb, attr);
if (!nest)
goto nla_put_failure;
- if (nf_tables_fill_expr_info(skb, expr) < 0)
+ if (nf_tables_fill_expr_info(skb, expr, reset) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
return 0;
@@ -2857,6 +2857,43 @@ err1:
return err;
}
+int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
+ struct nft_expr_info *info)
+{
+ struct nlattr *tb[NFTA_EXPR_MAX + 1];
+ const struct nft_expr_type *type;
+ int err;
+
+ err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla,
+ nft_expr_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_EXPR_DATA])
+ return -EINVAL;
+
+ type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]);
+ if (IS_ERR(type))
+ return PTR_ERR(type);
+
+ if (!type->inner_ops)
+ return -EOPNOTSUPP;
+
+ err = nla_parse_nested_deprecated(info->tb, type->maxattr,
+ tb[NFTA_EXPR_DATA],
+ type->policy, NULL);
+ if (err < 0)
+ goto err_nla_parse;
+
+ info->attr = nla;
+ info->ops = type->inner_ops;
+
+ return 0;
+
+err_nla_parse:
+ return err;
+}
+
static int nf_tables_newexpr(const struct nft_ctx *ctx,
const struct nft_expr_info *expr_info,
struct nft_expr *expr)
@@ -2997,7 +3034,8 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
u32 flags, int family,
const struct nft_table *table,
const struct nft_chain *chain,
- const struct nft_rule *rule, u64 handle)
+ const struct nft_rule *rule, u64 handle,
+ bool reset)
{
struct nlmsghdr *nlh;
const struct nft_expr *expr, *next;
@@ -3030,7 +3068,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
if (list == NULL)
goto nla_put_failure;
nft_rule_for_each_expr(expr, next, rule) {
- if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0)
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0)
goto nla_put_failure;
}
nla_nest_end(skb, list);
@@ -3081,7 +3119,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx,
err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq,
event, flags, ctx->family, ctx->table,
- ctx->chain, rule, handle);
+ ctx->chain, rule, handle, false);
if (err < 0) {
kfree_skb(skb);
goto err;
@@ -3102,7 +3140,8 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
unsigned int *idx,
struct netlink_callback *cb,
const struct nft_table *table,
- const struct nft_chain *chain)
+ const struct nft_chain *chain,
+ bool reset)
{
struct net *net = sock_net(skb->sk);
const struct nft_rule *rule, *prule;
@@ -3129,7 +3168,7 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
NFT_MSG_NEWRULE,
NLM_F_MULTI | NLM_F_APPEND,
table->family,
- table, chain, rule, handle) < 0)
+ table, chain, rule, handle, reset) < 0)
return 1;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -3152,6 +3191,10 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nftables_pernet *nft_net;
+ bool reset = false;
+
+ if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET)
+ reset = true;
rcu_read_lock();
nft_net = nft_pernet(net);
@@ -3176,14 +3219,15 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
if (!nft_is_active(net, chain))
continue;
__nf_tables_dump_rules(skb, &idx,
- cb, table, chain);
+ cb, table, chain, reset);
break;
}
goto done;
}
list_for_each_entry_rcu(chain, &table->chains, list) {
- if (__nf_tables_dump_rules(skb, &idx, cb, table, chain))
+ if (__nf_tables_dump_rules(skb, &idx,
+ cb, table, chain, reset))
goto done;
}
@@ -3254,6 +3298,7 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
struct net *net = info->net;
struct nft_table *table;
struct sk_buff *skb2;
+ bool reset = false;
int err;
if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
@@ -3290,9 +3335,12 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
if (!skb2)
return -ENOMEM;
+ if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET)
+ reset = true;
+
err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid,
info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
- family, table, chain, rule, 0);
+ family, table, chain, rule, 0, reset);
if (err < 0)
goto err_fill_rule_info;
@@ -4067,7 +4115,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
if (set->num_exprs == 1) {
nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR);
- if (nf_tables_fill_expr_info(skb, set->exprs[0]) < 0)
+ if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
@@ -4078,7 +4126,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
for (i = 0; i < set->num_exprs; i++) {
if (nft_expr_dump(skb, NFTA_LIST_ELEM,
- set->exprs[i]) < 0)
+ set->exprs[i], false) < 0)
goto nla_put_failure;
}
nla_nest_end(skb, nest);
@@ -4909,7 +4957,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb,
if (num_exprs == 1) {
expr = nft_setelem_expr_at(elem_expr, 0);
- if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr) < 0)
+ if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, false) < 0)
return -1;
return 0;
@@ -4920,7 +4968,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb,
nft_setelem_expr_foreach(expr, elem_expr, size) {
expr = nft_setelem_expr_at(elem_expr, size);
- if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0)
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, false) < 0)
goto nla_put_failure;
}
nla_nest_end(skb, nest);
@@ -5865,8 +5913,9 @@ static bool nft_setelem_valid_key_end(const struct nft_set *set,
(NFT_SET_CONCAT | NFT_SET_INTERVAL)) {
if (flags & NFT_SET_ELEM_INTERVAL_END)
return false;
- if (!nla[NFTA_SET_ELEM_KEY_END] &&
- !(flags & NFT_SET_ELEM_CATCHALL))
+
+ if (nla[NFTA_SET_ELEM_KEY_END] &&
+ flags & NFT_SET_ELEM_CATCHALL)
return false;
} else {
if (nla[NFTA_SET_ELEM_KEY_END])
@@ -5957,7 +6006,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
&timeout);
if (err)
return err;
- } else if (set->flags & NFT_SET_TIMEOUT) {
+ } else if (set->flags & NFT_SET_TIMEOUT &&
+ !(flags & NFT_SET_ELEM_INTERVAL_END)) {
timeout = set->timeout;
}
@@ -6023,7 +6073,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
err = -EOPNOTSUPP;
goto err_set_elem_expr;
}
- } else if (set->num_exprs > 0) {
+ } else if (set->num_exprs > 0 &&
+ !(flags & NFT_SET_ELEM_INTERVAL_END)) {
err = nft_set_elem_expr_clone(ctx, set, expr_array);
if (err < 0)
goto err_set_elem_expr_clone;
@@ -8273,6 +8324,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
+ [NFT_MSG_GETRULE_RESET] = {
+ .call = nf_tables_getrule,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
[NFT_MSG_DELRULE] = {
.call = nf_tables_delrule,
.type = NFNL_CB_BATCH,
@@ -8464,9 +8521,6 @@ static void nft_commit_release(struct nft_trans *trans)
nf_tables_chain_destroy(&trans->ctx);
break;
case NFT_MSG_DELRULE:
- if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
- nft_flow_rule_destroy(nft_trans_flow_rule(trans));
-
nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
break;
case NFT_MSG_DELSET:
@@ -8972,6 +9026,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nft_rule_expr_deactivate(&trans->ctx,
nft_trans_rule(trans),
NFT_TRANS_COMMIT);
+
+ if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
break;
case NFT_MSG_NEWSET:
nft_clear(net, nft_trans_set(trans));
@@ -10029,6 +10086,8 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event,
nft_net = nft_pernet(net);
deleted = 0;
mutex_lock(&nft_net->commit_mutex);
+ if (!list_empty(&nf_tables_destroy_list))
+ rcu_barrier();
again:
list_for_each_entry(table, &nft_net->tables, list) {
if (nft_table_has_owner(table) &&
@@ -10087,7 +10146,8 @@ static void __net_exit nf_tables_exit_net(struct net *net)
struct nftables_pernet *nft_net = nft_pernet(net);
mutex_lock(&nft_net->commit_mutex);
- if (!list_empty(&nft_net->commit_list))
+ if (!list_empty(&nft_net->commit_list) ||
+ !list_empty(&nft_net->module_list))
__nf_tables_abort(net, NFNL_ABORT_NONE);
__nft_release_tables(net);
mutex_unlock(&nft_net->commit_mutex);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index cee3e4e905ec..709a736c301c 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -340,6 +340,8 @@ static struct nft_expr_type *nft_basic_types[] = {
&nft_exthdr_type,
&nft_last_type,
&nft_counter_type,
+ &nft_objref_type,
+ &nft_inner_type,
};
static struct nft_object_type *nft_basic_objects[] = {
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 9c44518cb70f..6d18fb346868 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -294,6 +294,7 @@ replay:
nfnl_lock(subsys_id);
if (nfnl_dereference_protected(subsys_id) != ss ||
nfnetlink_find_client(type, ss) != nc) {
+ nfnl_unlock(subsys_id);
err = -EAGAIN;
break;
}
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index e6e402b247d0..84eae7cabc67 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -232,7 +232,8 @@ static int nft_bitwise_dump_shift(struct sk_buff *skb,
return 0;
}
-static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_bitwise_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_bitwise *priv = nft_expr_priv(expr);
int err = 0;
@@ -393,7 +394,8 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx,
}
static int
-nft_bitwise_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+nft_bitwise_fast_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
struct nft_data data;
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index f952a80275a8..b66647a5a171 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -148,7 +148,8 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
priv->len);
}
-static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_byteorder_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_byteorder *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 963cf831799c..6eb21a4f5698 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -92,7 +92,8 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return 0;
}
-static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_cmp_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_cmp_expr *priv = nft_expr_priv(expr);
@@ -253,7 +254,8 @@ static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx,
return __nft_cmp_offload(ctx, flow, &cmp);
}
-static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_cmp_fast_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
@@ -347,7 +349,8 @@ static int nft_cmp16_fast_offload(struct nft_offload_ctx *ctx,
return __nft_cmp_offload(ctx, flow, &cmp);
}
-static int nft_cmp16_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_cmp16_fast_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index c16172427622..5284cd2ad532 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -324,7 +324,8 @@ static int nft_extension_dump_info(struct sk_buff *skb, int attr,
return 0;
}
-static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_target_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct xt_target *target = expr->ops->data;
void *info = nft_expr_priv(expr);
@@ -572,12 +573,14 @@ nla_put_failure:
return -1;
}
-static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_match_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
return __nft_match_dump(skb, expr, nft_expr_priv(expr));
}
-static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e)
+static int nft_match_large_dump(struct sk_buff *skb,
+ const struct nft_expr *e, bool reset)
{
struct nft_xt_match_priv *priv = nft_expr_priv(e);
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index d657f999a11b..de9d1980df69 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -185,7 +185,8 @@ static void nft_connlimit_eval(const struct nft_expr *expr,
nft_connlimit_do_eval(priv, regs, pkt, NULL);
}
-static int nft_connlimit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_connlimit_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_connlimit *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index f4d3573e8782..dccc68a5135a 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -201,11 +201,12 @@ void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs,
nft_counter_do_eval(priv, regs, pkt);
}
-static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_counter_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
- return nft_counter_do_dump(skb, priv, false);
+ return nft_counter_do_dump(skb, priv, reset);
}
static int nft_counter_init(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index a3f01f209a53..c68e2151defe 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -98,7 +98,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
return;
#ifdef CONFIG_NF_CONNTRACK_MARK
case NFT_CT_MARK:
- *dest = ct->mark;
+ *dest = READ_ONCE(ct->mark);
return;
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
@@ -297,8 +297,8 @@ static void nft_ct_set_eval(const struct nft_expr *expr,
switch (priv->key) {
#ifdef CONFIG_NF_CONNTRACK_MARK
case NFT_CT_MARK:
- if (ct->mark != value) {
- ct->mark = value;
+ if (READ_ONCE(ct->mark) != value) {
+ WRITE_ONCE(ct->mark, value);
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
@@ -641,7 +641,8 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx,
nf_ct_netns_put(ctx->net, ctx->family);
}
-static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ct_get_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ct *priv = nft_expr_priv(expr);
@@ -703,7 +704,8 @@ static bool nft_ct_get_reduce(struct nft_regs_track *track,
return nft_expr_reduce_bitwise(track, expr);
}
-static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ct_set_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ct *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c
index 63507402716d..e5739a59ebf1 100644
--- a/net/netfilter/nft_dup_netdev.c
+++ b/net/netfilter/nft_dup_netdev.c
@@ -44,7 +44,8 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx,
sizeof(int));
}
-static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dup_netdev_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_dup_netdev *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 6983e6ddeef9..274579b1696e 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -357,7 +357,8 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx,
nf_tables_destroy_set(ctx, priv->set);
}
-static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dynset_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_dynset *priv = nft_expr_priv(expr);
u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0;
@@ -379,7 +380,7 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
if (priv->set->num_exprs == 0) {
if (priv->num_exprs == 1) {
if (nft_expr_dump(skb, NFTA_DYNSET_EXPR,
- priv->expr_array[0]))
+ priv->expr_array[0], reset))
goto nla_put_failure;
} else if (priv->num_exprs > 1) {
struct nlattr *nest;
@@ -390,7 +391,7 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
for (i = 0; i < priv->num_exprs; i++) {
if (nft_expr_dump(skb, NFTA_LIST_ELEM,
- priv->expr_array[i]))
+ priv->expr_array[i], reset))
goto nla_put_failure;
}
nla_nest_end(skb, nest);
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index a67ea9c3ae57..a54a7f772cec 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -13,7 +13,6 @@
#include <linux/sctp.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
-#include <net/sctp/sctp.h>
#include <net/tcp.h>
struct nft_exthdr {
@@ -576,7 +575,8 @@ nla_put_failure:
return -1;
}
-static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_exthdr_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_exthdr *priv = nft_expr_priv(expr);
@@ -586,7 +586,8 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
return nft_exthdr_dump_common(skb, priv);
}
-static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_exthdr_dump_set(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_exthdr *priv = nft_expr_priv(expr);
@@ -596,7 +597,8 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
return nft_exthdr_dump_common(skb, priv);
}
-static int nft_exthdr_dump_strip(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_exthdr_dump_strip(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_exthdr *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index 1f12d7ade606..6e049fd48760 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -118,7 +118,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
}
EXPORT_SYMBOL_GPL(nft_fib_init);
-int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset)
{
const struct nft_fib *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index a25c88bc8b75..e860d8fe0e5e 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -433,7 +433,8 @@ static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
nf_ct_netns_put(ctx->net, ctx->family);
}
-static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_flow_offload_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_flow_offload *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 7c5876dc9ff2..7b9d4d1bd17c 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -56,7 +56,8 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
sizeof(int));
}
-static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_fwd_netdev_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_fwd_netdev *priv = nft_expr_priv(expr);
@@ -186,7 +187,8 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
addr_len);
}
-static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_fwd_neigh_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_fwd_neigh *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index e5631e88b285..ee8d487b69c0 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -139,7 +139,7 @@ static int nft_symhash_init(const struct nft_ctx *ctx,
}
static int nft_jhash_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_jhash *priv = nft_expr_priv(expr);
@@ -176,7 +176,7 @@ static bool nft_jhash_reduce(struct nft_regs_track *track,
}
static int nft_symhash_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_symhash *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 5f28b21abc7d..c9d2f7c29f53 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -147,7 +147,8 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx,
}
}
-static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_immediate_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
new file mode 100644
index 000000000000..28e2873ba24e
--- /dev/null
+++ b/net/netfilter/nft_inner.c
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Pablo Neira Ayuso <pablo@netfilter.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_meta.h>
+#include <net/netfilter/nf_tables_offload.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <net/gre.h>
+#include <net/geneve.h>
+#include <net/ip.h>
+#include <linux/icmpv6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx);
+
+/* Same layout as nft_expr but it embeds the private expression data area. */
+struct __nft_expr {
+ const struct nft_expr_ops *ops;
+ union {
+ struct nft_payload payload;
+ struct nft_meta meta;
+ } __attribute__((aligned(__alignof__(u64))));
+};
+
+enum {
+ NFT_INNER_EXPR_PAYLOAD,
+ NFT_INNER_EXPR_META,
+};
+
+struct nft_inner {
+ u8 flags;
+ u8 hdrsize;
+ u8 type;
+ u8 expr_type;
+
+ struct __nft_expr expr;
+};
+
+static int nft_inner_parse_l2l3(const struct nft_inner *priv,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *ctx, u32 off)
+{
+ __be16 llproto, outer_llproto;
+ u32 nhoff, thoff;
+
+ if (priv->flags & NFT_INNER_LL) {
+ struct vlan_ethhdr *veth, _veth;
+ struct ethhdr *eth, _eth;
+ u32 hdrsize;
+
+ eth = skb_header_pointer(pkt->skb, off, sizeof(_eth), &_eth);
+ if (!eth)
+ return -1;
+
+ switch (eth->h_proto) {
+ case htons(ETH_P_IP):
+ case htons(ETH_P_IPV6):
+ llproto = eth->h_proto;
+ hdrsize = sizeof(_eth);
+ break;
+ case htons(ETH_P_8021Q):
+ veth = skb_header_pointer(pkt->skb, off, sizeof(_veth), &_veth);
+ if (!veth)
+ return -1;
+
+ outer_llproto = veth->h_vlan_encapsulated_proto;
+ llproto = veth->h_vlan_proto;
+ hdrsize = sizeof(_veth);
+ break;
+ default:
+ return -1;
+ }
+
+ ctx->inner_lloff = off;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_LL;
+ off += hdrsize;
+ } else {
+ struct iphdr *iph;
+ u32 _version;
+
+ iph = skb_header_pointer(pkt->skb, off, sizeof(_version), &_version);
+ if (!iph)
+ return -1;
+
+ switch (iph->version) {
+ case 4:
+ llproto = htons(ETH_P_IP);
+ break;
+ case 6:
+ llproto = htons(ETH_P_IPV6);
+ break;
+ default:
+ return -1;
+ }
+ }
+
+ ctx->llproto = llproto;
+ if (llproto == htons(ETH_P_8021Q))
+ llproto = outer_llproto;
+
+ nhoff = off;
+
+ switch (llproto) {
+ case htons(ETH_P_IP): {
+ struct iphdr *iph, _iph;
+
+ iph = skb_header_pointer(pkt->skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph)
+ return -1;
+
+ if (iph->ihl < 5 || iph->version != 4)
+ return -1;
+
+ ctx->inner_nhoff = nhoff;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH;
+
+ thoff = nhoff + (iph->ihl * 4);
+ if ((ntohs(iph->frag_off) & IP_OFFSET) == 0) {
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
+ ctx->inner_thoff = thoff;
+ ctx->l4proto = iph->protocol;
+ }
+ }
+ break;
+ case htons(ETH_P_IPV6): {
+ struct ipv6hdr *ip6h, _ip6h;
+ int fh_flags = IP6_FH_F_AUTH;
+ unsigned short fragoff;
+ int l4proto;
+
+ ip6h = skb_header_pointer(pkt->skb, nhoff, sizeof(_ip6h), &_ip6h);
+ if (!ip6h)
+ return -1;
+
+ if (ip6h->version != 6)
+ return -1;
+
+ ctx->inner_nhoff = nhoff;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH;
+
+ thoff = nhoff;
+ l4proto = ipv6_find_hdr(pkt->skb, &thoff, -1, &fragoff, &fh_flags);
+ if (l4proto < 0 || thoff > U16_MAX)
+ return -1;
+
+ if (fragoff == 0) {
+ thoff = nhoff + sizeof(_ip6h);
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
+ ctx->inner_thoff = thoff;
+ ctx->l4proto = l4proto;
+ }
+ }
+ break;
+ default:
+ return -1;
+ }
+
+ return 0;
+}
+
+static int nft_inner_parse_tunhdr(const struct nft_inner *priv,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *ctx, u32 *off)
+{
+ if (pkt->tprot == IPPROTO_GRE) {
+ ctx->inner_tunoff = pkt->thoff;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN;
+ return 0;
+ }
+
+ if (pkt->tprot != IPPROTO_UDP)
+ return -1;
+
+ ctx->inner_tunoff = *off;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN;
+ *off += priv->hdrsize;
+
+ switch (priv->type) {
+ case NFT_INNER_GENEVE: {
+ struct genevehdr *gnvh, _gnvh;
+
+ gnvh = skb_header_pointer(pkt->skb, pkt->inneroff,
+ sizeof(_gnvh), &_gnvh);
+ if (!gnvh)
+ return -1;
+
+ *off += gnvh->opt_len * 4;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int nft_inner_parse(const struct nft_inner *priv,
+ struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ struct nft_inner_tun_ctx ctx = {};
+ u32 off = pkt->inneroff;
+
+ if (priv->flags & NFT_INNER_HDRSIZE &&
+ nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
+ return -1;
+
+ if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
+ if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
+ return -1;
+ } else if (priv->flags & NFT_INNER_TH) {
+ ctx.inner_thoff = off;
+ ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
+ }
+
+ *tun_ctx = ctx;
+ tun_ctx->type = priv->type;
+ pkt->flags |= NFT_PKTINFO_INNER_FULL;
+
+ return 0;
+}
+
+static bool nft_inner_parse_needed(const struct nft_inner *priv,
+ const struct nft_pktinfo *pkt,
+ const struct nft_inner_tun_ctx *tun_ctx)
+{
+ if (!(pkt->flags & NFT_PKTINFO_INNER_FULL))
+ return true;
+
+ if (priv->type != tun_ctx->type)
+ return true;
+
+ return false;
+}
+
+static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_inner_tun_ctx *tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
+ const struct nft_inner *priv = nft_expr_priv(expr);
+
+ if (nft_payload_inner_offset(pkt) < 0)
+ goto err;
+
+ if (nft_inner_parse_needed(priv, pkt, tun_ctx) &&
+ nft_inner_parse(priv, (struct nft_pktinfo *)pkt, tun_ctx) < 0)
+ goto err;
+
+ switch (priv->expr_type) {
+ case NFT_INNER_EXPR_PAYLOAD:
+ nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx);
+ break;
+ case NFT_INNER_EXPR_META:
+ nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ goto err;
+ }
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_inner_policy[NFTA_INNER_MAX + 1] = {
+ [NFTA_INNER_NUM] = { .type = NLA_U32 },
+ [NFTA_INNER_FLAGS] = { .type = NLA_U32 },
+ [NFTA_INNER_HDRSIZE] = { .type = NLA_U32 },
+ [NFTA_INNER_TYPE] = { .type = NLA_U32 },
+ [NFTA_INNER_EXPR] = { .type = NLA_NESTED },
+};
+
+struct nft_expr_info {
+ const struct nft_expr_ops *ops;
+ const struct nlattr *attr;
+ struct nlattr *tb[NFT_EXPR_MAXATTR + 1];
+};
+
+static int nft_inner_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_inner *priv = nft_expr_priv(expr);
+ u32 flags, hdrsize, type, num;
+ struct nft_expr_info expr_info;
+ int err;
+
+ if (!tb[NFTA_INNER_FLAGS] ||
+ !tb[NFTA_INNER_HDRSIZE] ||
+ !tb[NFTA_INNER_TYPE] ||
+ !tb[NFTA_INNER_EXPR])
+ return -EINVAL;
+
+ flags = ntohl(nla_get_be32(tb[NFTA_INNER_FLAGS]));
+ if (flags & ~NFT_INNER_MASK)
+ return -EOPNOTSUPP;
+
+ num = ntohl(nla_get_be32(tb[NFTA_INNER_NUM]));
+ if (num != 0)
+ return -EOPNOTSUPP;
+
+ hdrsize = ntohl(nla_get_be32(tb[NFTA_INNER_HDRSIZE]));
+ type = ntohl(nla_get_be32(tb[NFTA_INNER_TYPE]));
+
+ if (type > U8_MAX)
+ return -EINVAL;
+
+ if (flags & NFT_INNER_HDRSIZE) {
+ if (hdrsize == 0 || hdrsize > 64)
+ return -EOPNOTSUPP;
+ }
+
+ priv->flags = flags;
+ priv->hdrsize = hdrsize;
+ priv->type = type;
+
+ err = nft_expr_inner_parse(ctx, tb[NFTA_INNER_EXPR], &expr_info);
+ if (err < 0)
+ return err;
+
+ priv->expr.ops = expr_info.ops;
+
+ if (!strcmp(expr_info.ops->type->name, "payload"))
+ priv->expr_type = NFT_INNER_EXPR_PAYLOAD;
+ else if (!strcmp(expr_info.ops->type->name, "meta"))
+ priv->expr_type = NFT_INNER_EXPR_META;
+ else
+ return -EINVAL;
+
+ err = expr_info.ops->init(ctx, (struct nft_expr *)&priv->expr,
+ (const struct nlattr * const*)expr_info.tb);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static int nft_inner_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ const struct nft_inner *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_INNER_NUM, htonl(0)) ||
+ nla_put_be32(skb, NFTA_INNER_TYPE, htonl(priv->type)) ||
+ nla_put_be32(skb, NFTA_INNER_FLAGS, htonl(priv->flags)) ||
+ nla_put_be32(skb, NFTA_INNER_HDRSIZE, htonl(priv->hdrsize)))
+ goto nla_put_failure;
+
+ if (nft_expr_dump(skb, NFTA_INNER_EXPR,
+ (struct nft_expr *)&priv->expr, reset) < 0)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nft_expr_ops nft_inner_ops = {
+ .type = &nft_inner_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_inner)),
+ .eval = nft_inner_eval,
+ .init = nft_inner_init,
+ .dump = nft_inner_dump,
+};
+
+struct nft_expr_type nft_inner_type __read_mostly = {
+ .name = "inner",
+ .ops = &nft_inner_ops,
+ .policy = nft_inner_policy,
+ .maxattr = NFTA_INNER_MAX,
+ .owner = THIS_MODULE,
+};
diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
index bb15a55dad5c..7f2bda6641bd 100644
--- a/net/netfilter/nft_last.c
+++ b/net/netfilter/nft_last.c
@@ -65,7 +65,8 @@ static void nft_last_eval(const struct nft_expr *expr,
WRITE_ONCE(last->set, 1);
}
-static int nft_last_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_last_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_last_priv *priv = nft_expr_priv(expr);
struct nft_last *last = priv->last;
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index 981addb2d051..145dc62c6247 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -193,7 +193,8 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_limit_pkts_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
@@ -251,7 +252,7 @@ static int nft_limit_bytes_init(const struct nft_ctx *ctx,
}
static int nft_limit_bytes_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_limit_priv *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 0e13c003f0c1..5defe6e4fd98 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -241,7 +241,8 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
nf_logger_put(ctx->family, li->type);
}
-static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_log_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_log *priv = nft_expr_priv(expr);
const struct nf_loginfo *li = &priv->loginfo;
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index dfae12759c7c..cae5a6724163 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -178,7 +178,8 @@ static void nft_lookup_destroy(const struct nft_ctx *ctx,
nf_tables_destroy_set(ctx, priv->set);
}
-static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_lookup_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_lookup *priv = nft_expr_priv(expr);
u32 flags = priv->invert ? NFT_LOOKUP_F_INV : 0;
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 2a0adc497bbb..e55e455275c4 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -73,7 +73,8 @@ static int nft_masq_init(const struct nft_ctx *ctx,
return nf_ct_netns_get(ctx->net, ctx->family);
}
-static int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_masq_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_masq *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 55d2d49c3425..e384e0de7a54 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -669,7 +669,7 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
EXPORT_SYMBOL_GPL(nft_meta_set_init);
int nft_meta_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -684,7 +684,8 @@ nla_put_failure:
}
EXPORT_SYMBOL_GPL(nft_meta_get_dump);
-int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_meta_set_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -831,9 +832,71 @@ nft_meta_select_ops(const struct nft_ctx *ctx,
return ERR_PTR(-EINVAL);
}
+static int nft_meta_inner_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ unsigned int len;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+ switch (priv->key) {
+ case NFT_META_PROTOCOL:
+ len = sizeof(u16);
+ break;
+ case NFT_META_L4PROTO:
+ len = sizeof(u32);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ priv->len = len;
+
+ return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
+}
+
+void nft_meta_inner_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+
+ switch (priv->key) {
+ case NFT_META_PROTOCOL:
+ nft_reg_store16(dest, (__force u16)tun_ctx->llproto);
+ break;
+ case NFT_META_L4PROTO:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH))
+ goto err;
+
+ nft_reg_store8(dest, tun_ctx->l4proto);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ goto err;
+ }
+ return;
+
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+EXPORT_SYMBOL_GPL(nft_meta_inner_eval);
+
+static const struct nft_expr_ops nft_meta_inner_ops = {
+ .type = &nft_meta_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .init = nft_meta_inner_init,
+ .dump = nft_meta_get_dump,
+ /* direct call to nft_meta_inner_eval(). */
+};
+
struct nft_expr_type nft_meta_type __read_mostly = {
.name = "meta",
.select_ops = nft_meta_select_ops,
+ .inner_ops = &nft_meta_inner_ops,
.policy = nft_meta_policy,
.maxattr = NFTA_META_MAX,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index e5fd6995e4bf..047999150390 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -255,7 +255,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return nf_ct_netns_get(ctx->net, family);
}
-static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_nat_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_nat *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 45d3dc9e96f2..7d29db7c2ac0 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -112,7 +112,8 @@ nla_put_failure:
return -1;
}
-static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ng_inc_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ng_inc *priv = nft_expr_priv(expr);
@@ -168,7 +169,8 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
NULL, NFT_DATA_VALUE, sizeof(u32));
}
-static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ng_random_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ng_random *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 5d8d91b3904d..7b01aa2ef653 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -47,7 +47,8 @@ static int nft_objref_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_objref_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_object *obj = nft_objref_priv(expr);
@@ -82,7 +83,6 @@ static void nft_objref_activate(const struct nft_ctx *ctx,
obj->use++;
}
-static struct nft_expr_type nft_objref_type;
static const struct nft_expr_ops nft_objref_ops = {
.type = &nft_objref_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_object *)),
@@ -156,7 +156,8 @@ static int nft_objref_map_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_objref_map_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_objref_map *priv = nft_expr_priv(expr);
@@ -195,7 +196,6 @@ static void nft_objref_map_destroy(const struct nft_ctx *ctx,
nf_tables_destroy_set(ctx, priv->set);
}
-static struct nft_expr_type nft_objref_type;
static const struct nft_expr_ops nft_objref_map_ops = {
.type = &nft_objref_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
@@ -233,28 +233,10 @@ static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = {
[NFTA_OBJREF_SET_ID] = { .type = NLA_U32 },
};
-static struct nft_expr_type nft_objref_type __read_mostly = {
+struct nft_expr_type nft_objref_type __read_mostly = {
.name = "objref",
.select_ops = nft_objref_select_ops,
.policy = nft_objref_policy,
.maxattr = NFTA_OBJREF_MAX,
.owner = THIS_MODULE,
};
-
-static int __init nft_objref_module_init(void)
-{
- return nft_register_expr(&nft_objref_type);
-}
-
-static void __exit nft_objref_module_exit(void)
-{
- nft_unregister_expr(&nft_objref_type);
-}
-
-module_init(nft_objref_module_init);
-module_exit(nft_objref_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_ALIAS_NFT_EXPR("objref");
-MODULE_DESCRIPTION("nftables stateful object reference module");
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index adacf95b6e2b..70820c66b591 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -92,7 +92,8 @@ static int nft_osf_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_osf_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_osf *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 088244f9d838..17b418a5a593 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -19,6 +19,7 @@
/* For layer 4 checksum field offset. */
#include <linux/tcp.h>
#include <linux/udp.h>
+#include <net/gre.h>
#include <linux/icmpv6.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
@@ -100,6 +101,41 @@ static int __nft_payload_inner_offset(struct nft_pktinfo *pkt)
pkt->inneroff = thoff + __tcp_hdrlen(th);
}
break;
+ case IPPROTO_GRE: {
+ u32 offset = sizeof(struct gre_base_hdr);
+ struct gre_base_hdr *gre, _gre;
+ __be16 version;
+
+ gre = skb_header_pointer(pkt->skb, thoff, sizeof(_gre), &_gre);
+ if (!gre)
+ return -1;
+
+ version = gre->flags & GRE_VERSION;
+ switch (version) {
+ case GRE_VERSION_0:
+ if (gre->flags & GRE_ROUTING)
+ return -1;
+
+ if (gre->flags & GRE_CSUM) {
+ offset += sizeof_field(struct gre_full_hdr, csum) +
+ sizeof_field(struct gre_full_hdr, reserved1);
+ }
+ if (gre->flags & GRE_KEY)
+ offset += sizeof_field(struct gre_full_hdr, key);
+
+ if (gre->flags & GRE_SEQ)
+ offset += sizeof_field(struct gre_full_hdr, seq);
+ break;
+ default:
+ return -1;
+ }
+
+ pkt->inneroff = thoff + offset;
+ }
+ break;
+ case IPPROTO_IPIP:
+ pkt->inneroff = thoff;
+ break;
default:
return -1;
}
@@ -109,7 +145,7 @@ static int __nft_payload_inner_offset(struct nft_pktinfo *pkt)
return 0;
}
-static int nft_payload_inner_offset(const struct nft_pktinfo *pkt)
+int nft_payload_inner_offset(const struct nft_pktinfo *pkt)
{
if (!(pkt->flags & NFT_PKTINFO_INNER) &&
__nft_payload_inner_offset((struct nft_pktinfo *)pkt) < 0)
@@ -173,10 +209,10 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
[NFTA_PAYLOAD_SREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_DREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_BASE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX_BE(NLA_U32, 255),
- [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX_BE(NLA_U32, 255),
+ [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
+ [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX_BE(NLA_U32, 255),
+ [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_PAYLOAD_CSUM_FLAGS] = { .type = NLA_U32 },
};
@@ -195,7 +231,8 @@ static int nft_payload_init(const struct nft_ctx *ctx,
priv->len);
}
-static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_payload_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_payload *priv = nft_expr_priv(expr);
@@ -552,6 +589,92 @@ const struct nft_expr_ops nft_payload_fast_ops = {
.offload = nft_payload_offload,
};
+void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ const struct sk_buff *skb = pkt->skb;
+ u32 *dest = &regs->data[priv->dreg];
+ int offset;
+
+ if (priv->len % NFT_REG32_SIZE)
+ dest[priv->len / NFT_REG32_SIZE] = 0;
+
+ switch (priv->base) {
+ case NFT_PAYLOAD_TUN_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TUN))
+ goto err;
+
+ offset = tun_ctx->inner_tunoff;
+ break;
+ case NFT_PAYLOAD_LL_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_LL))
+ goto err;
+
+ offset = tun_ctx->inner_lloff;
+ break;
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_NH))
+ goto err;
+
+ offset = tun_ctx->inner_nhoff;
+ break;
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH))
+ goto err;
+
+ offset = tun_ctx->inner_thoff;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ goto err;
+ }
+ offset += priv->offset;
+
+ if (skb_copy_bits(skb, offset, dest, priv->len) < 0)
+ goto err;
+
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_payload_inner_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_payload *priv = nft_expr_priv(expr);
+ u32 base;
+
+ base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
+ switch (base) {
+ case NFT_PAYLOAD_TUN_HEADER:
+ case NFT_PAYLOAD_LL_HEADER:
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->base = base;
+ priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
+ priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+
+ return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
+}
+
+static const struct nft_expr_ops nft_payload_inner_ops = {
+ .type = &nft_payload_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)),
+ .init = nft_payload_inner_init,
+ .dump = nft_payload_dump,
+ /* direct call to nft_payload_inner_eval(). */
+};
+
static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum)
{
*sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum));
@@ -665,6 +788,16 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src,
return 0;
}
+struct nft_payload_set {
+ enum nft_payload_bases base:8;
+ u8 offset;
+ u8 len;
+ u8 sreg;
+ u8 csum_type;
+ u8 csum_offset;
+ u8 csum_flags;
+};
+
static void nft_payload_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -787,7 +920,8 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
priv->len);
}
-static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_payload_set_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_payload_set *priv = nft_expr_priv(expr);
@@ -885,6 +1019,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
struct nft_expr_type nft_payload_type __read_mostly = {
.name = "payload",
.select_ops = nft_payload_select_ops,
+ .inner_ops = &nft_payload_inner_ops,
.policy = nft_payload_policy,
.maxattr = NFTA_PAYLOAD_MAX,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index da29e92c03e2..b2b8127c8d43 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -152,7 +152,8 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_queue_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_queue *priv = nft_expr_priv(expr);
@@ -168,7 +169,8 @@ nla_put_failure:
}
static int
-nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr)
+nft_queue_sreg_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_queue *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index e6b0df68feea..123578e28917 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -217,11 +217,12 @@ static int nft_quota_init(const struct nft_ctx *ctx,
return nft_quota_do_init(tb, priv);
}
-static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_quota_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_quota *priv = nft_expr_priv(expr);
- return nft_quota_do_dump(skb, priv, false);
+ return nft_quota_do_dump(skb, priv, reset);
}
static void nft_quota_destroy(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index 832f0d725a9e..0566d6aaf1e5 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -111,7 +111,8 @@ err1:
return err;
}
-static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_range_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_range_expr *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 5086adfe731c..5f7739987559 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -75,7 +75,8 @@ static int nft_redir_init(const struct nft_ctx *ctx,
return nf_ct_netns_get(ctx->net, ctx->family);
}
-static int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_redir_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_redir *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 927ff8459bd9..f2addc844dd2 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -69,7 +69,8 @@ int nft_reject_init(const struct nft_ctx *ctx,
}
EXPORT_SYMBOL_GPL(nft_reject_init);
-int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_reject_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_reject *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 71931ec91721..5990fdd7b3cc 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -146,7 +146,7 @@ static int nft_rt_get_init(const struct nft_ctx *ctx,
}
static int nft_rt_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_rt *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 49a5348a6a14..85f8df87efda 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -199,7 +199,7 @@ static int nft_socket_init(const struct nft_ctx *ctx,
}
static int nft_socket_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_socket *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
index 6cf9a04fbfe2..13da882669a4 100644
--- a/net/netfilter/nft_synproxy.c
+++ b/net/netfilter/nft_synproxy.c
@@ -272,7 +272,8 @@ static void nft_synproxy_destroy(const struct nft_ctx *ctx,
nft_synproxy_do_destroy(ctx);
}
-static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_synproxy_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_synproxy *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
index 62da25ad264b..ea83f661417e 100644
--- a/net/netfilter/nft_tproxy.c
+++ b/net/netfilter/nft_tproxy.c
@@ -294,7 +294,7 @@ static void nft_tproxy_destroy(const struct nft_ctx *ctx,
}
static int nft_tproxy_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_tproxy *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 983ade4be3b3..b059aa541798 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -108,7 +108,7 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx,
}
static int nft_tunnel_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_tunnel *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
index 1c5343c936a8..c88fd078a9ae 100644
--- a/net/netfilter/nft_xfrm.c
+++ b/net/netfilter/nft_xfrm.c
@@ -212,7 +212,7 @@ static void nft_xfrm_get_eval(const struct nft_expr *expr,
}
static int nft_xfrm_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_xfrm *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index e5ebc0810675..ad3c033db64e 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -30,6 +30,7 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info)
u_int32_t new_targetmark;
struct nf_conn *ct;
u_int32_t newmark;
+ u_int32_t oldmark;
ct = nf_ct_get(skb, &ctinfo);
if (ct == NULL)
@@ -37,14 +38,15 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info)
switch (info->mode) {
case XT_CONNMARK_SET:
- newmark = (ct->mark & ~info->ctmask) ^ info->ctmark;
+ oldmark = READ_ONCE(ct->mark);
+ newmark = (oldmark & ~info->ctmask) ^ info->ctmark;
if (info->shift_dir == D_SHIFT_RIGHT)
newmark >>= info->shift_bits;
else
newmark <<= info->shift_bits;
- if (ct->mark != newmark) {
- ct->mark = newmark;
+ if (READ_ONCE(ct->mark) != newmark) {
+ WRITE_ONCE(ct->mark, newmark);
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
@@ -55,15 +57,15 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info)
else
new_targetmark <<= info->shift_bits;
- newmark = (ct->mark & ~info->ctmask) ^
+ newmark = (READ_ONCE(ct->mark) & ~info->ctmask) ^
new_targetmark;
- if (ct->mark != newmark) {
- ct->mark = newmark;
+ if (READ_ONCE(ct->mark) != newmark) {
+ WRITE_ONCE(ct->mark, newmark);
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
case XT_CONNMARK_RESTORE:
- new_targetmark = (ct->mark & info->ctmask);
+ new_targetmark = (READ_ONCE(ct->mark) & info->ctmask);
if (info->shift_dir == D_SHIFT_RIGHT)
new_targetmark >>= info->shift_bits;
else
@@ -126,7 +128,7 @@ connmark_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (ct == NULL)
return false;
- return ((ct->mark & info->mask) == info->mark) ^ info->invert;
+ return ((READ_ONCE(ct->mark) & info->mask) == info->mark) ^ info->invert;
}
static int connmark_mt_check(const struct xt_mtchk_param *par)
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index 680015ba7cb6..e8961094a282 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -4,7 +4,6 @@
#include <linux/skbuff.h>
#include <net/ip.h>
#include <net/ipv6.h>
-#include <net/sctp/sctp.h>
#include <linux/sctp.h>
#include <linux/netfilter/x_tables.h>
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index 203e24ae472c..b26c1dcfc27b 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -34,7 +34,7 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)
switch (info->mode) {
case XT_STATISTIC_MODE_RANDOM:
- if ((prandom_u32() & 0x7FFFFFFF) < info->u.random.probability)
+ if ((get_random_u32() & 0x7FFFFFFF) < info->u.random.probability)
ret = !ret;
break;
case XT_STATISTIC_MODE_NTH:
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a662e8a5ff84..d73091f6bb0f 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -812,6 +812,17 @@ static int netlink_release(struct socket *sock)
}
sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
+
+ /* Because struct net might disappear soon, do not keep a pointer. */
+ if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) {
+ __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
+ /* Because of deferred_put_nlk_sk and use of work queue,
+ * it is possible netns will be freed before this socket.
+ */
+ sock_net_set(sk, &init_net);
+ __netns_tracker_alloc(&init_net, &sk->ns_tracker,
+ false, GFP_KERNEL);
+ }
call_rcu(&nlk->rcu, deferred_put_nlk_sk);
return 0;
}
@@ -2488,19 +2499,24 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
flags |= NLM_F_ACK_TLVS;
skb = nlmsg_new(payload + tlvlen, GFP_KERNEL);
- if (!skb) {
- NETLINK_CB(in_skb).sk->sk_err = ENOBUFS;
- sk_error_report(NETLINK_CB(in_skb).sk);
- return;
- }
+ if (!skb)
+ goto err_skb;
rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
- NLMSG_ERROR, payload, flags);
+ NLMSG_ERROR, sizeof(*errmsg), flags);
+ if (!rep)
+ goto err_bad_put;
errmsg = nlmsg_data(rep);
errmsg->error = err;
- unsafe_memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg)
- ? nlh->nlmsg_len : sizeof(*nlh),
- /* Bounds checked by the skb layer. */);
+ errmsg->msg = *nlh;
+
+ if (!(flags & NLM_F_CAPPED)) {
+ if (!nlmsg_append(skb, nlmsg_len(nlh)))
+ goto err_bad_put;
+
+ memcpy(nlmsg_data(&errmsg->msg), nlmsg_data(nlh),
+ nlmsg_len(nlh));
+ }
if (tlvlen)
netlink_ack_tlv_fill(in_skb, skb, nlh, err, extack);
@@ -2508,6 +2524,14 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
nlmsg_end(skb, rep);
nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid);
+
+ return;
+
+err_bad_put:
+ nlmsg_free(skb);
+err_skb:
+ NETLINK_CB(in_skb).sk->sk_err = ENOBUFS;
+ sk_error_report(NETLINK_CB(in_skb).sk);
}
EXPORT_SYMBOL(netlink_ack);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 39b7c00e4cef..600993c80050 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -78,10 +78,40 @@ static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) |
static unsigned long *mc_groups = &mc_group_start;
static unsigned long mc_groups_longs = 1;
+/* We need the last attribute with non-zero ID therefore a 2-entry array */
+static struct nla_policy genl_policy_reject_all[] = {
+ { .type = NLA_REJECT },
+ { .type = NLA_REJECT },
+};
+
static int genl_ctrl_event(int event, const struct genl_family *family,
const struct genl_multicast_group *grp,
int grp_id);
+static void
+genl_op_fill_in_reject_policy(const struct genl_family *family,
+ struct genl_ops *op)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(genl_policy_reject_all) - 1 != 1);
+
+ if (op->policy || op->cmd < family->resv_start_op)
+ return;
+
+ op->policy = genl_policy_reject_all;
+ op->maxattr = 1;
+}
+
+static void
+genl_op_fill_in_reject_policy_split(const struct genl_family *family,
+ struct genl_split_ops *op)
+{
+ if (op->policy)
+ return;
+
+ op->policy = genl_policy_reject_all;
+ op->maxattr = 1;
+}
+
static const struct genl_family *genl_family_find_byid(unsigned int id)
{
return idr_find(&genl_fam_idr, id);
@@ -99,10 +129,15 @@ static const struct genl_family *genl_family_find_byname(char *name)
return NULL;
}
-static int genl_get_cmd_cnt(const struct genl_family *family)
-{
- return family->n_ops + family->n_small_ops;
-}
+struct genl_op_iter {
+ const struct genl_family *family;
+ struct genl_split_ops doit;
+ struct genl_split_ops dumpit;
+ int cmd_idx;
+ int entry_idx;
+ u32 cmd;
+ u8 flags;
+};
static void genl_op_from_full(const struct genl_family *family,
unsigned int i, struct genl_ops *op)
@@ -113,6 +148,8 @@ static void genl_op_from_full(const struct genl_family *family,
op->maxattr = family->maxattr;
if (!op->policy)
op->policy = family->policy;
+
+ genl_op_fill_in_reject_policy(family, op);
}
static int genl_get_cmd_full(u32 cmd, const struct genl_family *family,
@@ -142,6 +179,8 @@ static void genl_op_from_small(const struct genl_family *family,
op->maxattr = family->maxattr;
op->policy = family->policy;
+
+ genl_op_fill_in_reject_policy(family, op);
}
static int genl_get_cmd_small(u32 cmd, const struct genl_family *family,
@@ -158,24 +197,187 @@ static int genl_get_cmd_small(u32 cmd, const struct genl_family *family,
return -ENOENT;
}
-static int genl_get_cmd(u32 cmd, const struct genl_family *family,
- struct genl_ops *op)
+static void genl_op_from_split(struct genl_op_iter *iter)
{
- if (!genl_get_cmd_full(cmd, family, op))
- return 0;
- return genl_get_cmd_small(cmd, family, op);
+ const struct genl_family *family = iter->family;
+ int i, cnt = 0;
+
+ i = iter->entry_idx - family->n_ops - family->n_small_ops;
+
+ if (family->split_ops[i + cnt].flags & GENL_CMD_CAP_DO) {
+ iter->doit = family->split_ops[i + cnt];
+ genl_op_fill_in_reject_policy_split(family, &iter->doit);
+ cnt++;
+ } else {
+ memset(&iter->doit, 0, sizeof(iter->doit));
+ }
+
+ if (i + cnt < family->n_split_ops &&
+ family->split_ops[i + cnt].flags & GENL_CMD_CAP_DUMP) {
+ iter->dumpit = family->split_ops[i + cnt];
+ genl_op_fill_in_reject_policy_split(family, &iter->dumpit);
+ cnt++;
+ } else {
+ memset(&iter->dumpit, 0, sizeof(iter->dumpit));
+ }
+
+ WARN_ON(!cnt);
+ iter->entry_idx += cnt;
}
-static void genl_get_cmd_by_index(unsigned int i,
- const struct genl_family *family,
- struct genl_ops *op)
+static int
+genl_get_cmd_split(u32 cmd, u8 flag, const struct genl_family *family,
+ struct genl_split_ops *op)
{
- if (i < family->n_ops)
- genl_op_from_full(family, i, op);
- else if (i < family->n_ops + family->n_small_ops)
- genl_op_from_small(family, i - family->n_ops, op);
- else
- WARN_ON_ONCE(1);
+ int i;
+
+ for (i = 0; i < family->n_split_ops; i++)
+ if (family->split_ops[i].cmd == cmd &&
+ family->split_ops[i].flags & flag) {
+ *op = family->split_ops[i];
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int
+genl_cmd_full_to_split(struct genl_split_ops *op,
+ const struct genl_family *family,
+ const struct genl_ops *full, u8 flags)
+{
+ if ((flags & GENL_CMD_CAP_DO && !full->doit) ||
+ (flags & GENL_CMD_CAP_DUMP && !full->dumpit)) {
+ memset(op, 0, sizeof(*op));
+ return -ENOENT;
+ }
+
+ if (flags & GENL_CMD_CAP_DUMP) {
+ op->start = full->start;
+ op->dumpit = full->dumpit;
+ op->done = full->done;
+ } else {
+ op->pre_doit = family->pre_doit;
+ op->doit = full->doit;
+ op->post_doit = family->post_doit;
+ }
+
+ if (flags & GENL_CMD_CAP_DUMP &&
+ full->validate & GENL_DONT_VALIDATE_DUMP) {
+ op->policy = NULL;
+ op->maxattr = 0;
+ } else {
+ op->policy = full->policy;
+ op->maxattr = full->maxattr;
+ }
+
+ op->cmd = full->cmd;
+ op->internal_flags = full->internal_flags;
+ op->flags = full->flags;
+ op->validate = full->validate;
+
+ /* Make sure flags include the GENL_CMD_CAP_DO / GENL_CMD_CAP_DUMP */
+ op->flags |= flags;
+
+ return 0;
+}
+
+/* Must make sure that op is initialized to 0 on failure */
+static int
+genl_get_cmd(u32 cmd, u8 flags, const struct genl_family *family,
+ struct genl_split_ops *op)
+{
+ struct genl_ops full;
+ int err;
+
+ err = genl_get_cmd_full(cmd, family, &full);
+ if (err == -ENOENT)
+ err = genl_get_cmd_small(cmd, family, &full);
+ /* Found one of legacy forms */
+ if (err == 0)
+ return genl_cmd_full_to_split(op, family, &full, flags);
+
+ err = genl_get_cmd_split(cmd, flags, family, op);
+ if (err)
+ memset(op, 0, sizeof(*op));
+ return err;
+}
+
+/* For policy dumping only, get ops of both do and dump.
+ * Fail if both are missing, genl_get_cmd() will zero-init in case of failure.
+ */
+static int
+genl_get_cmd_both(u32 cmd, const struct genl_family *family,
+ struct genl_split_ops *doit, struct genl_split_ops *dumpit)
+{
+ int err1, err2;
+
+ err1 = genl_get_cmd(cmd, GENL_CMD_CAP_DO, family, doit);
+ err2 = genl_get_cmd(cmd, GENL_CMD_CAP_DUMP, family, dumpit);
+
+ return err1 && err2 ? -ENOENT : 0;
+}
+
+static bool
+genl_op_iter_init(const struct genl_family *family, struct genl_op_iter *iter)
+{
+ iter->family = family;
+ iter->cmd_idx = 0;
+ iter->entry_idx = 0;
+
+ iter->flags = 0;
+
+ return iter->family->n_ops +
+ iter->family->n_small_ops +
+ iter->family->n_split_ops;
+}
+
+static bool genl_op_iter_next(struct genl_op_iter *iter)
+{
+ const struct genl_family *family = iter->family;
+ bool legacy_op = true;
+ struct genl_ops op;
+
+ if (iter->entry_idx < family->n_ops) {
+ genl_op_from_full(family, iter->entry_idx, &op);
+ } else if (iter->entry_idx < family->n_ops + family->n_small_ops) {
+ genl_op_from_small(family, iter->entry_idx - family->n_ops,
+ &op);
+ } else if (iter->entry_idx <
+ family->n_ops + family->n_small_ops + family->n_split_ops) {
+ legacy_op = false;
+ /* updates entry_idx */
+ genl_op_from_split(iter);
+ } else {
+ return false;
+ }
+
+ iter->cmd_idx++;
+
+ if (legacy_op) {
+ iter->entry_idx++;
+
+ genl_cmd_full_to_split(&iter->doit, family,
+ &op, GENL_CMD_CAP_DO);
+ genl_cmd_full_to_split(&iter->dumpit, family,
+ &op, GENL_CMD_CAP_DUMP);
+ }
+
+ iter->cmd = iter->doit.cmd | iter->dumpit.cmd;
+ iter->flags = iter->doit.flags | iter->dumpit.flags;
+
+ return true;
+}
+
+static void
+genl_op_iter_copy(struct genl_op_iter *dst, struct genl_op_iter *src)
+{
+ *dst = *src;
+}
+
+static unsigned int genl_op_iter_idx(struct genl_op_iter *iter)
+{
+ return iter->cmd_idx;
}
static int genl_allocate_reserve_groups(int n_groups, int *first_id)
@@ -343,29 +545,72 @@ static void genl_unregister_mc_groups(const struct genl_family *family)
}
}
+static bool genl_split_op_check(const struct genl_split_ops *op)
+{
+ if (WARN_ON(hweight8(op->flags & (GENL_CMD_CAP_DO |
+ GENL_CMD_CAP_DUMP)) != 1))
+ return true;
+ return false;
+}
+
static int genl_validate_ops(const struct genl_family *family)
{
- int i, j;
+ struct genl_op_iter i, j;
+ unsigned int s;
if (WARN_ON(family->n_ops && !family->ops) ||
- WARN_ON(family->n_small_ops && !family->small_ops))
+ WARN_ON(family->n_small_ops && !family->small_ops) ||
+ WARN_ON(family->n_split_ops && !family->split_ops))
return -EINVAL;
- for (i = 0; i < genl_get_cmd_cnt(family); i++) {
- struct genl_ops op;
+ for (genl_op_iter_init(family, &i); genl_op_iter_next(&i); ) {
+ if (!(i.flags & (GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP)))
+ return -EINVAL;
- genl_get_cmd_by_index(i, family, &op);
- if (op.dumpit == NULL && op.doit == NULL)
+ if (WARN_ON(i.cmd >= family->resv_start_op &&
+ (i.doit.validate || i.dumpit.validate)))
return -EINVAL;
- for (j = i + 1; j < genl_get_cmd_cnt(family); j++) {
- struct genl_ops op2;
- genl_get_cmd_by_index(j, family, &op2);
- if (op.cmd == op2.cmd)
+ genl_op_iter_copy(&j, &i);
+ while (genl_op_iter_next(&j)) {
+ if (i.cmd == j.cmd)
return -EINVAL;
}
}
+ if (family->n_split_ops) {
+ if (genl_split_op_check(&family->split_ops[0]))
+ return -EINVAL;
+ }
+
+ for (s = 1; s < family->n_split_ops; s++) {
+ const struct genl_split_ops *a, *b;
+
+ a = &family->split_ops[s - 1];
+ b = &family->split_ops[s];
+
+ if (genl_split_op_check(b))
+ return -EINVAL;
+
+ /* Check sort order */
+ if (a->cmd < b->cmd)
+ continue;
+
+ if (a->internal_flags != b->internal_flags ||
+ ((a->flags ^ b->flags) & ~(GENL_CMD_CAP_DO |
+ GENL_CMD_CAP_DUMP))) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if ((a->flags & GENL_CMD_CAP_DO) &&
+ (b->flags & GENL_CMD_CAP_DUMP))
+ continue;
+
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
return 0;
}
@@ -519,7 +764,7 @@ static struct nlattr **
genl_family_rcv_msg_attrs_parse(const struct genl_family *family,
struct nlmsghdr *nlh,
struct netlink_ext_ack *extack,
- const struct genl_ops *ops,
+ const struct genl_split_ops *ops,
int hdrlen,
enum genl_validate_flags no_strict_flag)
{
@@ -555,22 +800,21 @@ struct genl_start_context {
const struct genl_family *family;
struct nlmsghdr *nlh;
struct netlink_ext_ack *extack;
- const struct genl_ops *ops;
+ const struct genl_split_ops *ops;
int hdrlen;
};
static int genl_start(struct netlink_callback *cb)
{
struct genl_start_context *ctx = cb->data;
- const struct genl_ops *ops = ctx->ops;
+ const struct genl_split_ops *ops;
struct genl_dumpit_info *info;
struct nlattr **attrs = NULL;
int rc = 0;
- if (ops->validate & GENL_DONT_VALIDATE_DUMP)
- goto no_attrs;
-
- if (ctx->nlh->nlmsg_len < nlmsg_msg_size(ctx->hdrlen))
+ ops = ctx->ops;
+ if (!(ops->validate & GENL_DONT_VALIDATE_DUMP) &&
+ ctx->nlh->nlmsg_len < nlmsg_msg_size(ctx->hdrlen))
return -EINVAL;
attrs = genl_family_rcv_msg_attrs_parse(ctx->family, ctx->nlh, ctx->extack,
@@ -579,7 +823,6 @@ static int genl_start(struct netlink_callback *cb)
if (IS_ERR(attrs))
return PTR_ERR(attrs);
-no_attrs:
info = genl_dumpit_info_alloc();
if (!info) {
genl_family_rcv_msg_attrs_free(attrs);
@@ -608,7 +851,7 @@ no_attrs:
static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
- const struct genl_ops *ops = &genl_dumpit_info(cb)->op;
+ const struct genl_split_ops *ops = &genl_dumpit_info(cb)->op;
int rc;
genl_lock();
@@ -620,7 +863,7 @@ static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
static int genl_lock_done(struct netlink_callback *cb)
{
const struct genl_dumpit_info *info = genl_dumpit_info(cb);
- const struct genl_ops *ops = &info->op;
+ const struct genl_split_ops *ops = &info->op;
int rc = 0;
if (ops->done) {
@@ -636,7 +879,7 @@ static int genl_lock_done(struct netlink_callback *cb)
static int genl_parallel_done(struct netlink_callback *cb)
{
const struct genl_dumpit_info *info = genl_dumpit_info(cb);
- const struct genl_ops *ops = &info->op;
+ const struct genl_split_ops *ops = &info->op;
int rc = 0;
if (ops->done)
@@ -650,15 +893,12 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family,
struct sk_buff *skb,
struct nlmsghdr *nlh,
struct netlink_ext_ack *extack,
- const struct genl_ops *ops,
+ const struct genl_split_ops *ops,
int hdrlen, struct net *net)
{
struct genl_start_context ctx;
int err;
- if (!ops->dumpit)
- return -EOPNOTSUPP;
-
ctx.family = family;
ctx.nlh = nlh;
ctx.extack = extack;
@@ -696,16 +936,13 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family,
struct sk_buff *skb,
struct nlmsghdr *nlh,
struct netlink_ext_ack *extack,
- const struct genl_ops *ops,
+ const struct genl_split_ops *ops,
int hdrlen, struct net *net)
{
struct nlattr **attrbuf;
struct genl_info info;
int err;
- if (!ops->doit)
- return -EOPNOTSUPP;
-
attrbuf = genl_family_rcv_msg_attrs_parse(family, nlh, extack,
ops, hdrlen,
GENL_DONT_VALIDATE_STRICT);
@@ -722,16 +959,16 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family,
genl_info_net_set(&info, net);
memset(&info.user_ptr, 0, sizeof(info.user_ptr));
- if (family->pre_doit) {
- err = family->pre_doit(ops, skb, &info);
+ if (ops->pre_doit) {
+ err = ops->pre_doit(ops, skb, &info);
if (err)
goto out;
}
err = ops->doit(skb, &info);
- if (family->post_doit)
- family->post_doit(ops, skb, &info);
+ if (ops->post_doit)
+ ops->post_doit(ops, skb, &info);
out:
genl_family_rcv_msg_attrs_free(attrbuf);
@@ -776,8 +1013,9 @@ static int genl_family_rcv_msg(const struct genl_family *family,
{
struct net *net = sock_net(skb->sk);
struct genlmsghdr *hdr = nlmsg_data(nlh);
- struct genl_ops op;
+ struct genl_split_ops op;
int hdrlen;
+ u8 flags;
/* this family doesn't exist in this netns */
if (!family->netnsok && !net_eq(net, &init_net))
@@ -790,7 +1028,9 @@ static int genl_family_rcv_msg(const struct genl_family *family,
if (genl_header_check(family, nlh, hdr, extack))
return -EINVAL;
- if (genl_get_cmd(hdr->cmd, family, &op))
+ flags = (nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP ?
+ GENL_CMD_CAP_DUMP : GENL_CMD_CAP_DO;
+ if (genl_get_cmd(hdr->cmd, flags, family, &op))
return -EOPNOTSUPP;
if ((op.flags & GENL_ADMIN_PERM) &&
@@ -801,7 +1041,7 @@ static int genl_family_rcv_msg(const struct genl_family *family,
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP)
+ if (flags & GENL_CMD_CAP_DUMP)
return genl_family_rcv_msg_dumpit(family, skb, nlh, extack,
&op, hdrlen, net);
else
@@ -846,6 +1086,7 @@ static struct genl_family genl_ctrl;
static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq,
u32 flags, struct sk_buff *skb, u8 cmd)
{
+ struct genl_op_iter i;
void *hdr;
hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd);
@@ -859,33 +1100,26 @@ static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq,
nla_put_u32(skb, CTRL_ATTR_MAXATTR, family->maxattr))
goto nla_put_failure;
- if (genl_get_cmd_cnt(family)) {
+ if (genl_op_iter_init(family, &i)) {
struct nlattr *nla_ops;
- int i;
nla_ops = nla_nest_start_noflag(skb, CTRL_ATTR_OPS);
if (nla_ops == NULL)
goto nla_put_failure;
- for (i = 0; i < genl_get_cmd_cnt(family); i++) {
+ while (genl_op_iter_next(&i)) {
struct nlattr *nest;
- struct genl_ops op;
u32 op_flags;
- genl_get_cmd_by_index(i, family, &op);
- op_flags = op.flags;
- if (op.dumpit)
- op_flags |= GENL_CMD_CAP_DUMP;
- if (op.doit)
- op_flags |= GENL_CMD_CAP_DO;
- if (op.policy)
+ op_flags = i.flags;
+ if (i.doit.policy || i.dumpit.policy)
op_flags |= GENL_CMD_CAP_HASPOL;
- nest = nla_nest_start_noflag(skb, i + 1);
+ nest = nla_nest_start_noflag(skb, genl_op_iter_idx(&i));
if (nest == NULL)
goto nla_put_failure;
- if (nla_put_u32(skb, CTRL_ATTR_OP_ID, op.cmd) ||
+ if (nla_put_u32(skb, CTRL_ATTR_OP_ID, i.cmd) ||
nla_put_u32(skb, CTRL_ATTR_OP_FLAGS, op_flags))
goto nla_put_failure;
@@ -1138,10 +1372,10 @@ static int genl_ctrl_event(int event, const struct genl_family *family,
struct ctrl_dump_policy_ctx {
struct netlink_policy_dump_state *state;
const struct genl_family *rt;
- unsigned int opidx;
+ struct genl_op_iter *op_iter;
u32 op;
u16 fam_id;
- u8 policies:1,
+ u8 dump_map:1,
single_op:1;
};
@@ -1158,8 +1392,8 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb)
struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx;
struct nlattr **tb = info->attrs;
const struct genl_family *rt;
- struct genl_ops op;
- int err, i;
+ struct genl_op_iter i;
+ int err;
BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
@@ -1183,40 +1417,73 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb)
ctx->rt = rt;
if (tb[CTRL_ATTR_OP]) {
+ struct genl_split_ops doit, dump;
+
ctx->single_op = true;
ctx->op = nla_get_u32(tb[CTRL_ATTR_OP]);
- err = genl_get_cmd(ctx->op, rt, &op);
+ err = genl_get_cmd_both(ctx->op, rt, &doit, &dump);
if (err) {
NL_SET_BAD_ATTR(cb->extack, tb[CTRL_ATTR_OP]);
return err;
}
- if (!op.policy)
+ if (doit.policy) {
+ err = netlink_policy_dump_add_policy(&ctx->state,
+ doit.policy,
+ doit.maxattr);
+ if (err)
+ goto err_free_state;
+ }
+ if (dump.policy) {
+ err = netlink_policy_dump_add_policy(&ctx->state,
+ dump.policy,
+ dump.maxattr);
+ if (err)
+ goto err_free_state;
+ }
+
+ if (!ctx->state)
return -ENODATA;
- return netlink_policy_dump_add_policy(&ctx->state, op.policy,
- op.maxattr);
+ ctx->dump_map = 1;
+ return 0;
}
- for (i = 0; i < genl_get_cmd_cnt(rt); i++) {
- genl_get_cmd_by_index(i, rt, &op);
+ ctx->op_iter = kmalloc(sizeof(*ctx->op_iter), GFP_KERNEL);
+ if (!ctx->op_iter)
+ return -ENOMEM;
+
+ genl_op_iter_init(rt, ctx->op_iter);
+ ctx->dump_map = genl_op_iter_next(ctx->op_iter);
- if (op.policy) {
+ for (genl_op_iter_init(rt, &i); genl_op_iter_next(&i); ) {
+ if (i.doit.policy) {
+ err = netlink_policy_dump_add_policy(&ctx->state,
+ i.doit.policy,
+ i.doit.maxattr);
+ if (err)
+ goto err_free_state;
+ }
+ if (i.dumpit.policy) {
err = netlink_policy_dump_add_policy(&ctx->state,
- op.policy,
- op.maxattr);
+ i.dumpit.policy,
+ i.dumpit.maxattr);
if (err)
goto err_free_state;
}
}
- if (!ctx->state)
- return -ENODATA;
+ if (!ctx->state) {
+ err = -ENODATA;
+ goto err_free_op_iter;
+ }
return 0;
err_free_state:
netlink_policy_dump_free(ctx->state);
+err_free_op_iter:
+ kfree(ctx->op_iter);
return err;
}
@@ -1240,7 +1507,8 @@ static void *ctrl_dumppolicy_prep(struct sk_buff *skb,
static int ctrl_dumppolicy_put_op(struct sk_buff *skb,
struct netlink_callback *cb,
- struct genl_ops *op)
+ struct genl_split_ops *doit,
+ struct genl_split_ops *dumpit)
{
struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx;
struct nlattr *nest_pol, *nest_op;
@@ -1248,10 +1516,7 @@ static int ctrl_dumppolicy_put_op(struct sk_buff *skb,
int idx;
/* skip if we have nothing to show */
- if (!op->policy)
- return 0;
- if (!op->doit &&
- (!op->dumpit || op->validate & GENL_DONT_VALIDATE_DUMP))
+ if (!doit->policy && !dumpit->policy)
return 0;
hdr = ctrl_dumppolicy_prep(skb, cb);
@@ -1262,21 +1527,26 @@ static int ctrl_dumppolicy_put_op(struct sk_buff *skb,
if (!nest_pol)
goto err;
- nest_op = nla_nest_start(skb, op->cmd);
+ nest_op = nla_nest_start(skb, doit->cmd);
if (!nest_op)
goto err;
- /* for now both do/dump are always the same */
- idx = netlink_policy_dump_get_policy_idx(ctx->state,
- op->policy,
- op->maxattr);
+ if (doit->policy) {
+ idx = netlink_policy_dump_get_policy_idx(ctx->state,
+ doit->policy,
+ doit->maxattr);
- if (op->doit && nla_put_u32(skb, CTRL_ATTR_POLICY_DO, idx))
- goto err;
+ if (nla_put_u32(skb, CTRL_ATTR_POLICY_DO, idx))
+ goto err;
+ }
+ if (dumpit->policy) {
+ idx = netlink_policy_dump_get_policy_idx(ctx->state,
+ dumpit->policy,
+ dumpit->maxattr);
- if (op->dumpit && !(op->validate & GENL_DONT_VALIDATE_DUMP) &&
- nla_put_u32(skb, CTRL_ATTR_POLICY_DUMP, idx))
- goto err;
+ if (nla_put_u32(skb, CTRL_ATTR_POLICY_DUMP, idx))
+ goto err;
+ }
nla_nest_end(skb, nest_op);
nla_nest_end(skb, nest_pol);
@@ -1293,31 +1563,29 @@ static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb)
struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx;
void *hdr;
- if (!ctx->policies) {
- while (ctx->opidx < genl_get_cmd_cnt(ctx->rt)) {
- struct genl_ops op;
+ if (ctx->dump_map) {
+ if (ctx->single_op) {
+ struct genl_split_ops doit, dumpit;
- if (ctx->single_op) {
- int err;
+ if (WARN_ON(genl_get_cmd_both(ctx->op, ctx->rt,
+ &doit, &dumpit)))
+ return -ENOENT;
- err = genl_get_cmd(ctx->op, ctx->rt, &op);
- if (WARN_ON(err))
- return skb->len;
+ if (ctrl_dumppolicy_put_op(skb, cb, &doit, &dumpit))
+ return skb->len;
- /* break out of the loop after this one */
- ctx->opidx = genl_get_cmd_cnt(ctx->rt);
- } else {
- genl_get_cmd_by_index(ctx->opidx, ctx->rt, &op);
- }
+ /* done with the per-op policy index list */
+ ctx->dump_map = 0;
+ }
- if (ctrl_dumppolicy_put_op(skb, cb, &op))
+ while (ctx->dump_map) {
+ if (ctrl_dumppolicy_put_op(skb, cb,
+ &ctx->op_iter->doit,
+ &ctx->op_iter->dumpit))
return skb->len;
- ctx->opidx++;
+ ctx->dump_map = genl_op_iter_next(ctx->op_iter);
}
-
- /* completed with the per-op policy index list */
- ctx->policies = true;
}
while (netlink_policy_dump_loop(ctx->state)) {
@@ -1350,18 +1618,27 @@ static int ctrl_dumppolicy_done(struct netlink_callback *cb)
{
struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx;
+ kfree(ctx->op_iter);
netlink_policy_dump_free(ctx->state);
return 0;
}
-static const struct genl_ops genl_ctrl_ops[] = {
+static const struct genl_split_ops genl_ctrl_ops[] = {
{
.cmd = CTRL_CMD_GETFAMILY,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .validate = GENL_DONT_VALIDATE_STRICT,
.policy = ctrl_policy_family,
.maxattr = ARRAY_SIZE(ctrl_policy_family) - 1,
.doit = ctrl_getfamily,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = CTRL_CMD_GETFAMILY,
+ .validate = GENL_DONT_VALIDATE_DUMP,
+ .policy = ctrl_policy_family,
+ .maxattr = ARRAY_SIZE(ctrl_policy_family) - 1,
.dumpit = ctrl_dumpfamily,
+ .flags = GENL_CMD_CAP_DUMP,
},
{
.cmd = CTRL_CMD_GETPOLICY,
@@ -1370,6 +1647,7 @@ static const struct genl_ops genl_ctrl_ops[] = {
.start = ctrl_dumppolicy_start,
.dumpit = ctrl_dumppolicy,
.done = ctrl_dumppolicy_done,
+ .flags = GENL_CMD_CAP_DUMP,
},
};
@@ -1379,8 +1657,8 @@ static const struct genl_multicast_group genl_ctrl_groups[] = {
static struct genl_family genl_ctrl __ro_after_init = {
.module = THIS_MODULE,
- .ops = genl_ctrl_ops,
- .n_ops = ARRAY_SIZE(genl_ctrl_ops),
+ .split_ops = genl_ctrl_ops,
+ .n_split_ops = ARRAY_SIZE(genl_ctrl_ops),
.resv_start_op = CTRL_CMD_GETPOLICY + 1,
.mcgrps = genl_ctrl_groups,
.n_mcgrps = ARRAY_SIZE(genl_ctrl_groups),
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index 6a193cce2a75..fff755dde30d 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -24,6 +24,7 @@
#include <linux/sched.h>
#include <linux/bitops.h>
#include <linux/skbuff.h>
+#include <linux/kcov.h>
#include "../nfc.h"
#include <net/nfc/nci.h>
@@ -542,7 +543,7 @@ static int nci_open_device(struct nci_dev *ndev)
skb_queue_purge(&ndev->tx_q);
ndev->ops->close(ndev);
- ndev->flags = 0;
+ ndev->flags &= BIT(NCI_UNREG);
}
done:
@@ -1472,6 +1473,7 @@ static void nci_tx_work(struct work_struct *work)
skb = skb_dequeue(&ndev->tx_q);
if (!skb)
return;
+ kcov_remote_start_common(skb_get_kcov_handle(skb));
/* Check if data flow control is used */
if (atomic_read(&conn_info->credits_cnt) !=
@@ -1487,6 +1489,7 @@ static void nci_tx_work(struct work_struct *work)
mod_timer(&ndev->data_timer,
jiffies + msecs_to_jiffies(NCI_DATA_TIMEOUT));
+ kcov_remote_stop();
}
}
@@ -1497,7 +1500,8 @@ static void nci_rx_work(struct work_struct *work)
struct nci_dev *ndev = container_of(work, struct nci_dev, rx_work);
struct sk_buff *skb;
- while ((skb = skb_dequeue(&ndev->rx_q))) {
+ for (; (skb = skb_dequeue(&ndev->rx_q)); kcov_remote_stop()) {
+ kcov_remote_start_common(skb_get_kcov_handle(skb));
/* Send copy to sniffer */
nfc_send_to_raw_sock(ndev->nfc_dev, skb,
@@ -1551,6 +1555,7 @@ static void nci_cmd_work(struct work_struct *work)
if (!skb)
return;
+ kcov_remote_start_common(skb_get_kcov_handle(skb));
atomic_dec(&ndev->cmd_cnt);
pr_debug("NCI TX: MT=cmd, PBF=%d, GID=0x%x, OID=0x%x, plen=%d\n",
@@ -1563,6 +1568,7 @@ static void nci_cmd_work(struct work_struct *work)
mod_timer(&ndev->cmd_timer,
jiffies + msecs_to_jiffies(NCI_CMD_TIMEOUT));
+ kcov_remote_stop();
}
}
diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c
index aa5e712adf07..3d36ea5701f0 100644
--- a/net/nfc/nci/data.c
+++ b/net/nfc/nci/data.c
@@ -279,8 +279,10 @@ void nci_rx_data_packet(struct nci_dev *ndev, struct sk_buff *skb)
nci_plen(skb->data));
conn_info = nci_get_conn_info_by_conn_id(ndev, nci_conn_id(skb->data));
- if (!conn_info)
+ if (!conn_info) {
+ kfree_skb(skb);
return;
+ }
/* strip the nci data header */
skb_pull(skb, NCI_DATA_HDR_SIZE);
diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c
index 78c4b6addf15..de175318a3a0 100644
--- a/net/nfc/nci/hci.c
+++ b/net/nfc/nci/hci.c
@@ -14,6 +14,7 @@
#include <net/nfc/nci.h>
#include <net/nfc/nci_core.h>
#include <linux/nfc.h>
+#include <linux/kcov.h>
struct nci_data {
u8 conn_id;
@@ -409,7 +410,8 @@ static void nci_hci_msg_rx_work(struct work_struct *work)
const struct nci_hcp_message *message;
u8 pipe, type, instruction;
- while ((skb = skb_dequeue(&hdev->msg_rx_queue)) != NULL) {
+ for (; (skb = skb_dequeue(&hdev->msg_rx_queue)); kcov_remote_stop()) {
+ kcov_remote_start_common(skb_get_kcov_handle(skb));
pipe = NCI_HCP_MSG_GET_PIPE(skb->data[0]);
skb_pull(skb, NCI_HCI_HCP_PACKET_HEADER_LEN);
message = (struct nci_hcp_message *)skb->data;
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 8dd569765f96..5125392bb68e 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -12,6 +12,7 @@
#include <net/tcp_states.h>
#include <linux/nfc.h>
#include <linux/export.h>
+#include <linux/kcov.h>
#include "nfc.h"
@@ -189,6 +190,7 @@ static void rawsock_tx_work(struct work_struct *work)
}
skb = skb_dequeue(&sk->sk_write_queue);
+ kcov_remote_start_common(skb_get_kcov_handle(skb));
sock_hold(sk);
rc = nfc_data_exchange(dev, target_idx, skb,
@@ -197,6 +199,7 @@ static void rawsock_tx_work(struct work_struct *work)
rawsock_report_error(sk, rc);
sock_put(sk);
}
+ kcov_remote_stop();
}
static int rawsock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 868db4669a29..ca3ebfdb3023 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1033,7 +1033,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
actions = nla_next(sample_arg, &rem);
if ((arg->probability != U32_MAX) &&
- (!arg->probability || prandom_u32() > arg->probability)) {
+ (!arg->probability || get_random_u32() > arg->probability)) {
if (last)
consume_skb(skb);
return 0;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index c7b10234cf7c..d78f0fc4337d 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -152,7 +152,7 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
static u32 ovs_ct_get_mark(const struct nf_conn *ct)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
- return ct ? ct->mark : 0;
+ return ct ? READ_ONCE(ct->mark) : 0;
#else
return 0;
#endif
@@ -340,9 +340,9 @@ static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key,
#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
u32 new_mark;
- new_mark = ct_mark | (ct->mark & ~(mask));
- if (ct->mark != new_mark) {
- ct->mark = new_mark;
+ new_mark = ct_mark | (READ_ONCE(ct->mark) & ~(mask));
+ if (READ_ONCE(ct->mark) != new_mark) {
+ WRITE_ONCE(ct->mark, new_mark);
if (nf_ct_is_confirmed(ct))
nf_conntrack_event_cache(IPCT_MARK, ct);
key->ct.mark = new_mark;
@@ -434,65 +434,6 @@ static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key,
return 0;
}
-/* 'skb' should already be pulled to nh_ofs. */
-static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
-{
- const struct nf_conntrack_helper *helper;
- const struct nf_conn_help *help;
- enum ip_conntrack_info ctinfo;
- unsigned int protoff;
- struct nf_conn *ct;
- int err;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- return NF_ACCEPT;
-
- help = nfct_help(ct);
- if (!help)
- return NF_ACCEPT;
-
- helper = rcu_dereference(help->helper);
- if (!helper)
- return NF_ACCEPT;
-
- switch (proto) {
- case NFPROTO_IPV4:
- protoff = ip_hdrlen(skb);
- break;
- case NFPROTO_IPV6: {
- u8 nexthdr = ipv6_hdr(skb)->nexthdr;
- __be16 frag_off;
- int ofs;
-
- ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
- &frag_off);
- if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
- pr_debug("proto header not found\n");
- return NF_ACCEPT;
- }
- protoff = ofs;
- break;
- }
- default:
- WARN_ONCE(1, "helper invoked on non-IP family!");
- return NF_DROP;
- }
-
- err = helper->help(skb, protoff, ct, ctinfo);
- if (err != NF_ACCEPT)
- return err;
-
- /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
- * FTP with NAT) adusting the TCP payload size when mangling IP
- * addresses and/or port numbers in the text-based control connection.
- */
- if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
- !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
- return NF_DROP;
- return NF_ACCEPT;
-}
-
/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
* value if 'skb' is freed.
*/
@@ -1038,7 +979,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
*/
if ((nf_ct_is_confirmed(ct) ? !cached || add_helper :
info->commit) &&
- ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
+ nf_ct_helper(skb, ct, ctinfo, info->family) != NF_ACCEPT) {
return -EINVAL;
}
@@ -1350,43 +1291,6 @@ int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key)
return 0;
}
-static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
- const struct sw_flow_key *key, bool log)
-{
- struct nf_conntrack_helper *helper;
- struct nf_conn_help *help;
- int ret = 0;
-
- helper = nf_conntrack_helper_try_module_get(name, info->family,
- key->ip.proto);
- if (!helper) {
- OVS_NLERR(log, "Unknown helper \"%s\"", name);
- return -EINVAL;
- }
-
- help = nf_ct_helper_ext_add(info->ct, GFP_KERNEL);
- if (!help) {
- nf_conntrack_helper_put(helper);
- return -ENOMEM;
- }
-
-#if IS_ENABLED(CONFIG_NF_NAT)
- if (info->nat) {
- ret = nf_nat_helper_try_module_get(name, info->family,
- key->ip.proto);
- if (ret) {
- nf_conntrack_helper_put(helper);
- OVS_NLERR(log, "Failed to load \"%s\" NAT helper, error: %d",
- name, ret);
- return ret;
- }
- }
-#endif
- rcu_assign_pointer(help->helper, helper);
- info->helper = helper;
- return ret;
-}
-
#if IS_ENABLED(CONFIG_NF_NAT)
static int parse_nat(const struct nlattr *attr,
struct ovs_conntrack_info *info, bool log)
@@ -1720,9 +1624,12 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
}
if (helper) {
- err = ovs_ct_add_helper(&ct_info, helper, key, log);
- if (err)
+ err = nf_ct_add_helper(ct_info.ct, helper, ct_info.family,
+ key->ip.proto, ct_info.nat, &ct_info.helper);
+ if (err) {
+ OVS_NLERR(log, "Failed to add %s helper %d", helper, err);
goto err_free_ct;
+ }
}
err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info,
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index c8a9075ddd0a..861dfb8daf4a 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -716,9 +716,9 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
do {
- start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
+ start = u64_stats_fetch_begin(&percpu_stats->syncp);
local_stats = *percpu_stats;
- } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
+ } while (u64_stats_fetch_retry(&percpu_stats->syncp, start));
stats->n_hit += local_stats.n_hit;
stats->n_missed += local_stats.n_missed;
@@ -1616,7 +1616,8 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb,
if (IS_ERR(dp))
return;
- WARN(dp->user_features, "Dropping previously announced user features\n");
+ pr_warn("%s: Dropping previously announced user features\n",
+ ovs_dp_name(dp));
dp->user_features = 0;
}
@@ -2543,6 +2544,7 @@ struct genl_family dp_vport_genl_family __ro_after_init = {
.parallel_ops = true,
.small_ops = dp_vport_genl_ops,
.n_small_ops = ARRAY_SIZE(dp_vport_genl_ops),
+ .resv_start_op = OVS_VPORT_CMD_SET + 1,
.mcgrps = &ovs_dp_vport_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 4a07ab094a84..ead5418c126e 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2309,7 +2309,7 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size)
WARN_ON_ONCE(size > MAX_ACTIONS_BUFSIZE);
- sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL);
+ sfa = kmalloc(kmalloc_size_roundup(sizeof(*sfa) + size), GFP_KERNEL);
if (!sfa)
return ERR_PTR(-ENOMEM);
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index d4a2db0b2299..0a0e4c283f02 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -205,9 +205,9 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma)
stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
do {
- start = u64_stats_fetch_begin_irq(&stats->syncp);
+ start = u64_stats_fetch_begin(&stats->syncp);
counter = stats->usage_cntrs[i];
- } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+ } while (u64_stats_fetch_retry(&stats->syncp, start));
ma->masks_usage_zero_cntr[i] += counter;
}
@@ -1136,10 +1136,9 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
do {
- start = u64_stats_fetch_begin_irq(&stats->syncp);
+ start = u64_stats_fetch_begin(&stats->syncp);
counter = stats->usage_cntrs[i];
- } while (u64_stats_fetch_retry_irq(&stats->syncp,
- start));
+ } while (u64_stats_fetch_retry(&stats->syncp, start));
masks_and_count[i].counter += counter;
}
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 89a8e1501809..b10e1602c6b1 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -91,7 +91,7 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms)
err = dev_change_flags(dev, dev->flags | IFF_UP, NULL);
if (err < 0) {
- rtnl_delete_link(dev);
+ rtnl_delete_link(dev, 0, NULL);
rtnl_unlock();
ovs_vport_free(vport);
goto error;
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index e6b5e76a962a..4014c9b5eb79 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -57,7 +57,7 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms)
err = dev_change_flags(dev, dev->flags | IFF_UP, NULL);
if (err < 0) {
- rtnl_delete_link(dev);
+ rtnl_delete_link(dev, 0, NULL);
rtnl_unlock();
ovs_vport_free(vport);
return ERR_PTR(err);
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 2f61d5bdce1a..903537a5da22 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -172,7 +172,7 @@ void ovs_netdev_tunnel_destroy(struct vport *vport)
* if it's not already shutting down.
*/
if (vport->dev->reg_state == NETREG_REGISTERED)
- rtnl_delete_link(vport->dev);
+ rtnl_delete_link(vport->dev, 0, NULL);
netdev_put(vport->dev, &vport->dev_tracker);
vport->dev = NULL;
rtnl_unlock();
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 188e9c1360a1..0b881b043bcf 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -120,7 +120,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
err = dev_change_flags(dev, dev->flags | IFF_UP, NULL);
if (err < 0) {
- rtnl_delete_link(dev);
+ rtnl_delete_link(dev, 0, NULL);
rtnl_unlock();
ovs_vport_free(vport);
goto error;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d3f6db350de7..41c4ccc3a5d6 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1350,7 +1350,7 @@ static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
if (READ_ONCE(history[i]) == rxhash)
count++;
- victim = prandom_u32() % ROLLOVER_HLEN;
+ victim = prandom_u32_max(ROLLOVER_HLEN);
/* Avoid dirtying the cache line if possible */
if (READ_ONCE(history[victim]) != rxhash)
@@ -1777,6 +1777,7 @@ static int fanout_add(struct sock *sk, struct fanout_args *args)
match->prot_hook.af_packet_net = read_pnet(&match->net);
match->prot_hook.id_match = match_fanout_group;
match->max_num_members = args->max_num_members;
+ match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING;
list_add(&match->list, &fanout_list);
}
err = -EINVAL;
@@ -2293,8 +2294,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (skb->ip_summed == CHECKSUM_PARTIAL)
status |= TP_STATUS_CSUMNOTREADY;
else if (skb->pkt_type != PACKET_OUTGOING &&
- (skb->ip_summed == CHECKSUM_COMPLETE ||
- skb_csum_unnecessary(skb)))
+ skb_csum_unnecessary(skb))
status |= TP_STATUS_CSUM_VALID;
if (snaplen > res)
@@ -3277,7 +3277,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
int addr_len)
{
struct sock *sk = sock->sk;
- char name[sizeof(uaddr->sa_data) + 1];
+ char name[sizeof(uaddr->sa_data_min) + 1];
/*
* Check legality
@@ -3288,8 +3288,8 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
/* uaddr->sa_data comes from the userspace, it's not guaranteed to be
* zero-terminated.
*/
- memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
- name[sizeof(uaddr->sa_data)] = 0;
+ memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min));
+ name[sizeof(uaddr->sa_data_min)] = 0;
return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
}
@@ -3520,8 +3520,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (skb->ip_summed == CHECKSUM_PARTIAL)
aux.tp_status |= TP_STATUS_CSUMNOTREADY;
else if (skb->pkt_type != PACKET_OUTGOING &&
- (skb->ip_summed == CHECKSUM_COMPLETE ||
- skb_csum_unnecessary(skb)))
+ skb_csum_unnecessary(skb))
aux.tp_status |= TP_STATUS_CSUM_VALID;
aux.tp_len = origlen;
@@ -3561,11 +3560,11 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
return -EOPNOTSUPP;
uaddr->sa_family = AF_PACKET;
- memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
+ memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min));
rcu_read_lock();
dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
if (dev)
- strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
+ strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min));
rcu_read_unlock();
return sizeof(*uaddr);
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 5b5fb4ca8d3e..97a29172a8ee 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -104,7 +104,7 @@ static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr,
return -EINVAL;
last = rover;
} else {
- rover = max_t(u16, prandom_u32(), 2);
+ rover = max_t(u16, get_random_u16(), 2);
last = rover - 1;
}
diff --git a/net/rds/message.c b/net/rds/message.c
index 44dbc612ef54..b47e4f0a1639 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -366,7 +366,6 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
struct scatterlist *sg;
int ret = 0;
int length = iov_iter_count(from);
- int total_copied = 0;
struct rds_msg_zcopy_info *info;
rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
@@ -404,7 +403,6 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
ret = -EFAULT;
goto err;
}
- total_copied += copied;
length -= copied;
sg_set_page(sg, pages, copied, start);
rm->data.op_nents++;
diff --git a/net/rds/send.c b/net/rds/send.c
index 0c5504068e3c..5e57a1581dc6 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1114,7 +1114,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
struct rds_conn_path *cpath;
struct in6_addr daddr;
__u32 scope_id = 0;
- size_t total_payload_len = payload_len, rdma_payload_len = 0;
+ size_t rdma_payload_len = 0;
bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
int num_sgs = DIV_ROUND_UP(payload_len, PAGE_SIZE);
@@ -1243,7 +1243,6 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
if (ret)
goto out;
- total_payload_len += rdma_payload_len;
if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) {
ret = -EMSGSIZE;
goto out;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 4444fd82b66d..c5b86066ff66 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -503,6 +503,9 @@ bool rds_tcp_tune(struct socket *sock)
release_sock(sk);
return false;
}
+ /* Update ns_tracker to current stack trace and refcounted tracker */
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+
sk->sk_net_refcnt = 1;
netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL);
sock_inuse_add(net, 1);
diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c
index 8b96a56d3a49..0f77ae8ef944 100644
--- a/net/rose/rose_link.c
+++ b/net/rose/rose_link.c
@@ -236,6 +236,9 @@ void rose_transmit_clear_request(struct rose_neigh *neigh, unsigned int lci, uns
unsigned char *dptr;
int len;
+ if (!neigh->dev)
+ return;
+
len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 3;
if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL)
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index b11281bed2a4..fdeba488fc6e 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -30,6 +30,7 @@ rxrpc-y := \
sendmsg.o \
server_key.o \
skbuff.o \
+ txbuf.o \
utils.o
rxrpc-$(CONFIG_PROC_FS) += proc.o
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index ceba28e9dce6..aacdd96a9886 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -39,7 +39,7 @@ atomic_t rxrpc_debug_id;
EXPORT_SYMBOL(rxrpc_debug_id);
/* count of skbs currently in use */
-atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
+atomic_t rxrpc_n_rx_skbs;
struct workqueue_struct *rxrpc_workqueue;
@@ -93,12 +93,11 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
srx->transport_len > len)
return -EINVAL;
- if (srx->transport.family != rx->family &&
- srx->transport.family == AF_INET && rx->family != AF_INET6)
- return -EAFNOSUPPORT;
-
switch (srx->transport.family) {
case AF_INET:
+ if (rx->family != AF_INET &&
+ rx->family != AF_INET6)
+ return -EAFNOSUPPORT;
if (srx->transport_len < sizeof(struct sockaddr_in))
return -EINVAL;
tail = offsetof(struct sockaddr_rxrpc, transport.sin.__pad);
@@ -106,6 +105,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
#ifdef CONFIG_AF_RXRPC_IPV6
case AF_INET6:
+ if (rx->family != AF_INET6)
+ return -EAFNOSUPPORT;
if (srx->transport_len < sizeof(struct sockaddr_in6))
return -EINVAL;
tail = offsetof(struct sockaddr_rxrpc, transport) +
@@ -979,7 +980,7 @@ static int __init af_rxrpc_init(void)
goto error_call_jar;
}
- rxrpc_workqueue = alloc_workqueue("krxrpcd", 0, 1);
+ rxrpc_workqueue = alloc_workqueue("krxrpcd", WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
if (!rxrpc_workqueue) {
pr_notice("Failed to allocate work queue\n");
goto error_work_queue;
@@ -1059,7 +1060,6 @@ static void __exit af_rxrpc_exit(void)
sock_unregister(PF_RXRPC);
proto_unregister(&rxrpc_proto);
unregister_pernet_device(&rxrpc_net_ops);
- ASSERTCMP(atomic_read(&rxrpc_n_tx_skbs), ==, 0);
ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0);
/* Make sure the local and peer records pinned by any dying connections
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 1ad0ec5afb50..f5c538ce3e23 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -29,6 +29,7 @@ struct rxrpc_crypt {
struct key_preparsed_payload;
struct rxrpc_connection;
+struct rxrpc_txbuf;
/*
* Mark applied to socket buffers in skb->mark. skb->priority is used
@@ -93,6 +94,22 @@ struct rxrpc_net {
struct list_head peer_keepalive_new;
struct timer_list peer_keepalive_timer;
struct work_struct peer_keepalive_work;
+
+ atomic_t stat_tx_data;
+ atomic_t stat_tx_data_retrans;
+ atomic_t stat_tx_data_send;
+ atomic_t stat_tx_data_send_frag;
+ atomic_t stat_rx_data;
+ atomic_t stat_rx_data_reqack;
+ atomic_t stat_rx_data_jumbo;
+
+ atomic_t stat_tx_ack_fill;
+ atomic_t stat_tx_ack_send;
+ atomic_t stat_tx_ack_skip;
+ atomic_t stat_tx_acks[256];
+ atomic_t stat_rx_acks[256];
+
+ atomic_t stat_why_req_ack[8];
};
/*
@@ -178,20 +195,12 @@ struct rxrpc_host_header {
* - max 48 bytes (struct sk_buff::cb)
*/
struct rxrpc_skb_priv {
- atomic_t nr_ring_pins; /* Number of rxtx ring pins */
- u8 nr_subpackets; /* Number of subpackets */
- u8 rx_flags; /* Received packet flags */
-#define RXRPC_SKB_INCL_LAST 0x01 /* - Includes last packet */
-#define RXRPC_SKB_TX_BUFFER 0x02 /* - Is transmit buffer */
- union {
- int remain; /* amount of space remaining for next write */
-
- /* List of requested ACKs on subpackets */
- unsigned long rx_req_ack[(RXRPC_MAX_NR_JUMBO + BITS_PER_LONG - 1) /
- BITS_PER_LONG];
- };
+ u16 offset; /* Offset of data */
+ u16 len; /* Length of data */
+ u8 flags;
+#define RXRPC_RX_VERIFIED 0x01
- struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */
+ struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */
};
#define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb)
@@ -233,19 +242,14 @@ struct rxrpc_security {
size_t *, size_t *, size_t *);
/* impose security on a packet */
- int (*secure_packet)(struct rxrpc_call *, struct sk_buff *, size_t);
+ int (*secure_packet)(struct rxrpc_call *, struct rxrpc_txbuf *);
/* verify the security on a received packet */
- int (*verify_packet)(struct rxrpc_call *, struct sk_buff *,
- unsigned int, unsigned int, rxrpc_seq_t, u16);
+ int (*verify_packet)(struct rxrpc_call *, struct sk_buff *);
/* Free crypto request on a call */
void (*free_call_crypto)(struct rxrpc_call *);
- /* Locate the data in a received packet that has been verified. */
- void (*locate_data)(struct rxrpc_call *, struct sk_buff *,
- unsigned int *, unsigned int *);
-
/* issue a challenge */
int (*issue_challenge)(struct rxrpc_connection *);
@@ -276,6 +280,8 @@ struct rxrpc_local {
struct hlist_node link;
struct socket *socket; /* my UDP socket */
struct work_struct processor;
+ struct list_head ack_tx_queue; /* List of ACKs that need sending */
+ spinlock_t ack_tx_lock; /* ACK list lock */
struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */
struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */
struct sk_buff_head reject_queue; /* packets awaiting rejection */
@@ -326,7 +332,7 @@ struct rxrpc_peer {
u32 rto_j; /* Retransmission timeout in jiffies */
u8 backoff; /* Backoff timeout */
- u8 cong_cwnd; /* Congestion window size */
+ u8 cong_ssthresh; /* Congestion slow-start threshold */
};
/*
@@ -399,6 +405,7 @@ enum rxrpc_conn_proto_state {
struct rxrpc_bundle {
struct rxrpc_conn_parameters params;
refcount_t ref;
+ atomic_t active; /* Number of active users */
unsigned int debug_id;
bool try_upgrade; /* True if the bundle is attempting upgrade */
bool alloc_conn; /* True if someone's getting a conn */
@@ -490,6 +497,7 @@ enum rxrpc_call_flag {
RXRPC_CALL_EXPOSED, /* The call was exposed to the world */
RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */
RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */
+ RXRPC_CALL_TX_ALL_ACKED, /* Last packet has been hard-acked */
RXRPC_CALL_SEND_PING, /* A ping will need to be sent */
RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */
RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */
@@ -498,16 +506,16 @@ enum rxrpc_call_flag {
RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */
RXRPC_CALL_KERNEL, /* The call was made by the kernel */
RXRPC_CALL_UPGRADE, /* Service upgrade was requested for the call */
+ RXRPC_CALL_DELAY_ACK_PENDING, /* DELAY ACK generation is pending */
+ RXRPC_CALL_IDLE_ACK_PENDING, /* IDLE ACK generation is pending */
};
/*
* Events that can be raised on a call.
*/
enum rxrpc_call_event {
- RXRPC_CALL_EV_ACK, /* need to generate ACK */
RXRPC_CALL_EV_ABORT, /* need to generate abort */
RXRPC_CALL_EV_RESEND, /* Tx resend required */
- RXRPC_CALL_EV_PING, /* Ping send required */
RXRPC_CALL_EV_EXPIRED, /* Expiry occurred */
RXRPC_CALL_EV_ACK_LOST, /* ACK may be lost, send ping */
};
@@ -566,7 +574,7 @@ struct rxrpc_call {
struct rxrpc_net *rxnet; /* Network namespace to which call belongs */
const struct rxrpc_security *security; /* applied security module */
struct mutex user_mutex; /* User access mutex */
- unsigned long ack_at; /* When deferred ACK needs to happen */
+ unsigned long delay_ack_at; /* When DELAY ACK needs to happen */
unsigned long ack_lost_at; /* When ACK is figured as lost */
unsigned long resend_at; /* When next resend needs to happen */
unsigned long ping_at; /* When next to send a ping */
@@ -576,7 +584,6 @@ struct rxrpc_call {
unsigned long expect_term_by; /* When we expect call termination by */
u32 next_rx_timo; /* Timeout for next Rx packet (jif) */
u32 next_req_timo; /* Timeout for next Rx request packet (jif) */
- struct skcipher_request *cipher_req; /* Packet cipher request buffer */
struct timer_list timer; /* Combined event timer */
struct work_struct processor; /* Event processor */
rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */
@@ -587,14 +594,12 @@ struct rxrpc_call {
struct list_head recvmsg_link; /* Link in rx->recvmsg_q */
struct list_head sock_link; /* Link in rx->sock_calls */
struct rb_node sock_node; /* Node in rx->calls */
- struct sk_buff *tx_pending; /* Tx socket buffer being filled */
+ struct rxrpc_txbuf *tx_pending; /* Tx buffer being filled */
wait_queue_head_t waitq; /* Wait queue for channel or Tx */
s64 tx_total_len; /* Total length left to be transmitted (or -1) */
- __be32 crypto_buf[2]; /* Temporary packet crypto buffer */
unsigned long user_call_ID; /* user-defined call ID */
unsigned long flags;
unsigned long events;
- spinlock_t lock;
spinlock_t notify_lock; /* Kernel notification lock */
rwlock_t state_lock; /* lock for state transition */
u32 abort_code; /* Local/remote abort code */
@@ -610,37 +615,27 @@ struct rxrpc_call {
int debug_id; /* debug ID for printks */
unsigned short rx_pkt_offset; /* Current recvmsg packet offset */
unsigned short rx_pkt_len; /* Current recvmsg packet len */
- bool rx_pkt_last; /* Current recvmsg packet is last */
-
- /* Rx/Tx circular buffer, depending on phase.
- *
- * In the Rx phase, packets are annotated with 0 or the number of the
- * segment of a jumbo packet each buffer refers to. There can be up to
- * 47 segments in a maximum-size UDP packet.
- *
- * In the Tx phase, packets are annotated with which buffers have been
- * acked.
- */
-#define RXRPC_RXTX_BUFF_SIZE 64
-#define RXRPC_RXTX_BUFF_MASK (RXRPC_RXTX_BUFF_SIZE - 1)
-#define RXRPC_INIT_RX_WINDOW_SIZE 63
- struct sk_buff **rxtx_buffer;
- u8 *rxtx_annotations;
-#define RXRPC_TX_ANNO_ACK 0
-#define RXRPC_TX_ANNO_UNACK 1
-#define RXRPC_TX_ANNO_NAK 2
-#define RXRPC_TX_ANNO_RETRANS 3
-#define RXRPC_TX_ANNO_MASK 0x03
-#define RXRPC_TX_ANNO_LAST 0x04
-#define RXRPC_TX_ANNO_RESENT 0x08
-
-#define RXRPC_RX_ANNO_SUBPACKET 0x3f /* Subpacket number in jumbogram */
-#define RXRPC_RX_ANNO_VERIFIED 0x80 /* Set if verified and decrypted */
- rxrpc_seq_t tx_hard_ack; /* Dead slot in buffer; the first transmitted but
- * not hard-ACK'd packet follows this.
- */
+
+ /* Transmitted data tracking. */
+ spinlock_t tx_lock; /* Transmit queue lock */
+ struct list_head tx_buffer; /* Buffer of transmissible packets */
+ rxrpc_seq_t tx_bottom; /* First packet in buffer */
+ rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */
rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */
u16 tx_backoff; /* Delay to insert due to Tx failure */
+ u8 tx_winsize; /* Maximum size of Tx window */
+#define RXRPC_TX_MAX_WINDOW 128
+ ktime_t tx_last_sent; /* Last time a transmission occurred */
+
+ /* Received data tracking */
+ struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */
+ struct sk_buff_head rx_oos_queue; /* Queue of out of sequence packets */
+
+ rxrpc_seq_t rx_highest_seq; /* Higest sequence number received */
+ rxrpc_seq_t rx_consumed; /* Highest packet consumed */
+ rxrpc_serial_t rx_serial; /* Highest serial received for this call */
+ u8 rx_winsize; /* Size of Rx window */
+ spinlock_t input_lock; /* Lock for packet input to this call */
/* TCP-style slow-start congestion control [RFC5681]. Since the SMSS
* is fixed, we keep these numbers in terms of segments (ie. DATA
@@ -655,25 +650,17 @@ struct rxrpc_call {
u8 cong_cumul_acks; /* Cumulative ACK count */
ktime_t cong_tstamp; /* Last time cwnd was changed */
- rxrpc_seq_t rx_hard_ack; /* Dead slot in buffer; the first received but not
- * consumed packet follows this.
- */
- rxrpc_seq_t rx_top; /* Highest Rx slot allocated. */
- rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */
- rxrpc_serial_t rx_serial; /* Highest serial received for this call */
- u8 rx_winsize; /* Size of Rx window */
- u8 tx_winsize; /* Maximum size of Tx window */
- bool tx_phase; /* T if transmission phase, F if receive phase */
- u8 nr_jumbo_bad; /* Number of jumbo dups/exceeds-windows */
-
- spinlock_t input_lock; /* Lock for packet input to this call */
-
/* Receive-phase ACK management (ACKs we send). */
u8 ackr_reason; /* reason to ACK */
rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */
- rxrpc_seq_t ackr_highest_seq; /* Higest sequence number received */
+ atomic64_t ackr_window; /* Base (in LSW) and top (in MSW) of SACK window */
atomic_t ackr_nr_unacked; /* Number of unacked packets */
atomic_t ackr_nr_consumed; /* Number of packets needing hard ACK */
+ struct {
+#define RXRPC_SACK_SIZE 256
+ /* SACK table for soft-acked packets */
+ u8 ackr_sack_table[RXRPC_SACK_SIZE];
+ } __aligned(8);
/* RTT management */
rxrpc_serial_t rtt_serial[4]; /* Serial number of DATA or PING sent */
@@ -687,21 +674,24 @@ struct rxrpc_call {
ktime_t acks_latest_ts; /* Timestamp of latest ACK received */
rxrpc_seq_t acks_first_seq; /* first sequence number received */
rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */
+ rxrpc_seq_t acks_hard_ack; /* Latest hard-ack point */
rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
rxrpc_seq_t acks_lost_top; /* tx_top at the time lost-ack ping sent */
rxrpc_serial_t acks_lost_ping; /* Serial number of probe ACK */
+ rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */
+ struct sk_buff *acks_soft_tbl; /* The last ACK packet with NAKs in it */
+ spinlock_t acks_ack_lock; /* Access to ->acks_last_ack */
};
/*
* Summary of a new ACK and the changes it made to the Tx buffer packet states.
*/
struct rxrpc_ack_summary {
+ u16 nr_acks; /* Number of ACKs in packet */
+ u16 nr_new_acks; /* Number of new ACKs in packet */
+ u16 nr_rot_new_acks; /* Number of rotated new ACKs */
u8 ack_reason;
- u8 nr_acks; /* Number of ACKs in packet */
- u8 nr_nacks; /* Number of NACKs in packet */
- u8 nr_new_acks; /* Number of new ACKs in packet */
- u8 nr_new_nacks; /* Number of new NACKs in packet */
- u8 nr_rot_new_acks; /* Number of rotated new ACKs */
+ bool saw_nacks; /* Saw NACKs in packet */
bool new_low_nack; /* T if new low NACK found */
bool retrans_timeo; /* T if reTx due to timeout happened */
u8 flight_size; /* Number of unreceived transmissions */
@@ -744,12 +734,58 @@ struct rxrpc_send_params {
bool upgrade; /* If the connection is upgradeable */
};
+/*
+ * Buffer of data to be output as a packet.
+ */
+struct rxrpc_txbuf {
+ struct rcu_head rcu;
+ struct list_head call_link; /* Link in call->tx_queue */
+ struct list_head tx_link; /* Link in live Enc queue or Tx queue */
+ struct rxrpc_call *call; /* Call to which belongs */
+ ktime_t last_sent; /* Time at which last transmitted */
+ refcount_t ref;
+ rxrpc_seq_t seq; /* Sequence number of this packet */
+ unsigned int call_debug_id;
+ unsigned int debug_id;
+ unsigned int len; /* Amount of data in buffer */
+ unsigned int space; /* Remaining data space */
+ unsigned int offset; /* Offset of fill point */
+ unsigned long flags;
+#define RXRPC_TXBUF_LAST 0 /* Set if last packet in Tx phase */
+#define RXRPC_TXBUF_RESENT 1 /* Set if has been resent */
+ u8 /*enum rxrpc_propose_ack_trace*/ ack_why; /* If ack, why */
+ struct {
+ /* The packet for encrypting and DMA'ing. We align it such
+ * that data[] aligns correctly for any crypto blocksize.
+ */
+ u8 pad[64 - sizeof(struct rxrpc_wire_header)];
+ struct rxrpc_wire_header wire; /* Network-ready header */
+ union {
+ u8 data[RXRPC_JUMBO_DATALEN]; /* Data packet */
+ struct {
+ struct rxrpc_ackpacket ack;
+ u8 acks[0];
+ };
+ };
+ } __aligned(64);
+};
+
+static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb)
+{
+ return txb->wire.flags & RXRPC_CLIENT_INITIATED;
+}
+
+static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb)
+{
+ return !rxrpc_sending_to_server(txb);
+}
+
#include <trace/events/rxrpc.h>
/*
* af_rxrpc.c
*/
-extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
+extern atomic_t rxrpc_n_rx_skbs;
extern struct workqueue_struct *rxrpc_workqueue;
/*
@@ -766,8 +802,12 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long);
/*
* call_event.c
*/
-void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool, bool,
- enum rxrpc_propose_ack_trace);
+void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial,
+ enum rxrpc_propose_ack_trace why);
+void rxrpc_send_ACK(struct rxrpc_call *, u8, rxrpc_serial_t, enum rxrpc_propose_ack_trace);
+void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t,
+ enum rxrpc_propose_ack_trace);
+void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *);
void rxrpc_process_call(struct work_struct *);
void rxrpc_reduce_call_timer(struct rxrpc_call *call,
@@ -949,15 +989,12 @@ static inline bool __rxrpc_use_local(struct rxrpc_local *local)
* misc.c
*/
extern unsigned int rxrpc_max_backlog __read_mostly;
-extern unsigned long rxrpc_requested_ack_delay;
extern unsigned long rxrpc_soft_ack_delay;
extern unsigned long rxrpc_idle_ack_delay;
extern unsigned int rxrpc_rx_window_size;
extern unsigned int rxrpc_rx_mtu;
extern unsigned int rxrpc_rx_jumbo_max;
-extern const s8 rxrpc_ack_priority[];
-
/*
* net_ns.c
*/
@@ -972,16 +1009,15 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net)
/*
* output.c
*/
-int rxrpc_send_ack_packet(struct rxrpc_call *, bool, rxrpc_serial_t *);
+void rxrpc_transmit_ack_packets(struct rxrpc_local *);
int rxrpc_send_abort_packet(struct rxrpc_call *);
-int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool);
+int rxrpc_send_data_packet(struct rxrpc_call *, struct rxrpc_txbuf *);
void rxrpc_reject_packets(struct rxrpc_local *);
void rxrpc_send_keepalive(struct rxrpc_peer *);
/*
* peer_event.c
*/
-void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset);
void rxrpc_error_report(struct sock *);
void rxrpc_peer_keepalive_worker(struct work_struct *);
@@ -1092,6 +1128,15 @@ void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_purge_queue(struct sk_buff_head *);
/*
+ * stats.c
+ */
+int rxrpc_stats_show(struct seq_file *seq, void *v);
+int rxrpc_stats_clear(struct file *file, char *buf, size_t size);
+
+#define rxrpc_inc_stat(rxnet, s) atomic_inc(&(rxnet)->s)
+#define rxrpc_dec_stat(rxnet, s) atomic_dec(&(rxnet)->s)
+
+/*
* sysctl.c
*/
#ifdef CONFIG_SYSCTL
@@ -1103,6 +1148,16 @@ static inline void rxrpc_sysctl_exit(void) {}
#endif
/*
+ * txbuf.c
+ */
+extern atomic_t rxrpc_nr_txbuf;
+struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type,
+ gfp_t gfp);
+void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what);
+void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what);
+void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what);
+
+/*
* utils.c
*/
int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *);
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 99e10eea3732..48790ee77019 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -248,9 +248,8 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb)
if (call->peer->rtt_count < 3 ||
ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now))
- rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial,
- true, true,
- rxrpc_propose_ack_ping_for_params);
+ rxrpc_send_ACK(call, RXRPC_ACK_PING, sp->hdr.serial,
+ rxrpc_propose_ack_ping_for_params);
}
/*
@@ -325,7 +324,8 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
call->security = conn->security;
call->security_ix = conn->security_ix;
call->peer = rxrpc_get_peer(conn->params.peer);
- call->cong_cwnd = call->peer->cong_cwnd;
+ call->cong_ssthresh = call->peer->cong_ssthresh;
+ call->tx_last_sent = ktime_get_real();
return call;
}
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 2a93e7b5fbd0..1e21a708390e 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -20,127 +20,103 @@
/*
* Propose a PING ACK be sent.
*/
-static void rxrpc_propose_ping(struct rxrpc_call *call,
- bool immediate, bool background)
+void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial,
+ enum rxrpc_propose_ack_trace why)
{
- if (immediate) {
- if (background &&
- !test_and_set_bit(RXRPC_CALL_EV_PING, &call->events))
- rxrpc_queue_call(call);
- } else {
- unsigned long now = jiffies;
- unsigned long ping_at = now + rxrpc_idle_ack_delay;
-
- if (time_before(ping_at, call->ping_at)) {
- WRITE_ONCE(call->ping_at, ping_at);
- rxrpc_reduce_call_timer(call, ping_at, now,
- rxrpc_timer_set_for_ping);
- }
+ unsigned long now = jiffies;
+ unsigned long ping_at = now + rxrpc_idle_ack_delay;
+
+ if (time_before(ping_at, call->ping_at)) {
+ WRITE_ONCE(call->ping_at, ping_at);
+ rxrpc_reduce_call_timer(call, ping_at, now,
+ rxrpc_timer_set_for_ping);
+ trace_rxrpc_propose_ack(call, why, RXRPC_ACK_PING, serial);
}
}
/*
- * propose an ACK be sent
+ * Propose a DELAY ACK be sent in the future.
*/
-static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
- u32 serial, bool immediate, bool background,
- enum rxrpc_propose_ack_trace why)
+void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial,
+ enum rxrpc_propose_ack_trace why)
{
- enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use;
unsigned long expiry = rxrpc_soft_ack_delay;
- s8 prior = rxrpc_ack_priority[ack_reason];
-
- /* Pings are handled specially because we don't want to accidentally
- * lose a ping response by subsuming it into a ping.
- */
- if (ack_reason == RXRPC_ACK_PING) {
- rxrpc_propose_ping(call, immediate, background);
- goto trace;
+ unsigned long now = jiffies, ack_at;
+
+ call->ackr_serial = serial;
+
+ if (rxrpc_soft_ack_delay < expiry)
+ expiry = rxrpc_soft_ack_delay;
+ if (call->peer->srtt_us != 0)
+ ack_at = usecs_to_jiffies(call->peer->srtt_us >> 3);
+ else
+ ack_at = expiry;
+
+ ack_at += READ_ONCE(call->tx_backoff);
+ ack_at += now;
+ if (time_before(ack_at, call->delay_ack_at)) {
+ WRITE_ONCE(call->delay_ack_at, ack_at);
+ rxrpc_reduce_call_timer(call, ack_at, now,
+ rxrpc_timer_set_for_ack);
}
- /* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
- * numbers, but we don't alter the timeout.
- */
- _debug("prior %u %u vs %u %u",
- ack_reason, prior,
- call->ackr_reason, rxrpc_ack_priority[call->ackr_reason]);
- if (ack_reason == call->ackr_reason) {
- if (RXRPC_ACK_UPDATEABLE & (1 << ack_reason)) {
- outcome = rxrpc_propose_ack_update;
- call->ackr_serial = serial;
- }
- if (!immediate)
- goto trace;
- } else if (prior > rxrpc_ack_priority[call->ackr_reason]) {
- call->ackr_reason = ack_reason;
- call->ackr_serial = serial;
- } else {
- outcome = rxrpc_propose_ack_subsume;
+ trace_rxrpc_propose_ack(call, why, RXRPC_ACK_DELAY, serial);
+}
+
+/*
+ * Queue an ACK for immediate transmission.
+ */
+void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason,
+ rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why)
+{
+ struct rxrpc_local *local = call->conn->params.local;
+ struct rxrpc_txbuf *txb;
+
+ if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
+ return;
+ if (ack_reason == RXRPC_ACK_DELAY &&
+ test_and_set_bit(RXRPC_CALL_DELAY_ACK_PENDING, &call->flags)) {
+ trace_rxrpc_drop_ack(call, why, ack_reason, serial, false);
+ return;
}
- switch (ack_reason) {
- case RXRPC_ACK_REQUESTED:
- if (rxrpc_requested_ack_delay < expiry)
- expiry = rxrpc_requested_ack_delay;
- if (serial == 1)
- immediate = false;
- break;
-
- case RXRPC_ACK_DELAY:
- if (rxrpc_soft_ack_delay < expiry)
- expiry = rxrpc_soft_ack_delay;
- break;
-
- case RXRPC_ACK_IDLE:
- if (rxrpc_idle_ack_delay < expiry)
- expiry = rxrpc_idle_ack_delay;
- break;
-
- default:
- immediate = true;
- break;
+ rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]);
+
+ txb = rxrpc_alloc_txbuf(call, RXRPC_PACKET_TYPE_ACK,
+ in_softirq() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS);
+ if (!txb) {
+ kleave(" = -ENOMEM");
+ return;
}
- if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) {
- _debug("already scheduled");
- } else if (immediate || expiry == 0) {
- _debug("immediate ACK %lx", call->events);
- if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events) &&
- background)
- rxrpc_queue_call(call);
- } else {
- unsigned long now = jiffies, ack_at;
-
- if (call->peer->srtt_us != 0)
- ack_at = usecs_to_jiffies(call->peer->srtt_us >> 3);
- else
- ack_at = expiry;
-
- ack_at += READ_ONCE(call->tx_backoff);
- ack_at += now;
- if (time_before(ack_at, call->ack_at)) {
- WRITE_ONCE(call->ack_at, ack_at);
- rxrpc_reduce_call_timer(call, ack_at, now,
- rxrpc_timer_set_for_ack);
- }
+ txb->ack_why = why;
+ txb->wire.seq = 0;
+ txb->wire.type = RXRPC_PACKET_TYPE_ACK;
+ txb->wire.flags |= RXRPC_SLOW_START_OK;
+ txb->ack.bufferSpace = 0;
+ txb->ack.maxSkew = 0;
+ txb->ack.firstPacket = 0;
+ txb->ack.previousPacket = 0;
+ txb->ack.serial = htonl(serial);
+ txb->ack.reason = ack_reason;
+ txb->ack.nAcks = 0;
+
+ if (!rxrpc_try_get_call(call, rxrpc_call_got)) {
+ rxrpc_put_txbuf(txb, rxrpc_txbuf_put_nomem);
+ return;
}
-trace:
- trace_rxrpc_propose_ack(call, why, ack_reason, serial, immediate,
- background, outcome);
-}
+ spin_lock_bh(&local->ack_tx_lock);
+ list_add_tail(&txb->tx_link, &local->ack_tx_queue);
+ spin_unlock_bh(&local->ack_tx_lock);
+ trace_rxrpc_send_ack(call, why, ack_reason, serial);
-/*
- * propose an ACK be sent, locking the call structure
- */
-void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
- u32 serial, bool immediate, bool background,
- enum rxrpc_propose_ack_trace why)
-{
- spin_lock_bh(&call->lock);
- __rxrpc_propose_ACK(call, ack_reason, serial,
- immediate, background, why);
- spin_unlock_bh(&call->lock);
+ if (in_task()) {
+ rxrpc_transmit_ack_packets(call->peer->local);
+ } else {
+ rxrpc_get_local(local);
+ rxrpc_queue_local(local);
+ }
}
/*
@@ -156,62 +132,131 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call)
*/
static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
{
- struct sk_buff *skb;
+ struct rxrpc_ackpacket *ack = NULL;
+ struct rxrpc_txbuf *txb;
+ struct sk_buff *ack_skb = NULL;
unsigned long resend_at;
- rxrpc_seq_t cursor, seq, top;
+ rxrpc_seq_t transmitted = READ_ONCE(call->tx_transmitted);
ktime_t now, max_age, oldest, ack_ts;
- int ix;
- u8 annotation, anno_type, retrans = 0, unacked = 0;
+ bool unacked = false;
+ unsigned int i;
+ LIST_HEAD(retrans_queue);
- _enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
+ _enter("{%d,%d}", call->acks_hard_ack, call->tx_top);
now = ktime_get_real();
max_age = ktime_sub_us(now, jiffies_to_usecs(call->peer->rto_j));
+ oldest = now;
+
+ /* See if there's an ACK saved with a soft-ACK table in it. */
+ if (call->acks_soft_tbl) {
+ spin_lock_bh(&call->acks_ack_lock);
+ ack_skb = call->acks_soft_tbl;
+ if (ack_skb) {
+ rxrpc_get_skb(ack_skb, rxrpc_skb_ack);
+ ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header);
+ }
+ spin_unlock_bh(&call->acks_ack_lock);
+ }
- spin_lock_bh(&call->lock);
+ if (list_empty(&call->tx_buffer))
+ goto no_resend;
- cursor = call->tx_hard_ack;
- top = call->tx_top;
- ASSERT(before_eq(cursor, top));
- if (cursor == top)
- goto out_unlock;
+ spin_lock(&call->tx_lock);
- /* Scan the packet list without dropping the lock and decide which of
- * the packets in the Tx buffer we're going to resend and what the new
- * resend timeout will be.
- */
- trace_rxrpc_resend(call, (cursor + 1) & RXRPC_RXTX_BUFF_MASK);
- oldest = now;
- for (seq = cursor + 1; before_eq(seq, top); seq++) {
- ix = seq & RXRPC_RXTX_BUFF_MASK;
- annotation = call->rxtx_annotations[ix];
- anno_type = annotation & RXRPC_TX_ANNO_MASK;
- annotation &= ~RXRPC_TX_ANNO_MASK;
- if (anno_type == RXRPC_TX_ANNO_ACK)
- continue;
+ if (list_empty(&call->tx_buffer))
+ goto no_further_resend;
+
+ trace_rxrpc_resend(call);
+ txb = list_first_entry(&call->tx_buffer, struct rxrpc_txbuf, call_link);
- skb = call->rxtx_buffer[ix];
- rxrpc_see_skb(skb, rxrpc_skb_seen);
+ /* Scan the soft ACK table without dropping the lock and resend any
+ * explicitly NAK'd packets.
+ */
+ if (ack) {
+ for (i = 0; i < ack->nAcks; i++) {
+ rxrpc_seq_t seq;
- if (anno_type == RXRPC_TX_ANNO_UNACK) {
- if (ktime_after(skb->tstamp, max_age)) {
- if (ktime_before(skb->tstamp, oldest))
- oldest = skb->tstamp;
+ if (ack->acks[i] & 1)
continue;
+ seq = ntohl(ack->firstPacket) + i;
+ if (after(txb->seq, transmitted))
+ break;
+ if (after(txb->seq, seq))
+ continue; /* A new hard ACK probably came in */
+ list_for_each_entry_from(txb, &call->tx_buffer, call_link) {
+ if (txb->seq == seq)
+ goto found_txb;
+ }
+ goto no_further_resend;
+
+ found_txb:
+ if (after(ntohl(txb->wire.serial), call->acks_highest_serial))
+ continue; /* Ack point not yet reached */
+
+ rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked);
+
+ if (list_empty(&txb->tx_link)) {
+ rxrpc_get_txbuf(txb, rxrpc_txbuf_get_retrans);
+ rxrpc_get_call(call, rxrpc_call_got_tx);
+ list_add_tail(&txb->tx_link, &retrans_queue);
+ set_bit(RXRPC_TXBUF_RESENT, &txb->flags);
}
- if (!(annotation & RXRPC_TX_ANNO_RESENT))
- unacked++;
+
+ trace_rxrpc_retransmit(call, txb->seq,
+ ktime_to_ns(ktime_sub(txb->last_sent,
+ max_age)));
+
+ if (list_is_last(&txb->call_link, &call->tx_buffer))
+ goto no_further_resend;
+ txb = list_next_entry(txb, call_link);
+ }
+ }
+
+ /* Fast-forward through the Tx queue to the point the peer says it has
+ * seen. Anything between the soft-ACK table and that point will get
+ * ACK'd or NACK'd in due course, so don't worry about it here; here we
+ * need to consider retransmitting anything beyond that point.
+ *
+ * Note that ACK for a packet can beat the update of tx_transmitted.
+ */
+ if (after_eq(READ_ONCE(call->acks_prev_seq), READ_ONCE(call->tx_transmitted)))
+ goto no_further_resend;
+
+ list_for_each_entry_from(txb, &call->tx_buffer, call_link) {
+ if (before_eq(txb->seq, READ_ONCE(call->acks_prev_seq)))
+ continue;
+ if (after(txb->seq, READ_ONCE(call->tx_transmitted)))
+ break; /* Not transmitted yet */
+
+ if (ack && ack->reason == RXRPC_ACK_PING_RESPONSE &&
+ before(ntohl(txb->wire.serial), ntohl(ack->serial)))
+ goto do_resend; /* Wasn't accounted for by a more recent ping. */
+
+ if (ktime_after(txb->last_sent, max_age)) {
+ if (ktime_before(txb->last_sent, oldest))
+ oldest = txb->last_sent;
+ continue;
}
- /* Okay, we need to retransmit a packet. */
- call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation;
- retrans++;
- trace_rxrpc_retransmit(call, seq, annotation | anno_type,
- ktime_to_ns(ktime_sub(skb->tstamp, max_age)));
+ do_resend:
+ unacked = true;
+ if (list_empty(&txb->tx_link)) {
+ rxrpc_get_txbuf(txb, rxrpc_txbuf_get_retrans);
+ list_add_tail(&txb->tx_link, &retrans_queue);
+ set_bit(RXRPC_TXBUF_RESENT, &txb->flags);
+ rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans);
+ }
}
+no_further_resend:
+ spin_unlock(&call->tx_lock);
+no_resend:
+ rxrpc_free_skb(ack_skb, rxrpc_skb_freed);
+
resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest)));
- resend_at += jiffies + rxrpc_get_rto_backoff(call->peer, retrans);
+ resend_at += jiffies + rxrpc_get_rto_backoff(call->peer,
+ !list_empty(&retrans_queue));
WRITE_ONCE(call->resend_at, resend_at);
if (unacked)
@@ -221,62 +266,28 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
* that an ACK got lost somewhere. Send a ping to find out instead of
* retransmitting data.
*/
- if (!retrans) {
+ if (list_empty(&retrans_queue)) {
rxrpc_reduce_call_timer(call, resend_at, now_j,
rxrpc_timer_set_for_resend);
- spin_unlock_bh(&call->lock);
ack_ts = ktime_sub(now, call->acks_latest_ts);
if (ktime_to_us(ack_ts) < (call->peer->srtt_us >> 3))
goto out;
- rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false,
- rxrpc_propose_ack_ping_for_lost_ack);
- rxrpc_send_ack_packet(call, true, NULL);
+ rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
+ rxrpc_propose_ack_ping_for_lost_ack);
goto out;
}
- /* Now go through the Tx window and perform the retransmissions. We
- * have to drop the lock for each send. If an ACK comes in whilst the
- * lock is dropped, it may clear some of the retransmission markers for
- * packets that it soft-ACKs.
- */
- for (seq = cursor + 1; before_eq(seq, top); seq++) {
- ix = seq & RXRPC_RXTX_BUFF_MASK;
- annotation = call->rxtx_annotations[ix];
- anno_type = annotation & RXRPC_TX_ANNO_MASK;
- if (anno_type != RXRPC_TX_ANNO_RETRANS)
- continue;
-
- /* We need to reset the retransmission state, but we need to do
- * so before we drop the lock as a new ACK/NAK may come in and
- * confuse things
- */
- annotation &= ~RXRPC_TX_ANNO_MASK;
- annotation |= RXRPC_TX_ANNO_UNACK | RXRPC_TX_ANNO_RESENT;
- call->rxtx_annotations[ix] = annotation;
-
- skb = call->rxtx_buffer[ix];
- if (!skb)
- continue;
-
- rxrpc_get_skb(skb, rxrpc_skb_got);
- spin_unlock_bh(&call->lock);
-
- if (rxrpc_send_data_packet(call, skb, true) < 0) {
- rxrpc_free_skb(skb, rxrpc_skb_freed);
- return;
- }
+ while ((txb = list_first_entry_or_null(&retrans_queue,
+ struct rxrpc_txbuf, tx_link))) {
+ list_del_init(&txb->tx_link);
+ rxrpc_send_data_packet(call, txb);
+ rxrpc_put_txbuf(txb, rxrpc_txbuf_put_trans);
- if (rxrpc_is_client_call(call))
- rxrpc_expose_client_call(call);
-
- rxrpc_free_skb(skb, rxrpc_skb_freed);
- spin_lock_bh(&call->lock);
- if (after(call->tx_hard_ack, seq))
- seq = call->tx_hard_ack;
+ trace_rxrpc_retransmit(call, txb->seq,
+ ktime_to_ns(ktime_sub(txb->last_sent,
+ max_age)));
}
-out_unlock:
- spin_unlock_bh(&call->lock);
out:
_leave("");
}
@@ -288,9 +299,9 @@ void rxrpc_process_call(struct work_struct *work)
{
struct rxrpc_call *call =
container_of(work, struct rxrpc_call, processor);
- rxrpc_serial_t *send_ack;
unsigned long now, next, t;
unsigned int iterations = 0;
+ rxrpc_serial_t ackr_serial;
rxrpc_see_call(call);
@@ -309,6 +320,9 @@ recheck_state:
goto recheck_state;
}
+ if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom)
+ rxrpc_shrink_call_tx_buffer(call);
+
if (call->state == RXRPC_CALL_COMPLETE) {
rxrpc_delete_call_timer(call);
goto out_put;
@@ -335,11 +349,13 @@ recheck_state:
set_bit(RXRPC_CALL_EV_EXPIRED, &call->events);
}
- t = READ_ONCE(call->ack_at);
+ t = READ_ONCE(call->delay_ack_at);
if (time_after_eq(now, t)) {
trace_rxrpc_timer(call, rxrpc_timer_exp_ack, now);
- cmpxchg(&call->ack_at, t, now + MAX_JIFFY_OFFSET);
- set_bit(RXRPC_CALL_EV_ACK, &call->events);
+ cmpxchg(&call->delay_ack_at, t, now + MAX_JIFFY_OFFSET);
+ ackr_serial = xchg(&call->ackr_serial, 0);
+ rxrpc_send_ACK(call, RXRPC_ACK_DELAY, ackr_serial,
+ rxrpc_propose_ack_ping_for_lost_ack);
}
t = READ_ONCE(call->ack_lost_at);
@@ -353,16 +369,16 @@ recheck_state:
if (time_after_eq(now, t)) {
trace_rxrpc_timer(call, rxrpc_timer_exp_keepalive, now);
cmpxchg(&call->keepalive_at, t, now + MAX_JIFFY_OFFSET);
- rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, true,
- rxrpc_propose_ack_ping_for_keepalive);
- set_bit(RXRPC_CALL_EV_PING, &call->events);
+ rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
+ rxrpc_propose_ack_ping_for_keepalive);
}
t = READ_ONCE(call->ping_at);
if (time_after_eq(now, t)) {
trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now);
cmpxchg(&call->ping_at, t, now + MAX_JIFFY_OFFSET);
- set_bit(RXRPC_CALL_EV_PING, &call->events);
+ rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
+ rxrpc_propose_ack_ping_for_keepalive);
}
t = READ_ONCE(call->resend_at);
@@ -385,25 +401,10 @@ recheck_state:
goto recheck_state;
}
- send_ack = NULL;
if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) {
call->acks_lost_top = call->tx_top;
- rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false,
- rxrpc_propose_ack_ping_for_lost_ack);
- send_ack = &call->acks_lost_ping;
- }
-
- if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) ||
- send_ack) {
- if (call->ackr_reason) {
- rxrpc_send_ack_packet(call, false, send_ack);
- goto recheck_state;
- }
- }
-
- if (test_and_clear_bit(RXRPC_CALL_EV_PING, &call->events)) {
- rxrpc_send_ack_packet(call, true, NULL);
- goto recheck_state;
+ rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
+ rxrpc_propose_ack_ping_for_lost_ack);
}
if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events) &&
@@ -419,7 +420,7 @@ recheck_state:
set(call->expect_req_by);
set(call->expect_term_by);
- set(call->ack_at);
+ set(call->delay_ack_at);
set(call->ack_lost_at);
set(call->resend_at);
set(call->keepalive_at);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 6401cdf7a624..1befe22cd301 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -52,7 +52,7 @@ static void rxrpc_call_timer_expired(struct timer_list *t)
_enter("%d", call->debug_id);
if (call->state < RXRPC_CALL_COMPLETE) {
- trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies);
+ trace_rxrpc_timer_expired(call, jiffies);
__rxrpc_queue_call(call);
} else {
rxrpc_put_call(call, rxrpc_call_put);
@@ -129,16 +129,6 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
if (!call)
return NULL;
- call->rxtx_buffer = kcalloc(RXRPC_RXTX_BUFF_SIZE,
- sizeof(struct sk_buff *),
- gfp);
- if (!call->rxtx_buffer)
- goto nomem;
-
- call->rxtx_annotations = kcalloc(RXRPC_RXTX_BUFF_SIZE, sizeof(u8), gfp);
- if (!call->rxtx_annotations)
- goto nomem_2;
-
mutex_init(&call->user_mutex);
/* Prevent lockdep reporting a deadlock false positive between the afs
@@ -155,37 +145,39 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
INIT_LIST_HEAD(&call->accept_link);
INIT_LIST_HEAD(&call->recvmsg_link);
INIT_LIST_HEAD(&call->sock_link);
+ INIT_LIST_HEAD(&call->tx_buffer);
+ skb_queue_head_init(&call->recvmsg_queue);
+ skb_queue_head_init(&call->rx_oos_queue);
init_waitqueue_head(&call->waitq);
- spin_lock_init(&call->lock);
spin_lock_init(&call->notify_lock);
+ spin_lock_init(&call->tx_lock);
spin_lock_init(&call->input_lock);
+ spin_lock_init(&call->acks_ack_lock);
rwlock_init(&call->state_lock);
refcount_set(&call->ref, 1);
call->debug_id = debug_id;
call->tx_total_len = -1;
call->next_rx_timo = 20 * HZ;
call->next_req_timo = 1 * HZ;
+ atomic64_set(&call->ackr_window, 0x100000001ULL);
memset(&call->sock_node, 0xed, sizeof(call->sock_node));
- /* Leave space in the ring to handle a maxed-out jumbo packet */
call->rx_winsize = rxrpc_rx_window_size;
call->tx_winsize = 16;
- call->rx_expect_next = 1;
- call->cong_cwnd = 2;
- call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1;
+ if (RXRPC_TX_SMSS > 2190)
+ call->cong_cwnd = 2;
+ else if (RXRPC_TX_SMSS > 1095)
+ call->cong_cwnd = 3;
+ else
+ call->cong_cwnd = 4;
+ call->cong_ssthresh = RXRPC_TX_MAX_WINDOW;
call->rxnet = rxnet;
call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK;
atomic_inc(&rxnet->nr_calls);
return call;
-
-nomem_2:
- kfree(call->rxtx_buffer);
-nomem:
- kmem_cache_free(rxrpc_call_jar, call);
- return NULL;
}
/*
@@ -206,7 +198,6 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
return ERR_PTR(-ENOMEM);
call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
call->service_id = srx->srx_service;
- call->tx_phase = true;
now = ktime_get_real();
call->acks_latest_ts = now;
call->cong_tstamp = now;
@@ -223,7 +214,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call)
unsigned long now = jiffies;
unsigned long j = now + MAX_JIFFY_OFFSET;
- call->ack_at = j;
+ call->delay_ack_at = j;
call->ack_lost_at = j;
call->resend_at = j;
call->ping_at = j;
@@ -510,16 +501,12 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
}
/*
- * Clean up the RxTx skb ring.
+ * Clean up the Rx skb ring.
*/
static void rxrpc_cleanup_ring(struct rxrpc_call *call)
{
- int i;
-
- for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) {
- rxrpc_free_skb(call->rxtx_buffer[i], rxrpc_skb_cleaned);
- call->rxtx_buffer[i] = NULL;
- }
+ skb_queue_purge(&call->recvmsg_queue);
+ skb_queue_purge(&call->rx_oos_queue);
}
/*
@@ -539,10 +526,8 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
- spin_lock_bh(&call->lock);
if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags))
BUG();
- spin_unlock_bh(&call->lock);
rxrpc_put_call_slot(call);
rxrpc_delete_call_timer(call);
@@ -656,8 +641,6 @@ static void rxrpc_destroy_call(struct work_struct *work)
rxrpc_put_connection(call->conn);
rxrpc_put_peer(call->peer);
- kfree(call->rxtx_buffer);
- kfree(call->rxtx_annotations);
kmem_cache_free(rxrpc_call_jar, call);
if (atomic_dec_and_test(&rxnet->nr_calls))
wake_up_var(&rxnet->nr_calls);
@@ -684,6 +667,8 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
*/
void rxrpc_cleanup_call(struct rxrpc_call *call)
{
+ struct rxrpc_txbuf *txb;
+
_net("DESTROY CALL %d", call->debug_id);
memset(&call->sock_node, 0xcd, sizeof(call->sock_node));
@@ -692,7 +677,13 @@ void rxrpc_cleanup_call(struct rxrpc_call *call)
ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags));
rxrpc_cleanup_ring(call);
- rxrpc_free_skb(call->tx_pending, rxrpc_skb_cleaned);
+ while ((txb = list_first_entry_or_null(&call->tx_buffer,
+ struct rxrpc_txbuf, call_link))) {
+ list_del(&txb->call_link);
+ rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned);
+ }
+ rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned);
+ rxrpc_free_skb(call->acks_soft_tbl, rxrpc_skb_cleaned);
call_rcu(&call->rcu, rxrpc_rcu_destroy_call);
}
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 3c9eeb5b750c..f11c97e28d2a 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -40,6 +40,8 @@ __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ;
DEFINE_IDR(rxrpc_client_conn_ids);
static DEFINE_SPINLOCK(rxrpc_conn_id_lock);
+static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle);
+
/*
* Get a connection ID and epoch for a client connection from the global pool.
* The connection struct pointer is then recorded in the idr radix tree. The
@@ -123,6 +125,7 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp,
bundle->params = *cp;
rxrpc_get_peer(bundle->params.peer);
refcount_set(&bundle->ref, 1);
+ atomic_set(&bundle->active, 1);
spin_lock_init(&bundle->channel_lock);
INIT_LIST_HEAD(&bundle->waiting_calls);
}
@@ -149,7 +152,7 @@ void rxrpc_put_bundle(struct rxrpc_bundle *bundle)
dead = __refcount_dec_and_test(&bundle->ref, &r);
- _debug("PUT B=%x %d", d, r);
+ _debug("PUT B=%x %d", d, r - 1);
if (dead)
rxrpc_free_bundle(bundle);
}
@@ -338,6 +341,7 @@ found_bundle_free:
rxrpc_free_bundle(candidate);
found_bundle:
rxrpc_get_bundle(bundle);
+ atomic_inc(&bundle->active);
spin_unlock(&local->client_bundles_lock);
_leave(" = %u [found]", bundle->debug_id);
return bundle;
@@ -363,7 +367,8 @@ static struct rxrpc_bundle *rxrpc_prep_call(struct rxrpc_sock *rx,
if (!cp->peer)
goto error;
- call->cong_cwnd = cp->peer->cong_cwnd;
+ call->tx_last_sent = ktime_get_real();
+ call->cong_ssthresh = cp->peer->cong_ssthresh;
if (call->cong_cwnd >= call->cong_ssthresh)
call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
else
@@ -435,6 +440,7 @@ static void rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, gfp_t gfp)
if (old)
trace_rxrpc_client(old, -1, rxrpc_client_replace);
candidate->bundle_shift = shift;
+ atomic_inc(&bundle->active);
bundle->conns[i] = candidate;
for (j = 0; j < RXRPC_MAXCALLS; j++)
set_bit(shift + j, &bundle->avail_chans);
@@ -725,6 +731,7 @@ granted_channel:
smp_rmb();
out_put_bundle:
+ rxrpc_deactivate_bundle(bundle);
rxrpc_put_bundle(bundle);
out:
_leave(" = %d", ret);
@@ -900,9 +907,8 @@ out:
static void rxrpc_unbundle_conn(struct rxrpc_connection *conn)
{
struct rxrpc_bundle *bundle = conn->bundle;
- struct rxrpc_local *local = bundle->params.local;
unsigned int bindex;
- bool need_drop = false, need_put = false;
+ bool need_drop = false;
int i;
_enter("C=%x", conn->debug_id);
@@ -921,15 +927,22 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn)
}
spin_unlock(&bundle->channel_lock);
- /* If there are no more connections, remove the bundle */
- if (!bundle->avail_chans) {
- _debug("maybe unbundle");
- spin_lock(&local->client_bundles_lock);
+ if (need_drop) {
+ rxrpc_deactivate_bundle(bundle);
+ rxrpc_put_connection(conn);
+ }
+}
- for (i = 0; i < ARRAY_SIZE(bundle->conns); i++)
- if (bundle->conns[i])
- break;
- if (i == ARRAY_SIZE(bundle->conns) && !bundle->params.exclusive) {
+/*
+ * Drop the active count on a bundle.
+ */
+static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle)
+{
+ struct rxrpc_local *local = bundle->params.local;
+ bool need_put = false;
+
+ if (atomic_dec_and_lock(&bundle->active, &local->client_bundles_lock)) {
+ if (!bundle->params.exclusive) {
_debug("erase bundle");
rb_erase(&bundle->local_node, &local->client_bundles);
need_put = true;
@@ -939,10 +952,6 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn)
if (need_put)
rxrpc_put_bundle(bundle);
}
-
- if (need_drop)
- rxrpc_put_connection(conn);
- _leave("");
}
/*
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 22089e37e97f..156bd26daf74 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -175,7 +175,7 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn,
trace_rxrpc_disconnect_call(call);
switch (call->completion) {
case RXRPC_CALL_SUCCEEDED:
- chan->last_seq = call->rx_hard_ack;
+ chan->last_seq = call->rx_highest_seq;
chan->last_type = RXRPC_PACKET_TYPE_ACK;
break;
case RXRPC_CALL_LOCALLY_ABORTED:
@@ -207,7 +207,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
{
struct rxrpc_connection *conn = call->conn;
- call->peer->cong_cwnd = call->cong_cwnd;
+ call->peer->cong_ssthresh = call->cong_ssthresh;
if (!hlist_unhashed(&call->error_link)) {
spin_lock_bh(&call->peer->lock);
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 721d847ba92b..bdf70b81addc 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -7,20 +7,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
-#include <linux/net.h>
-#include <linux/skbuff.h>
-#include <linux/errqueue.h>
-#include <linux/udp.h>
-#include <linux/in.h>
-#include <linux/in6.h>
-#include <linux/icmp.h>
-#include <linux/gfp.h>
-#include <net/sock.h>
-#include <net/af_rxrpc.h>
-#include <net/ip.h>
-#include <net/udp.h>
-#include <net/net_namespace.h>
#include "ar-internal.h"
static void rxrpc_proto_abort(const char *why,
@@ -46,7 +32,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
bool resend = false;
summary->flight_size =
- (call->tx_top - call->tx_hard_ack) - summary->nr_acks;
+ (call->tx_top - call->acks_hard_ack) - summary->nr_acks;
if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) {
summary->retrans_timeo = true;
@@ -72,9 +58,28 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
summary->cumulative_acks = cumulative_acks;
summary->dup_acks = call->cong_dup_acks;
+ /* If we haven't transmitted anything for >1RTT, we should reset the
+ * congestion management state.
+ */
+ if ((call->cong_mode == RXRPC_CALL_SLOW_START ||
+ call->cong_mode == RXRPC_CALL_CONGEST_AVOIDANCE) &&
+ ktime_before(ktime_add_us(call->tx_last_sent,
+ call->peer->srtt_us >> 3),
+ ktime_get_real())
+ ) {
+ change = rxrpc_cong_idle_reset;
+ summary->mode = RXRPC_CALL_SLOW_START;
+ if (RXRPC_TX_SMSS > 2190)
+ summary->cwnd = 2;
+ else if (RXRPC_TX_SMSS > 1095)
+ summary->cwnd = 3;
+ else
+ summary->cwnd = 4;
+ }
+
switch (call->cong_mode) {
case RXRPC_CALL_SLOW_START:
- if (summary->nr_nacks > 0)
+ if (summary->saw_nacks)
goto packet_loss_detected;
if (summary->cumulative_acks > 0)
cwnd += 1;
@@ -85,7 +90,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
goto out;
case RXRPC_CALL_CONGEST_AVOIDANCE:
- if (summary->nr_nacks > 0)
+ if (summary->saw_nacks)
goto packet_loss_detected;
/* We analyse the number of packets that get ACK'd per RTT
@@ -104,7 +109,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
goto out;
case RXRPC_CALL_PACKET_LOSS:
- if (summary->nr_nacks == 0)
+ if (!summary->saw_nacks)
goto resume_normality;
if (summary->new_low_nack) {
@@ -142,7 +147,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
} else {
change = rxrpc_cong_progress;
cwnd = call->cong_ssthresh;
- if (summary->nr_nacks == 0)
+ if (!summary->saw_nacks)
goto resume_normality;
}
goto out;
@@ -164,8 +169,8 @@ resume_normality:
out:
cumulative_acks = 0;
out_no_clear_ca:
- if (cwnd >= RXRPC_RXTX_BUFF_SIZE - 1)
- cwnd = RXRPC_RXTX_BUFF_SIZE - 1;
+ if (cwnd >= RXRPC_TX_MAX_WINDOW)
+ cwnd = RXRPC_TX_MAX_WINDOW;
call->cong_cwnd = cwnd;
call->cong_cumul_acks = cumulative_acks;
trace_rxrpc_congest(call, summary, acked_serial, change);
@@ -183,9 +188,8 @@ send_extra_data:
/* Send some previously unsent DATA if we have some to advance the ACK
* state.
*/
- if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
- RXRPC_TX_ANNO_LAST ||
- summary->nr_acks != call->tx_top - call->tx_hard_ack) {
+ if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) ||
+ summary->nr_acks != call->tx_top - call->acks_hard_ack) {
call->cong_extra++;
wake_up(&call->waitq);
}
@@ -198,53 +202,39 @@ send_extra_data:
static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
struct rxrpc_ack_summary *summary)
{
- struct sk_buff *skb, *list = NULL;
+ struct rxrpc_txbuf *txb;
bool rot_last = false;
- int ix;
- u8 annotation;
-
- if (call->acks_lowest_nak == call->tx_hard_ack) {
- call->acks_lowest_nak = to;
- } else if (before_eq(call->acks_lowest_nak, to)) {
- summary->new_low_nack = true;
- call->acks_lowest_nak = to;
- }
-
- spin_lock(&call->lock);
-
- while (before(call->tx_hard_ack, to)) {
- call->tx_hard_ack++;
- ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK;
- skb = call->rxtx_buffer[ix];
- annotation = call->rxtx_annotations[ix];
- rxrpc_see_skb(skb, rxrpc_skb_rotated);
- call->rxtx_buffer[ix] = NULL;
- call->rxtx_annotations[ix] = 0;
- skb->next = list;
- list = skb;
- if (annotation & RXRPC_TX_ANNO_LAST) {
+ list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) {
+ if (before_eq(txb->seq, call->acks_hard_ack))
+ continue;
+ summary->nr_rot_new_acks++;
+ if (test_bit(RXRPC_TXBUF_LAST, &txb->flags)) {
set_bit(RXRPC_CALL_TX_LAST, &call->flags);
rot_last = true;
}
- if ((annotation & RXRPC_TX_ANNO_MASK) != RXRPC_TX_ANNO_ACK)
- summary->nr_rot_new_acks++;
+ if (txb->seq == to)
+ break;
}
- spin_unlock(&call->lock);
+ if (rot_last)
+ set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags);
- trace_rxrpc_transmit(call, (rot_last ?
- rxrpc_transmit_rotate_last :
- rxrpc_transmit_rotate));
- wake_up(&call->waitq);
+ _enter("%x,%x,%x,%d", to, call->acks_hard_ack, call->tx_top, rot_last);
- while (list) {
- skb = list;
- list = skb->next;
- skb_mark_not_on_list(skb);
- rxrpc_free_skb(skb, rxrpc_skb_freed);
+ if (call->acks_lowest_nak == call->acks_hard_ack) {
+ call->acks_lowest_nak = to;
+ } else if (after(to, call->acks_lowest_nak)) {
+ summary->new_low_nack = true;
+ call->acks_lowest_nak = to;
}
+ smp_store_release(&call->acks_hard_ack, to);
+
+ trace_rxrpc_txqueue(call, (rot_last ?
+ rxrpc_txqueue_rotate_last :
+ rxrpc_txqueue_rotate));
+ wake_up(&call->waitq);
return rot_last;
}
@@ -284,9 +274,9 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
write_unlock(&call->state_lock);
if (state == RXRPC_CALL_CLIENT_AWAIT_REPLY)
- trace_rxrpc_transmit(call, rxrpc_transmit_await_reply);
+ trace_rxrpc_txqueue(call, rxrpc_txqueue_await_reply);
else
- trace_rxrpc_transmit(call, rxrpc_transmit_end);
+ trace_rxrpc_txqueue(call, rxrpc_txqueue_end);
_leave(" = ok");
return true;
@@ -307,13 +297,10 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call)
rxrpc_seq_t top = READ_ONCE(call->tx_top);
if (call->ackr_reason) {
- spin_lock_bh(&call->lock);
- call->ackr_reason = 0;
- spin_unlock_bh(&call->lock);
now = jiffies;
timo = now + MAX_JIFFY_OFFSET;
WRITE_ONCE(call->resend_at, timo);
- WRITE_ONCE(call->ack_at, timo);
+ WRITE_ONCE(call->delay_ack_at, timo);
trace_rxrpc_timer(call, rxrpc_timer_init_for_reply, now);
}
@@ -323,85 +310,230 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call)
return false;
}
}
- if (!rxrpc_end_tx_phase(call, true, "ETD"))
- return false;
- call->tx_phase = false;
- return true;
+ return rxrpc_end_tx_phase(call, true, "ETD");
+}
+
+static void rxrpc_input_update_ack_window(struct rxrpc_call *call,
+ rxrpc_seq_t window, rxrpc_seq_t wtop)
+{
+ atomic64_set_release(&call->ackr_window, ((u64)wtop) << 32 | window);
}
/*
- * Scan a data packet to validate its structure and to work out how many
- * subpackets it contains.
- *
- * A jumbo packet is a collection of consecutive packets glued together with
- * little headers between that indicate how to change the initial header for
- * each subpacket.
- *
- * RXRPC_JUMBO_PACKET must be set on all but the last subpacket - and all but
- * the last are RXRPC_JUMBO_DATALEN in size. The last subpacket may be of any
- * size.
+ * Push a DATA packet onto the Rx queue.
*/
-static bool rxrpc_validate_data(struct sk_buff *skb)
+static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb,
+ rxrpc_seq_t window, rxrpc_seq_t wtop,
+ enum rxrpc_receive_trace why)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- unsigned int offset = sizeof(struct rxrpc_wire_header);
- unsigned int len = skb->len;
- u8 flags = sp->hdr.flags;
+ bool last = sp->hdr.flags & RXRPC_LAST_PACKET;
- for (;;) {
- if (flags & RXRPC_REQUEST_ACK)
- __set_bit(sp->nr_subpackets, sp->rx_req_ack);
- sp->nr_subpackets++;
+ __skb_queue_tail(&call->recvmsg_queue, skb);
+ rxrpc_input_update_ack_window(call, window, wtop);
- if (!(flags & RXRPC_JUMBO_PACKET))
- break;
+ trace_rxrpc_receive(call, last ? why + 1 : why, sp->hdr.serial, sp->hdr.seq);
+}
- if (len - offset < RXRPC_JUMBO_SUBPKTLEN)
- goto protocol_error;
- if (flags & RXRPC_LAST_PACKET)
- goto protocol_error;
- offset += RXRPC_JUMBO_DATALEN;
- if (skb_copy_bits(skb, offset, &flags, 1) < 0)
- goto protocol_error;
- offset += sizeof(struct rxrpc_jumbo_header);
+/*
+ * Process a DATA packet.
+ */
+static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct sk_buff *oos;
+ rxrpc_serial_t serial = sp->hdr.serial;
+ u64 win = atomic64_read(&call->ackr_window);
+ rxrpc_seq_t window = lower_32_bits(win);
+ rxrpc_seq_t wtop = upper_32_bits(win);
+ rxrpc_seq_t wlimit = window + call->rx_winsize - 1;
+ rxrpc_seq_t seq = sp->hdr.seq;
+ bool last = sp->hdr.flags & RXRPC_LAST_PACKET;
+ int ack_reason = -1;
+
+ rxrpc_inc_stat(call->rxnet, stat_rx_data);
+ if (sp->hdr.flags & RXRPC_REQUEST_ACK)
+ rxrpc_inc_stat(call->rxnet, stat_rx_data_reqack);
+ if (sp->hdr.flags & RXRPC_JUMBO_PACKET)
+ rxrpc_inc_stat(call->rxnet, stat_rx_data_jumbo);
+
+ if (last) {
+ if (test_and_set_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
+ seq + 1 != wtop) {
+ rxrpc_proto_abort("LSN", call, seq);
+ goto err_free;
+ }
+ } else {
+ if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
+ after_eq(seq, wtop)) {
+ pr_warn("Packet beyond last: c=%x q=%x window=%x-%x wlimit=%x\n",
+ call->debug_id, seq, window, wtop, wlimit);
+ rxrpc_proto_abort("LSA", call, seq);
+ goto err_free;
+ }
}
- if (flags & RXRPC_LAST_PACKET)
- sp->rx_flags |= RXRPC_SKB_INCL_LAST;
- return true;
+ if (after(seq, call->rx_highest_seq))
+ call->rx_highest_seq = seq;
-protocol_error:
- return false;
+ trace_rxrpc_rx_data(call->debug_id, seq, serial, sp->hdr.flags);
+
+ if (before(seq, window)) {
+ ack_reason = RXRPC_ACK_DUPLICATE;
+ goto send_ack;
+ }
+ if (after(seq, wlimit)) {
+ ack_reason = RXRPC_ACK_EXCEEDS_WINDOW;
+ goto send_ack;
+ }
+
+ /* Queue the packet. */
+ if (seq == window) {
+ rxrpc_seq_t reset_from;
+ bool reset_sack = false;
+
+ if (sp->hdr.flags & RXRPC_REQUEST_ACK)
+ ack_reason = RXRPC_ACK_REQUESTED;
+ /* Send an immediate ACK if we fill in a hole */
+ else if (!skb_queue_empty(&call->rx_oos_queue))
+ ack_reason = RXRPC_ACK_DELAY;
+
+ window++;
+ if (after(window, wtop))
+ wtop = window;
+
+ spin_lock(&call->recvmsg_queue.lock);
+ rxrpc_input_queue_data(call, skb, window, wtop, rxrpc_receive_queue);
+ skb = NULL;
+
+ while ((oos = skb_peek(&call->rx_oos_queue))) {
+ struct rxrpc_skb_priv *osp = rxrpc_skb(oos);
+
+ if (after(osp->hdr.seq, window))
+ break;
+
+ __skb_unlink(oos, &call->rx_oos_queue);
+ last = osp->hdr.flags & RXRPC_LAST_PACKET;
+ seq = osp->hdr.seq;
+ if (!reset_sack) {
+ reset_from = seq;
+ reset_sack = true;
+ }
+
+ window++;
+ rxrpc_input_queue_data(call, oos, window, wtop,
+ rxrpc_receive_queue_oos);
+ }
+
+ spin_unlock(&call->recvmsg_queue.lock);
+
+ if (reset_sack) {
+ do {
+ call->ackr_sack_table[reset_from % RXRPC_SACK_SIZE] = 0;
+ } while (reset_from++, before(reset_from, window));
+ }
+ } else {
+ bool keep = false;
+
+ ack_reason = RXRPC_ACK_OUT_OF_SEQUENCE;
+
+ if (!call->ackr_sack_table[seq % RXRPC_SACK_SIZE]) {
+ call->ackr_sack_table[seq % RXRPC_SACK_SIZE] = 1;
+ keep = 1;
+ }
+
+ if (after(seq + 1, wtop)) {
+ wtop = seq + 1;
+ rxrpc_input_update_ack_window(call, window, wtop);
+ }
+
+ if (!keep) {
+ ack_reason = RXRPC_ACK_DUPLICATE;
+ goto send_ack;
+ }
+
+ skb_queue_walk(&call->rx_oos_queue, oos) {
+ struct rxrpc_skb_priv *osp = rxrpc_skb(oos);
+
+ if (after(osp->hdr.seq, seq)) {
+ __skb_queue_before(&call->rx_oos_queue, oos, skb);
+ goto oos_queued;
+ }
+ }
+
+ __skb_queue_tail(&call->rx_oos_queue, skb);
+ oos_queued:
+ trace_rxrpc_receive(call, last ? rxrpc_receive_oos_last : rxrpc_receive_oos,
+ sp->hdr.serial, sp->hdr.seq);
+ skb = NULL;
+ }
+
+send_ack:
+ if (ack_reason < 0 &&
+ atomic_inc_return(&call->ackr_nr_unacked) > 2 &&
+ test_and_set_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags)) {
+ ack_reason = RXRPC_ACK_IDLE;
+ } else if (ack_reason >= 0) {
+ set_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags);
+ }
+
+ if (ack_reason >= 0)
+ rxrpc_send_ACK(call, ack_reason, serial,
+ rxrpc_propose_ack_input_data);
+ else
+ rxrpc_propose_delay_ACK(call, serial,
+ rxrpc_propose_ack_input_data);
+
+err_free:
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
}
/*
- * Handle reception of a duplicate packet.
- *
- * We have to take care to avoid an attack here whereby we're given a series of
- * jumbograms, each with a sequence number one before the preceding one and
- * filled up to maximum UDP size. If they never send us the first packet in
- * the sequence, they can cause us to have to hold on to around 2MiB of kernel
- * space until the call times out.
- *
- * We limit the space usage by only accepting three duplicate jumbo packets per
- * call. After that, we tell the other side we're no longer accepting jumbos
- * (that information is encoded in the ACK packet).
+ * Split a jumbo packet and file the bits separately.
*/
-static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq,
- bool is_jumbo, bool *_jumbo_bad)
+static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb)
{
- /* Discard normal packets that are duplicates. */
- if (is_jumbo)
- return;
+ struct rxrpc_jumbo_header jhdr;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb), *jsp;
+ struct sk_buff *jskb;
+ unsigned int offset = sizeof(struct rxrpc_wire_header);
+ unsigned int len = skb->len - offset;
- /* Skip jumbo subpackets that are duplicates. When we've had three or
- * more partially duplicate jumbo packets, we refuse to take any more
- * jumbos for this call.
- */
- if (!*_jumbo_bad) {
- call->nr_jumbo_bad++;
- *_jumbo_bad = true;
+ while (sp->hdr.flags & RXRPC_JUMBO_PACKET) {
+ if (len < RXRPC_JUMBO_SUBPKTLEN)
+ goto protocol_error;
+ if (sp->hdr.flags & RXRPC_LAST_PACKET)
+ goto protocol_error;
+ if (skb_copy_bits(skb, offset + RXRPC_JUMBO_DATALEN,
+ &jhdr, sizeof(jhdr)) < 0)
+ goto protocol_error;
+
+ jskb = skb_clone(skb, GFP_ATOMIC);
+ if (!jskb) {
+ kdebug("couldn't clone");
+ return false;
+ }
+ rxrpc_new_skb(jskb, rxrpc_skb_cloned_jumbo);
+ jsp = rxrpc_skb(jskb);
+ jsp->offset = offset;
+ jsp->len = RXRPC_JUMBO_DATALEN;
+ rxrpc_input_data_one(call, jskb);
+
+ sp->hdr.flags = jhdr.flags;
+ sp->hdr._rsvd = ntohs(jhdr._rsvd);
+ sp->hdr.seq++;
+ sp->hdr.serial++;
+ offset += RXRPC_JUMBO_SUBPKTLEN;
+ len -= RXRPC_JUMBO_SUBPKTLEN;
}
+
+ sp->offset = offset;
+ sp->len = len;
+ rxrpc_input_data_one(call, skb);
+ return true;
+
+protocol_error:
+ return false;
}
/*
@@ -412,17 +544,15 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
enum rxrpc_call_state state;
- unsigned int j, nr_subpackets, nr_unacked = 0;
- rxrpc_serial_t serial = sp->hdr.serial, ack_serial = serial;
- rxrpc_seq_t seq0 = sp->hdr.seq, hard_ack;
- bool immediate_ack = false, jumbo_bad = false;
- u8 ack = 0;
+ rxrpc_serial_t serial = sp->hdr.serial;
+ rxrpc_seq_t seq0 = sp->hdr.seq;
- _enter("{%u,%u},{%u,%u}",
- call->rx_hard_ack, call->rx_top, skb->len, seq0);
+ _enter("{%llx,%x},{%u,%x}",
+ atomic64_read(&call->ackr_window), call->rx_highest_seq,
+ skb->len, seq0);
- _proto("Rx DATA %%%u { #%u f=%02x n=%u }",
- sp->hdr.serial, seq0, sp->hdr.flags, sp->nr_subpackets);
+ _proto("Rx DATA %%%u { #%u f=%02x }",
+ sp->hdr.serial, seq0, sp->hdr.flags);
state = READ_ONCE(call->state);
if (state >= RXRPC_CALL_COMPLETE) {
@@ -430,6 +560,24 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb)
return;
}
+ /* Unshare the packet so that it can be modified for in-place
+ * decryption.
+ */
+ if (sp->hdr.securityIndex != 0) {
+ struct sk_buff *nskb = skb_unshare(skb, GFP_ATOMIC);
+ if (!nskb) {
+ rxrpc_eaten_skb(skb, rxrpc_skb_unshared_nomem);
+ return;
+ }
+
+ if (nskb != skb) {
+ rxrpc_eaten_skb(skb, rxrpc_skb_received);
+ skb = nskb;
+ rxrpc_new_skb(skb, rxrpc_skb_unshared);
+ sp = rxrpc_skb(skb);
+ }
+ }
+
if (state == RXRPC_CALL_SERVER_RECV_REQUEST) {
unsigned long timo = READ_ONCE(call->next_req_timo);
unsigned long now, expect_req_by;
@@ -451,166 +599,18 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb)
if ((state == RXRPC_CALL_CLIENT_SEND_REQUEST ||
state == RXRPC_CALL_CLIENT_AWAIT_REPLY) &&
!rxrpc_receiving_reply(call))
- goto unlock;
-
- hard_ack = READ_ONCE(call->rx_hard_ack);
-
- nr_subpackets = sp->nr_subpackets;
- if (nr_subpackets > 1) {
- if (call->nr_jumbo_bad > 3) {
- ack = RXRPC_ACK_NOSPACE;
- ack_serial = serial;
- goto ack;
- }
- }
-
- for (j = 0; j < nr_subpackets; j++) {
- rxrpc_serial_t serial = sp->hdr.serial + j;
- rxrpc_seq_t seq = seq0 + j;
- unsigned int ix = seq & RXRPC_RXTX_BUFF_MASK;
- bool terminal = (j == nr_subpackets - 1);
- bool last = terminal && (sp->rx_flags & RXRPC_SKB_INCL_LAST);
- u8 flags, annotation = j;
-
- _proto("Rx DATA+%u %%%u { #%x t=%u l=%u }",
- j, serial, seq, terminal, last);
-
- if (last) {
- if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
- seq != call->rx_top) {
- rxrpc_proto_abort("LSN", call, seq);
- goto unlock;
- }
- } else {
- if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
- after_eq(seq, call->rx_top)) {
- rxrpc_proto_abort("LSA", call, seq);
- goto unlock;
- }
- }
-
- flags = 0;
- if (last)
- flags |= RXRPC_LAST_PACKET;
- if (!terminal)
- flags |= RXRPC_JUMBO_PACKET;
- if (test_bit(j, sp->rx_req_ack))
- flags |= RXRPC_REQUEST_ACK;
- trace_rxrpc_rx_data(call->debug_id, seq, serial, flags, annotation);
-
- if (before_eq(seq, hard_ack)) {
- ack = RXRPC_ACK_DUPLICATE;
- ack_serial = serial;
- continue;
- }
-
- if (call->rxtx_buffer[ix]) {
- rxrpc_input_dup_data(call, seq, nr_subpackets > 1,
- &jumbo_bad);
- if (ack != RXRPC_ACK_DUPLICATE) {
- ack = RXRPC_ACK_DUPLICATE;
- ack_serial = serial;
- }
- immediate_ack = true;
- continue;
- }
-
- if (after(seq, hard_ack + call->rx_winsize)) {
- ack = RXRPC_ACK_EXCEEDS_WINDOW;
- ack_serial = serial;
- if (flags & RXRPC_JUMBO_PACKET) {
- if (!jumbo_bad) {
- call->nr_jumbo_bad++;
- jumbo_bad = true;
- }
- }
-
- goto ack;
- }
-
- if (flags & RXRPC_REQUEST_ACK && !ack) {
- ack = RXRPC_ACK_REQUESTED;
- ack_serial = serial;
- }
-
- if (after(seq0, call->ackr_highest_seq))
- call->ackr_highest_seq = seq0;
-
- /* Queue the packet. We use a couple of memory barriers here as need
- * to make sure that rx_top is perceived to be set after the buffer
- * pointer and that the buffer pointer is set after the annotation and
- * the skb data.
- *
- * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window()
- * and also rxrpc_fill_out_ack().
- */
- if (!terminal)
- rxrpc_get_skb(skb, rxrpc_skb_got);
- call->rxtx_annotations[ix] = annotation;
- smp_wmb();
- call->rxtx_buffer[ix] = skb;
- if (after(seq, call->rx_top)) {
- smp_store_release(&call->rx_top, seq);
- } else if (before(seq, call->rx_top)) {
- /* Send an immediate ACK if we fill in a hole */
- if (!ack) {
- ack = RXRPC_ACK_DELAY;
- ack_serial = serial;
- }
- immediate_ack = true;
- }
-
- if (terminal) {
- /* From this point on, we're not allowed to touch the
- * packet any longer as its ref now belongs to the Rx
- * ring.
- */
- skb = NULL;
- sp = NULL;
- }
-
- nr_unacked++;
-
- if (last) {
- set_bit(RXRPC_CALL_RX_LAST, &call->flags);
- if (!ack) {
- ack = RXRPC_ACK_DELAY;
- ack_serial = serial;
- }
- trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq);
- } else {
- trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq);
- }
+ goto out;
- if (after_eq(seq, call->rx_expect_next)) {
- if (after(seq, call->rx_expect_next)) {
- _net("OOS %u > %u", seq, call->rx_expect_next);
- ack = RXRPC_ACK_OUT_OF_SEQUENCE;
- ack_serial = serial;
- }
- call->rx_expect_next = seq + 1;
- }
- if (!ack)
- ack_serial = serial;
+ if (!rxrpc_input_split_jumbo(call, skb)) {
+ rxrpc_proto_abort("VLD", call, sp->hdr.seq);
+ goto out;
}
+ skb = NULL;
-ack:
- if (atomic_add_return(nr_unacked, &call->ackr_nr_unacked) > 2 && !ack)
- ack = RXRPC_ACK_IDLE;
-
- if (ack)
- rxrpc_propose_ACK(call, ack, ack_serial,
- immediate_ack, true,
- rxrpc_propose_ack_input_data);
- else
- rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial,
- false, true,
- rxrpc_propose_ack_input_data);
-
+out:
trace_rxrpc_notify_socket(call->debug_id, serial);
rxrpc_notify_socket(call);
-unlock:
spin_unlock(&call->input_lock);
rxrpc_free_skb(skb, rxrpc_skb_freed);
_leave(" [queued]");
@@ -679,31 +679,8 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call,
*/
static void rxrpc_input_check_for_lost_ack(struct rxrpc_call *call)
{
- rxrpc_seq_t top, bottom, seq;
- bool resend = false;
-
- spin_lock_bh(&call->lock);
-
- bottom = call->tx_hard_ack + 1;
- top = call->acks_lost_top;
- if (before(bottom, top)) {
- for (seq = bottom; before_eq(seq, top); seq++) {
- int ix = seq & RXRPC_RXTX_BUFF_MASK;
- u8 annotation = call->rxtx_annotations[ix];
- u8 anno_type = annotation & RXRPC_TX_ANNO_MASK;
-
- if (anno_type != RXRPC_TX_ANNO_UNACK)
- continue;
- annotation &= ~RXRPC_TX_ANNO_MASK;
- annotation |= RXRPC_TX_ANNO_RETRANS;
- call->rxtx_annotations[ix] = annotation;
- resend = true;
- }
- }
-
- spin_unlock_bh(&call->lock);
-
- if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
+ if (after(call->acks_lost_top, call->acks_prev_seq) &&
+ !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
rxrpc_queue_call(call);
}
@@ -736,8 +713,8 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU),
rwind, ntohl(ackinfo->jumbo_max));
- if (rwind > RXRPC_RXTX_BUFF_SIZE - 1)
- rwind = RXRPC_RXTX_BUFF_SIZE - 1;
+ if (rwind > RXRPC_TX_MAX_WINDOW)
+ rwind = RXRPC_TX_MAX_WINDOW;
if (call->tx_winsize != rwind) {
if (rwind > call->tx_winsize)
wake = true;
@@ -776,40 +753,19 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
rxrpc_seq_t seq, int nr_acks,
struct rxrpc_ack_summary *summary)
{
- int ix;
- u8 annotation, anno_type;
-
- for (; nr_acks > 0; nr_acks--, seq++) {
- ix = seq & RXRPC_RXTX_BUFF_MASK;
- annotation = call->rxtx_annotations[ix];
- anno_type = annotation & RXRPC_TX_ANNO_MASK;
- annotation &= ~RXRPC_TX_ANNO_MASK;
- switch (*acks++) {
- case RXRPC_ACK_TYPE_ACK:
+ unsigned int i;
+
+ for (i = 0; i < nr_acks; i++) {
+ if (acks[i] == RXRPC_ACK_TYPE_ACK) {
summary->nr_acks++;
- if (anno_type == RXRPC_TX_ANNO_ACK)
- continue;
summary->nr_new_acks++;
- call->rxtx_annotations[ix] =
- RXRPC_TX_ANNO_ACK | annotation;
- break;
- case RXRPC_ACK_TYPE_NACK:
- if (!summary->nr_nacks &&
- call->acks_lowest_nak != seq) {
- call->acks_lowest_nak = seq;
+ } else {
+ if (!summary->saw_nacks &&
+ call->acks_lowest_nak != seq + i) {
+ call->acks_lowest_nak = seq + i;
summary->new_low_nack = true;
}
- summary->nr_nacks++;
- if (anno_type == RXRPC_TX_ANNO_NAK)
- continue;
- summary->nr_new_nacks++;
- if (anno_type == RXRPC_TX_ANNO_RETRANS)
- continue;
- call->rxtx_annotations[ix] =
- RXRPC_TX_ANNO_NAK | annotation;
- break;
- default:
- return rxrpc_proto_abort("SFT", call, 0);
+ summary->saw_nacks = true;
}
}
}
@@ -851,12 +807,10 @@ static bool rxrpc_is_ack_valid(struct rxrpc_call *call,
static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
{
struct rxrpc_ack_summary summary = { 0 };
+ struct rxrpc_ackpacket ack;
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- union {
- struct rxrpc_ackpacket ack;
- struct rxrpc_ackinfo info;
- u8 acks[RXRPC_MAXACKS];
- } buf;
+ struct rxrpc_ackinfo info;
+ struct sk_buff *skb_old = NULL, *skb_put = skb;
rxrpc_serial_t ack_serial, acked_serial;
rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt;
int nr_acks, offset, ioffset;
@@ -864,29 +818,28 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
_enter("");
offset = sizeof(struct rxrpc_wire_header);
- if (skb_copy_bits(skb, offset, &buf.ack, sizeof(buf.ack)) < 0) {
- _debug("extraction failure");
- return rxrpc_proto_abort("XAK", call, 0);
+ if (skb_copy_bits(skb, offset, &ack, sizeof(ack)) < 0) {
+ rxrpc_proto_abort("XAK", call, 0);
+ goto out_not_locked;
}
- offset += sizeof(buf.ack);
+ offset += sizeof(ack);
ack_serial = sp->hdr.serial;
- acked_serial = ntohl(buf.ack.serial);
- first_soft_ack = ntohl(buf.ack.firstPacket);
- prev_pkt = ntohl(buf.ack.previousPacket);
+ acked_serial = ntohl(ack.serial);
+ first_soft_ack = ntohl(ack.firstPacket);
+ prev_pkt = ntohl(ack.previousPacket);
hard_ack = first_soft_ack - 1;
- nr_acks = buf.ack.nAcks;
- summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
- buf.ack.reason : RXRPC_ACK__INVALID);
+ nr_acks = ack.nAcks;
+ summary.ack_reason = (ack.reason < RXRPC_ACK__INVALID ?
+ ack.reason : RXRPC_ACK__INVALID);
trace_rxrpc_rx_ack(call, ack_serial, acked_serial,
first_soft_ack, prev_pkt,
summary.ack_reason, nr_acks);
+ rxrpc_inc_stat(call->rxnet, stat_rx_acks[ack.reason]);
- switch (buf.ack.reason) {
+ switch (ack.reason) {
case RXRPC_ACK_PING_RESPONSE:
- rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
- ack_serial);
rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial,
rxrpc_rtt_rx_ping_response);
break;
@@ -901,22 +854,20 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
break;
}
- if (buf.ack.reason == RXRPC_ACK_PING) {
+ if (ack.reason == RXRPC_ACK_PING) {
_proto("Rx ACK %%%u PING Request", ack_serial);
- rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
- ack_serial, true, true,
- rxrpc_propose_ack_respond_to_ping);
+ rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial,
+ rxrpc_propose_ack_respond_to_ping);
} else if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
- rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED,
- ack_serial, true, true,
- rxrpc_propose_ack_respond_to_ack);
+ rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial,
+ rxrpc_propose_ack_respond_to_ack);
}
/* If we get an EXCEEDS_WINDOW ACK from the server, it probably
* indicates that the client address changed due to NAT. The server
* lost the call because it switched to a different peer.
*/
- if (unlikely(buf.ack.reason == RXRPC_ACK_EXCEEDS_WINDOW) &&
+ if (unlikely(ack.reason == RXRPC_ACK_EXCEEDS_WINDOW) &&
first_soft_ack == 1 &&
prev_pkt == 0 &&
rxrpc_is_client_call(call)) {
@@ -929,10 +880,10 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
* indicate a change of address. However, we can retransmit the call
* if we still have it buffered to the beginning.
*/
- if (unlikely(buf.ack.reason == RXRPC_ACK_OUT_OF_SEQUENCE) &&
+ if (unlikely(ack.reason == RXRPC_ACK_OUT_OF_SEQUENCE) &&
first_soft_ack == 1 &&
prev_pkt == 0 &&
- call->tx_hard_ack == 0 &&
+ call->acks_hard_ack == 0 &&
rxrpc_is_client_call(call)) {
rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
0, -ENETRESET);
@@ -944,14 +895,19 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial,
first_soft_ack, call->acks_first_seq,
prev_pkt, call->acks_prev_seq);
- return;
+ goto out_not_locked;
}
- buf.info.rxMTU = 0;
+ info.rxMTU = 0;
ioffset = offset + nr_acks + 3;
- if (skb->len >= ioffset + sizeof(buf.info) &&
- skb_copy_bits(skb, ioffset, &buf.info, sizeof(buf.info)) < 0)
- return rxrpc_proto_abort("XAI", call, 0);
+ if (skb->len >= ioffset + sizeof(info) &&
+ skb_copy_bits(skb, ioffset, &info, sizeof(info)) < 0) {
+ rxrpc_proto_abort("XAI", call, 0);
+ goto out_not_locked;
+ }
+
+ if (nr_acks > 0)
+ skb_condense(skb);
spin_lock(&call->input_lock);
@@ -967,9 +923,22 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
call->acks_first_seq = first_soft_ack;
call->acks_prev_seq = prev_pkt;
+ switch (ack.reason) {
+ case RXRPC_ACK_PING:
+ break;
+ case RXRPC_ACK_PING_RESPONSE:
+ rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
+ ack_serial);
+ fallthrough;
+ default:
+ if (after(acked_serial, call->acks_highest_serial))
+ call->acks_highest_serial = acked_serial;
+ break;
+ }
+
/* Parse rwind and mtu sizes if provided. */
- if (buf.info.rxMTU)
- rxrpc_input_ackinfo(call, skb, &buf.info);
+ if (info.rxMTU)
+ rxrpc_input_ackinfo(call, skb, &info);
if (first_soft_ack == 0) {
rxrpc_proto_abort("AK0", call, 0);
@@ -987,7 +956,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
goto out;
}
- if (before(hard_ack, call->tx_hard_ack) ||
+ if (before(hard_ack, call->acks_hard_ack) ||
after(hard_ack, call->tx_top)) {
rxrpc_proto_abort("AKW", call, 0);
goto out;
@@ -997,7 +966,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
goto out;
}
- if (after(hard_ack, call->tx_hard_ack)) {
+ if (after(hard_ack, call->acks_hard_ack)) {
if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) {
rxrpc_end_tx_phase(call, false, "ETA");
goto out;
@@ -1005,25 +974,38 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
}
if (nr_acks > 0) {
- if (skb_copy_bits(skb, offset, buf.acks, nr_acks) < 0) {
+ if (offset > (int)skb->len - nr_acks) {
rxrpc_proto_abort("XSA", call, 0);
goto out;
}
- rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks,
- &summary);
+
+ spin_lock(&call->acks_ack_lock);
+ skb_old = call->acks_soft_tbl;
+ call->acks_soft_tbl = skb;
+ spin_unlock(&call->acks_ack_lock);
+
+ rxrpc_input_soft_acks(call, skb->data + offset, first_soft_ack,
+ nr_acks, &summary);
+ skb_put = NULL;
+ } else if (call->acks_soft_tbl) {
+ spin_lock(&call->acks_ack_lock);
+ skb_old = call->acks_soft_tbl;
+ call->acks_soft_tbl = NULL;
+ spin_unlock(&call->acks_ack_lock);
}
- if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
- RXRPC_TX_ANNO_LAST &&
+ if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) &&
summary.nr_acks == call->tx_top - hard_ack &&
rxrpc_is_client_call(call))
- rxrpc_propose_ACK(call, RXRPC_ACK_PING, ack_serial,
- false, true,
- rxrpc_propose_ack_ping_for_lost_reply);
+ rxrpc_propose_ping(call, ack_serial,
+ rxrpc_propose_ack_ping_for_lost_reply);
rxrpc_congestion_management(call, skb, &summary, acked_serial);
out:
spin_unlock(&call->input_lock);
+out_not_locked:
+ rxrpc_free_skb(skb_put, rxrpc_skb_freed);
+ rxrpc_free_skb(skb_old, rxrpc_skb_freed);
}
/*
@@ -1096,7 +1078,7 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
case RXRPC_PACKET_TYPE_ACK:
rxrpc_input_ack(call, skb);
- break;
+ goto no_free;
case RXRPC_PACKET_TYPE_BUSY:
_proto("Rx BUSY %%%u", sp->hdr.serial);
@@ -1307,8 +1289,6 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
if (sp->hdr.callNumber == 0 ||
sp->hdr.seq == 0)
goto bad_message;
- if (!rxrpc_validate_data(skb))
- goto bad_message;
/* Unshare the packet so that it can be modified for in-place
* decryption.
@@ -1422,7 +1402,7 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
trace_rxrpc_rx_data(chan->call_debug_id,
sp->hdr.seq,
sp->hdr.serial,
- sp->hdr.flags, 0);
+ sp->hdr.flags);
rxrpc_post_packet_to_conn(conn, skb);
goto out;
}
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index 9aae99d67833..0eb8471bfc53 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -25,16 +25,16 @@ static int none_how_much_data(struct rxrpc_call *call, size_t remain,
return 0;
}
-static int none_secure_packet(struct rxrpc_call *call, struct sk_buff *skb,
- size_t data_size)
+static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
{
return 0;
}
-static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
- unsigned int offset, unsigned int len,
- rxrpc_seq_t seq, u16 expected_cksum)
+static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb)
{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ sp->flags |= RXRPC_RX_VERIFIED;
return 0;
}
@@ -42,11 +42,6 @@ static void none_free_call_crypto(struct rxrpc_call *call)
{
}
-static void none_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
- unsigned int *_offset, unsigned int *_len)
-{
-}
-
static int none_respond_to_challenge(struct rxrpc_connection *conn,
struct sk_buff *skb,
u32 *_abort_code)
@@ -95,7 +90,6 @@ const struct rxrpc_security rxrpc_no_security = {
.how_much_data = none_how_much_data,
.secure_packet = none_secure_packet,
.verify_packet = none_verify_packet,
- .locate_data = none_locate_data,
.respond_to_challenge = none_respond_to_challenge,
.verify_response = none_verify_response,
.clear = none_clear,
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 38ea98ff426b..a943fdf91e24 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -24,6 +24,20 @@ static void rxrpc_local_processor(struct work_struct *);
static void rxrpc_local_rcu(struct rcu_head *);
/*
+ * Handle an ICMP/ICMP6 error turning up at the tunnel. Push it through the
+ * usual mechanism so that it gets parsed and presented through the UDP
+ * socket's error_report().
+ */
+static void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, int err,
+ __be16 port, u32 info, u8 *payload)
+{
+ if (ip_hdr(skb)->version == IPVERSION)
+ return ip_icmp_error(sk, skb, err, port, info, payload);
+ if (IS_ENABLED(CONFIG_AF_RXRPC_IPV6))
+ return ipv6_icmp_error(sk, skb, err, port, info, payload);
+}
+
+/*
* Compare a local to an address. Return -ve, 0 or +ve to indicate less than,
* same or greater than.
*
@@ -84,6 +98,8 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet,
local->rxnet = rxnet;
INIT_HLIST_NODE(&local->link);
INIT_WORK(&local->processor, rxrpc_local_processor);
+ INIT_LIST_HEAD(&local->ack_tx_queue);
+ spin_lock_init(&local->ack_tx_lock);
init_rwsem(&local->defrag_sem);
skb_queue_head_init(&local->reject_queue);
skb_queue_head_init(&local->event_queue);
@@ -419,6 +435,11 @@ static void rxrpc_local_processor(struct work_struct *work)
break;
}
+ if (!list_empty(&local->ack_tx_queue)) {
+ rxrpc_transmit_ack_packets(local);
+ again = true;
+ }
+
if (!skb_queue_empty(&local->reject_queue)) {
rxrpc_reject_packets(local);
again = true;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index d4144fd86f84..056c428d8bf3 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -17,12 +17,6 @@
unsigned int rxrpc_max_backlog __read_mostly = 10;
/*
- * How long to wait before scheduling ACK generation after seeing a
- * packet with RXRPC_REQUEST_ACK set (in jiffies).
- */
-unsigned long rxrpc_requested_ack_delay = 1;
-
-/*
* How long to wait before scheduling an ACK with subtype DELAY (in jiffies).
*
* We use this when we've received new data packets. If those packets aren't
@@ -46,10 +40,7 @@ unsigned long rxrpc_idle_ack_delay = HZ / 2;
* limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further
* packets.
*/
-unsigned int rxrpc_rx_window_size = RXRPC_INIT_RX_WINDOW_SIZE;
-#if (RXRPC_RXTX_BUFF_SIZE - 1) < RXRPC_INIT_RX_WINDOW_SIZE
-#error Need to reduce RXRPC_INIT_RX_WINDOW_SIZE
-#endif
+unsigned int rxrpc_rx_window_size = 255;
/*
* Maximum Rx MTU size. This indicates to the sender the size of jumbo packet
@@ -62,15 +53,3 @@ unsigned int rxrpc_rx_mtu = 5692;
* sender that we're willing to handle.
*/
unsigned int rxrpc_rx_jumbo_max = 4;
-
-const s8 rxrpc_ack_priority[] = {
- [0] = 0,
- [RXRPC_ACK_DELAY] = 1,
- [RXRPC_ACK_REQUESTED] = 2,
- [RXRPC_ACK_IDLE] = 3,
- [RXRPC_ACK_DUPLICATE] = 4,
- [RXRPC_ACK_OUT_OF_SEQUENCE] = 5,
- [RXRPC_ACK_EXCEEDS_WINDOW] = 6,
- [RXRPC_ACK_NOSPACE] = 7,
- [RXRPC_ACK_PING_RESPONSE] = 8,
-};
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index bb4c25d6df64..84242c0e467c 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -101,6 +101,8 @@ static __net_init int rxrpc_init_net(struct net *net)
proc_create_net("locals", 0444, rxnet->proc_net,
&rxrpc_local_seq_ops,
sizeof(struct seq_net_private));
+ proc_create_net_single_write("stats", S_IFREG | 0644, rxnet->proc_net,
+ rxrpc_stats_show, rxrpc_stats_clear, NULL);
return 0;
err_proc:
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 9683617db704..c5eed0e83e47 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -13,15 +13,27 @@
#include <linux/export.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
+#include <net/udp.h>
#include "ar-internal.h"
-struct rxrpc_ack_buffer {
- struct rxrpc_wire_header whdr;
- struct rxrpc_ackpacket ack;
- u8 acks[255];
- u8 pad[3];
- struct rxrpc_ackinfo ackinfo;
-};
+extern int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
+
+static ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len)
+{
+ struct sockaddr *sa = msg->msg_name;
+ struct sock *sk = socket->sk;
+
+ if (IS_ENABLED(CONFIG_AF_RXRPC_IPV6)) {
+ if (sa->sa_family == AF_INET6) {
+ if (sk->sk_family != AF_INET6) {
+ pr_warn("AF_INET6 address on AF_INET socket\n");
+ return -ENOPROTOOPT;
+ }
+ return udpv6_sendmsg(sk, msg, len);
+ }
+ }
+ return udp_sendmsg(sk, msg, len);
+}
struct rxrpc_abort_buffer {
struct rxrpc_wire_header whdr;
@@ -68,66 +80,83 @@ static void rxrpc_set_keepalive(struct rxrpc_call *call)
*/
static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn,
struct rxrpc_call *call,
- struct rxrpc_ack_buffer *pkt,
- rxrpc_seq_t *_hard_ack,
- rxrpc_seq_t *_top,
- u8 reason)
+ struct rxrpc_txbuf *txb)
{
- rxrpc_serial_t serial;
- unsigned int tmp;
- rxrpc_seq_t hard_ack, top, seq;
- int ix;
+ struct rxrpc_ackinfo ackinfo;
+ unsigned int qsize;
+ rxrpc_seq_t window, wtop, wrap_point, ix, first;
+ int rsize;
+ u64 wtmp;
u32 mtu, jmax;
- u8 *ackp = pkt->acks;
+ u8 *ackp = txb->acks;
+ u8 sack_buffer[sizeof(call->ackr_sack_table)] __aligned(8);
- tmp = atomic_xchg(&call->ackr_nr_unacked, 0);
- tmp |= atomic_xchg(&call->ackr_nr_consumed, 0);
- if (!tmp && (reason == RXRPC_ACK_DELAY ||
- reason == RXRPC_ACK_IDLE))
- return 0;
+ atomic_set(&call->ackr_nr_unacked, 0);
+ atomic_set(&call->ackr_nr_consumed, 0);
+ rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill);
/* Barrier against rxrpc_input_data(). */
- serial = call->ackr_serial;
- hard_ack = READ_ONCE(call->rx_hard_ack);
- top = smp_load_acquire(&call->rx_top);
- *_hard_ack = hard_ack;
- *_top = top;
-
- pkt->ack.bufferSpace = htons(8);
- pkt->ack.maxSkew = htons(0);
- pkt->ack.firstPacket = htonl(hard_ack + 1);
- pkt->ack.previousPacket = htonl(call->ackr_highest_seq);
- pkt->ack.serial = htonl(serial);
- pkt->ack.reason = reason;
- pkt->ack.nAcks = top - hard_ack;
-
- if (reason == RXRPC_ACK_PING)
- pkt->whdr.flags |= RXRPC_REQUEST_ACK;
-
- if (after(top, hard_ack)) {
- seq = hard_ack + 1;
- do {
- ix = seq & RXRPC_RXTX_BUFF_MASK;
- if (call->rxtx_buffer[ix])
- *ackp++ = RXRPC_ACK_TYPE_ACK;
- else
- *ackp++ = RXRPC_ACK_TYPE_NACK;
- seq++;
- } while (before_eq(seq, top));
+retry:
+ wtmp = atomic64_read_acquire(&call->ackr_window);
+ window = lower_32_bits(wtmp);
+ wtop = upper_32_bits(wtmp);
+ txb->ack.firstPacket = htonl(window);
+ txb->ack.nAcks = 0;
+
+ if (after(wtop, window)) {
+ /* Try to copy the SACK ring locklessly. We can use the copy,
+ * only if the now-current top of the window didn't go past the
+ * previously read base - otherwise we can't know whether we
+ * have old data or new data.
+ */
+ memcpy(sack_buffer, call->ackr_sack_table, sizeof(sack_buffer));
+ wrap_point = window + RXRPC_SACK_SIZE - 1;
+ wtmp = atomic64_read_acquire(&call->ackr_window);
+ window = lower_32_bits(wtmp);
+ wtop = upper_32_bits(wtmp);
+ if (after(wtop, wrap_point)) {
+ cond_resched();
+ goto retry;
+ }
+
+ /* The buffer is maintained as a ring with an invariant mapping
+ * between bit position and sequence number, so we'll probably
+ * need to rotate it.
+ */
+ txb->ack.nAcks = wtop - window;
+ ix = window % RXRPC_SACK_SIZE;
+ first = sizeof(sack_buffer) - ix;
+
+ if (ix + txb->ack.nAcks <= RXRPC_SACK_SIZE) {
+ memcpy(txb->acks, sack_buffer + ix, txb->ack.nAcks);
+ } else {
+ memcpy(txb->acks, sack_buffer + ix, first);
+ memcpy(txb->acks + first, sack_buffer,
+ txb->ack.nAcks - first);
+ }
+
+ ackp += txb->ack.nAcks;
+ } else if (before(wtop, window)) {
+ pr_warn("ack window backward %x %x", window, wtop);
+ } else if (txb->ack.reason == RXRPC_ACK_DELAY) {
+ txb->ack.reason = RXRPC_ACK_IDLE;
}
mtu = conn->params.peer->if_mtu;
mtu -= conn->params.peer->hdrsize;
- jmax = (call->nr_jumbo_bad > 3) ? 1 : rxrpc_rx_jumbo_max;
- pkt->ackinfo.rxMTU = htonl(rxrpc_rx_mtu);
- pkt->ackinfo.maxMTU = htonl(mtu);
- pkt->ackinfo.rwind = htonl(call->rx_winsize);
- pkt->ackinfo.jumbo_max = htonl(jmax);
+ jmax = rxrpc_rx_jumbo_max;
+ qsize = (window - 1) - call->rx_consumed;
+ rsize = max_t(int, call->rx_winsize - qsize, 0);
+ ackinfo.rxMTU = htonl(rxrpc_rx_mtu);
+ ackinfo.maxMTU = htonl(mtu);
+ ackinfo.rwind = htonl(rsize);
+ ackinfo.jumbo_max = htonl(jmax);
*ackp++ = 0;
*ackp++ = 0;
*ackp++ = 0;
- return top - hard_ack + 3;
+ memcpy(ackp, &ackinfo, sizeof(ackinfo));
+ return txb->ack.nAcks + 3 + sizeof(ackinfo);
}
/*
@@ -176,26 +205,19 @@ static void rxrpc_cancel_rtt_probe(struct rxrpc_call *call,
/*
* Send an ACK call packet.
*/
-int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
- rxrpc_serial_t *_serial)
+static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf *txb)
{
struct rxrpc_connection *conn;
- struct rxrpc_ack_buffer *pkt;
+ struct rxrpc_call *call = txb->call;
struct msghdr msg;
- struct kvec iov[2];
+ struct kvec iov[1];
rxrpc_serial_t serial;
- rxrpc_seq_t hard_ack, top;
size_t len, n;
int ret, rtt_slot = -1;
- u8 reason;
if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
return -ECONNRESET;
- pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
- if (!pkt)
- return -ENOMEM;
-
conn = call->conn;
msg.msg_name = &call->peer->srx.transport;
@@ -204,83 +226,97 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
msg.msg_controllen = 0;
msg.msg_flags = 0;
- pkt->whdr.epoch = htonl(conn->proto.epoch);
- pkt->whdr.cid = htonl(call->cid);
- pkt->whdr.callNumber = htonl(call->call_id);
- pkt->whdr.seq = 0;
- pkt->whdr.type = RXRPC_PACKET_TYPE_ACK;
- pkt->whdr.flags = RXRPC_SLOW_START_OK | conn->out_clientflag;
- pkt->whdr.userStatus = 0;
- pkt->whdr.securityIndex = call->security_ix;
- pkt->whdr._rsvd = 0;
- pkt->whdr.serviceId = htons(call->service_id);
-
- spin_lock_bh(&call->lock);
- if (ping) {
- reason = RXRPC_ACK_PING;
- } else {
- reason = call->ackr_reason;
- if (!call->ackr_reason) {
- spin_unlock_bh(&call->lock);
- ret = 0;
- goto out;
- }
- call->ackr_reason = 0;
- }
- n = rxrpc_fill_out_ack(conn, call, pkt, &hard_ack, &top, reason);
+ if (txb->ack.reason == RXRPC_ACK_PING)
+ txb->wire.flags |= RXRPC_REQUEST_ACK;
+
+ if (txb->ack.reason == RXRPC_ACK_DELAY)
+ clear_bit(RXRPC_CALL_DELAY_ACK_PENDING, &call->flags);
+ if (txb->ack.reason == RXRPC_ACK_IDLE)
+ clear_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags);
- spin_unlock_bh(&call->lock);
- if (n == 0) {
- kfree(pkt);
+ n = rxrpc_fill_out_ack(conn, call, txb);
+ if (n == 0)
return 0;
- }
- iov[0].iov_base = pkt;
- iov[0].iov_len = sizeof(pkt->whdr) + sizeof(pkt->ack) + n;
- iov[1].iov_base = &pkt->ackinfo;
- iov[1].iov_len = sizeof(pkt->ackinfo);
- len = iov[0].iov_len + iov[1].iov_len;
+ iov[0].iov_base = &txb->wire;
+ iov[0].iov_len = sizeof(txb->wire) + sizeof(txb->ack) + n;
+ len = iov[0].iov_len;
serial = atomic_inc_return(&conn->serial);
- pkt->whdr.serial = htonl(serial);
+ txb->wire.serial = htonl(serial);
trace_rxrpc_tx_ack(call->debug_id, serial,
- ntohl(pkt->ack.firstPacket),
- ntohl(pkt->ack.serial),
- pkt->ack.reason, pkt->ack.nAcks);
- if (_serial)
- *_serial = serial;
+ ntohl(txb->ack.firstPacket),
+ ntohl(txb->ack.serial), txb->ack.reason, txb->ack.nAcks);
+ if (txb->ack_why == rxrpc_propose_ack_ping_for_lost_ack)
+ call->acks_lost_ping = serial;
- if (ping)
+ if (txb->ack.reason == RXRPC_ACK_PING)
rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_ping);
- ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
- conn->params.peer->last_tx_at = ktime_get_seconds();
+ rxrpc_inc_stat(call->rxnet, stat_tx_ack_send);
+
+ /* Grab the highest received seq as late as possible */
+ txb->ack.previousPacket = htonl(call->rx_highest_seq);
+
+ iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len);
+ ret = do_udp_sendmsg(conn->params.local->socket, &msg, len);
+ call->peer->last_tx_at = ktime_get_seconds();
if (ret < 0)
trace_rxrpc_tx_fail(call->debug_id, serial, ret,
rxrpc_tx_point_call_ack);
else
- trace_rxrpc_tx_packet(call->debug_id, &pkt->whdr,
+ trace_rxrpc_tx_packet(call->debug_id, &txb->wire,
rxrpc_tx_point_call_ack);
rxrpc_tx_backoff(call, ret);
if (call->state < RXRPC_CALL_COMPLETE) {
- if (ret < 0) {
+ if (ret < 0)
rxrpc_cancel_rtt_probe(call, serial, rtt_slot);
- rxrpc_propose_ACK(call, pkt->ack.reason,
- ntohl(pkt->ack.serial),
- false, true,
- rxrpc_propose_ack_retry_tx);
- }
-
rxrpc_set_keepalive(call);
}
-out:
- kfree(pkt);
return ret;
}
/*
+ * ACK transmitter for a local endpoint. The UDP socket locks around each
+ * transmission, so we can only transmit one packet at a time, ACK, DATA or
+ * otherwise.
+ */
+void rxrpc_transmit_ack_packets(struct rxrpc_local *local)
+{
+ LIST_HEAD(queue);
+ int ret;
+
+ trace_rxrpc_local(local->debug_id, rxrpc_local_tx_ack,
+ refcount_read(&local->ref), NULL);
+
+ if (list_empty(&local->ack_tx_queue))
+ return;
+
+ spin_lock_bh(&local->ack_tx_lock);
+ list_splice_tail_init(&local->ack_tx_queue, &queue);
+ spin_unlock_bh(&local->ack_tx_lock);
+
+ while (!list_empty(&queue)) {
+ struct rxrpc_txbuf *txb =
+ list_entry(queue.next, struct rxrpc_txbuf, tx_link);
+
+ ret = rxrpc_send_ack_packet(local, txb);
+ if (ret < 0 && ret != -ECONNRESET) {
+ spin_lock_bh(&local->ack_tx_lock);
+ list_splice_init(&queue, &local->ack_tx_queue);
+ spin_unlock_bh(&local->ack_tx_lock);
+ break;
+ }
+
+ list_del_init(&txb->tx_link);
+ rxrpc_put_call(txb->call, rxrpc_call_put);
+ rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx);
+ }
+}
+
+/*
* Send an ABORT call packet.
*/
int rxrpc_send_abort_packet(struct rxrpc_call *call)
@@ -299,7 +335,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
* channel instead, thereby closing off this call.
*/
if (rxrpc_is_client_call(call) &&
- test_bit(RXRPC_CALL_TX_LAST, &call->flags))
+ test_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags))
return 0;
if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
@@ -331,8 +367,8 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
serial = atomic_inc_return(&conn->serial);
pkt.whdr.serial = htonl(serial);
- ret = kernel_sendmsg(conn->params.local->socket,
- &msg, iov, 1, sizeof(pkt));
+ iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, sizeof(pkt));
+ ret = do_udp_sendmsg(conn->params.local->socket, &msg, sizeof(pkt));
conn->params.peer->last_tx_at = ktime_get_seconds();
if (ret < 0)
trace_rxrpc_tx_fail(call->debug_id, serial, ret,
@@ -347,19 +383,17 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
/*
* send a packet through the transport endpoint
*/
-int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
- bool retrans)
+int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
{
+ enum rxrpc_req_ack_trace why;
struct rxrpc_connection *conn = call->conn;
- struct rxrpc_wire_header whdr;
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct msghdr msg;
- struct kvec iov[2];
+ struct kvec iov[1];
rxrpc_serial_t serial;
size_t len;
int ret, rtt_slot = -1;
- _enter(",{%d}", skb->len);
+ _enter("%x,{%d}", txb->seq, txb->len);
if (hlist_unhashed(&call->error_link)) {
spin_lock_bh(&call->peer->lock);
@@ -369,28 +403,16 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
/* Each transmission of a Tx packet needs a new serial number */
serial = atomic_inc_return(&conn->serial);
-
- whdr.epoch = htonl(conn->proto.epoch);
- whdr.cid = htonl(call->cid);
- whdr.callNumber = htonl(call->call_id);
- whdr.seq = htonl(sp->hdr.seq);
- whdr.serial = htonl(serial);
- whdr.type = RXRPC_PACKET_TYPE_DATA;
- whdr.flags = sp->hdr.flags;
- whdr.userStatus = 0;
- whdr.securityIndex = call->security_ix;
- whdr._rsvd = htons(sp->hdr._rsvd);
- whdr.serviceId = htons(call->service_id);
+ txb->wire.serial = htonl(serial);
if (test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags) &&
- sp->hdr.seq == 1)
- whdr.userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE;
+ txb->seq == 1)
+ txb->wire.userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE;
- iov[0].iov_base = &whdr;
- iov[0].iov_len = sizeof(whdr);
- iov[1].iov_base = skb->head;
- iov[1].iov_len = skb->len;
- len = iov[0].iov_len + iov[1].iov_len;
+ iov[0].iov_base = &txb->wire;
+ iov[0].iov_len = sizeof(txb->wire) + txb->len;
+ len = iov[0].iov_len;
+ iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len);
msg.msg_name = &call->peer->srx.transport;
msg.msg_namelen = call->peer->srx.transport_len;
@@ -405,41 +427,56 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
* service call, lest OpenAFS incorrectly send us an ACK with some
* soft-ACKs in it and then never follow up with a proper hard ACK.
*/
- if ((!(sp->hdr.flags & RXRPC_LAST_PACKET) ||
- rxrpc_to_server(sp)
- ) &&
- (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) ||
- retrans ||
- call->cong_mode == RXRPC_CALL_SLOW_START ||
- (call->peer->rtt_count < 3 && sp->hdr.seq & 1) ||
- ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
- ktime_get_real())))
- whdr.flags |= RXRPC_REQUEST_ACK;
+ if (txb->wire.flags & RXRPC_REQUEST_ACK)
+ why = rxrpc_reqack_already_on;
+ else if (test_bit(RXRPC_TXBUF_LAST, &txb->flags) && rxrpc_sending_to_client(txb))
+ why = rxrpc_reqack_no_srv_last;
+ else if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events))
+ why = rxrpc_reqack_ack_lost;
+ else if (test_bit(RXRPC_TXBUF_RESENT, &txb->flags))
+ why = rxrpc_reqack_retrans;
+ else if (call->cong_mode == RXRPC_CALL_SLOW_START && call->cong_cwnd <= 2)
+ why = rxrpc_reqack_slow_start;
+ else if (call->tx_winsize <= 2)
+ why = rxrpc_reqack_small_txwin;
+ else if (call->peer->rtt_count < 3 && txb->seq & 1)
+ why = rxrpc_reqack_more_rtt;
+ else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real()))
+ why = rxrpc_reqack_old_rtt;
+ else
+ goto dont_set_request_ack;
+
+ rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]);
+ trace_rxrpc_req_ack(call->debug_id, txb->seq, why);
+ if (why != rxrpc_reqack_no_srv_last)
+ txb->wire.flags |= RXRPC_REQUEST_ACK;
+dont_set_request_ack:
if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
static int lose;
if ((lose++ & 7) == 7) {
ret = 0;
- trace_rxrpc_tx_data(call, sp->hdr.seq, serial,
- whdr.flags, retrans, true);
+ trace_rxrpc_tx_data(call, txb->seq, serial,
+ txb->wire.flags,
+ test_bit(RXRPC_TXBUF_RESENT, &txb->flags),
+ true);
goto done;
}
}
- trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, retrans,
- false);
+ trace_rxrpc_tx_data(call, txb->seq, serial, txb->wire.flags,
+ test_bit(RXRPC_TXBUF_RESENT, &txb->flags), false);
+ cmpxchg(&call->tx_transmitted, txb->seq - 1, txb->seq);
/* send the packet with the don't fragment bit set if we currently
* think it's small enough */
- if (iov[1].iov_len >= call->peer->maxdata)
+ if (txb->len >= call->peer->maxdata)
goto send_fragmentable;
down_read(&conn->params.local->defrag_sem);
- sp->hdr.serial = serial;
- smp_wmb(); /* Set serial before timestamp */
- skb->tstamp = ktime_get_real();
- if (whdr.flags & RXRPC_REQUEST_ACK)
+ txb->last_sent = ktime_get_real();
+ if (txb->wire.flags & RXRPC_REQUEST_ACK)
rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data);
/* send the packet by UDP
@@ -448,7 +485,8 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
* - in which case, we'll have processed the ICMP error
* message and update the peer record
*/
- ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
+ rxrpc_inc_stat(call->rxnet, stat_tx_data_send);
+ ret = do_udp_sendmsg(conn->params.local->socket, &msg, len);
conn->params.peer->last_tx_at = ktime_get_seconds();
up_read(&conn->params.local->defrag_sem);
@@ -457,7 +495,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
trace_rxrpc_tx_fail(call->debug_id, serial, ret,
rxrpc_tx_point_call_data_nofrag);
} else {
- trace_rxrpc_tx_packet(call->debug_id, &whdr,
+ trace_rxrpc_tx_packet(call->debug_id, &txb->wire,
rxrpc_tx_point_call_data_nofrag);
}
@@ -467,8 +505,9 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
done:
if (ret >= 0) {
- if (whdr.flags & RXRPC_REQUEST_ACK) {
- call->peer->rtt_last_req = skb->tstamp;
+ call->tx_last_sent = txb->last_sent;
+ if (txb->wire.flags & RXRPC_REQUEST_ACK) {
+ call->peer->rtt_last_req = txb->last_sent;
if (call->peer->rtt_count > 1) {
unsigned long nowj = jiffies, ack_lost_at;
@@ -480,7 +519,7 @@ done:
}
}
- if (sp->hdr.seq == 1 &&
+ if (txb->seq == 1 &&
!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER,
&call->flags)) {
unsigned long nowj = jiffies, expect_rx_by;
@@ -512,23 +551,21 @@ send_fragmentable:
down_write(&conn->params.local->defrag_sem);
- sp->hdr.serial = serial;
- smp_wmb(); /* Set serial before timestamp */
- skb->tstamp = ktime_get_real();
- if (whdr.flags & RXRPC_REQUEST_ACK)
+ txb->last_sent = ktime_get_real();
+ if (txb->wire.flags & RXRPC_REQUEST_ACK)
rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data);
switch (conn->params.local->srx.transport.family) {
case AF_INET6:
case AF_INET:
ip_sock_set_mtu_discover(conn->params.local->socket->sk,
- IP_PMTUDISC_DONT);
- ret = kernel_sendmsg(conn->params.local->socket, &msg,
- iov, 2, len);
+ IP_PMTUDISC_DONT);
+ rxrpc_inc_stat(call->rxnet, stat_tx_data_send_frag);
+ ret = do_udp_sendmsg(conn->params.local->socket, &msg, len);
conn->params.peer->last_tx_at = ktime_get_seconds();
ip_sock_set_mtu_discover(conn->params.local->socket->sk,
- IP_PMTUDISC_DO);
+ IP_PMTUDISC_DO);
break;
default:
@@ -540,7 +577,7 @@ send_fragmentable:
trace_rxrpc_tx_fail(call->debug_id, serial, ret,
rxrpc_tx_point_call_data_frag);
} else {
- trace_rxrpc_tx_packet(call->debug_id, &whdr,
+ trace_rxrpc_tx_packet(call->debug_id, &txb->wire,
rxrpc_tx_point_call_data_frag);
}
rxrpc_tx_backoff(call, ret);
@@ -610,8 +647,8 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
whdr.flags ^= RXRPC_CLIENT_INITIATED;
whdr.flags &= RXRPC_CLIENT_INITIATED;
- ret = kernel_sendmsg(local->socket, &msg,
- iov, ioc, size);
+ iov_iter_kvec(&msg.msg_iter, WRITE, iov, ioc, size);
+ ret = do_udp_sendmsg(local->socket, &msg, size);
if (ret < 0)
trace_rxrpc_tx_fail(local->debug_id, 0, ret,
rxrpc_tx_point_reject);
@@ -666,7 +703,8 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer)
_proto("Tx VERSION (keepalive)");
- ret = kernel_sendmsg(peer->local->socket, &msg, iov, 2, len);
+ iov_iter_kvec(&msg.msg_iter, WRITE, iov, 2, len);
+ ret = do_udp_sendmsg(peer->local->socket, &msg, len);
if (ret < 0)
trace_rxrpc_tx_fail(peer->debug_id, 0, ret,
rxrpc_tx_point_version_keepalive);
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 32561e9567fe..cda3890657a9 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -16,258 +16,13 @@
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include <net/ip.h>
-#include <net/icmp.h>
#include "ar-internal.h"
-static void rxrpc_adjust_mtu(struct rxrpc_peer *, unsigned int);
static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *);
static void rxrpc_distribute_error(struct rxrpc_peer *, int,
enum rxrpc_call_completion);
/*
- * Find the peer associated with an ICMPv4 packet.
- */
-static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
- struct sk_buff *skb,
- unsigned int udp_offset,
- unsigned int *info,
- struct sockaddr_rxrpc *srx)
-{
- struct iphdr *ip, *ip0 = ip_hdr(skb);
- struct icmphdr *icmp = icmp_hdr(skb);
- struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset);
-
- _enter("%u,%u,%u", ip0->protocol, icmp->type, icmp->code);
-
- switch (icmp->type) {
- case ICMP_DEST_UNREACH:
- *info = ntohs(icmp->un.frag.mtu);
- fallthrough;
- case ICMP_TIME_EXCEEDED:
- case ICMP_PARAMETERPROB:
- ip = (struct iphdr *)((void *)icmp + 8);
- break;
- default:
- return NULL;
- }
-
- memset(srx, 0, sizeof(*srx));
- srx->transport_type = local->srx.transport_type;
- srx->transport_len = local->srx.transport_len;
- srx->transport.family = local->srx.transport.family;
-
- /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice
- * versa?
- */
- switch (srx->transport.family) {
- case AF_INET:
- srx->transport_len = sizeof(srx->transport.sin);
- srx->transport.family = AF_INET;
- srx->transport.sin.sin_port = udp->dest;
- memcpy(&srx->transport.sin.sin_addr, &ip->daddr,
- sizeof(struct in_addr));
- break;
-
-#ifdef CONFIG_AF_RXRPC_IPV6
- case AF_INET6:
- srx->transport_len = sizeof(srx->transport.sin);
- srx->transport.family = AF_INET;
- srx->transport.sin.sin_port = udp->dest;
- memcpy(&srx->transport.sin.sin_addr, &ip->daddr,
- sizeof(struct in_addr));
- break;
-#endif
-
- default:
- WARN_ON_ONCE(1);
- return NULL;
- }
-
- _net("ICMP {%pISp}", &srx->transport);
- return rxrpc_lookup_peer_rcu(local, srx);
-}
-
-#ifdef CONFIG_AF_RXRPC_IPV6
-/*
- * Find the peer associated with an ICMPv6 packet.
- */
-static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local,
- struct sk_buff *skb,
- unsigned int udp_offset,
- unsigned int *info,
- struct sockaddr_rxrpc *srx)
-{
- struct icmp6hdr *icmp = icmp6_hdr(skb);
- struct ipv6hdr *ip, *ip0 = ipv6_hdr(skb);
- struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset);
-
- _enter("%u,%u,%u", ip0->nexthdr, icmp->icmp6_type, icmp->icmp6_code);
-
- switch (icmp->icmp6_type) {
- case ICMPV6_DEST_UNREACH:
- *info = ntohl(icmp->icmp6_mtu);
- fallthrough;
- case ICMPV6_PKT_TOOBIG:
- case ICMPV6_TIME_EXCEED:
- case ICMPV6_PARAMPROB:
- ip = (struct ipv6hdr *)((void *)icmp + 8);
- break;
- default:
- return NULL;
- }
-
- memset(srx, 0, sizeof(*srx));
- srx->transport_type = local->srx.transport_type;
- srx->transport_len = local->srx.transport_len;
- srx->transport.family = local->srx.transport.family;
-
- /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice
- * versa?
- */
- switch (srx->transport.family) {
- case AF_INET:
- _net("Rx ICMP6 on v4 sock");
- srx->transport_len = sizeof(srx->transport.sin);
- srx->transport.family = AF_INET;
- srx->transport.sin.sin_port = udp->dest;
- memcpy(&srx->transport.sin.sin_addr,
- &ip->daddr.s6_addr32[3], sizeof(struct in_addr));
- break;
- case AF_INET6:
- _net("Rx ICMP6");
- srx->transport.sin.sin_port = udp->dest;
- memcpy(&srx->transport.sin6.sin6_addr, &ip->daddr,
- sizeof(struct in6_addr));
- break;
- default:
- WARN_ON_ONCE(1);
- return NULL;
- }
-
- _net("ICMP {%pISp}", &srx->transport);
- return rxrpc_lookup_peer_rcu(local, srx);
-}
-#endif /* CONFIG_AF_RXRPC_IPV6 */
-
-/*
- * Handle an error received on the local endpoint as a tunnel.
- */
-void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb,
- unsigned int udp_offset)
-{
- struct sock_extended_err ee;
- struct sockaddr_rxrpc srx;
- struct rxrpc_local *local;
- struct rxrpc_peer *peer;
- unsigned int info = 0;
- int err;
- u8 version = ip_hdr(skb)->version;
- u8 type = icmp_hdr(skb)->type;
- u8 code = icmp_hdr(skb)->code;
-
- rcu_read_lock();
- local = rcu_dereference_sk_user_data(sk);
- if (unlikely(!local)) {
- rcu_read_unlock();
- return;
- }
-
- rxrpc_new_skb(skb, rxrpc_skb_received);
-
- switch (ip_hdr(skb)->version) {
- case IPVERSION:
- peer = rxrpc_lookup_peer_icmp_rcu(local, skb, udp_offset,
- &info, &srx);
- break;
-#ifdef CONFIG_AF_RXRPC_IPV6
- case 6:
- peer = rxrpc_lookup_peer_icmp6_rcu(local, skb, udp_offset,
- &info, &srx);
- break;
-#endif
- default:
- rcu_read_unlock();
- return;
- }
-
- if (peer && !rxrpc_get_peer_maybe(peer))
- peer = NULL;
- if (!peer) {
- rcu_read_unlock();
- return;
- }
-
- memset(&ee, 0, sizeof(ee));
-
- switch (version) {
- case IPVERSION:
- switch (type) {
- case ICMP_DEST_UNREACH:
- switch (code) {
- case ICMP_FRAG_NEEDED:
- rxrpc_adjust_mtu(peer, info);
- rcu_read_unlock();
- rxrpc_put_peer(peer);
- return;
- default:
- break;
- }
-
- err = EHOSTUNREACH;
- if (code <= NR_ICMP_UNREACH) {
- /* Might want to do something different with
- * non-fatal errors
- */
- //harderr = icmp_err_convert[code].fatal;
- err = icmp_err_convert[code].errno;
- }
- break;
-
- case ICMP_TIME_EXCEEDED:
- err = EHOSTUNREACH;
- break;
- default:
- err = EPROTO;
- break;
- }
-
- ee.ee_origin = SO_EE_ORIGIN_ICMP;
- ee.ee_type = type;
- ee.ee_code = code;
- ee.ee_errno = err;
- break;
-
-#ifdef CONFIG_AF_RXRPC_IPV6
- case 6:
- switch (type) {
- case ICMPV6_PKT_TOOBIG:
- rxrpc_adjust_mtu(peer, info);
- rcu_read_unlock();
- rxrpc_put_peer(peer);
- return;
- }
-
- icmpv6_err_convert(type, code, &err);
-
- if (err == EACCES)
- err = EHOSTUNREACH;
-
- ee.ee_origin = SO_EE_ORIGIN_ICMP6;
- ee.ee_type = type;
- ee.ee_code = code;
- ee.ee_errno = err;
- break;
-#endif
- }
-
- trace_rxrpc_rx_icmp(peer, &ee, &srx);
-
- rxrpc_distribute_error(peer, err, RXRPC_CALL_NETWORK_ERROR);
- rcu_read_unlock();
- rxrpc_put_peer(peer);
-}
-
-/*
* Find the peer associated with a local error.
*/
static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local,
@@ -283,6 +38,9 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local,
srx->transport_len = local->srx.transport_len;
srx->transport.family = local->srx.transport.family;
+ /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice
+ * versa?
+ */
switch (srx->transport.family) {
case AF_INET:
srx->transport_len = sizeof(srx->transport.sin);
@@ -412,20 +170,38 @@ void rxrpc_error_report(struct sock *sk)
}
rxrpc_new_skb(skb, rxrpc_skb_received);
serr = SKB_EXT_ERR(skb);
+ if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
+ _leave("UDP empty message");
+ rcu_read_unlock();
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
+ return;
+ }
- if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) {
- peer = rxrpc_lookup_peer_local_rcu(local, skb, &srx);
- if (peer && !rxrpc_get_peer_maybe(peer))
- peer = NULL;
- if (peer) {
- trace_rxrpc_rx_icmp(peer, &serr->ee, &srx);
- rxrpc_store_error(peer, serr);
- }
+ peer = rxrpc_lookup_peer_local_rcu(local, skb, &srx);
+ if (peer && !rxrpc_get_peer_maybe(peer))
+ peer = NULL;
+ if (!peer) {
+ rcu_read_unlock();
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
+ _leave(" [no peer]");
+ return;
}
+ trace_rxrpc_rx_icmp(peer, &serr->ee, &srx);
+
+ if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP &&
+ serr->ee.ee_type == ICMP_DEST_UNREACH &&
+ serr->ee.ee_code == ICMP_FRAG_NEEDED)) {
+ rxrpc_adjust_mtu(peer, serr->ee.ee_info);
+ goto out;
+ }
+
+ rxrpc_store_error(peer, serr);
+out:
rcu_read_unlock();
rxrpc_free_skb(skb, rxrpc_skb_freed);
rxrpc_put_peer(peer);
+
_leave("");
}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 26d2ae9baaf2..041a51225c5f 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -227,12 +227,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
rxrpc_peer_init_rtt(peer);
- if (RXRPC_TX_SMSS > 2190)
- peer->cong_cwnd = 2;
- else if (RXRPC_TX_SMSS > 1095)
- peer->cong_cwnd = 3;
- else
- peer->cong_cwnd = 4;
+ peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW;
trace_rxrpc_peer(peer->debug_id, rxrpc_peer_new, 1, here);
}
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 245418943e01..fae22a8b38d6 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -54,8 +54,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
struct rxrpc_call *call;
struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
unsigned long timeout = 0;
- rxrpc_seq_t tx_hard_ack, rx_hard_ack;
+ rxrpc_seq_t acks_hard_ack;
char lbuff[50], rbuff[50];
+ u64 wtmp;
if (v == &rxnet->calls) {
seq_puts(seq,
@@ -90,8 +91,8 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
timeout -= jiffies;
}
- tx_hard_ack = READ_ONCE(call->tx_hard_ack);
- rx_hard_ack = READ_ONCE(call->rx_hard_ack);
+ acks_hard_ack = READ_ONCE(call->acks_hard_ack);
+ wtmp = atomic64_read_acquire(&call->ackr_window);
seq_printf(seq,
"UDP %-47.47s %-47.47s %4x %08x %08x %s %3u"
" %-8.8s %08x %08x %08x %02x %08x %02x %08x %06lx\n",
@@ -105,8 +106,8 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
rxrpc_call_states[call->state],
call->abort_code,
call->debug_id,
- tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack,
- rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack,
+ acks_hard_ack, READ_ONCE(call->tx_top) - acks_hard_ack,
+ lower_32_bits(wtmp), upper_32_bits(wtmp) - lower_32_bits(wtmp),
call->rx_serial,
timeout);
@@ -216,7 +217,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
seq_puts(seq,
"Proto Local "
" Remote "
- " Use CW MTU LastUse RTT RTO\n"
+ " Use SST MTU LastUse RTT RTO\n"
);
return 0;
}
@@ -234,7 +235,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
lbuff,
rbuff,
refcount_read(&peer->ref),
- peer->cong_cwnd,
+ peer->cong_ssthresh,
peer->mtu,
now - peer->last_tx_at,
peer->srtt_us >> 3,
@@ -397,3 +398,98 @@ const struct seq_operations rxrpc_local_seq_ops = {
.stop = rxrpc_local_seq_stop,
.show = rxrpc_local_seq_show,
};
+
+/*
+ * Display stats in /proc/net/rxrpc/stats
+ */
+int rxrpc_stats_show(struct seq_file *seq, void *v)
+{
+ struct rxrpc_net *rxnet = rxrpc_net(seq_file_single_net(seq));
+
+ seq_printf(seq,
+ "Data : send=%u sendf=%u\n",
+ atomic_read(&rxnet->stat_tx_data_send),
+ atomic_read(&rxnet->stat_tx_data_send_frag));
+ seq_printf(seq,
+ "Data-Tx : nr=%u retrans=%u\n",
+ atomic_read(&rxnet->stat_tx_data),
+ atomic_read(&rxnet->stat_tx_data_retrans));
+ seq_printf(seq,
+ "Data-Rx : nr=%u reqack=%u jumbo=%u\n",
+ atomic_read(&rxnet->stat_rx_data),
+ atomic_read(&rxnet->stat_rx_data_reqack),
+ atomic_read(&rxnet->stat_rx_data_jumbo));
+ seq_printf(seq,
+ "Ack : fill=%u send=%u skip=%u\n",
+ atomic_read(&rxnet->stat_tx_ack_fill),
+ atomic_read(&rxnet->stat_tx_ack_send),
+ atomic_read(&rxnet->stat_tx_ack_skip));
+ seq_printf(seq,
+ "Ack-Tx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u\n",
+ atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_REQUESTED]),
+ atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_DUPLICATE]),
+ atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_OUT_OF_SEQUENCE]),
+ atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_EXCEEDS_WINDOW]),
+ atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_NOSPACE]),
+ atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_PING]),
+ atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_PING_RESPONSE]),
+ atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_DELAY]),
+ atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_IDLE]));
+ seq_printf(seq,
+ "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u\n",
+ atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_REQUESTED]),
+ atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DUPLICATE]),
+ atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_OUT_OF_SEQUENCE]),
+ atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_EXCEEDS_WINDOW]),
+ atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_NOSPACE]),
+ atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING]),
+ atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING_RESPONSE]),
+ atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DELAY]),
+ atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]));
+ seq_printf(seq,
+ "Why-Req-A: acklost=%u already=%u mrtt=%u ortt=%u\n",
+ atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]),
+ atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_already_on]),
+ atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_more_rtt]),
+ atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt]));
+ seq_printf(seq,
+ "Why-Req-A: nolast=%u retx=%u slows=%u smtxw=%u\n",
+ atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_no_srv_last]),
+ atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_retrans]),
+ atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_slow_start]),
+ atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_small_txwin]));
+ seq_printf(seq,
+ "Buffers : txb=%u rxb=%u\n",
+ atomic_read(&rxrpc_nr_txbuf),
+ atomic_read(&rxrpc_n_rx_skbs));
+ return 0;
+}
+
+/*
+ * Clear stats if /proc/net/rxrpc/stats is written to.
+ */
+int rxrpc_stats_clear(struct file *file, char *buf, size_t size)
+{
+ struct seq_file *m = file->private_data;
+ struct rxrpc_net *rxnet = rxrpc_net(seq_file_single_net(m));
+
+ if (size > 1 || (size == 1 && buf[0] != '\n'))
+ return -EINVAL;
+
+ atomic_set(&rxnet->stat_tx_data, 0);
+ atomic_set(&rxnet->stat_tx_data_retrans, 0);
+ atomic_set(&rxnet->stat_tx_data_send, 0);
+ atomic_set(&rxnet->stat_tx_data_send_frag, 0);
+ atomic_set(&rxnet->stat_rx_data, 0);
+ atomic_set(&rxnet->stat_rx_data_reqack, 0);
+ atomic_set(&rxnet->stat_rx_data_jumbo, 0);
+
+ atomic_set(&rxnet->stat_tx_ack_fill, 0);
+ atomic_set(&rxnet->stat_tx_ack_send, 0);
+ atomic_set(&rxnet->stat_tx_ack_skip, 0);
+ memset(&rxnet->stat_tx_acks, 0, sizeof(rxnet->stat_tx_acks));
+ memset(&rxnet->stat_rx_acks, 0, sizeof(rxnet->stat_rx_acks));
+
+ memset(&rxnet->stat_why_req_ack, 0, sizeof(rxnet->stat_why_req_ack));
+ return size;
+}
diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h
index d2cf8e1d218f..6760cb99c6d6 100644
--- a/net/rxrpc/protocol.h
+++ b/net/rxrpc/protocol.h
@@ -84,7 +84,7 @@ struct rxrpc_jumbo_header {
__be16 _rsvd; /* reserved */
__be16 cksum; /* kerberos security checksum */
};
-};
+} __packed;
#define RXRPC_JUMBO_DATALEN 1412 /* non-terminal jumbo packet data length */
#define RXRPC_JUMBO_SUBPKTLEN (RXRPC_JUMBO_DATALEN + sizeof(struct rxrpc_jumbo_header))
@@ -132,13 +132,6 @@ struct rxrpc_ackpacket {
} __packed;
-/* Some ACKs refer to specific packets and some are general and can be updated. */
-#define RXRPC_ACK_UPDATEABLE ((1 << RXRPC_ACK_REQUESTED) | \
- (1 << RXRPC_ACK_PING_RESPONSE) | \
- (1 << RXRPC_ACK_DELAY) | \
- (1 << RXRPC_ACK_IDLE))
-
-
/*
* ACK packets can have a further piece of information tagged on the end
*/
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 7e39c262fd79..efb85f983657 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -173,8 +173,9 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg)
break;
}
- trace_rxrpc_recvmsg(call, rxrpc_recvmsg_terminal, call->rx_hard_ack,
- call->rx_pkt_offset, call->rx_pkt_len, ret);
+ trace_rxrpc_recvdata(call, rxrpc_recvmsg_terminal,
+ lower_32_bits(atomic64_read(&call->ackr_window)) - 1,
+ call->rx_pkt_offset, call->rx_pkt_len, ret);
return ret;
}
@@ -183,16 +184,14 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg)
*/
static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
{
+ rxrpc_seq_t whigh = READ_ONCE(call->rx_highest_seq);
+
_enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]);
- trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top);
- ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
+ trace_rxrpc_receive(call, rxrpc_receive_end, 0, whigh);
- if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
- rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, serial, false, true,
- rxrpc_propose_ack_terminal_ack);
- //rxrpc_send_ack_packet(call, false, NULL);
- }
+ if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY)
+ rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_terminal_ack);
write_lock_bh(&call->state_lock);
@@ -203,12 +202,11 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
break;
case RXRPC_CALL_SERVER_RECV_REQUEST:
- call->tx_phase = true;
call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
call->expect_req_by = jiffies + MAX_JIFFY_OFFSET;
write_unlock_bh(&call->state_lock);
- rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial, false, true,
- rxrpc_propose_ack_processing_op);
+ rxrpc_propose_delay_ACK(call, serial,
+ rxrpc_propose_ack_processing_op);
break;
default:
write_unlock_bh(&call->state_lock);
@@ -224,126 +222,66 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
struct rxrpc_skb_priv *sp;
struct sk_buff *skb;
rxrpc_serial_t serial;
- rxrpc_seq_t hard_ack, top;
- bool last = false;
- u8 subpacket;
- int ix;
+ rxrpc_seq_t old_consumed = call->rx_consumed, tseq;
+ bool last;
+ int acked;
_enter("%d", call->debug_id);
- hard_ack = call->rx_hard_ack;
- top = smp_load_acquire(&call->rx_top);
- ASSERT(before(hard_ack, top));
-
- hard_ack++;
- ix = hard_ack & RXRPC_RXTX_BUFF_MASK;
- skb = call->rxtx_buffer[ix];
+further_rotation:
+ skb = skb_dequeue(&call->recvmsg_queue);
rxrpc_see_skb(skb, rxrpc_skb_rotated);
- sp = rxrpc_skb(skb);
-
- subpacket = call->rxtx_annotations[ix] & RXRPC_RX_ANNO_SUBPACKET;
- serial = sp->hdr.serial + subpacket;
- if (subpacket == sp->nr_subpackets - 1 &&
- sp->rx_flags & RXRPC_SKB_INCL_LAST)
- last = true;
+ sp = rxrpc_skb(skb);
+ tseq = sp->hdr.seq;
+ serial = sp->hdr.serial;
+ last = sp->hdr.flags & RXRPC_LAST_PACKET;
- call->rxtx_buffer[ix] = NULL;
- call->rxtx_annotations[ix] = 0;
/* Barrier against rxrpc_input_data(). */
- smp_store_release(&call->rx_hard_ack, hard_ack);
+ if (after(tseq, call->rx_consumed))
+ smp_store_release(&call->rx_consumed, tseq);
rxrpc_free_skb(skb, rxrpc_skb_freed);
- trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack);
+ trace_rxrpc_receive(call, last ? rxrpc_receive_rotate_last : rxrpc_receive_rotate,
+ serial, call->rx_consumed);
if (last) {
rxrpc_end_rx_phase(call, serial);
- } else {
- /* Check to see if there's an ACK that needs sending. */
- if (atomic_inc_return(&call->ackr_nr_consumed) > 2)
- rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, serial,
- true, false,
- rxrpc_propose_ack_rotate_rx);
- if (call->ackr_reason && call->ackr_reason != RXRPC_ACK_DELAY)
- rxrpc_send_ack_packet(call, false, NULL);
+ return;
}
-}
-
-/*
- * Decrypt and verify a (sub)packet. The packet's length may be changed due to
- * padding, but if this is the case, the packet length will be resident in the
- * socket buffer. Note that we can't modify the master skb info as the skb may
- * be the home to multiple subpackets.
- */
-static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
- u8 annotation,
- unsigned int offset, unsigned int len)
-{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- rxrpc_seq_t seq = sp->hdr.seq;
- u16 cksum = sp->hdr.cksum;
- u8 subpacket = annotation & RXRPC_RX_ANNO_SUBPACKET;
- _enter("");
-
- /* For all but the head jumbo subpacket, the security checksum is in a
- * jumbo header immediately prior to the data.
+ /* The next packet on the queue might entirely overlap with the one we
+ * just consumed; if so, rotate that away also.
*/
- if (subpacket > 0) {
- __be16 tmp;
- if (skb_copy_bits(skb, offset - 2, &tmp, 2) < 0)
- BUG();
- cksum = ntohs(tmp);
- seq += subpacket;
+ skb = skb_peek(&call->recvmsg_queue);
+ if (skb) {
+ sp = rxrpc_skb(skb);
+ if (sp->hdr.seq != call->rx_consumed &&
+ after_eq(call->rx_consumed, sp->hdr.seq))
+ goto further_rotation;
}
- return call->security->verify_packet(call, skb, offset, len,
- seq, cksum);
+ /* Check to see if there's an ACK that needs sending. */
+ acked = atomic_add_return(call->rx_consumed - old_consumed,
+ &call->ackr_nr_consumed);
+ if (acked > 2 &&
+ !test_and_set_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags)) {
+ rxrpc_send_ACK(call, RXRPC_ACK_IDLE, serial,
+ rxrpc_propose_ack_rotate_rx);
+ rxrpc_transmit_ack_packets(call->peer->local);
+ }
}
/*
- * Locate the data within a packet. This is complicated by:
- *
- * (1) An skb may contain a jumbo packet - so we have to find the appropriate
- * subpacket.
- *
- * (2) The (sub)packets may be encrypted and, if so, the encrypted portion
- * contains an extra header which includes the true length of the data,
- * excluding any encrypted padding.
+ * Decrypt and verify a DATA packet.
*/
-static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
- u8 *_annotation,
- unsigned int *_offset, unsigned int *_len,
- bool *_last)
+static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- unsigned int offset = sizeof(struct rxrpc_wire_header);
- unsigned int len;
- bool last = false;
- int ret;
- u8 annotation = *_annotation;
- u8 subpacket = annotation & RXRPC_RX_ANNO_SUBPACKET;
-
- /* Locate the subpacket */
- offset += subpacket * RXRPC_JUMBO_SUBPKTLEN;
- len = skb->len - offset;
- if (subpacket < sp->nr_subpackets - 1)
- len = RXRPC_JUMBO_DATALEN;
- else if (sp->rx_flags & RXRPC_SKB_INCL_LAST)
- last = true;
-
- if (!(annotation & RXRPC_RX_ANNO_VERIFIED)) {
- ret = rxrpc_verify_packet(call, skb, annotation, offset, len);
- if (ret < 0)
- return ret;
- *_annotation |= RXRPC_RX_ANNO_VERIFIED;
- }
- *_offset = offset;
- *_len = len;
- *_last = last;
- call->security->locate_data(call, skb, _offset, _len);
- return 0;
+ if (sp->flags & RXRPC_RX_VERIFIED)
+ return 0;
+ return call->security->verify_packet(call, skb);
}
/*
@@ -357,69 +295,55 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
{
struct rxrpc_skb_priv *sp;
struct sk_buff *skb;
- rxrpc_serial_t serial;
- rxrpc_seq_t hard_ack, top, seq;
+ rxrpc_seq_t seq = 0;
size_t remain;
- bool rx_pkt_last;
unsigned int rx_pkt_offset, rx_pkt_len;
- int ix, copy, ret = -EAGAIN, ret2;
-
- if (test_and_clear_bit(RXRPC_CALL_RX_UNDERRUN, &call->flags) &&
- call->ackr_reason)
- rxrpc_send_ack_packet(call, false, NULL);
+ int copy, ret = -EAGAIN, ret2;
rx_pkt_offset = call->rx_pkt_offset;
rx_pkt_len = call->rx_pkt_len;
- rx_pkt_last = call->rx_pkt_last;
if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) {
- seq = call->rx_hard_ack;
+ seq = lower_32_bits(atomic64_read(&call->ackr_window)) - 1;
ret = 1;
goto done;
}
- /* Barriers against rxrpc_input_data(). */
- hard_ack = call->rx_hard_ack;
- seq = hard_ack + 1;
-
- while (top = smp_load_acquire(&call->rx_top),
- before_eq(seq, top)
- ) {
- ix = seq & RXRPC_RXTX_BUFF_MASK;
- skb = call->rxtx_buffer[ix];
- if (!skb) {
- trace_rxrpc_recvmsg(call, rxrpc_recvmsg_hole, seq,
- rx_pkt_offset, rx_pkt_len, 0);
- break;
- }
- smp_rmb();
+ /* No one else can be removing stuff from the queue, so we shouldn't
+ * need the Rx lock to walk it.
+ */
+ skb = skb_peek(&call->recvmsg_queue);
+ while (skb) {
rxrpc_see_skb(skb, rxrpc_skb_seen);
sp = rxrpc_skb(skb);
+ seq = sp->hdr.seq;
- if (!(flags & MSG_PEEK)) {
- serial = sp->hdr.serial;
- serial += call->rxtx_annotations[ix] & RXRPC_RX_ANNO_SUBPACKET;
- trace_rxrpc_receive(call, rxrpc_receive_front,
- serial, seq);
+ if (after_eq(call->rx_consumed, seq)) {
+ kdebug("obsolete %x %x", call->rx_consumed, seq);
+ goto skip_obsolete;
}
+ if (!(flags & MSG_PEEK))
+ trace_rxrpc_receive(call, rxrpc_receive_front,
+ sp->hdr.serial, seq);
+
if (msg)
sock_recv_timestamp(msg, sock->sk, skb);
if (rx_pkt_offset == 0) {
- ret2 = rxrpc_locate_data(call, skb,
- &call->rxtx_annotations[ix],
- &rx_pkt_offset, &rx_pkt_len,
- &rx_pkt_last);
- trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq,
- rx_pkt_offset, rx_pkt_len, ret2);
+ ret2 = rxrpc_verify_data(call, skb);
+ rx_pkt_offset = sp->offset;
+ rx_pkt_len = sp->len;
+ trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq,
+ rx_pkt_offset, rx_pkt_len, ret2);
if (ret2 < 0) {
ret = ret2;
goto out;
}
+ rxrpc_transmit_ack_packets(call->peer->local);
} else {
- trace_rxrpc_recvmsg(call, rxrpc_recvmsg_cont, seq,
- rx_pkt_offset, rx_pkt_len, 0);
+ trace_rxrpc_recvdata(call, rxrpc_recvmsg_cont, seq,
+ rx_pkt_offset, rx_pkt_len, 0);
}
/* We have to handle short, empty and used-up DATA packets. */
@@ -442,37 +366,34 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
}
if (rx_pkt_len > 0) {
- trace_rxrpc_recvmsg(call, rxrpc_recvmsg_full, seq,
- rx_pkt_offset, rx_pkt_len, 0);
+ trace_rxrpc_recvdata(call, rxrpc_recvmsg_full, seq,
+ rx_pkt_offset, rx_pkt_len, 0);
ASSERTCMP(*_offset, ==, len);
ret = 0;
break;
}
+ skip_obsolete:
/* The whole packet has been transferred. */
- if (!(flags & MSG_PEEK))
- rxrpc_rotate_rx_window(call);
+ if (sp->hdr.flags & RXRPC_LAST_PACKET)
+ ret = 1;
rx_pkt_offset = 0;
rx_pkt_len = 0;
- if (rx_pkt_last) {
- ASSERTCMP(seq, ==, READ_ONCE(call->rx_top));
- ret = 1;
- goto out;
- }
+ skb = skb_peek_next(skb, &call->recvmsg_queue);
- seq++;
+ if (!(flags & MSG_PEEK))
+ rxrpc_rotate_rx_window(call);
}
out:
if (!(flags & MSG_PEEK)) {
call->rx_pkt_offset = rx_pkt_offset;
call->rx_pkt_len = rx_pkt_len;
- call->rx_pkt_last = rx_pkt_last;
}
done:
- trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq,
- rx_pkt_offset, rx_pkt_len, ret);
+ trace_rxrpc_recvdata(call, rxrpc_recvmsg_data_return, seq,
+ rx_pkt_offset, rx_pkt_len, ret);
if (ret == -EAGAIN)
set_bit(RXRPC_CALL_RX_UNDERRUN, &call->flags);
return ret;
@@ -495,7 +416,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
DEFINE_WAIT(wait);
- trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0, 0, 0, 0);
+ trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0);
if (flags & (MSG_OOB | MSG_TRUNC))
return -EOPNOTSUPP;
@@ -532,8 +453,7 @@ try_again:
if (list_empty(&rx->recvmsg_q)) {
if (signal_pending(current))
goto wait_interrupted;
- trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait,
- 0, 0, 0, 0);
+ trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait, 0);
timeo = schedule_timeout(timeo);
}
finish_wait(sk_sleep(&rx->sk), &wait);
@@ -552,7 +472,7 @@ try_again:
rxrpc_get_call(call, rxrpc_call_got);
write_unlock_bh(&rx->recvmsg_lock);
- trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0);
/* We're going to drop the socket lock, so we need to lock the call
* against interference by sendmsg.
@@ -605,8 +525,8 @@ try_again:
if (ret == -EAGAIN)
ret = 0;
- if (after(call->rx_top, call->rx_hard_ack) &&
- call->rxtx_buffer[(call->rx_hard_ack + 1) & RXRPC_RXTX_BUFF_MASK])
+ rxrpc_transmit_ack_packets(call->peer->local);
+ if (!skb_queue_empty(&call->recvmsg_queue))
rxrpc_notify_socket(call);
break;
default:
@@ -636,7 +556,7 @@ try_again:
error_unlock_call:
mutex_unlock(&call->user_mutex);
rxrpc_put_call(call, rxrpc_call_put);
- trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, ret);
return ret;
error_requeue_call:
@@ -644,14 +564,14 @@ error_requeue_call:
write_lock_bh(&rx->recvmsg_lock);
list_add(&call->recvmsg_link, &rx->recvmsg_q);
write_unlock_bh(&rx->recvmsg_lock);
- trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0, 0, 0, 0);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0);
} else {
rxrpc_put_call(call, rxrpc_call_put);
}
error_no_call:
release_sock(&rx->sk);
error_trace:
- trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, ret);
return ret;
wait_interrupted:
@@ -735,17 +655,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
read_phase_complete:
ret = 1;
out:
- switch (call->ackr_reason) {
- case RXRPC_ACK_IDLE:
- break;
- case RXRPC_ACK_DELAY:
- if (ret != -EAGAIN)
- break;
- fallthrough;
- default:
- rxrpc_send_ack_packet(call, false, NULL);
- }
-
+ rxrpc_transmit_ack_packets(call->peer->local);
if (_service)
*_service = call->service_id;
mutex_unlock(&call->user_mutex);
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 78fa0524156f..110a5550c0a6 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -233,16 +233,8 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn,
static struct skcipher_request *rxkad_get_call_crypto(struct rxrpc_call *call)
{
struct crypto_skcipher *tfm = &call->conn->rxkad.cipher->base;
- struct skcipher_request *cipher_req = call->cipher_req;
- if (!cipher_req) {
- cipher_req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!cipher_req)
- return NULL;
- call->cipher_req = cipher_req;
- }
-
- return cipher_req;
+ return skcipher_request_alloc(tfm, GFP_NOFS);
}
/*
@@ -250,20 +242,16 @@ static struct skcipher_request *rxkad_get_call_crypto(struct rxrpc_call *call)
*/
static void rxkad_free_call_crypto(struct rxrpc_call *call)
{
- if (call->cipher_req)
- skcipher_request_free(call->cipher_req);
- call->cipher_req = NULL;
}
/*
* partially encrypt a packet (level 1 security)
*/
static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
- struct sk_buff *skb, u32 data_size,
+ struct rxrpc_txbuf *txb,
struct skcipher_request *req)
{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct rxkad_level1_hdr hdr;
+ struct rxkad_level1_hdr *hdr = (void *)txb->data;
struct rxrpc_crypt iv;
struct scatterlist sg;
size_t pad;
@@ -271,22 +259,22 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
_enter("");
- check = sp->hdr.seq ^ call->call_id;
- data_size |= (u32)check << 16;
+ check = txb->seq ^ ntohl(txb->wire.callNumber);
+ hdr->data_size = htonl((u32)check << 16 | txb->len);
- hdr.data_size = htonl(data_size);
- memcpy(skb->head, &hdr, sizeof(hdr));
-
- pad = sizeof(struct rxkad_level1_hdr) + data_size;
+ txb->len += sizeof(struct rxkad_level1_hdr);
+ pad = txb->len;
pad = RXKAD_ALIGN - pad;
pad &= RXKAD_ALIGN - 1;
- if (pad)
- skb_put_zero(skb, pad);
+ if (pad) {
+ memset(txb->data + txb->offset, 0, pad);
+ txb->len += pad;
+ }
/* start the encryption afresh */
memset(&iv, 0, sizeof(iv));
- sg_init_one(&sg, skb->head, 8);
+ sg_init_one(&sg, txb->data, 8);
skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x);
@@ -301,87 +289,63 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
* wholly encrypt a packet (level 2 security)
*/
static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
- struct sk_buff *skb,
- u32 data_size,
+ struct rxrpc_txbuf *txb,
struct skcipher_request *req)
{
const struct rxrpc_key_token *token;
- struct rxkad_level2_hdr rxkhdr;
- struct rxrpc_skb_priv *sp;
+ struct rxkad_level2_hdr *rxkhdr = (void *)txb->data;
struct rxrpc_crypt iv;
- struct scatterlist sg[16];
- unsigned int len;
+ struct scatterlist sg;
size_t pad;
u16 check;
- int err;
-
- sp = rxrpc_skb(skb);
+ int ret;
_enter("");
- check = sp->hdr.seq ^ call->call_id;
+ check = txb->seq ^ ntohl(txb->wire.callNumber);
- rxkhdr.data_size = htonl(data_size | (u32)check << 16);
- rxkhdr.checksum = 0;
- memcpy(skb->head, &rxkhdr, sizeof(rxkhdr));
+ rxkhdr->data_size = htonl(txb->len | (u32)check << 16);
+ rxkhdr->checksum = 0;
- pad = sizeof(struct rxkad_level2_hdr) + data_size;
+ txb->len += sizeof(struct rxkad_level2_hdr);
+ pad = txb->len;
pad = RXKAD_ALIGN - pad;
pad &= RXKAD_ALIGN - 1;
- if (pad)
- skb_put_zero(skb, pad);
+ if (pad) {
+ memset(txb->data + txb->offset, 0, pad);
+ txb->len += pad;
+ }
/* encrypt from the session key */
token = call->conn->params.key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
- sg_init_one(&sg[0], skb->head, sizeof(rxkhdr));
+ sg_init_one(&sg, txb->data, txb->len);
skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, &sg[0], &sg[0], sizeof(rxkhdr), iv.x);
- crypto_skcipher_encrypt(req);
-
- /* we want to encrypt the skbuff in-place */
- err = -EMSGSIZE;
- if (skb_shinfo(skb)->nr_frags > 16)
- goto out;
-
- len = round_up(data_size, RXKAD_ALIGN);
-
- sg_init_table(sg, ARRAY_SIZE(sg));
- err = skb_to_sgvec(skb, sg, 8, len);
- if (unlikely(err < 0))
- goto out;
- skcipher_request_set_crypt(req, sg, sg, len, iv.x);
- crypto_skcipher_encrypt(req);
-
- _leave(" = 0");
- err = 0;
-
-out:
+ skcipher_request_set_crypt(req, &sg, &sg, txb->len, iv.x);
+ ret = crypto_skcipher_encrypt(req);
skcipher_request_zero(req);
- return err;
+ return ret;
}
/*
* checksum an RxRPC packet header
*/
-static int rxkad_secure_packet(struct rxrpc_call *call,
- struct sk_buff *skb,
- size_t data_size)
+static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
{
- struct rxrpc_skb_priv *sp;
struct skcipher_request *req;
struct rxrpc_crypt iv;
struct scatterlist sg;
+ union {
+ __be32 buf[2];
+ } crypto __aligned(8);
u32 x, y;
int ret;
- sp = rxrpc_skb(skb);
-
- _enter("{%d{%x}},{#%u},%zu,",
+ _enter("{%d{%x}},{#%u},%u,",
call->debug_id, key_serial(call->conn->params.key),
- sp->hdr.seq, data_size);
+ txb->seq, txb->len);
if (!call->conn->rxkad.cipher)
return 0;
@@ -398,39 +362,40 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
memcpy(&iv, call->conn->rxkad.csum_iv.x, sizeof(iv));
/* calculate the security checksum */
- x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
- x |= sp->hdr.seq & 0x3fffffff;
- call->crypto_buf[0] = htonl(call->call_id);
- call->crypto_buf[1] = htonl(x);
+ x = (ntohl(txb->wire.cid) & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
+ x |= txb->seq & 0x3fffffff;
+ crypto.buf[0] = txb->wire.callNumber;
+ crypto.buf[1] = htonl(x);
- sg_init_one(&sg, call->crypto_buf, 8);
+ sg_init_one(&sg, crypto.buf, 8);
skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x);
crypto_skcipher_encrypt(req);
skcipher_request_zero(req);
- y = ntohl(call->crypto_buf[1]);
+ y = ntohl(crypto.buf[1]);
y = (y >> 16) & 0xffff;
if (y == 0)
y = 1; /* zero checksums are not permitted */
- sp->hdr.cksum = y;
+ txb->wire.cksum = htons(y);
switch (call->conn->params.security_level) {
case RXRPC_SECURITY_PLAIN:
ret = 0;
break;
case RXRPC_SECURITY_AUTH:
- ret = rxkad_secure_packet_auth(call, skb, data_size, req);
+ ret = rxkad_secure_packet_auth(call, txb, req);
break;
case RXRPC_SECURITY_ENCRYPT:
- ret = rxkad_secure_packet_encrypt(call, skb, data_size, req);
+ ret = rxkad_secure_packet_encrypt(call, txb, req);
break;
default:
ret = -EPERM;
break;
}
+ skcipher_request_free(req);
_leave(" = %d [set %x]", ret, y);
return ret;
}
@@ -439,11 +404,11 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
* decrypt partial encryption on a packet (level 1 security)
*/
static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
- unsigned int offset, unsigned int len,
rxrpc_seq_t seq,
struct skcipher_request *req)
{
struct rxkad_level1_hdr sechdr;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_crypt iv;
struct scatterlist sg[16];
bool aborted;
@@ -453,9 +418,9 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
_enter("");
- if (len < 8) {
+ if (sp->len < 8) {
aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_hdr", "V1H",
- RXKADSEALEDINCON);
+ RXKADSEALEDINCON);
goto protocol_error;
}
@@ -463,7 +428,7 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
* directly into the target buffer.
*/
sg_init_table(sg, ARRAY_SIZE(sg));
- ret = skb_to_sgvec(skb, sg, offset, 8);
+ ret = skb_to_sgvec(skb, sg, sp->offset, 8);
if (unlikely(ret < 0))
return ret;
@@ -477,12 +442,13 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
skcipher_request_zero(req);
/* Extract the decrypted packet length */
- if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) {
+ if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) {
aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_len", "XV1",
RXKADDATALEN);
goto protocol_error;
}
- len -= sizeof(sechdr);
+ sp->offset += sizeof(sechdr);
+ sp->len -= sizeof(sechdr);
buf = ntohl(sechdr.data_size);
data_size = buf & 0xffff;
@@ -496,11 +462,12 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
goto protocol_error;
}
- if (data_size > len) {
+ if (data_size > sp->len) {
aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_datalen", "V1L",
RXKADDATALEN);
goto protocol_error;
}
+ sp->len = data_size;
_leave(" = 0 [dlen=%x]", data_size);
return 0;
@@ -515,12 +482,12 @@ protocol_error:
* wholly decrypt a packet (level 2 security)
*/
static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
- unsigned int offset, unsigned int len,
rxrpc_seq_t seq,
struct skcipher_request *req)
{
const struct rxrpc_key_token *token;
struct rxkad_level2_hdr sechdr;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_crypt iv;
struct scatterlist _sg[4], *sg;
bool aborted;
@@ -528,9 +495,9 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
u16 check;
int nsg, ret;
- _enter(",{%d}", skb->len);
+ _enter(",{%d}", sp->len);
- if (len < 8) {
+ if (sp->len < 8) {
aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_hdr", "V2H",
RXKADSEALEDINCON);
goto protocol_error;
@@ -550,7 +517,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
}
sg_init_table(sg, nsg);
- ret = skb_to_sgvec(skb, sg, offset, len);
+ ret = skb_to_sgvec(skb, sg, sp->offset, sp->len);
if (unlikely(ret < 0)) {
if (sg != _sg)
kfree(sg);
@@ -563,19 +530,20 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg, sg, len, iv.x);
+ skcipher_request_set_crypt(req, sg, sg, sp->len, iv.x);
crypto_skcipher_decrypt(req);
skcipher_request_zero(req);
if (sg != _sg)
kfree(sg);
/* Extract the decrypted packet length */
- if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) {
+ if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) {
aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_len", "XV2",
RXKADDATALEN);
goto protocol_error;
}
- len -= sizeof(sechdr);
+ sp->offset += sizeof(sechdr);
+ sp->len -= sizeof(sechdr);
buf = ntohl(sechdr.data_size);
data_size = buf & 0xffff;
@@ -589,12 +557,13 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
goto protocol_error;
}
- if (data_size > len) {
+ if (data_size > sp->len) {
aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_datalen", "V2L",
RXKADDATALEN);
goto protocol_error;
}
+ sp->len = data_size;
_leave(" = 0 [dlen=%x]", data_size);
return 0;
@@ -609,17 +578,20 @@ nomem:
}
/*
- * Verify the security on a received packet or subpacket (if part of a
- * jumbo packet).
+ * Verify the security on a received packet and the subpackets therein.
*/
-static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
- unsigned int offset, unsigned int len,
- rxrpc_seq_t seq, u16 expected_cksum)
+static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb)
{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct skcipher_request *req;
struct rxrpc_crypt iv;
struct scatterlist sg;
+ union {
+ __be32 buf[2];
+ } crypto __aligned(8);
+ rxrpc_seq_t seq = sp->hdr.seq;
bool aborted;
+ int ret;
u16 cksum;
u32 x, y;
@@ -639,22 +611,22 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
/* validate the security checksum */
x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
x |= seq & 0x3fffffff;
- call->crypto_buf[0] = htonl(call->call_id);
- call->crypto_buf[1] = htonl(x);
+ crypto.buf[0] = htonl(call->call_id);
+ crypto.buf[1] = htonl(x);
- sg_init_one(&sg, call->crypto_buf, 8);
+ sg_init_one(&sg, crypto.buf, 8);
skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x);
crypto_skcipher_encrypt(req);
skcipher_request_zero(req);
- y = ntohl(call->crypto_buf[1]);
+ y = ntohl(crypto.buf[1]);
cksum = (y >> 16) & 0xffff;
if (cksum == 0)
cksum = 1; /* zero checksums are not permitted */
- if (cksum != expected_cksum) {
+ if (cksum != sp->hdr.cksum) {
aborted = rxrpc_abort_eproto(call, skb, "rxkad_csum", "VCK",
RXKADSEALEDINCON);
goto protocol_error;
@@ -662,15 +634,22 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
switch (call->conn->params.security_level) {
case RXRPC_SECURITY_PLAIN:
- return 0;
+ ret = 0;
+ break;
case RXRPC_SECURITY_AUTH:
- return rxkad_verify_packet_1(call, skb, offset, len, seq, req);
+ ret = rxkad_verify_packet_1(call, skb, seq, req);
+ break;
case RXRPC_SECURITY_ENCRYPT:
- return rxkad_verify_packet_2(call, skb, offset, len, seq, req);
+ ret = rxkad_verify_packet_2(call, skb, seq, req);
+ break;
default:
- return -ENOANO;
+ ret = -ENOANO;
+ break;
}
+ skcipher_request_free(req);
+ return ret;
+
protocol_error:
if (aborted)
rxrpc_send_abort_packet(call);
@@ -678,52 +657,6 @@ protocol_error:
}
/*
- * Locate the data contained in a packet that was partially encrypted.
- */
-static void rxkad_locate_data_1(struct rxrpc_call *call, struct sk_buff *skb,
- unsigned int *_offset, unsigned int *_len)
-{
- struct rxkad_level1_hdr sechdr;
-
- if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0)
- BUG();
- *_offset += sizeof(sechdr);
- *_len = ntohl(sechdr.data_size) & 0xffff;
-}
-
-/*
- * Locate the data contained in a packet that was completely encrypted.
- */
-static void rxkad_locate_data_2(struct rxrpc_call *call, struct sk_buff *skb,
- unsigned int *_offset, unsigned int *_len)
-{
- struct rxkad_level2_hdr sechdr;
-
- if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0)
- BUG();
- *_offset += sizeof(sechdr);
- *_len = ntohl(sechdr.data_size) & 0xffff;
-}
-
-/*
- * Locate the data contained in an already decrypted packet.
- */
-static void rxkad_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
- unsigned int *_offset, unsigned int *_len)
-{
- switch (call->conn->params.security_level) {
- case RXRPC_SECURITY_AUTH:
- rxkad_locate_data_1(call, skb, _offset, _len);
- return;
- case RXRPC_SECURITY_ENCRYPT:
- rxkad_locate_data_2(call, skb, _offset, _len);
- return;
- default:
- return;
- }
-}
-
-/*
* issue a challenge
*/
static int rxkad_issue_challenge(struct rxrpc_connection *conn)
@@ -1232,9 +1165,10 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
eproto = tracepoint_string("rxkad_tkt_short");
abort_code = RXKADPACKETSHORT;
- if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response),
- ticket, ticket_len) < 0)
- goto protocol_error_free;
+ ret = skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response),
+ ticket, ticket_len);
+ if (ret < 0)
+ goto temporary_error_free_ticket;
ret = rxkad_decrypt_ticket(conn, server_key, skb, ticket, ticket_len,
&session_key, &expiry, _abort_code);
@@ -1397,7 +1331,6 @@ const struct rxrpc_security rxkad = {
.secure_packet = rxkad_secure_packet,
.verify_packet = rxkad_verify_packet,
.free_call_crypto = rxkad_free_call_crypto,
- .locate_data = rxkad_locate_data,
.issue_challenge = rxkad_issue_challenge,
.respond_to_challenge = rxkad_respond_to_challenge,
.verify_response = rxkad_verify_response,
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 3c3a626459de..e5fd8a95bf71 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -22,10 +22,26 @@
*/
static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win)
{
- unsigned int win_size =
- min_t(unsigned int, call->tx_winsize,
- call->cong_cwnd + call->cong_extra);
- rxrpc_seq_t tx_win = READ_ONCE(call->tx_hard_ack);
+ unsigned int win_size;
+ rxrpc_seq_t tx_win = smp_load_acquire(&call->acks_hard_ack);
+
+ /* If we haven't transmitted anything for >1RTT, we should reset the
+ * congestion management state.
+ */
+ if (ktime_before(ktime_add_us(call->tx_last_sent,
+ call->peer->srtt_us >> 3),
+ ktime_get_real())) {
+ if (RXRPC_TX_SMSS > 2190)
+ win_size = 2;
+ else if (RXRPC_TX_SMSS > 1095)
+ win_size = 3;
+ else
+ win_size = 4;
+ win_size += call->cong_extra;
+ } else {
+ win_size = min_t(unsigned int, call->tx_winsize,
+ call->cong_cwnd + call->cong_extra);
+ }
if (_tx_win)
*_tx_win = tx_win;
@@ -50,7 +66,12 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx,
if (signal_pending(current))
return sock_intr_errno(*timeo);
- trace_rxrpc_transmit(call, rxrpc_transmit_wait);
+ if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) {
+ rxrpc_shrink_call_tx_buffer(call);
+ continue;
+ }
+
+ trace_rxrpc_txqueue(call, rxrpc_txqueue_wait);
*timeo = schedule_timeout(*timeo);
}
}
@@ -71,12 +92,11 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx,
rtt = 2;
timeout = rtt;
- tx_start = READ_ONCE(call->tx_hard_ack);
+ tx_start = smp_load_acquire(&call->acks_hard_ack);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
- tx_win = READ_ONCE(call->tx_hard_ack);
if (rxrpc_check_tx_space(call, &tx_win))
return 0;
@@ -87,12 +107,17 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx,
tx_win == tx_start && signal_pending(current))
return -EINTR;
+ if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) {
+ rxrpc_shrink_call_tx_buffer(call);
+ continue;
+ }
+
if (tx_win != tx_start) {
timeout = rtt;
tx_start = tx_win;
}
- trace_rxrpc_transmit(call, rxrpc_transmit_wait);
+ trace_rxrpc_txqueue(call, rxrpc_txqueue_wait);
timeout = schedule_timeout(timeout);
}
}
@@ -112,7 +137,12 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx,
if (call->state >= RXRPC_CALL_COMPLETE)
return call->error;
- trace_rxrpc_transmit(call, rxrpc_transmit_wait);
+ if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) {
+ rxrpc_shrink_call_tx_buffer(call);
+ continue;
+ }
+
+ trace_rxrpc_txqueue(call, rxrpc_txqueue_wait);
*timeo = schedule_timeout(*timeo);
}
}
@@ -129,8 +159,8 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
DECLARE_WAITQUEUE(myself, current);
int ret;
- _enter(",{%u,%u,%u}",
- call->tx_hard_ack, call->tx_top, call->tx_winsize);
+ _enter(",{%u,%u,%u,%u}",
+ call->tx_bottom, call->acks_hard_ack, call->tx_top, call->tx_winsize);
add_wait_queue(&call->waitq, &myself);
@@ -155,24 +185,6 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
}
/*
- * Schedule an instant Tx resend.
- */
-static inline void rxrpc_instant_resend(struct rxrpc_call *call, int ix)
-{
- spin_lock_bh(&call->lock);
-
- if (call->state < RXRPC_CALL_COMPLETE) {
- call->rxtx_annotations[ix] =
- (call->rxtx_annotations[ix] & RXRPC_TX_ANNO_LAST) |
- RXRPC_TX_ANNO_RETRANS;
- if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
- rxrpc_queue_call(call);
- }
-
- spin_unlock_bh(&call->lock);
-}
-
-/*
* Notify the owner of the call that the transmit phase is ended and the last
* packet has been queued.
*/
@@ -188,38 +200,35 @@ static void rxrpc_notify_end_tx(struct rxrpc_sock *rx, struct rxrpc_call *call,
* the packet immediately. Returns the error from rxrpc_send_data_packet()
* in case the caller wants to do something with it.
*/
-static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
- struct sk_buff *skb, bool last,
- rxrpc_notify_end_tx_t notify_end_tx)
+static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
+ struct rxrpc_txbuf *txb,
+ rxrpc_notify_end_tx_t notify_end_tx)
{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
unsigned long now;
- rxrpc_seq_t seq = sp->hdr.seq;
- int ret, ix;
- u8 annotation = RXRPC_TX_ANNO_UNACK;
+ rxrpc_seq_t seq = txb->seq;
+ bool last = test_bit(RXRPC_TXBUF_LAST, &txb->flags);
+ int ret;
- _net("queue skb %p [%d]", skb, seq);
+ rxrpc_inc_stat(call->rxnet, stat_tx_data);
ASSERTCMP(seq, ==, call->tx_top + 1);
- if (last)
- annotation |= RXRPC_TX_ANNO_LAST;
-
/* We have to set the timestamp before queueing as the retransmit
* algorithm can see the packet as soon as we queue it.
*/
- skb->tstamp = ktime_get_real();
+ txb->last_sent = ktime_get_real();
- ix = seq & RXRPC_RXTX_BUFF_MASK;
- rxrpc_get_skb(skb, rxrpc_skb_got);
- call->rxtx_annotations[ix] = annotation;
- smp_wmb();
- call->rxtx_buffer[ix] = skb;
+ /* Add the packet to the call's output buffer */
+ rxrpc_get_txbuf(txb, rxrpc_txbuf_get_buffer);
+ spin_lock(&call->tx_lock);
+ list_add_tail(&txb->call_link, &call->tx_buffer);
call->tx_top = seq;
+ spin_unlock(&call->tx_lock);
+
if (last)
- trace_rxrpc_transmit(call, rxrpc_transmit_queue_last);
+ trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last);
else
- trace_rxrpc_transmit(call, rxrpc_transmit_queue);
+ trace_rxrpc_txqueue(call, rxrpc_txqueue_queue);
if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
_debug("________awaiting reply/ACK__________");
@@ -232,7 +241,7 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
case RXRPC_CALL_SERVER_ACK_REQUEST:
call->state = RXRPC_CALL_SERVER_SEND_REPLY;
now = jiffies;
- WRITE_ONCE(call->ack_at, now + MAX_JIFFY_OFFSET);
+ WRITE_ONCE(call->delay_ack_at, now + MAX_JIFFY_OFFSET);
if (call->ackr_reason == RXRPC_ACK_DELAY)
call->ackr_reason = 0;
trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now);
@@ -252,7 +261,7 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
if (seq == 1 && rxrpc_is_client_call(call))
rxrpc_expose_client_call(call);
- ret = rxrpc_send_data_packet(call, skb, false);
+ ret = rxrpc_send_data_packet(call, txb);
if (ret < 0) {
switch (ret) {
case -ENETUNREACH:
@@ -262,8 +271,6 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
0, ret);
goto out;
}
- _debug("need instant resend %d", ret);
- rxrpc_instant_resend(call, ix);
} else {
unsigned long now = jiffies;
unsigned long resend_at = now + call->peer->rto_j;
@@ -274,9 +281,7 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
}
out:
- rxrpc_free_skb(skb, rxrpc_skb_freed);
- _leave(" = %d", ret);
- return ret;
+ rxrpc_put_txbuf(txb, rxrpc_txbuf_put_trans);
}
/*
@@ -290,8 +295,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
rxrpc_notify_end_tx_t notify_end_tx,
bool *_dropped_lock)
{
- struct rxrpc_skb_priv *sp;
- struct sk_buff *skb;
+ struct rxrpc_txbuf *txb;
struct sock *sk = &rx->sk;
enum rxrpc_call_state state;
long timeo;
@@ -325,16 +329,15 @@ reload:
goto maybe_error;
}
- skb = call->tx_pending;
+ txb = call->tx_pending;
call->tx_pending = NULL;
- rxrpc_see_skb(skb, rxrpc_skb_seen);
+ if (txb)
+ rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more);
do {
- /* Check to see if there's a ping ACK to reply to. */
- if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE)
- rxrpc_send_ack_packet(call, false, NULL);
+ rxrpc_transmit_ack_packets(call->peer->local);
- if (!skb) {
+ if (!txb) {
size_t remain, bufsize, chunk, offset;
_debug("alloc");
@@ -355,53 +358,31 @@ reload:
_debug("SIZE: %zu/%zu @%zu", chunk, bufsize, offset);
/* create a buffer that we can retain until it's ACK'd */
- skb = sock_alloc_send_skb(
- sk, bufsize, msg->msg_flags & MSG_DONTWAIT, &ret);
- if (!skb)
+ ret = -ENOMEM;
+ txb = rxrpc_alloc_txbuf(call, RXRPC_PACKET_TYPE_DATA,
+ GFP_KERNEL);
+ if (!txb)
goto maybe_error;
- sp = rxrpc_skb(skb);
- sp->rx_flags |= RXRPC_SKB_TX_BUFFER;
- rxrpc_new_skb(skb, rxrpc_skb_new);
-
- _debug("ALLOC SEND %p", skb);
-
- ASSERTCMP(skb->mark, ==, 0);
-
- __skb_put(skb, offset);
-
- sp->remain = chunk;
- if (sp->remain > skb_tailroom(skb))
- sp->remain = skb_tailroom(skb);
-
- _net("skb: hr %d, tr %d, hl %d, rm %d",
- skb_headroom(skb),
- skb_tailroom(skb),
- skb_headlen(skb),
- sp->remain);
-
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ txb->offset = offset;
+ txb->space -= offset;
+ txb->space = min_t(size_t, chunk, txb->space);
}
_debug("append");
- sp = rxrpc_skb(skb);
/* append next segment of data to the current buffer */
if (msg_data_left(msg) > 0) {
- int copy = skb_tailroom(skb);
- ASSERTCMP(copy, >, 0);
- if (copy > msg_data_left(msg))
- copy = msg_data_left(msg);
- if (copy > sp->remain)
- copy = sp->remain;
-
- _debug("add");
- ret = skb_add_data(skb, &msg->msg_iter, copy);
- _debug("added");
- if (ret < 0)
+ size_t copy = min_t(size_t, txb->space, msg_data_left(msg));
+
+ _debug("add %zu", copy);
+ if (!copy_from_iter_full(txb->data + txb->offset, copy,
+ &msg->msg_iter))
goto efault;
- sp->remain -= copy;
- skb->mark += copy;
+ _debug("added");
+ txb->space -= copy;
+ txb->len += copy;
+ txb->offset += copy;
copied += copy;
if (call->tx_total_len != -1)
call->tx_total_len -= copy;
@@ -413,32 +394,22 @@ reload:
goto call_terminated;
/* add the packet to the send queue if it's now full */
- if (sp->remain <= 0 ||
+ if (!txb->space ||
(msg_data_left(msg) == 0 && !more)) {
- struct rxrpc_connection *conn = call->conn;
- uint32_t seq;
-
- seq = call->tx_top + 1;
-
- sp->hdr.seq = seq;
- sp->hdr._rsvd = 0;
- sp->hdr.flags = conn->out_clientflag;
-
- if (msg_data_left(msg) == 0 && !more)
- sp->hdr.flags |= RXRPC_LAST_PACKET;
- else if (call->tx_top - call->tx_hard_ack <
+ if (msg_data_left(msg) == 0 && !more) {
+ txb->wire.flags |= RXRPC_LAST_PACKET;
+ __set_bit(RXRPC_TXBUF_LAST, &txb->flags);
+ }
+ else if (call->tx_top - call->acks_hard_ack <
call->tx_winsize)
- sp->hdr.flags |= RXRPC_MORE_PACKETS;
+ txb->wire.flags |= RXRPC_MORE_PACKETS;
- ret = call->security->secure_packet(call, skb, skb->mark);
+ ret = call->security->secure_packet(call, txb);
if (ret < 0)
goto out;
- ret = rxrpc_queue_packet(rx, call, skb,
- !msg_data_left(msg) && !more,
- notify_end_tx);
- /* Should check for failure here */
- skb = NULL;
+ rxrpc_queue_packet(rx, call, txb, notify_end_tx);
+ txb = NULL;
}
} while (msg_data_left(msg) > 0);
@@ -451,12 +422,12 @@ success:
read_unlock_bh(&call->state_lock);
}
out:
- call->tx_pending = skb;
+ call->tx_pending = txb;
_leave(" = %d", ret);
return ret;
call_terminated:
- rxrpc_free_skb(skb, rxrpc_skb_freed);
+ rxrpc_put_txbuf(txb, rxrpc_txbuf_put_send_aborted);
_leave(" = %d", call->error);
return call->error;
@@ -645,7 +616,6 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
*/
int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
__releases(&rx->sk.sk_lock.slock)
- __releases(&call->user_mutex)
{
enum rxrpc_call_state state;
struct rxrpc_call *call;
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index 580a5acffee7..0c827d5bb2b8 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -14,8 +14,7 @@
#include <net/af_rxrpc.h>
#include "ar-internal.h"
-#define is_tx_skb(skb) (rxrpc_skb(skb)->rx_flags & RXRPC_SKB_TX_BUFFER)
-#define select_skb_count(skb) (is_tx_skb(skb) ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs)
+#define select_skb_count(skb) (&rxrpc_n_rx_skbs)
/*
* Note the allocation or reception of a socket buffer.
@@ -24,8 +23,7 @@ void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
const void *here = __builtin_return_address(0);
int n = atomic_inc_return(select_skb_count(skb));
- trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n,
- rxrpc_skb(skb)->rx_flags, here);
+ trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here);
}
/*
@@ -36,8 +34,7 @@ void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
const void *here = __builtin_return_address(0);
if (skb) {
int n = atomic_read(select_skb_count(skb));
- trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n,
- rxrpc_skb(skb)->rx_flags, here);
+ trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here);
}
}
@@ -48,8 +45,7 @@ void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
const void *here = __builtin_return_address(0);
int n = atomic_inc_return(select_skb_count(skb));
- trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n,
- rxrpc_skb(skb)->rx_flags, here);
+ trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here);
skb_get(skb);
}
@@ -60,7 +56,7 @@ void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
const void *here = __builtin_return_address(0);
int n = atomic_inc_return(&rxrpc_n_rx_skbs);
- trace_rxrpc_skb(skb, op, 0, n, 0, here);
+ trace_rxrpc_skb(skb, op, 0, n, here);
}
/*
@@ -72,8 +68,7 @@ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
if (skb) {
int n;
n = atomic_dec_return(select_skb_count(skb));
- trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n,
- rxrpc_skb(skb)->rx_flags, here);
+ trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here);
kfree_skb(skb);
}
}
@@ -88,8 +83,7 @@ void rxrpc_purge_queue(struct sk_buff_head *list)
while ((skb = skb_dequeue((list))) != NULL) {
int n = atomic_dec_return(select_skb_count(skb));
trace_rxrpc_skb(skb, rxrpc_skb_purged,
- refcount_read(&skb->users), n,
- rxrpc_skb(skb)->rx_flags, here);
+ refcount_read(&skb->users), n, here);
kfree_skb(skb);
}
}
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 555e0910786b..cde3224a5cd2 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -14,7 +14,7 @@ static struct ctl_table_header *rxrpc_sysctl_reg_table;
static const unsigned int four = 4;
static const unsigned int max_backlog = RXRPC_BACKLOG_MAX - 1;
static const unsigned int n_65535 = 65535;
-static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1;
+static const unsigned int n_max_acks = 255;
static const unsigned long one_jiffy = 1;
static const unsigned long max_jiffies = MAX_JIFFY_OFFSET;
@@ -27,15 +27,6 @@ static const unsigned long max_jiffies = MAX_JIFFY_OFFSET;
static struct ctl_table rxrpc_sysctl_table[] = {
/* Values measured in milliseconds but used in jiffies */
{
- .procname = "req_ack_delay",
- .data = &rxrpc_requested_ack_delay,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_ms_jiffies_minmax,
- .extra1 = (void *)&one_jiffy,
- .extra2 = (void *)&max_jiffies,
- },
- {
.procname = "soft_ack_delay",
.data = &rxrpc_soft_ack_delay,
.maxlen = sizeof(unsigned long),
diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c
new file mode 100644
index 000000000000..96bfee89927b
--- /dev/null
+++ b/net/rxrpc/txbuf.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* RxRPC Tx data buffering.
+ *
+ * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include "ar-internal.h"
+
+static atomic_t rxrpc_txbuf_debug_ids;
+atomic_t rxrpc_nr_txbuf;
+
+/*
+ * Allocate and partially initialise an I/O request structure.
+ */
+struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type,
+ gfp_t gfp)
+{
+ struct rxrpc_txbuf *txb;
+
+ txb = kmalloc(sizeof(*txb), gfp);
+ if (txb) {
+ INIT_LIST_HEAD(&txb->call_link);
+ INIT_LIST_HEAD(&txb->tx_link);
+ refcount_set(&txb->ref, 1);
+ txb->call = call;
+ txb->call_debug_id = call->debug_id;
+ txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids);
+ txb->space = sizeof(txb->data);
+ txb->len = 0;
+ txb->offset = 0;
+ txb->flags = 0;
+ txb->ack_why = 0;
+ txb->seq = call->tx_top + 1;
+ txb->wire.epoch = htonl(call->conn->proto.epoch);
+ txb->wire.cid = htonl(call->cid);
+ txb->wire.callNumber = htonl(call->call_id);
+ txb->wire.seq = htonl(txb->seq);
+ txb->wire.type = packet_type;
+ txb->wire.flags = call->conn->out_clientflag;
+ txb->wire.userStatus = 0;
+ txb->wire.securityIndex = call->security_ix;
+ txb->wire._rsvd = 0;
+ txb->wire.serviceId = htons(call->service_id);
+
+ trace_rxrpc_txbuf(txb->debug_id,
+ txb->call_debug_id, txb->seq, 1,
+ packet_type == RXRPC_PACKET_TYPE_DATA ?
+ rxrpc_txbuf_alloc_data :
+ rxrpc_txbuf_alloc_ack);
+ atomic_inc(&rxrpc_nr_txbuf);
+ }
+
+ return txb;
+}
+
+void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what)
+{
+ int r;
+
+ __refcount_inc(&txb->ref, &r);
+ trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, r + 1, what);
+}
+
+void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what)
+{
+ int r = refcount_read(&txb->ref);
+
+ trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, r, what);
+}
+
+static void rxrpc_free_txbuf(struct rcu_head *rcu)
+{
+ struct rxrpc_txbuf *txb = container_of(rcu, struct rxrpc_txbuf, rcu);
+
+ trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0,
+ rxrpc_txbuf_free);
+ kfree(txb);
+ atomic_dec(&rxrpc_nr_txbuf);
+}
+
+void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what)
+{
+ unsigned int debug_id, call_debug_id;
+ rxrpc_seq_t seq;
+ bool dead;
+ int r;
+
+ if (txb) {
+ debug_id = txb->debug_id;
+ call_debug_id = txb->call_debug_id;
+ seq = txb->seq;
+ dead = __refcount_dec_and_test(&txb->ref, &r);
+ trace_rxrpc_txbuf(debug_id, call_debug_id, seq, r - 1, what);
+ if (dead)
+ call_rcu(&txb->rcu, rxrpc_free_txbuf);
+ }
+}
+
+/*
+ * Shrink the transmit buffer.
+ */
+void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call)
+{
+ struct rxrpc_txbuf *txb;
+ rxrpc_seq_t hard_ack = smp_load_acquire(&call->acks_hard_ack);
+
+ _enter("%x/%x/%x", call->tx_bottom, call->acks_hard_ack, call->tx_top);
+
+ for (;;) {
+ spin_lock(&call->tx_lock);
+ txb = list_first_entry_or_null(&call->tx_buffer,
+ struct rxrpc_txbuf, call_link);
+ if (!txb)
+ break;
+ hard_ack = smp_load_acquire(&call->acks_hard_ack);
+ if (before(hard_ack, txb->seq))
+ break;
+
+ ASSERTCMP(txb->seq, ==, call->tx_bottom + 1);
+ call->tx_bottom++;
+ list_del_rcu(&txb->call_link);
+
+ trace_rxrpc_txqueue(call, rxrpc_txqueue_dequeue);
+
+ spin_unlock(&call->tx_lock);
+
+ rxrpc_put_txbuf(txb, rxrpc_txbuf_put_rotated);
+ }
+
+ spin_unlock(&call->tx_lock);
+}
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 1e8ab4749c6c..4662a6ce8a7e 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -976,7 +976,7 @@ config NET_ACT_TUNNEL_KEY
config NET_ACT_CT
tristate "connection tracking tc action"
- depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE
+ depends on NET_CLS_ACT && NF_CONNTRACK && (!NF_NAT || NF_NAT) && NF_FLOW_TABLE
help
Say Y here to allow sending the packets to conntrack module.
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 66b143bb04ac..d41002e4613f 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -61,7 +61,7 @@ static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a,
c = nf_ct_get(skb, &ctinfo);
if (c) {
- skb->mark = c->mark;
+ skb->mark = READ_ONCE(c->mark);
/* using overlimits stats to count how many packets marked */
ca->tcf_qstats.overlimits++;
goto out;
@@ -81,7 +81,7 @@ static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a,
c = nf_ct_tuplehash_to_ctrack(thash);
/* using overlimits stats to count how many packets marked */
ca->tcf_qstats.overlimits++;
- skb->mark = c->mark;
+ skb->mark = READ_ONCE(c->mark);
nf_ct_put(c);
out:
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index b38d91d6b249..dd5ae7551956 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -33,6 +33,7 @@
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/netfilter/nf_conntrack_act_ct.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
#include <uapi/linux/netfilter/nf_nat.h>
static struct workqueue_struct *act_ct_wq;
@@ -178,7 +179,7 @@ static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct,
entry = tcf_ct_flow_table_flow_action_get_next(action);
entry->id = FLOW_ACTION_CT_METADATA;
#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
- entry->ct_metadata.mark = ct->mark;
+ entry->ct_metadata.mark = READ_ONCE(ct->mark);
#endif
ctinfo = dir == IP_CT_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
IP_CT_ESTABLISHED_REPLY;
@@ -345,11 +346,9 @@ static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
module_put(THIS_MODULE);
}
-static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
+static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft)
{
- struct tcf_ct_flow_table *ct_ft = params->ct_ft;
-
- if (refcount_dec_and_test(&params->ct_ft->ref)) {
+ if (refcount_dec_and_test(&ct_ft->ref)) {
rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
queue_rcu_work(act_ct_wq, &ct_ft->rwork);
@@ -657,7 +656,7 @@ struct tc_ct_action_net {
/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
- u16 zone_id, bool force)
+ struct tcf_ct_params *p)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
@@ -667,11 +666,19 @@ static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
return false;
if (!net_eq(net, read_pnet(&ct->ct_net)))
goto drop_ct;
- if (nf_ct_zone(ct)->id != zone_id)
+ if (nf_ct_zone(ct)->id != p->zone)
goto drop_ct;
+ if (p->helper) {
+ struct nf_conn_help *help;
+
+ help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
+ if (help && rcu_access_pointer(help->helper) != p->helper)
+ goto drop_ct;
+ }
/* Force conntrack entry direction. */
- if (force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
+ if ((p->ct_action & TCA_CT_ACT_FORCE) &&
+ CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
if (nf_ct_is_confirmed(ct))
nf_ct_kill(ct);
@@ -832,18 +839,30 @@ out_free:
return err;
}
-static void tcf_ct_params_free(struct rcu_head *head)
+static void tcf_ct_params_free(struct tcf_ct_params *params)
{
- struct tcf_ct_params *params = container_of(head,
- struct tcf_ct_params, rcu);
-
- tcf_ct_flow_table_put(params);
-
+ if (params->helper) {
+#if IS_ENABLED(CONFIG_NF_NAT)
+ if (params->ct_action & TCA_CT_ACT_NAT)
+ nf_nat_helper_put(params->helper);
+#endif
+ nf_conntrack_helper_put(params->helper);
+ }
+ if (params->ct_ft)
+ tcf_ct_flow_table_put(params->ct_ft);
if (params->tmpl)
nf_ct_put(params->tmpl);
kfree(params);
}
+static void tcf_ct_params_free_rcu(struct rcu_head *head)
+{
+ struct tcf_ct_params *params;
+
+ params = container_of(head, struct tcf_ct_params, rcu);
+ tcf_ct_params_free(params);
+}
+
#if IS_ENABLED(CONFIG_NF_NAT)
/* Modelled after nf_nat_ipv[46]_fn().
* range is only used for new, uninitialized NAT state.
@@ -936,9 +955,9 @@ static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask)
if (!mask)
return;
- new_mark = mark | (ct->mark & ~(mask));
- if (ct->mark != new_mark) {
- ct->mark = new_mark;
+ new_mark = mark | (READ_ONCE(ct->mark) & ~(mask));
+ if (READ_ONCE(ct->mark) != new_mark) {
+ WRITE_ONCE(ct->mark, new_mark);
if (nf_ct_is_confirmed(ct))
nf_conntrack_event_cache(IPCT_MARK, ct);
}
@@ -1023,13 +1042,14 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct net *net = dev_net(skb->dev);
- bool cached, commit, clear, force;
enum ip_conntrack_info ctinfo;
struct tcf_ct *c = to_ct(a);
struct nf_conn *tmpl = NULL;
struct nf_hook_state state;
+ bool cached, commit, clear;
int nh_ofs, err, retval;
struct tcf_ct_params *p;
+ bool add_helper = false;
bool skip_add = false;
bool defrag = false;
struct nf_conn *ct;
@@ -1040,7 +1060,6 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
retval = READ_ONCE(c->tcf_action);
commit = p->ct_action & TCA_CT_ACT_COMMIT;
clear = p->ct_action & TCA_CT_ACT_CLEAR;
- force = p->ct_action & TCA_CT_ACT_FORCE;
tmpl = p->tmpl;
tcf_lastuse_update(&c->tcf_tm);
@@ -1083,7 +1102,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
* actually run the packet through conntrack twice unless it's for a
* different zone.
*/
- cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
+ cached = tcf_ct_skb_nfct_cached(net, skb, p);
if (!cached) {
if (tcf_ct_flow_table_lookup(p, skb, family)) {
skip_add = true;
@@ -1116,6 +1135,22 @@ do_nat:
if (err != NF_ACCEPT)
goto drop;
+ if (!nf_ct_is_confirmed(ct) && commit && p->helper && !nfct_help(ct)) {
+ err = __nf_ct_try_assign_helper(ct, p->tmpl, GFP_ATOMIC);
+ if (err)
+ goto drop;
+ add_helper = true;
+ if (p->ct_action & TCA_CT_ACT_NAT && !nfct_seqadj(ct)) {
+ if (!nfct_seqadj_ext_add(ct))
+ goto drop;
+ }
+ }
+
+ if (nf_ct_is_confirmed(ct) ? ((!cached && !skip_add) || add_helper) : commit) {
+ if (nf_ct_helper(skb, ct, ctinfo, family) != NF_ACCEPT)
+ goto drop;
+ }
+
if (commit) {
tcf_ct_act_set_mark(ct, p->mark, p->mark_mask);
tcf_ct_act_set_labels(ct, p->labels, p->labels_mask);
@@ -1164,6 +1199,9 @@ static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
[TCA_CT_NAT_IPV6_MAX] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
[TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 },
[TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 },
+ [TCA_CT_HELPER_NAME] = { .type = NLA_STRING, .len = NF_CT_HELPER_NAME_LEN },
+ [TCA_CT_HELPER_FAMILY] = { .type = NLA_U8 },
+ [TCA_CT_HELPER_PROTO] = { .type = NLA_U8 },
};
static int tcf_ct_fill_params_nat(struct tcf_ct_params *p,
@@ -1253,8 +1291,9 @@ static int tcf_ct_fill_params(struct net *net,
{
struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id);
struct nf_conntrack_zone zone;
+ int err, family, proto, len;
struct nf_conn *tmpl;
- int err;
+ char *name;
p->zone = NF_CT_DEFAULT_ZONE_ID;
@@ -1315,10 +1354,31 @@ static int tcf_ct_fill_params(struct net *net,
NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template");
return -ENOMEM;
}
- __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
p->tmpl = tmpl;
+ if (tb[TCA_CT_HELPER_NAME]) {
+ name = nla_data(tb[TCA_CT_HELPER_NAME]);
+ len = nla_len(tb[TCA_CT_HELPER_NAME]);
+ if (len > 16 || name[len - 1] != '\0') {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to parse helper name.");
+ err = -EINVAL;
+ goto err;
+ }
+ family = tb[TCA_CT_HELPER_FAMILY] ? nla_get_u8(tb[TCA_CT_HELPER_FAMILY]) : AF_INET;
+ proto = tb[TCA_CT_HELPER_PROTO] ? nla_get_u8(tb[TCA_CT_HELPER_PROTO]) : IPPROTO_TCP;
+ err = nf_ct_add_helper(tmpl, name, family, proto,
+ p->ct_action & TCA_CT_ACT_NAT, &p->helper);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to add helper");
+ goto err;
+ }
+ }
+ __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
return 0;
+err:
+ nf_ct_put(p->tmpl);
+ p->tmpl = NULL;
+ return err;
}
static int tcf_ct_init(struct net *net, struct nlattr *nla,
@@ -1390,7 +1450,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
err = tcf_ct_flow_table_get(net, params);
if (err)
- goto cleanup_params;
+ goto cleanup;
spin_lock_bh(&c->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
@@ -1401,17 +1461,15 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
if (goto_ch)
tcf_chain_put_by_act(goto_ch);
if (params)
- call_rcu(&params->rcu, tcf_ct_params_free);
+ call_rcu(&params->rcu, tcf_ct_params_free_rcu);
return res;
-cleanup_params:
- if (params->tmpl)
- nf_ct_put(params->tmpl);
cleanup:
if (goto_ch)
tcf_chain_put_by_act(goto_ch);
- kfree(params);
+ if (params)
+ tcf_ct_params_free(params);
tcf_idr_release(*a, bind);
return err;
}
@@ -1423,7 +1481,7 @@ static void tcf_ct_cleanup(struct tc_action *a)
params = rcu_dereference_protected(c->params, 1);
if (params)
- call_rcu(&params->rcu, tcf_ct_params_free);
+ call_rcu(&params->rcu, tcf_ct_params_free_rcu);
}
static int tcf_ct_dump_key_val(struct sk_buff *skb,
@@ -1489,6 +1547,19 @@ static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p)
return 0;
}
+static int tcf_ct_dump_helper(struct sk_buff *skb, struct nf_conntrack_helper *helper)
+{
+ if (!helper)
+ return 0;
+
+ if (nla_put_string(skb, TCA_CT_HELPER_NAME, helper->name) ||
+ nla_put_u8(skb, TCA_CT_HELPER_FAMILY, helper->tuple.src.l3num) ||
+ nla_put_u8(skb, TCA_CT_HELPER_PROTO, helper->tuple.dst.protonum))
+ return -1;
+
+ return 0;
+}
+
static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
@@ -1541,6 +1612,9 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
if (tcf_ct_dump_nat(skb, p))
goto nla_put_failure;
+ if (tcf_ct_dump_helper(skb, p->helper))
+ goto nla_put_failure;
+
skip_dump:
if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index d4102f0a9abd..eaa02f098d1c 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -32,7 +32,7 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
{
u8 dscp, newdscp;
- newdscp = (((ct->mark & cp->dscpmask) >> cp->dscpmaskshift) << 2) &
+ newdscp = (((READ_ONCE(ct->mark) & cp->dscpmask) >> cp->dscpmaskshift) << 2) &
~INET_ECN_MASK;
switch (proto) {
@@ -72,7 +72,7 @@ static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
struct sk_buff *skb)
{
ca->stats_cpmark_set++;
- skb->mark = ct->mark & cp->cpmarkmask;
+ skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask;
}
static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a,
@@ -130,7 +130,7 @@ static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a,
}
if (cp->mode & CTINFO_MODE_DSCP)
- if (!cp->dscpstatemask || (ct->mark & cp->dscpstatemask))
+ if (!cp->dscpstatemask || (READ_ONCE(ct->mark) & cp->dscpstatemask))
tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto);
if (cp->mode & CTINFO_MODE_CPMARK)
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index abe1bcc5c797..62d682b96b88 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -25,7 +25,7 @@ static struct tc_action_ops act_gact_ops;
static int gact_net_rand(struct tcf_gact *gact)
{
smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */
- if (prandom_u32() % gact->tcfg_pval)
+ if (prandom_u32_max(gact->tcfg_pval))
return gact->tcf_action;
return gact->tcfg_paction;
}
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 5ba36f70e3a1..7a25477f5d99 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -168,7 +168,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
psample_group = rcu_dereference_bh(s->psample_group);
/* randomly sample packets according to rate */
- if (psample_group && (prandom_u32() % s->rate == 0)) {
+ if (psample_group && (prandom_u32_max(s->rate) == 0)) {
if (!skb_at_tc_ingress(skb)) {
md.in_ifindex = skb->skb_iif;
md.out_ifindex = skb->dev->ifindex;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 7f598784fd30..1710780c908a 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -148,6 +148,11 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
}
if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) {
+ if (is_tcf_skbedit_ingress(act_flags) &&
+ !(act_flags & TCA_ACT_FLAGS_SKIP_SW)) {
+ NL_SET_ERR_MSG_MOD(extack, "\"queue_mapping\" option on receive side is hardware only, use skip_sw");
+ return -EOPNOTSUPP;
+ }
flags |= SKBEDIT_F_QUEUE_MAPPING;
queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
}
@@ -374,9 +379,12 @@ static int tcf_skbedit_offload_act_setup(struct tc_action *act, void *entry_data
} else if (is_tcf_skbedit_priority(act)) {
entry->id = FLOW_ACTION_PRIORITY;
entry->priority = tcf_skbedit_priority(act);
- } else if (is_tcf_skbedit_queue_mapping(act)) {
- NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"queue_mapping\" option is used");
+ } else if (is_tcf_skbedit_tx_queue_mapping(act)) {
+ NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"queue_mapping\" option is used on transmit side");
return -EOPNOTSUPP;
+ } else if (is_tcf_skbedit_rx_queue_mapping(act)) {
+ entry->id = FLOW_ACTION_RX_QUEUE_MAPPING;
+ entry->rx_queue = tcf_skbedit_rx_queue_mapping(act);
} else if (is_tcf_skbedit_inheritdsfield(act)) {
NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"inheritdsfield\" option is used");
return -EOPNOTSUPP;
@@ -394,6 +402,8 @@ static int tcf_skbedit_offload_act_setup(struct tc_action *act, void *entry_data
fl_action->id = FLOW_ACTION_PTYPE;
else if (is_tcf_skbedit_priority(act))
fl_action->id = FLOW_ACTION_PRIORITY;
+ else if (is_tcf_skbedit_rx_queue_mapping(act))
+ fl_action->id = FLOW_ACTION_RX_QUEUE_MAPPING;
else
return -EOPNOTSUPP;
}
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 50566db45949..23d1cfa4f58c 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1953,6 +1953,11 @@ static void tfilter_put(struct tcf_proto *tp, void *fh)
tp->ops->put(tp, fh);
}
+static bool is_qdisc_ingress(__u32 classid)
+{
+ return (TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS));
+}
+
static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
struct netlink_ext_ack *extack)
{
@@ -2144,6 +2149,8 @@ replay:
flags |= TCA_ACT_FLAGS_REPLACE;
if (!rtnl_held)
flags |= TCA_ACT_FLAGS_NO_RTNL;
+ if (is_qdisc_ingress(parent))
+ flags |= TCA_ACT_FLAGS_AT_INGRESS;
err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
flags, extack);
if (err == 0) {
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index c98af0ada706..4a27dfb1ba0f 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1099,12 +1099,13 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
skip:
if (!ingress) {
- notify_and_destroy(net, skb, n, classid,
- rtnl_dereference(dev->qdisc), new);
+ old = rtnl_dereference(dev->qdisc);
if (new && !new->ops->attach)
qdisc_refcount_inc(new);
rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
+ notify_and_destroy(net, skb, n, classid, old, new);
+
if (new && new->ops->attach)
new->ops->attach(new);
} else {
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 55c6879d2c7e..3ed0c3342189 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -573,7 +573,7 @@ static bool cobalt_should_drop(struct cobalt_vars *vars,
/* Simple BLUE implementation. Lack of ECN is deliberate. */
if (vars->p_drop)
- drop |= (prandom_u32() < vars->p_drop);
+ drop |= (get_random_u32() < vars->p_drop);
/* Overload the drop_next field as an activity timeout */
if (!vars->count)
@@ -2092,11 +2092,11 @@ retry:
WARN_ON(host_load > CAKE_QUEUES);
- /* The shifted prandom_u32() is a way to apply dithering to
- * avoid accumulating roundoff errors
+ /* The get_random_u16() is a way to apply dithering to avoid
+ * accumulating roundoff errors
*/
flow->deficit += (b->flow_quantum * quantum_div[host_load] +
- (prandom_u32() >> 16)) >> 16;
+ get_random_u16()) >> 16;
list_move_tail(&flow->flowchain, &b->old_flows);
goto retry;
@@ -2224,8 +2224,12 @@ retry:
static void cake_reset(struct Qdisc *sch)
{
+ struct cake_sched_data *q = qdisc_priv(sch);
u32 c;
+ if (!q->tins)
+ return;
+
for (c = 0; c < CAKE_MAX_TINS; c++)
cake_clear_tin(sch, c);
}
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 99d318b60568..8c4fee063436 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -478,24 +478,26 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt,
if (opt) {
err = fq_codel_change(sch, opt, extack);
if (err)
- return err;
+ goto init_failure;
}
err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
if (err)
- return err;
+ goto init_failure;
if (!q->flows) {
q->flows = kvcalloc(q->flows_cnt,
sizeof(struct fq_codel_flow),
GFP_KERNEL);
- if (!q->flows)
- return -ENOMEM;
-
+ if (!q->flows) {
+ err = -ENOMEM;
+ goto init_failure;
+ }
q->backlogs = kvcalloc(q->flows_cnt, sizeof(u32), GFP_KERNEL);
- if (!q->backlogs)
- return -ENOMEM;
-
+ if (!q->backlogs) {
+ err = -ENOMEM;
+ goto alloc_failure;
+ }
for (i = 0; i < q->flows_cnt; i++) {
struct fq_codel_flow *flow = q->flows + i;
@@ -508,6 +510,13 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt,
else
sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
+
+alloc_failure:
+ kvfree(q->flows);
+ q->flows = NULL;
+init_failure:
+ q->flows_cnt = 0;
+ return err;
}
static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 18f4273a835b..fb00ac40ecb7 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -171,7 +171,7 @@ static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
static void init_crandom(struct crndstate *state, unsigned long rho)
{
state->rho = rho;
- state->last = prandom_u32();
+ state->last = get_random_u32();
}
/* get_crandom - correlated random number generator
@@ -184,9 +184,9 @@ static u32 get_crandom(struct crndstate *state)
unsigned long answer;
if (!state || state->rho == 0) /* no correlation */
- return prandom_u32();
+ return get_random_u32();
- value = prandom_u32();
+ value = get_random_u32();
rho = (u64)state->rho + 1;
answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
state->last = answer;
@@ -200,7 +200,7 @@ static u32 get_crandom(struct crndstate *state)
static bool loss_4state(struct netem_sched_data *q)
{
struct clgstate *clg = &q->clg;
- u32 rnd = prandom_u32();
+ u32 rnd = get_random_u32();
/*
* Makes a comparison between rnd and the transition
@@ -268,15 +268,15 @@ static bool loss_gilb_ell(struct netem_sched_data *q)
switch (clg->state) {
case GOOD_STATE:
- if (prandom_u32() < clg->a1)
+ if (get_random_u32() < clg->a1)
clg->state = BAD_STATE;
- if (prandom_u32() < clg->a4)
+ if (get_random_u32() < clg->a4)
return true;
break;
case BAD_STATE:
- if (prandom_u32() < clg->a2)
+ if (get_random_u32() < clg->a2)
clg->state = GOOD_STATE;
- if (prandom_u32() > clg->a3)
+ if (get_random_u32() > clg->a3)
return true;
}
@@ -513,8 +513,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
goto finish_segs;
}
- skb->data[prandom_u32() % skb_headlen(skb)] ^=
- 1<<(prandom_u32() % 8);
+ skb->data[prandom_u32_max(skb_headlen(skb))] ^=
+ 1<<prandom_u32_max(8);
}
if (unlikely(sch->q.qlen >= sch->limit)) {
@@ -632,7 +632,7 @@ static void get_slot_next(struct netem_sched_data *q, u64 now)
if (!q->slot_dist)
next_delay = q->slot_config.min_delay +
- (prandom_u32() *
+ (get_random_u32() *
(q->slot_config.max_delay -
q->slot_config.min_delay) >> 32);
else
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index 974038ba6c7b..265c238047a4 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -72,7 +72,7 @@ bool pie_drop_early(struct Qdisc *sch, struct pie_params *params,
if (vars->accu_prob >= (MAX_PROB / 2) * 17)
return true;
- prandom_bytes(&rnd, 8);
+ get_random_bytes(&rnd, 8);
if ((rnd >> BITS_PER_BYTE) < local_prob) {
vars->accu_prob = 0;
return true;
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index a5a401f93c1a..98129324e157 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -72,6 +72,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
+ unsigned int len;
int ret;
q->vars.qavg = red_calc_qavg(&q->parms,
@@ -126,9 +127,10 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
break;
}
+ len = qdisc_pkt_len(skb);
ret = qdisc_enqueue(skb, child, to_free);
if (likely(ret == NET_XMIT_SUCCESS)) {
- qdisc_qstats_backlog_inc(sch, skb);
+ sch->qstats.backlog += len;
sch->q.qlen++;
} else if (net_xmit_drop_count(ret)) {
q->stats.pdrop++;
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index e2389fa3cff8..1871a1c0224d 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -379,7 +379,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
goto enqueue;
}
- r = prandom_u32() & SFB_MAX_PROB;
+ r = get_random_u16() & SFB_MAX_PROB;
if (unlikely(r < p_min)) {
if (unlikely(p_min > SFB_MAX_PROB / 2)) {
@@ -455,7 +455,8 @@ static void sfb_reset(struct Qdisc *sch)
{
struct sfb_sched_data *q = qdisc_priv(sch);
- qdisc_reset(q->qdisc);
+ if (likely(q->qdisc))
+ qdisc_reset(q->qdisc);
q->slot = 0;
q->double_buffering = false;
sfb_zero_all_buckets(q);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 3460abceba44..63ba5551c13f 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -226,8 +226,7 @@ static struct sctp_association *sctp_association_init(
/* Create an output queue. */
sctp_outq_init(asoc, &asoc->outqueue);
- if (!sctp_ulpq_init(&asoc->ulpq, asoc))
- goto fail_init;
+ sctp_ulpq_init(&asoc->ulpq, asoc);
if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, 0, gfp))
goto stream_free;
@@ -277,7 +276,6 @@ static struct sctp_association *sctp_association_init(
stream_free:
sctp_stream_free(&asoc->stream);
-fail_init:
sock_put(asoc->base.sk);
sctp_endpoint_put(asoc->ep);
return NULL;
diff --git a/net/sctp/diag.c b/net/sctp/diag.c
index d9c6d8f30f09..a557009e9832 100644
--- a/net/sctp/diag.c
+++ b/net/sctp/diag.c
@@ -426,6 +426,7 @@ static int sctp_diag_dump_one(struct netlink_callback *cb,
struct net *net = sock_net(skb->sk);
const struct nlmsghdr *nlh = cb->nlh;
union sctp_addr laddr, paddr;
+ int dif = req->id.idiag_if;
struct sctp_comm_param commp = {
.skb = skb,
.r = req,
@@ -454,7 +455,7 @@ static int sctp_diag_dump_one(struct netlink_callback *cb,
}
return sctp_transport_lookup_process(sctp_sock_dump_one,
- net, &laddr, &paddr, &commp);
+ net, &laddr, &paddr, &commp, dif);
}
static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index efffde7f2328..7e77b450697c 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -246,12 +246,15 @@ void sctp_endpoint_put(struct sctp_endpoint *ep)
/* Is this the endpoint we are looking for? */
struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep,
struct net *net,
- const union sctp_addr *laddr)
+ const union sctp_addr *laddr,
+ int dif, int sdif)
{
+ int bound_dev_if = READ_ONCE(ep->base.sk->sk_bound_dev_if);
struct sctp_endpoint *retval = NULL;
- if ((htons(ep->base.bind_addr.port) == laddr->v4.sin_port) &&
- net_eq(ep->base.net, net)) {
+ if (net_eq(ep->base.net, net) &&
+ sctp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif) &&
+ (htons(ep->base.bind_addr.port) == laddr->v4.sin_port)) {
if (sctp_bind_addr_match(&ep->base.bind_addr, laddr,
sctp_sk(ep->base.sk)))
retval = ep;
@@ -298,6 +301,7 @@ out:
bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
const union sctp_addr *paddr)
{
+ int bound_dev_if = READ_ONCE(ep->base.sk->sk_bound_dev_if);
struct sctp_sockaddr_entry *addr;
struct net *net = ep->base.net;
struct sctp_bind_addr *bp;
@@ -307,7 +311,8 @@ bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
* so the address_list can not change.
*/
list_for_each_entry(addr, &bp->address_list, list) {
- if (sctp_has_association(net, &addr->a, paddr))
+ if (sctp_has_association(net, &addr->a, paddr,
+ bound_dev_if, bound_dev_if))
return true;
}
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 4f43afa8678f..bf70371301ff 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -50,16 +50,19 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net,
struct sk_buff *skb,
const union sctp_addr *paddr,
const union sctp_addr *laddr,
- struct sctp_transport **transportp);
+ struct sctp_transport **transportp,
+ int dif, int sdif);
static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
struct net *net, struct sk_buff *skb,
const union sctp_addr *laddr,
- const union sctp_addr *daddr);
+ const union sctp_addr *daddr,
+ int dif, int sdif);
static struct sctp_association *__sctp_lookup_association(
struct net *net,
const union sctp_addr *local,
const union sctp_addr *peer,
- struct sctp_transport **pt);
+ struct sctp_transport **pt,
+ int dif, int sdif);
static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb);
@@ -92,11 +95,11 @@ int sctp_rcv(struct sk_buff *skb)
struct sctp_chunk *chunk;
union sctp_addr src;
union sctp_addr dest;
- int bound_dev_if;
int family;
struct sctp_af *af;
struct net *net = dev_net(skb->dev);
bool is_gso = skb_is_gso(skb) && skb_is_gso_sctp(skb);
+ int dif, sdif;
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
@@ -141,6 +144,8 @@ int sctp_rcv(struct sk_buff *skb)
/* Initialize local addresses for lookups. */
af->from_skb(&src, skb, 1);
af->from_skb(&dest, skb, 0);
+ dif = af->skb_iif(skb);
+ sdif = af->skb_sdif(skb);
/* If the packet is to or from a non-unicast address,
* silently discard the packet.
@@ -157,36 +162,16 @@ int sctp_rcv(struct sk_buff *skb)
!af->addr_valid(&dest, NULL, skb))
goto discard_it;
- asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport);
+ asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport, dif, sdif);
if (!asoc)
- ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src);
+ ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src, dif, sdif);
/* Retrieve the common input handling substructure. */
rcvr = asoc ? &asoc->base : &ep->base;
sk = rcvr->sk;
/*
- * If a frame arrives on an interface and the receiving socket is
- * bound to another interface, via SO_BINDTODEVICE, treat it as OOTB
- */
- bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
- if (bound_dev_if && (bound_dev_if != af->skb_iif(skb))) {
- if (transport) {
- sctp_transport_put(transport);
- asoc = NULL;
- transport = NULL;
- } else {
- sctp_endpoint_put(ep);
- ep = NULL;
- }
- sk = net->sctp.ctl_sock;
- ep = sctp_sk(sk)->ep;
- sctp_endpoint_hold(ep);
- rcvr = &ep->base;
- }
-
- /*
* RFC 2960, 8.4 - Handle "Out of the blue" Packets.
* An SCTP packet is called an "out of the blue" (OOTB)
* packet if it is correctly formed, i.e., passed the
@@ -485,6 +470,8 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb,
struct sctp_association *asoc;
struct sctp_transport *transport = NULL;
__u32 vtag = ntohl(sctphdr->vtag);
+ int sdif = inet_sdif(skb);
+ int dif = inet_iif(skb);
*app = NULL; *tpp = NULL;
@@ -500,7 +487,7 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb,
/* Look for an association that matches the incoming ICMP error
* packet.
*/
- asoc = __sctp_lookup_association(net, &saddr, &daddr, &transport);
+ asoc = __sctp_lookup_association(net, &saddr, &daddr, &transport, dif, sdif);
if (!asoc)
return NULL;
@@ -850,7 +837,8 @@ static inline __u32 sctp_hashfn(const struct net *net, __be16 lport,
static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
struct net *net, struct sk_buff *skb,
const union sctp_addr *laddr,
- const union sctp_addr *paddr)
+ const union sctp_addr *paddr,
+ int dif, int sdif)
{
struct sctp_hashbucket *head;
struct sctp_endpoint *ep;
@@ -863,7 +851,7 @@ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
head = &sctp_ep_hashtable[hash];
read_lock(&head->lock);
sctp_for_each_hentry(ep, &head->chain) {
- if (sctp_endpoint_is_match(ep, net, laddr))
+ if (sctp_endpoint_is_match(ep, net, laddr, dif, sdif))
goto hit;
}
@@ -990,14 +978,26 @@ void sctp_unhash_transport(struct sctp_transport *t)
sctp_hash_params);
}
+bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif)
+{
+ bool l3mdev_accept = true;
+
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ l3mdev_accept = !!READ_ONCE(net->sctp.l3mdev_accept);
+#endif
+ return inet_bound_dev_eq(l3mdev_accept, bound_dev_if, dif, sdif);
+}
+
/* return a transport with holding it */
struct sctp_transport *sctp_addrs_lookup_transport(
struct net *net,
const union sctp_addr *laddr,
- const union sctp_addr *paddr)
+ const union sctp_addr *paddr,
+ int dif, int sdif)
{
struct rhlist_head *tmp, *list;
struct sctp_transport *t;
+ int bound_dev_if;
struct sctp_hash_cmp_arg arg = {
.paddr = paddr,
.net = net,
@@ -1011,7 +1011,9 @@ struct sctp_transport *sctp_addrs_lookup_transport(
if (!sctp_transport_hold(t))
continue;
- if (sctp_bind_addr_match(&t->asoc->base.bind_addr,
+ bound_dev_if = READ_ONCE(t->asoc->base.sk->sk_bound_dev_if);
+ if (sctp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif) &&
+ sctp_bind_addr_match(&t->asoc->base.bind_addr,
laddr, sctp_sk(t->asoc->base.sk)))
return t;
sctp_transport_put(t);
@@ -1048,12 +1050,13 @@ static struct sctp_association *__sctp_lookup_association(
struct net *net,
const union sctp_addr *local,
const union sctp_addr *peer,
- struct sctp_transport **pt)
+ struct sctp_transport **pt,
+ int dif, int sdif)
{
struct sctp_transport *t;
struct sctp_association *asoc = NULL;
- t = sctp_addrs_lookup_transport(net, local, peer);
+ t = sctp_addrs_lookup_transport(net, local, peer, dif, sdif);
if (!t)
goto out;
@@ -1069,12 +1072,13 @@ static
struct sctp_association *sctp_lookup_association(struct net *net,
const union sctp_addr *laddr,
const union sctp_addr *paddr,
- struct sctp_transport **transportp)
+ struct sctp_transport **transportp,
+ int dif, int sdif)
{
struct sctp_association *asoc;
rcu_read_lock();
- asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
+ asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif);
rcu_read_unlock();
return asoc;
@@ -1083,11 +1087,12 @@ struct sctp_association *sctp_lookup_association(struct net *net,
/* Is there an association matching the given local and peer addresses? */
bool sctp_has_association(struct net *net,
const union sctp_addr *laddr,
- const union sctp_addr *paddr)
+ const union sctp_addr *paddr,
+ int dif, int sdif)
{
struct sctp_transport *transport;
- if (sctp_lookup_association(net, laddr, paddr, &transport)) {
+ if (sctp_lookup_association(net, laddr, paddr, &transport, dif, sdif)) {
sctp_transport_put(transport);
return true;
}
@@ -1115,7 +1120,8 @@ bool sctp_has_association(struct net *net,
*/
static struct sctp_association *__sctp_rcv_init_lookup(struct net *net,
struct sk_buff *skb,
- const union sctp_addr *laddr, struct sctp_transport **transportp)
+ const union sctp_addr *laddr, struct sctp_transport **transportp,
+ int dif, int sdif)
{
struct sctp_association *asoc;
union sctp_addr addr;
@@ -1154,7 +1160,7 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net,
if (!af->from_addr_param(paddr, params.addr, sh->source, 0))
continue;
- asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
+ asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif);
if (asoc)
return asoc;
}
@@ -1181,7 +1187,8 @@ static struct sctp_association *__sctp_rcv_asconf_lookup(
struct sctp_chunkhdr *ch,
const union sctp_addr *laddr,
__be16 peer_port,
- struct sctp_transport **transportp)
+ struct sctp_transport **transportp,
+ int dif, int sdif)
{
struct sctp_addip_chunk *asconf = (struct sctp_addip_chunk *)ch;
struct sctp_af *af;
@@ -1201,7 +1208,7 @@ static struct sctp_association *__sctp_rcv_asconf_lookup(
if (!af->from_addr_param(&paddr, param, peer_port, 0))
return NULL;
- return __sctp_lookup_association(net, laddr, &paddr, transportp);
+ return __sctp_lookup_association(net, laddr, &paddr, transportp, dif, sdif);
}
@@ -1217,7 +1224,8 @@ static struct sctp_association *__sctp_rcv_asconf_lookup(
static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net,
struct sk_buff *skb,
const union sctp_addr *laddr,
- struct sctp_transport **transportp)
+ struct sctp_transport **transportp,
+ int dif, int sdif)
{
struct sctp_association *asoc = NULL;
struct sctp_chunkhdr *ch;
@@ -1260,7 +1268,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net,
asoc = __sctp_rcv_asconf_lookup(
net, ch, laddr,
sctp_hdr(skb)->source,
- transportp);
+ transportp, dif, sdif);
break;
default:
break;
@@ -1285,7 +1293,8 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net,
static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
struct sk_buff *skb,
const union sctp_addr *laddr,
- struct sctp_transport **transportp)
+ struct sctp_transport **transportp,
+ int dif, int sdif)
{
struct sctp_chunkhdr *ch;
@@ -1309,9 +1318,9 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
/* If this is INIT/INIT-ACK look inside the chunk too. */
if (ch->type == SCTP_CID_INIT || ch->type == SCTP_CID_INIT_ACK)
- return __sctp_rcv_init_lookup(net, skb, laddr, transportp);
+ return __sctp_rcv_init_lookup(net, skb, laddr, transportp, dif, sdif);
- return __sctp_rcv_walk_lookup(net, skb, laddr, transportp);
+ return __sctp_rcv_walk_lookup(net, skb, laddr, transportp, dif, sdif);
}
/* Lookup an association for an inbound skb. */
@@ -1319,11 +1328,12 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net,
struct sk_buff *skb,
const union sctp_addr *paddr,
const union sctp_addr *laddr,
- struct sctp_transport **transportp)
+ struct sctp_transport **transportp,
+ int dif, int sdif)
{
struct sctp_association *asoc;
- asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
+ asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif);
if (asoc)
goto out;
@@ -1331,7 +1341,7 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net,
* SCTP Implementors Guide, 2.18 Handling of address
* parameters within the INIT or INIT-ACK.
*/
- asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp);
+ asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp, dif, sdif);
if (asoc)
goto out;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index d081858c2d07..097bd60ce964 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -680,9 +680,11 @@ static int sctp_v6_is_any(const union sctp_addr *addr)
/* Should this be available for binding? */
static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp)
{
- int type;
- struct net *net = sock_net(&sp->inet.sk);
const struct in6_addr *in6 = (const struct in6_addr *)&addr->v6.sin6_addr;
+ struct sock *sk = &sp->inet.sk;
+ struct net *net = sock_net(sk);
+ struct net_device *dev = NULL;
+ int type;
type = ipv6_addr_type(in6);
if (IPV6_ADDR_ANY == type)
@@ -696,8 +698,14 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp)
if (!(type & IPV6_ADDR_UNICAST))
return 0;
+ if (sk->sk_bound_dev_if) {
+ dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
+ if (!dev)
+ return 0;
+ }
+
return ipv6_can_nonlocal_bind(net, &sp->inet) ||
- ipv6_chk_addr(net, in6, NULL, 0);
+ ipv6_chk_addr(net, in6, dev, 0);
}
/* This function checks if the address is a valid address to be used for
@@ -834,7 +842,12 @@ static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr)
/* Where did this skb come from? */
static int sctp_v6_skb_iif(const struct sk_buff *skb)
{
- return IP6CB(skb)->iif;
+ return inet6_iif(skb);
+}
+
+static int sctp_v6_skb_sdif(const struct sk_buff *skb)
+{
+ return inet6_sdif(skb);
}
/* Was this packet marked by Explicit Congestion Notification? */
@@ -1134,6 +1147,7 @@ static struct sctp_af sctp_af_inet6 = {
.is_any = sctp_v6_is_any,
.available = sctp_v6_available,
.skb_iif = sctp_v6_skb_iif,
+ .skb_sdif = sctp_v6_skb_sdif,
.is_ce = sctp_v6_is_ce,
.seq_dump_addr = sctp_v6_seq_dump_addr,
.ecn_capable = sctp_v6_ecn_capable,
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index e213aaf45d67..20831079fb09 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -384,6 +384,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
{
struct sctp_outq *q = &asoc->outqueue;
struct sctp_chunk *chk, *temp;
+ struct sctp_stream_out *sout;
q->sched->unsched_all(&asoc->stream);
@@ -398,12 +399,14 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
sctp_sched_dequeue_common(q, chk);
asoc->sent_cnt_removable--;
asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
- if (chk->sinfo.sinfo_stream < asoc->stream.outcnt) {
- struct sctp_stream_out *streamout =
- SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream);
- streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
- }
+ sout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream);
+ sout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
+
+ /* clear out_curr if all frag chunks are pruned */
+ if (asoc->stream.out_curr == sout &&
+ list_is_last(&chk->frag_list, &chk->msg->chunks))
+ asoc->stream.out_curr = NULL;
msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk);
sctp_chunk_free(chk);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index bcd3384ab07a..909a89a1cff4 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -351,10 +351,13 @@ static int sctp_v4_addr_valid(union sctp_addr *addr,
/* Should this be available for binding? */
static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp)
{
- struct net *net = sock_net(&sp->inet.sk);
- int ret = inet_addr_type(net, addr->v4.sin_addr.s_addr);
-
+ struct sock *sk = &sp->inet.sk;
+ struct net *net = sock_net(sk);
+ int tb_id = RT_TABLE_LOCAL;
+ int ret;
+ tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ?: tb_id;
+ ret = inet_addr_type_table(net, addr->v4.sin_addr.s_addr, tb_id);
if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) &&
ret != RTN_LOCAL &&
!sp->inet.freebind &&
@@ -564,6 +567,11 @@ static int sctp_v4_skb_iif(const struct sk_buff *skb)
return inet_iif(skb);
}
+static int sctp_v4_skb_sdif(const struct sk_buff *skb)
+{
+ return inet_sdif(skb);
+}
+
/* Was this packet marked by Explicit Congestion Notification? */
static int sctp_v4_is_ce(const struct sk_buff *skb)
{
@@ -1182,6 +1190,7 @@ static struct sctp_af sctp_af_inet = {
.available = sctp_v4_available,
.scope = sctp_v4_scope,
.skb_iif = sctp_v4_skb_iif,
+ .skb_sdif = sctp_v4_skb_sdif,
.is_ce = sctp_v4_is_ce,
.seq_dump_addr = sctp_v4_seq_dump_addr,
.ecn_capable = sctp_v4_ecn_capable,
@@ -1385,6 +1394,10 @@ static int __net_init sctp_defaults_init(struct net *net)
/* Initialize maximum autoclose timeout. */
net->sctp.max_autoclose = INT_MAX / HZ;
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ net->sctp.l3mdev_accept = 1;
+#endif
+
status = sctp_sysctl_net_register(net);
if (status)
goto err_sysctl_register;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index f6ee7f4040c1..ce5426171206 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -4044,7 +4044,7 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net,
(void *)err_param, commands);
if (last_asconf) {
- addip_hdr = (struct sctp_addiphdr *)last_asconf->subh.addip_hdr;
+ addip_hdr = last_asconf->subh.addip_hdr;
sent_serial = ntohl(addip_hdr->serial);
} else {
sent_serial = asoc->addip_serial - 1;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 171f1a35d205..5acbdf0d38f3 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5098,13 +5098,17 @@ static void sctp_destroy_sock(struct sock *sk)
}
/* Triggered when there are no references on the socket anymore */
-static void sctp_destruct_sock(struct sock *sk)
+static void sctp_destruct_common(struct sock *sk)
{
struct sctp_sock *sp = sctp_sk(sk);
/* Free up the HMAC transform. */
crypto_free_shash(sp->hmac);
+}
+static void sctp_destruct_sock(struct sock *sk)
+{
+ sctp_destruct_common(sk);
inet_sock_destruct(sk);
}
@@ -5311,14 +5315,14 @@ EXPORT_SYMBOL_GPL(sctp_for_each_endpoint);
int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net,
const union sctp_addr *laddr,
- const union sctp_addr *paddr, void *p)
+ const union sctp_addr *paddr, void *p, int dif)
{
struct sctp_transport *transport;
struct sctp_endpoint *ep;
int err = -ENOENT;
rcu_read_lock();
- transport = sctp_addrs_lookup_transport(net, laddr, paddr);
+ transport = sctp_addrs_lookup_transport(net, laddr, paddr, dif, dif);
if (!transport) {
rcu_read_unlock();
return err;
@@ -8319,7 +8323,7 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
- rover = prandom_u32() % remaining + low;
+ rover = prandom_u32_max(remaining) + low;
do {
rover++;
@@ -8394,6 +8398,7 @@ pp_found:
* in an endpoint.
*/
sk_for_each_bound(sk2, &pp->owner) {
+ int bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if);
struct sctp_sock *sp2 = sctp_sk(sk2);
struct sctp_endpoint *ep2 = sp2->ep;
@@ -8404,7 +8409,9 @@ pp_found:
uid_eq(uid, sock_i_uid(sk2))))
continue;
- if (sctp_bind_addr_conflict(&ep2->base.bind_addr,
+ if ((!sk->sk_bound_dev_if || !bound_dev_if2 ||
+ sk->sk_bound_dev_if == bound_dev_if2) &&
+ sctp_bind_addr_conflict(&ep2->base.bind_addr,
addr, sp2, sp)) {
ret = 1;
goto fail_unlock;
@@ -9427,7 +9434,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
sctp_sk(newsk)->reuse = sp->reuse;
newsk->sk_shutdown = sk->sk_shutdown;
- newsk->sk_destruct = sctp_destruct_sock;
+ newsk->sk_destruct = sk->sk_destruct;
newsk->sk_family = sk->sk_family;
newsk->sk_protocol = IPPROTO_SCTP;
newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
@@ -9448,7 +9455,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
newinet->inet_rcv_saddr = inet->inet_rcv_saddr;
newinet->inet_dport = htons(asoc->peer.port);
newinet->pmtudisc = inet->pmtudisc;
- newinet->inet_id = prandom_u32();
+ newinet->inet_id = get_random_u16();
newinet->uc_ttl = inet->uc_ttl;
newinet->mc_loop = 1;
@@ -9662,11 +9669,20 @@ struct proto sctp_prot = {
#if IS_ENABLED(CONFIG_IPV6)
-#include <net/transp_v6.h>
-static void sctp_v6_destroy_sock(struct sock *sk)
+static void sctp_v6_destruct_sock(struct sock *sk)
+{
+ sctp_destruct_common(sk);
+ inet6_sock_destruct(sk);
+}
+
+static int sctp_v6_init_sock(struct sock *sk)
{
- sctp_destroy_sock(sk);
- inet6_destroy_sock(sk);
+ int ret = sctp_init_sock(sk);
+
+ if (!ret)
+ sk->sk_destruct = sctp_v6_destruct_sock;
+
+ return ret;
}
struct proto sctpv6_prot = {
@@ -9676,8 +9692,8 @@ struct proto sctpv6_prot = {
.disconnect = sctp_disconnect,
.accept = sctp_accept,
.ioctl = sctp_ioctl,
- .init = sctp_init_sock,
- .destroy = sctp_v6_destroy_sock,
+ .init = sctp_v6_init_sock,
+ .destroy = sctp_destroy_sock,
.shutdown = sctp_shutdown,
.setsockopt = sctp_setsockopt,
.getsockopt = sctp_getsockopt,
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index ef9fceadef8d..ee6514af830f 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -52,6 +52,19 @@ static void sctp_stream_shrink_out(struct sctp_stream *stream, __u16 outcnt)
}
}
+static void sctp_stream_free_ext(struct sctp_stream *stream, __u16 sid)
+{
+ struct sctp_sched_ops *sched;
+
+ if (!SCTP_SO(stream, sid)->ext)
+ return;
+
+ sched = sctp_sched_ops_from_stream(stream);
+ sched->free_sid(stream, sid);
+ kfree(SCTP_SO(stream, sid)->ext);
+ SCTP_SO(stream, sid)->ext = NULL;
+}
+
/* Migrates chunks from stream queues to new stream queues if needed,
* but not across associations. Also, removes those chunks to streams
* higher than the new max.
@@ -70,16 +83,14 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream,
* sctp_stream_update will swap ->out pointers.
*/
for (i = 0; i < outcnt; i++) {
- kfree(SCTP_SO(new, i)->ext);
+ sctp_stream_free_ext(new, i);
SCTP_SO(new, i)->ext = SCTP_SO(stream, i)->ext;
SCTP_SO(stream, i)->ext = NULL;
}
}
- for (i = outcnt; i < stream->outcnt; i++) {
- kfree(SCTP_SO(stream, i)->ext);
- SCTP_SO(stream, i)->ext = NULL;
- }
+ for (i = outcnt; i < stream->outcnt; i++)
+ sctp_stream_free_ext(stream, i);
}
static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
@@ -174,9 +185,9 @@ void sctp_stream_free(struct sctp_stream *stream)
struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
int i;
- sched->free(stream);
+ sched->unsched_all(stream);
for (i = 0; i < stream->outcnt; i++)
- kfree(SCTP_SO(stream, i)->ext);
+ sctp_stream_free_ext(stream, i);
genradix_free(&stream->out);
genradix_free(&stream->in);
}
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index bb22b71df7a3..94727feb07b3 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -490,11 +490,8 @@ static int sctp_enqueue_event(struct sctp_ulpq *ulpq,
if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe))
goto out_free;
- if (skb_list)
- skb_queue_splice_tail_init(skb_list,
- &sk->sk_receive_queue);
- else
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_queue_splice_tail_init(skb_list,
+ &sk->sk_receive_queue);
if (!sp->data_ready_signalled) {
sp->data_ready_signalled = 1;
@@ -504,10 +501,7 @@ static int sctp_enqueue_event(struct sctp_ulpq *ulpq,
return 1;
out_free:
- if (skb_list)
- sctp_queue_purge_ulpevents(skb_list);
- else
- sctp_ulpevent_free(event);
+ sctp_queue_purge_ulpevents(skb_list);
return 0;
}
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 1ad565ed5627..7c8f9d89e16a 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -46,6 +46,10 @@ static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid,
return 0;
}
+static void sctp_sched_fcfs_free_sid(struct sctp_stream *stream, __u16 sid)
+{
+}
+
static void sctp_sched_fcfs_free(struct sctp_stream *stream)
{
}
@@ -96,6 +100,7 @@ static struct sctp_sched_ops sctp_sched_fcfs = {
.get = sctp_sched_fcfs_get,
.init = sctp_sched_fcfs_init,
.init_sid = sctp_sched_fcfs_init_sid,
+ .free_sid = sctp_sched_fcfs_free_sid,
.free = sctp_sched_fcfs_free,
.enqueue = sctp_sched_fcfs_enqueue,
.dequeue = sctp_sched_fcfs_dequeue,
diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c
index 80b5a2c4cbc7..4fc9f2923ed1 100644
--- a/net/sctp/stream_sched_prio.c
+++ b/net/sctp/stream_sched_prio.c
@@ -204,6 +204,24 @@ static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid,
return sctp_sched_prio_set(stream, sid, 0, gfp);
}
+static void sctp_sched_prio_free_sid(struct sctp_stream *stream, __u16 sid)
+{
+ struct sctp_stream_priorities *prio = SCTP_SO(stream, sid)->ext->prio_head;
+ int i;
+
+ if (!prio)
+ return;
+
+ SCTP_SO(stream, sid)->ext->prio_head = NULL;
+ for (i = 0; i < stream->outcnt; i++) {
+ if (SCTP_SO(stream, i)->ext &&
+ SCTP_SO(stream, i)->ext->prio_head == prio)
+ return;
+ }
+
+ kfree(prio);
+}
+
static void sctp_sched_prio_free(struct sctp_stream *stream)
{
struct sctp_stream_priorities *prio, *n;
@@ -323,6 +341,7 @@ static struct sctp_sched_ops sctp_sched_prio = {
.get = sctp_sched_prio_get,
.init = sctp_sched_prio_init,
.init_sid = sctp_sched_prio_init_sid,
+ .free_sid = sctp_sched_prio_free_sid,
.free = sctp_sched_prio_free,
.enqueue = sctp_sched_prio_enqueue,
.dequeue = sctp_sched_prio_dequeue,
diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c
index ff425aed62c7..cc444fe0d67c 100644
--- a/net/sctp/stream_sched_rr.c
+++ b/net/sctp/stream_sched_rr.c
@@ -90,6 +90,10 @@ static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid,
return 0;
}
+static void sctp_sched_rr_free_sid(struct sctp_stream *stream, __u16 sid)
+{
+}
+
static void sctp_sched_rr_free(struct sctp_stream *stream)
{
sctp_sched_rr_unsched_all(stream);
@@ -177,6 +181,7 @@ static struct sctp_sched_ops sctp_sched_rr = {
.get = sctp_sched_rr_get,
.init = sctp_sched_rr_init,
.init_sid = sctp_sched_rr_init_sid,
+ .free_sid = sctp_sched_rr_free_sid,
.free = sctp_sched_rr_free,
.enqueue = sctp_sched_rr_enqueue,
.dequeue = sctp_sched_rr_dequeue,
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index b46a416787ec..7f40ed117fc7 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -347,6 +347,17 @@ static struct ctl_table sctp_net_table[] = {
.extra1 = &max_autoclose_min,
.extra2 = &max_autoclose_max,
},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ {
+ .procname = "l3mdev_accept",
+ .data = &init_net.sctp.l3mdev_accept,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
{
.procname = "pf_enable",
.data = &init_net.sctp.pf_enable,
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 0a8510a0c5e6..b05daafd369a 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -38,8 +38,7 @@ static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq);
/* 1st Level Abstractions */
/* Initialize a ULP queue from a block of memory. */
-struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq,
- struct sctp_association *asoc)
+void sctp_ulpq_init(struct sctp_ulpq *ulpq, struct sctp_association *asoc)
{
memset(ulpq, 0, sizeof(struct sctp_ulpq));
@@ -48,8 +47,6 @@ struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq,
skb_queue_head_init(&ulpq->reasm_uo);
skb_queue_head_init(&ulpq->lobby);
ulpq->pd_mode = 0;
-
- return ulpq;
}
@@ -259,10 +256,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list)
return 1;
out_free:
- if (skb_list)
- sctp_queue_purge_ulpevents(skb_list);
- else
- sctp_ulpevent_free(event);
+ sctp_queue_purge_ulpevents(skb_list);
return 0;
}
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 3ccbf3c201cd..e12d4fa5aece 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -3380,14 +3380,14 @@ static int __init smc_init(void)
rc = register_pernet_subsys(&smc_net_stat_ops);
if (rc)
- return rc;
+ goto out_pernet_subsys;
smc_ism_init();
smc_clc_init();
rc = smc_nl_init();
if (rc)
- goto out_pernet_subsys;
+ goto out_pernet_subsys_stat;
rc = smc_pnet_init();
if (rc)
@@ -3480,6 +3480,8 @@ out_pnet:
smc_pnet_exit();
out_nl:
smc_nl_exit();
+out_pernet_subsys_stat:
+ unregister_pernet_subsys(&smc_net_stat_ops);
out_pernet_subsys:
unregister_pernet_subsys(&smc_net_ops);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index e6ee797640b4..c305d8dd23f8 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -896,7 +896,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
}
memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1],
SMC_MAX_PNETID_LEN);
- if (smc_wr_alloc_lgr_mem(lgr))
+ rc = smc_wr_alloc_lgr_mem(lgr);
+ if (rc)
goto free_wq;
smc_llc_lgr_init(lgr, smc);
diff --git a/net/socket.c b/net/socket.c
index 00da9ce3dba0..55c5d536e5f6 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2199,13 +2199,7 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size,
static bool sock_use_custom_sol_socket(const struct socket *sock)
{
- const struct sock *sk = sock->sk;
-
- /* Use sock->ops->setsockopt() for MPTCP */
- return IS_ENABLED(CONFIG_MPTCP) &&
- sk->sk_protocol == IPPROTO_MPTCP &&
- sk->sk_type == SOCK_STREAM &&
- (sk->sk_family == AF_INET || sk->sk_family == AF_INET6);
+ return test_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags);
}
/*
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index a31a27816cc0..7bb247c51e2f 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1989,7 +1989,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
goto unwrap_failed;
mic.len = len;
mic.data = kmalloc(len, GFP_KERNEL);
- if (!mic.data)
+ if (ZERO_OR_NULL_PTR(mic.data))
goto unwrap_failed;
if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len))
goto unwrap_failed;
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 5f96e75f9eec..48337687848c 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -130,8 +130,8 @@ gss_krb5_make_confounder(char *p, u32 conflen)
/* initialize to random value */
if (i == 0) {
- i = prandom_u32();
- i = (i << 32) | prandom_u32();
+ i = get_random_u32();
+ i = (i << 32) | get_random_u32();
}
switch (conflen) {
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index c3c693b51c94..f075a9fb5ccc 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -677,7 +677,7 @@ static void cache_limit_defers(void)
/* Consider removing either the first or the last */
if (cache_defer_cnt > DFR_MAX) {
- if (prandom_u32() & 1)
+ if (prandom_u32_max(2))
discard = list_entry(cache_defer_list.next,
struct cache_deferred_req, recent);
else
diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c
index c65c90ad626a..c1f559892ae8 100644
--- a/net/sunrpc/sysfs.c
+++ b/net/sunrpc/sysfs.c
@@ -518,13 +518,16 @@ void rpc_sysfs_client_setup(struct rpc_clnt *clnt,
struct net *net)
{
struct rpc_sysfs_client *rpc_client;
+ struct rpc_sysfs_xprt_switch *xswitch =
+ (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs;
+
+ if (!xswitch)
+ return;
rpc_client = rpc_sysfs_client_alloc(rpc_sunrpc_client_kobj,
net, clnt->cl_clid);
if (rpc_client) {
char name[] = "switch";
- struct rpc_sysfs_xprt_switch *xswitch =
- (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs;
int ret;
clnt->cl_sysfs = rpc_client;
@@ -558,6 +561,8 @@ void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch,
rpc_xprt_switch->xprt_switch = xprt_switch;
rpc_xprt_switch->xprt = xprt;
kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_ADD);
+ } else {
+ xprt_switch->xps_sysfs = NULL;
}
}
@@ -569,6 +574,9 @@ void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch,
struct rpc_sysfs_xprt_switch *switch_obj =
(struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs;
+ if (!switch_obj)
+ return;
+
rpc_xprt = rpc_sysfs_xprt_alloc(&switch_obj->kobject, xprt, gfp_flags);
if (rpc_xprt) {
xprt->xprt_sysfs = rpc_xprt;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 71dc26373444..656cec208371 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1865,7 +1865,7 @@ xprt_alloc_xid(struct rpc_xprt *xprt)
static void
xprt_init_xid(struct rpc_xprt *xprt)
{
- xprt->xid = prandom_u32();
+ xprt->xid = get_random_u32();
}
static void
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index f34d5427b66c..915b9902f673 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1619,7 +1619,7 @@ static int xs_get_random_port(void)
if (max < min)
return -EADDRINUSE;
range = max - min + 1;
- rand = (unsigned short) prandom_u32() % range;
+ rand = prandom_u32_max(range);
return rand + min;
}
diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
index f09316a9035f..d67440de011e 100644
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -1971,6 +1971,9 @@ rcv:
/* Ok, everything's fine, try to synch own keys according to peers' */
tipc_crypto_key_synch(rx, *skb);
+ /* Re-fetch skb cb as skb might be changed in tipc_msg_validate */
+ skb_cb = TIPC_SKB_CB(*skb);
+
/* Mark skb decrypted */
skb_cb->decrypted = 1;
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index da69e1abf68f..e8dcdf267c0c 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -148,8 +148,8 @@ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d,
{
struct net *net = d->net;
struct tipc_net *tn = tipc_net(net);
- bool trial = time_before(jiffies, tn->addr_trial_end);
u32 self = tipc_own_addr(net);
+ bool trial = time_before(jiffies, tn->addr_trial_end) && !self;
if (mtyp == DSC_TRIAL_FAIL_MSG) {
if (!trial)
@@ -211,7 +211,10 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
u32 self;
int err;
- skb_linearize(skb);
+ if (skb_linearize(skb)) {
+ kfree_skb(skb);
+ return;
+ }
hdr = buf_msg(skb);
if (caps & TIPC_NODE_ID128)
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index fc68733673ba..dfea27a906f2 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -880,7 +880,7 @@ static int tipc_nl_compat_name_table_dump_header(struct tipc_nl_compat_msg *msg)
};
ntq = (struct tipc_name_table_query *)TLV_DATA(msg->req);
- if (TLV_GET_DATA_LEN(msg->req) < sizeof(struct tipc_name_table_query))
+ if (TLV_GET_DATA_LEN(msg->req) < (int)sizeof(struct tipc_name_table_query))
return -EINVAL;
depth = ntohl(ntq->depth);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index f1c3b8eb4b3d..e902b01ea3cb 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -3010,7 +3010,7 @@ static int tipc_sk_insert(struct tipc_sock *tsk)
struct net *net = sock_net(sk);
struct tipc_net *tn = net_generic(net, tipc_net_id);
u32 remaining = (TIPC_MAX_PORT - TIPC_MIN_PORT) + 1;
- u32 portid = prandom_u32() % remaining + TIPC_MIN_PORT;
+ u32 portid = prandom_u32_max(remaining) + TIPC_MIN_PORT;
while (remaining--) {
portid++;
diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c
index 5522865deae9..e3b427a70398 100644
--- a/net/tipc/topsrv.c
+++ b/net/tipc/topsrv.c
@@ -176,7 +176,7 @@ static void tipc_conn_close(struct tipc_conn *con)
conn_put(con);
}
-static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s)
+static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s, struct socket *sock)
{
struct tipc_conn *con;
int ret;
@@ -202,10 +202,12 @@ static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s)
}
con->conid = ret;
s->idr_in_use++;
- spin_unlock_bh(&s->idr_lock);
set_bit(CF_CONNECTED, &con->flags);
con->server = s;
+ con->sock = sock;
+ conn_get(con);
+ spin_unlock_bh(&s->idr_lock);
return con;
}
@@ -450,17 +452,24 @@ static void tipc_conn_data_ready(struct sock *sk)
static void tipc_topsrv_accept(struct work_struct *work)
{
struct tipc_topsrv *srv = container_of(work, struct tipc_topsrv, awork);
- struct socket *lsock = srv->listener;
- struct socket *newsock;
+ struct socket *newsock, *lsock;
struct tipc_conn *con;
struct sock *newsk;
int ret;
+ spin_lock_bh(&srv->idr_lock);
+ if (!srv->listener) {
+ spin_unlock_bh(&srv->idr_lock);
+ return;
+ }
+ lsock = srv->listener;
+ spin_unlock_bh(&srv->idr_lock);
+
while (1) {
ret = kernel_accept(lsock, &newsock, O_NONBLOCK);
if (ret < 0)
return;
- con = tipc_conn_alloc(srv);
+ con = tipc_conn_alloc(srv, newsock);
if (IS_ERR(con)) {
ret = PTR_ERR(con);
sock_release(newsock);
@@ -472,11 +481,11 @@ static void tipc_topsrv_accept(struct work_struct *work)
newsk->sk_data_ready = tipc_conn_data_ready;
newsk->sk_write_space = tipc_conn_write_space;
newsk->sk_user_data = con;
- con->sock = newsock;
write_unlock_bh(&newsk->sk_callback_lock);
/* Wake up receive process in case of 'SYN+' message */
newsk->sk_data_ready(newsk);
+ conn_put(con);
}
}
@@ -489,7 +498,7 @@ static void tipc_topsrv_listener_data_ready(struct sock *sk)
read_lock_bh(&sk->sk_callback_lock);
srv = sk->sk_user_data;
- if (srv->listener)
+ if (srv)
queue_work(srv->rcv_wq, &srv->awork);
read_unlock_bh(&sk->sk_callback_lock);
}
@@ -568,19 +577,19 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
sub.seq.upper = upper;
sub.timeout = TIPC_WAIT_FOREVER;
sub.filter = filter;
- *(u32 *)&sub.usr_handle = port;
+ *(u64 *)&sub.usr_handle = (u64)port;
- con = tipc_conn_alloc(tipc_topsrv(net));
+ con = tipc_conn_alloc(tipc_topsrv(net), NULL);
if (IS_ERR(con))
return false;
*conid = con->conid;
- con->sock = NULL;
rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub);
- if (rc >= 0)
- return true;
+ if (rc)
+ conn_put(con);
+
conn_put(con);
- return false;
+ return !rc;
}
void tipc_topsrv_kern_unsubscr(struct net *net, int conid)
@@ -699,8 +708,9 @@ static void tipc_topsrv_stop(struct net *net)
__module_get(lsock->sk->sk_prot_creator->owner);
srv->listener = NULL;
spin_unlock_bh(&srv->idr_lock);
- sock_release(lsock);
+
tipc_topsrv_work_stop(srv);
+ sock_release(lsock);
idr_destroy(&srv->conn_idr);
kfree(srv);
}
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index cdb391a8754b..7fbb1d0b69b3 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -346,7 +346,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx,
salt = tls_ctx->crypto_send.aes_gcm_256.salt;
break;
default:
- return NULL;
+ goto free_req;
}
cipher_sz = &tls_cipher_size_desc[tls_ctx->crypto_send.info.cipher_type];
buf_len = cipher_sz->salt + cipher_sz->iv + TLS_AAD_SPACE_SIZE +
@@ -492,7 +492,8 @@ int tls_sw_fallback_init(struct sock *sk,
key = ((struct tls12_crypto_info_aes_gcm_256 *)crypto_info)->key;
break;
default:
- return -EINVAL;
+ rc = -EINVAL;
+ goto free_aead;
}
cipher_sz = &tls_cipher_size_desc[crypto_info->cipher_type];
diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
index 9b79e334dbd9..955ac3e0bf4d 100644
--- a/net/tls/tls_strp.c
+++ b/net/tls/tls_strp.c
@@ -273,7 +273,7 @@ static int tls_strp_read_copyin(struct tls_strparser *strp)
return desc.error;
}
-static int tls_strp_read_short(struct tls_strparser *strp)
+static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort)
{
struct skb_shared_info *shinfo;
struct page *page;
@@ -283,7 +283,7 @@ static int tls_strp_read_short(struct tls_strparser *strp)
* to read the data out. Otherwise the connection will stall.
* Without pressure threshold of INT_MAX will never be ready.
*/
- if (likely(!tcp_epollin_ready(strp->sk, INT_MAX)))
+ if (likely(qshort && !tcp_epollin_ready(strp->sk, INT_MAX)))
return 0;
shinfo = skb_shinfo(strp->anchor);
@@ -315,6 +315,27 @@ static int tls_strp_read_short(struct tls_strparser *strp)
return 0;
}
+static bool tls_strp_check_no_dup(struct tls_strparser *strp)
+{
+ unsigned int len = strp->stm.offset + strp->stm.full_len;
+ struct sk_buff *skb;
+ u32 seq;
+
+ skb = skb_shinfo(strp->anchor)->frag_list;
+ seq = TCP_SKB_CB(skb)->seq;
+
+ while (skb->len < len) {
+ seq += skb->len;
+ len -= skb->len;
+ skb = skb->next;
+
+ if (TCP_SKB_CB(skb)->seq != seq)
+ return false;
+ }
+
+ return true;
+}
+
static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len)
{
struct tcp_sock *tp = tcp_sk(strp->sk);
@@ -373,7 +394,7 @@ static int tls_strp_read_sock(struct tls_strparser *strp)
return tls_strp_read_copyin(strp);
if (inq < strp->stm.full_len)
- return tls_strp_read_short(strp);
+ return tls_strp_read_copy(strp, true);
if (!strp->stm.full_len) {
tls_strp_load_anchor_with_queue(strp, inq);
@@ -387,9 +408,12 @@ static int tls_strp_read_sock(struct tls_strparser *strp)
strp->stm.full_len = sz;
if (!strp->stm.full_len || inq < strp->stm.full_len)
- return tls_strp_read_short(strp);
+ return tls_strp_read_copy(strp, true);
}
+ if (!tls_strp_check_no_dup(strp))
+ return tls_strp_read_copy(strp, false);
+
strp->msg_ready = 1;
tls_rx_msg_ready(strp);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 15dbb392c875..b3545fc68097 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1147,7 +1147,7 @@ static int unix_autobind(struct sock *sk)
addr->name->sun_family = AF_UNIX;
refcount_set(&addr->refcnt, 1);
- ordernum = prandom_u32();
+ ordernum = get_random_u32();
lastnum = ordernum & 0xFFFFF;
retry:
ordernum = (ordernum + 1) & 0xFFFFF;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index d45d5366115a..dc2763540393 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -204,6 +204,7 @@ void wait_for_unix_gc(void)
/* The external entry point: unix_gc() */
void unix_gc(void)
{
+ struct sk_buff *next_skb, *skb;
struct unix_sock *u;
struct unix_sock *next;
struct sk_buff_head hitlist;
@@ -297,11 +298,30 @@ void unix_gc(void)
spin_unlock(&unix_gc_lock);
+ /* We need io_uring to clean its registered files, ignore all io_uring
+ * originated skbs. It's fine as io_uring doesn't keep references to
+ * other io_uring instances and so killing all other files in the cycle
+ * will put all io_uring references forcing it to go through normal
+ * release.path eventually putting registered files.
+ */
+ skb_queue_walk_safe(&hitlist, skb, next_skb) {
+ if (skb->scm_io_uring) {
+ __skb_unlink(skb, &hitlist);
+ skb_queue_tail(&skb->sk->sk_receive_queue, skb);
+ }
+ }
+
/* Here we are. Hitlist is filled. Die. */
__skb_queue_purge(&hitlist);
spin_lock(&unix_gc_lock);
+ /* There could be io_uring registered files, just push them back to
+ * the inflight list
+ */
+ list_for_each_entry_safe(u, next, &gc_candidates, link)
+ list_move_tail(&u->link, &gc_inflight_list);
+
/* All candidates should have been detached by now. */
BUG_ON(!list_empty(&gc_candidates));
diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
index 7cf14c6b1725..e9bf15513961 100644
--- a/net/unix/unix_bpf.c
+++ b/net/unix/unix_bpf.c
@@ -145,12 +145,12 @@ int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool re
if (restore) {
sk->sk_write_space = psock->saved_write_space;
- WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+ sock_replace_proto(sk, psock->sk_proto);
return 0;
}
unix_dgram_bpf_check_needs_rebuild(psock->sk_proto);
- WRITE_ONCE(sk->sk_prot, &unix_dgram_bpf_prot);
+ sock_replace_proto(sk, &unix_dgram_bpf_prot);
return 0;
}
@@ -158,12 +158,12 @@ int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool r
{
if (restore) {
sk->sk_write_space = psock->saved_write_space;
- WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+ sock_replace_proto(sk, psock->sk_proto);
return 0;
}
unix_stream_bpf_check_needs_rebuild(psock->sk_proto);
- WRITE_ONCE(sk->sk_prot, &unix_stream_bpf_prot);
+ sock_replace_proto(sk, &unix_stream_bpf_prot);
return 0;
}
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index ee418701cdee..884eca7f6743 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1905,8 +1905,11 @@ static int vsock_connectible_wait_data(struct sock *sk,
err = 0;
transport = vsk->transport;
- while ((data = vsock_connectible_has_data(vsk)) == 0) {
+ while (1) {
prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE);
+ data = vsock_connectible_has_data(vsk);
+ if (data != 0)
+ break;
if (sk->sk_err != 0 ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
@@ -2092,8 +2095,6 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
const struct vsock_transport *transport;
int err;
- DEFINE_WAIT(wait);
-
sk = sock->sk;
vsk = vsock_sk(sk);
err = 0;
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 775e16cb99ed..af85d8909935 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -271,6 +271,8 @@ struct cfg80211_event {
} ij;
struct {
u8 bssid[ETH_ALEN];
+ const u8 *td_bitmap;
+ u8 td_bitmap_len;
} pa;
};
};
@@ -409,7 +411,8 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
bool wextev);
void __cfg80211_roamed(struct wireless_dev *wdev,
struct cfg80211_roam_info *info);
-void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid);
+void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid,
+ const u8 *td_bitmap, u8 td_bitmap_len);
int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev);
void cfg80211_autodisconnect_wk(struct work_struct *work);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 581df7f4c524..58e1fb18f85a 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -42,6 +42,10 @@ void cfg80211_rx_assoc_resp(struct net_device *dev,
unsigned int link_id;
for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) {
+ cr.links[link_id].status = data->links[link_id].status;
+ WARN_ON_ONCE(cr.links[link_id].status != WLAN_STATUS_SUCCESS &&
+ (!cr.ap_mld_addr || !cr.links[link_id].bss));
+
cr.links[link_id].bss = data->links[link_id].bss;
if (!cr.links[link_id].bss)
continue;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 597c52236514..1ad0326ff4dc 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7780,6 +7780,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
int err;
memset(&params, 0, sizeof(params));
+ params.link_id = nl80211_link_id_or_invalid(info->attrs);
/* default to not changing parameters */
params.use_cts_prot = -1;
params.use_short_preamble = -1;
@@ -16139,7 +16140,8 @@ static u32 nl80211_internal_flags[] = {
#undef SELECTOR
};
-static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
+static int nl80211_pre_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
struct genl_info *info)
{
struct cfg80211_registered_device *rdev = NULL;
@@ -16240,7 +16242,8 @@ out_unlock:
return err;
}
-static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
+static void nl80211_post_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
struct genl_info *info)
{
u32 internal_flags = nl80211_internal_flags[ops->internal_flags];
@@ -16566,7 +16569,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl80211_set_bss,
.flags = GENL_UNS_ADMIN_PERM,
- .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+ .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_MLO_VALID_LINK_ID),
},
{
.cmd = NL80211_CMD_GET_REG,
@@ -17747,6 +17751,7 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
link_info_size += (cr->links[link].bssid ||
cr->links[link].bss) ?
nla_total_size(ETH_ALEN) : 0;
+ link_info_size += nla_total_size(sizeof(u16));
}
}
@@ -17815,7 +17820,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, bssid)) ||
(cr->links[link].addr &&
nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN,
- cr->links[link].addr)))
+ cr->links[link].addr)) ||
+ nla_put_u16(msg, NL80211_ATTR_STATUS_CODE,
+ cr->links[link].status))
goto nla_put_failure;
nla_nest_end(msg, nested_mlo_links);
@@ -17939,7 +17946,8 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
}
void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
- struct net_device *netdev, const u8 *bssid)
+ struct net_device *netdev, const u8 *bssid,
+ const u8 *td_bitmap, u8 td_bitmap_len)
{
struct sk_buff *msg;
void *hdr;
@@ -17959,6 +17967,11 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid))
goto nla_put_failure;
+ if ((td_bitmap_len > 0) && td_bitmap)
+ if (nla_put(msg, NL80211_ATTR_TD_BITMAP,
+ td_bitmap_len, td_bitmap))
+ goto nla_put_failure;
+
genlmsg_end(msg, hdr);
genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 855d540ddfb9..ba9457e94c43 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -83,7 +83,8 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
struct net_device *netdev,
struct cfg80211_roam_info *info, gfp_t gfp);
void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
- struct net_device *netdev, const u8 *bssid);
+ struct net_device *netdev, const u8 *bssid,
+ const u8 *td_bitmap, u8 td_bitmap_len);
void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
struct net_device *netdev, u16 reason,
const u8 *ie, size_t ie_len, bool from_ap);
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index d5c7a5aa6853..c3d950d29432 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1084,6 +1084,8 @@ MODULE_FIRMWARE("regulatory.db");
static int query_regdb_file(const char *alpha2)
{
+ int err;
+
ASSERT_RTNL();
if (regdb)
@@ -1093,9 +1095,13 @@ static int query_regdb_file(const char *alpha2)
if (!alpha2)
return -ENOMEM;
- return request_firmware_nowait(THIS_MODULE, true, "regulatory.db",
- &reg_pdev->dev, GFP_KERNEL,
- (void *)alpha2, regdb_fw_cb);
+ err = request_firmware_nowait(THIS_MODULE, true, "regulatory.db",
+ &reg_pdev->dev, GFP_KERNEL,
+ (void *)alpha2, regdb_fw_cb);
+ if (err)
+ kfree(alpha2);
+
+ return err;
}
int reg_reload_regdb(void)
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 806a5f1330ff..26c74f5b473c 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -330,7 +330,8 @@ static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen,
* determine if they are the same ie.
*/
if (tmp_old[0] == WLAN_EID_VENDOR_SPECIFIC) {
- if (!memcmp(tmp_old + 2, tmp + 2, 5)) {
+ if (tmp_old[1] >= 5 && tmp[1] >= 5 &&
+ !memcmp(tmp_old + 2, tmp + 2, 5)) {
/* same vendor ie, copy from
* subelement
*/
@@ -1674,7 +1675,9 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev,
if (old == rcu_access_pointer(known->pub.ies))
rcu_assign_pointer(known->pub.ies, new->pub.beacon_ies);
- cfg80211_update_hidden_bsses(known, new->pub.beacon_ies, old);
+ cfg80211_update_hidden_bsses(known,
+ rcu_access_pointer(new->pub.beacon_ies),
+ old);
if (old)
kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head);
@@ -2524,10 +2527,15 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
const struct cfg80211_bss_ies *ies1, *ies2;
size_t ielen = len - offsetof(struct ieee80211_mgmt,
u.probe_resp.variable);
- struct cfg80211_non_tx_bss non_tx_data;
+ struct cfg80211_non_tx_bss non_tx_data = {};
res = cfg80211_inform_single_bss_frame_data(wiphy, data, mgmt,
len, gfp);
+
+ /* don't do any further MBSSID handling for S1G */
+ if (ieee80211_is_s1g_beacon(mgmt->frame_control))
+ return res;
+
if (!res || !wiphy->support_mbssid ||
!cfg80211_find_elem(WLAN_EID_MULTIPLE_BSSID, ie, ielen))
return res;
@@ -3229,8 +3237,9 @@ static int ieee80211_scan_results(struct cfg80211_registered_device *rdev,
int cfg80211_wext_giwscan(struct net_device *dev,
struct iw_request_info *info,
- struct iw_point *data, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_point *data = &wrqu->data;
struct cfg80211_registered_device *rdev;
int res;
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index d513536617bd..4b5b6ee0fe01 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -793,6 +793,10 @@ void __cfg80211_connect_result(struct net_device *dev,
}
for_each_valid_link(cr, link) {
+ /* don't do extra lookups for failures */
+ if (cr->links[link].status != WLAN_STATUS_SUCCESS)
+ continue;
+
if (cr->links[link].bss)
continue;
@@ -829,6 +833,16 @@ void __cfg80211_connect_result(struct net_device *dev,
}
memset(wdev->links, 0, sizeof(wdev->links));
+ for_each_valid_link(cr, link) {
+ if (cr->links[link].status == WLAN_STATUS_SUCCESS)
+ continue;
+ cr->valid_links &= ~BIT(link);
+ /* don't require bss pointer for failed links */
+ if (!cr->links[link].bss)
+ continue;
+ cfg80211_unhold_bss(bss_from_pub(cr->links[link].bss));
+ cfg80211_put_bss(wdev->wiphy, cr->links[link].bss);
+ }
wdev->valid_links = cr->valid_links;
for_each_valid_link(cr, link)
wdev->links[link].client.current_bss =
@@ -1237,7 +1251,8 @@ out:
}
EXPORT_SYMBOL(cfg80211_roamed);
-void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid)
+void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid,
+ const u8 *td_bitmap, u8 td_bitmap_len)
{
ASSERT_WDEV_LOCK(wdev);
@@ -1250,11 +1265,11 @@ void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid)
return;
nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev,
- bssid);
+ bssid, td_bitmap, td_bitmap_len);
}
void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid,
- gfp_t gfp)
+ const u8 *td_bitmap, u8 td_bitmap_len, gfp_t gfp)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -1264,12 +1279,15 @@ void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid,
if (WARN_ON(!bssid))
return;
- ev = kzalloc(sizeof(*ev), gfp);
+ ev = kzalloc(sizeof(*ev) + td_bitmap_len, gfp);
if (!ev)
return;
ev->type = EVENT_PORT_AUTHORIZED;
memcpy(ev->pa.bssid, bssid, ETH_ALEN);
+ ev->pa.td_bitmap = ((u8 *)ev) + sizeof(*ev);
+ ev->pa.td_bitmap_len = td_bitmap_len;
+ memcpy((void *)ev->pa.td_bitmap, td_bitmap, td_bitmap_len);
/*
* Use the wdev event list so that if there are pending
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 1f285b515028..8f403f9fe816 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -990,7 +990,9 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
__cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev);
break;
case EVENT_PORT_AUTHORIZED:
- __cfg80211_port_authorized(wdev, ev->pa.bssid);
+ __cfg80211_port_authorized(wdev, ev->pa.bssid,
+ ev->pa.td_bitmap,
+ ev->pa.td_bitmap_len);
break;
}
wdev_unlock(wdev);
@@ -1557,10 +1559,12 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate)
tmp = result;
tmp *= SCALE;
do_div(tmp, mcs_divisors[rate->mcs]);
- result = tmp;
/* and take NSS */
- result = (result * rate->nss) / 8;
+ tmp *= rate->nss;
+ do_div(tmp, 8);
+
+ result = tmp;
return result / 10000;
}
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index ddf340bfa07a..8a24dfca75af 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -25,16 +25,17 @@
int cfg80211_wext_giwname(struct net_device *dev,
struct iw_request_info *info,
- char *name, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
- strcpy(name, "IEEE 802.11");
+ strcpy(wrqu->name, "IEEE 802.11");
return 0;
}
EXPORT_WEXT_HANDLER(cfg80211_wext_giwname);
int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info,
- u32 *mode, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ __u32 *mode = &wrqu->mode;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev;
struct vif_params vifparams;
@@ -71,8 +72,9 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info,
EXPORT_WEXT_HANDLER(cfg80211_wext_siwmode);
int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info,
- u32 *mode, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ __u32 *mode = &wrqu->mode;
struct wireless_dev *wdev = dev->ieee80211_ptr;
if (!wdev)
@@ -108,8 +110,9 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_giwmode);
int cfg80211_wext_giwrange(struct net_device *dev,
struct iw_request_info *info,
- struct iw_point *data, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_point *data = &wrqu->data;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct iw_range *range = (struct iw_range *) extra;
enum nl80211_band band;
@@ -251,8 +254,9 @@ int cfg80211_wext_freq(struct iw_freq *freq)
int cfg80211_wext_siwrts(struct net_device *dev,
struct iw_request_info *info,
- struct iw_param *rts, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_param *rts = &wrqu->rts;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
u32 orts = wdev->wiphy->rts_threshold;
@@ -281,8 +285,9 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_siwrts);
int cfg80211_wext_giwrts(struct net_device *dev,
struct iw_request_info *info,
- struct iw_param *rts, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_param *rts = &wrqu->rts;
struct wireless_dev *wdev = dev->ieee80211_ptr;
rts->value = wdev->wiphy->rts_threshold;
@@ -295,8 +300,9 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_giwrts);
int cfg80211_wext_siwfrag(struct net_device *dev,
struct iw_request_info *info,
- struct iw_param *frag, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_param *frag = &wrqu->frag;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
u32 ofrag = wdev->wiphy->frag_threshold;
@@ -325,8 +331,9 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_siwfrag);
int cfg80211_wext_giwfrag(struct net_device *dev,
struct iw_request_info *info,
- struct iw_param *frag, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_param *frag = &wrqu->frag;
struct wireless_dev *wdev = dev->ieee80211_ptr;
frag->value = wdev->wiphy->frag_threshold;
@@ -339,8 +346,9 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_giwfrag);
static int cfg80211_wext_siwretry(struct net_device *dev,
struct iw_request_info *info,
- struct iw_param *retry, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_param *retry = &wrqu->retry;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
u32 changed = 0;
@@ -378,8 +386,9 @@ static int cfg80211_wext_siwretry(struct net_device *dev,
int cfg80211_wext_giwretry(struct net_device *dev,
struct iw_request_info *info,
- struct iw_param *retry, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_param *retry = &wrqu->retry;
struct wireless_dev *wdev = dev->ieee80211_ptr;
retry->disabled = 0;
@@ -588,8 +597,9 @@ static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
static int cfg80211_wext_siwencode(struct net_device *dev,
struct iw_request_info *info,
- struct iw_point *erq, char *keybuf)
+ union iwreq_data *wrqu, char *keybuf)
{
+ struct iw_point *erq = &wrqu->encoding;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
int idx, err;
@@ -664,8 +674,9 @@ out:
static int cfg80211_wext_siwencodeext(struct net_device *dev,
struct iw_request_info *info,
- struct iw_point *erq, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_point *erq = &wrqu->encoding;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
struct iw_encode_ext *ext = (struct iw_encode_ext *) extra;
@@ -767,8 +778,9 @@ static int cfg80211_wext_siwencodeext(struct net_device *dev,
static int cfg80211_wext_giwencode(struct net_device *dev,
struct iw_request_info *info,
- struct iw_point *erq, char *keybuf)
+ union iwreq_data *wrqu, char *keybuf)
{
+ struct iw_point *erq = &wrqu->encoding;
struct wireless_dev *wdev = dev->ieee80211_ptr;
int idx;
@@ -804,8 +816,9 @@ static int cfg80211_wext_giwencode(struct net_device *dev,
static int cfg80211_wext_siwfreq(struct net_device *dev,
struct iw_request_info *info,
- struct iw_freq *wextfreq, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_freq *wextfreq = &wrqu->freq;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
struct cfg80211_chan_def chandef = {
@@ -870,8 +883,9 @@ static int cfg80211_wext_siwfreq(struct net_device *dev,
static int cfg80211_wext_giwfreq(struct net_device *dev,
struct iw_request_info *info,
- struct iw_freq *freq, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_freq *freq = &wrqu->freq;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
struct cfg80211_chan_def chandef = {};
@@ -1147,8 +1161,9 @@ static int cfg80211_set_key_mgt(struct wireless_dev *wdev, u32 key_mgt)
static int cfg80211_wext_siwauth(struct net_device *dev,
struct iw_request_info *info,
- struct iw_param *data, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_param *data = &wrqu->param;
struct wireless_dev *wdev = dev->ieee80211_ptr;
if (wdev->iftype != NL80211_IFTYPE_STATION)
@@ -1180,7 +1195,7 @@ static int cfg80211_wext_siwauth(struct net_device *dev,
static int cfg80211_wext_giwauth(struct net_device *dev,
struct iw_request_info *info,
- struct iw_param *data, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
/* XXX: what do we need? */
@@ -1189,8 +1204,9 @@ static int cfg80211_wext_giwauth(struct net_device *dev,
static int cfg80211_wext_siwpower(struct net_device *dev,
struct iw_request_info *info,
- struct iw_param *wrq, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_param *wrq = &wrqu->power;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
bool ps;
@@ -1238,8 +1254,9 @@ static int cfg80211_wext_siwpower(struct net_device *dev,
static int cfg80211_wext_giwpower(struct net_device *dev,
struct iw_request_info *info,
- struct iw_param *wrq, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_param *wrq = &wrqu->power;
struct wireless_dev *wdev = dev->ieee80211_ptr;
wrq->disabled = !wdev->ps;
@@ -1249,8 +1266,9 @@ static int cfg80211_wext_giwpower(struct net_device *dev,
static int cfg80211_wext_siwrate(struct net_device *dev,
struct iw_request_info *info,
- struct iw_param *rate, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_param *rate = &wrqu->bitrate;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
struct cfg80211_bitrate_mask mask;
@@ -1307,8 +1325,9 @@ static int cfg80211_wext_siwrate(struct net_device *dev,
static int cfg80211_wext_giwrate(struct net_device *dev,
struct iw_request_info *info,
- struct iw_param *rate, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_param *rate = &wrqu->bitrate;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
struct station_info sinfo = {};
@@ -1430,8 +1449,9 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
static int cfg80211_wext_siwap(struct net_device *dev,
struct iw_request_info *info,
- struct sockaddr *ap_addr, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct sockaddr *ap_addr = &wrqu->ap_addr;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
int ret;
@@ -1455,8 +1475,9 @@ static int cfg80211_wext_siwap(struct net_device *dev,
static int cfg80211_wext_giwap(struct net_device *dev,
struct iw_request_info *info,
- struct sockaddr *ap_addr, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct sockaddr *ap_addr = &wrqu->ap_addr;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
int ret;
@@ -1480,8 +1501,9 @@ static int cfg80211_wext_giwap(struct net_device *dev,
static int cfg80211_wext_siwessid(struct net_device *dev,
struct iw_request_info *info,
- struct iw_point *data, char *ssid)
+ union iwreq_data *wrqu, char *ssid)
{
+ struct iw_point *data = &wrqu->data;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
int ret;
@@ -1505,8 +1527,9 @@ static int cfg80211_wext_siwessid(struct net_device *dev,
static int cfg80211_wext_giwessid(struct net_device *dev,
struct iw_request_info *info,
- struct iw_point *data, char *ssid)
+ union iwreq_data *wrqu, char *ssid)
{
+ struct iw_point *data = &wrqu->data;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
int ret;
@@ -1533,7 +1556,7 @@ static int cfg80211_wext_giwessid(struct net_device *dev,
static int cfg80211_wext_siwpmksa(struct net_device *dev,
struct iw_request_info *info,
- struct iw_point *data, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -1584,78 +1607,39 @@ static int cfg80211_wext_siwpmksa(struct net_device *dev,
return ret;
}
-#define DEFINE_WEXT_COMPAT_STUB(func, type) \
- static int __ ## func(struct net_device *dev, \
- struct iw_request_info *info, \
- union iwreq_data *wrqu, \
- char *extra) \
- { \
- return func(dev, info, (type *)wrqu, extra); \
- }
-
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwname, char)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwfreq, struct iw_freq)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwfreq, struct iw_freq)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwmode, u32)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwmode, u32)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwrange, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwap, struct sockaddr)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwap, struct sockaddr)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwmlme, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwscan, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwessid, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwessid, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwrate, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwrate, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwrts, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwrts, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwfrag, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwfrag, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwretry, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwretry, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwencode, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwencode, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwpower, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwpower, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwgenie, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwauth, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwauth, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwencodeext, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwpmksa, struct iw_point)
-
static const iw_handler cfg80211_handlers[] = {
- [IW_IOCTL_IDX(SIOCGIWNAME)] = __cfg80211_wext_giwname,
- [IW_IOCTL_IDX(SIOCSIWFREQ)] = __cfg80211_wext_siwfreq,
- [IW_IOCTL_IDX(SIOCGIWFREQ)] = __cfg80211_wext_giwfreq,
- [IW_IOCTL_IDX(SIOCSIWMODE)] = __cfg80211_wext_siwmode,
- [IW_IOCTL_IDX(SIOCGIWMODE)] = __cfg80211_wext_giwmode,
- [IW_IOCTL_IDX(SIOCGIWRANGE)] = __cfg80211_wext_giwrange,
- [IW_IOCTL_IDX(SIOCSIWAP)] = __cfg80211_wext_siwap,
- [IW_IOCTL_IDX(SIOCGIWAP)] = __cfg80211_wext_giwap,
- [IW_IOCTL_IDX(SIOCSIWMLME)] = __cfg80211_wext_siwmlme,
- [IW_IOCTL_IDX(SIOCSIWSCAN)] = cfg80211_wext_siwscan,
- [IW_IOCTL_IDX(SIOCGIWSCAN)] = __cfg80211_wext_giwscan,
- [IW_IOCTL_IDX(SIOCSIWESSID)] = __cfg80211_wext_siwessid,
- [IW_IOCTL_IDX(SIOCGIWESSID)] = __cfg80211_wext_giwessid,
- [IW_IOCTL_IDX(SIOCSIWRATE)] = __cfg80211_wext_siwrate,
- [IW_IOCTL_IDX(SIOCGIWRATE)] = __cfg80211_wext_giwrate,
- [IW_IOCTL_IDX(SIOCSIWRTS)] = __cfg80211_wext_siwrts,
- [IW_IOCTL_IDX(SIOCGIWRTS)] = __cfg80211_wext_giwrts,
- [IW_IOCTL_IDX(SIOCSIWFRAG)] = __cfg80211_wext_siwfrag,
- [IW_IOCTL_IDX(SIOCGIWFRAG)] = __cfg80211_wext_giwfrag,
- [IW_IOCTL_IDX(SIOCSIWTXPOW)] = cfg80211_wext_siwtxpower,
- [IW_IOCTL_IDX(SIOCGIWTXPOW)] = cfg80211_wext_giwtxpower,
- [IW_IOCTL_IDX(SIOCSIWRETRY)] = __cfg80211_wext_siwretry,
- [IW_IOCTL_IDX(SIOCGIWRETRY)] = __cfg80211_wext_giwretry,
- [IW_IOCTL_IDX(SIOCSIWENCODE)] = __cfg80211_wext_siwencode,
- [IW_IOCTL_IDX(SIOCGIWENCODE)] = __cfg80211_wext_giwencode,
- [IW_IOCTL_IDX(SIOCSIWPOWER)] = __cfg80211_wext_siwpower,
- [IW_IOCTL_IDX(SIOCGIWPOWER)] = __cfg80211_wext_giwpower,
- [IW_IOCTL_IDX(SIOCSIWGENIE)] = __cfg80211_wext_siwgenie,
- [IW_IOCTL_IDX(SIOCSIWAUTH)] = __cfg80211_wext_siwauth,
- [IW_IOCTL_IDX(SIOCGIWAUTH)] = __cfg80211_wext_giwauth,
- [IW_IOCTL_IDX(SIOCSIWENCODEEXT)]= __cfg80211_wext_siwencodeext,
- [IW_IOCTL_IDX(SIOCSIWPMKSA)] = __cfg80211_wext_siwpmksa,
+ IW_HANDLER(SIOCGIWNAME, cfg80211_wext_giwname),
+ IW_HANDLER(SIOCSIWFREQ, cfg80211_wext_siwfreq),
+ IW_HANDLER(SIOCGIWFREQ, cfg80211_wext_giwfreq),
+ IW_HANDLER(SIOCSIWMODE, cfg80211_wext_siwmode),
+ IW_HANDLER(SIOCGIWMODE, cfg80211_wext_giwmode),
+ IW_HANDLER(SIOCGIWRANGE, cfg80211_wext_giwrange),
+ IW_HANDLER(SIOCSIWAP, cfg80211_wext_siwap),
+ IW_HANDLER(SIOCGIWAP, cfg80211_wext_giwap),
+ IW_HANDLER(SIOCSIWMLME, cfg80211_wext_siwmlme),
+ IW_HANDLER(SIOCSIWSCAN, cfg80211_wext_siwscan),
+ IW_HANDLER(SIOCGIWSCAN, cfg80211_wext_giwscan),
+ IW_HANDLER(SIOCSIWESSID, cfg80211_wext_siwessid),
+ IW_HANDLER(SIOCGIWESSID, cfg80211_wext_giwessid),
+ IW_HANDLER(SIOCSIWRATE, cfg80211_wext_siwrate),
+ IW_HANDLER(SIOCGIWRATE, cfg80211_wext_giwrate),
+ IW_HANDLER(SIOCSIWRTS, cfg80211_wext_siwrts),
+ IW_HANDLER(SIOCGIWRTS, cfg80211_wext_giwrts),
+ IW_HANDLER(SIOCSIWFRAG, cfg80211_wext_siwfrag),
+ IW_HANDLER(SIOCGIWFRAG, cfg80211_wext_giwfrag),
+ IW_HANDLER(SIOCSIWTXPOW, cfg80211_wext_siwtxpower),
+ IW_HANDLER(SIOCGIWTXPOW, cfg80211_wext_giwtxpower),
+ IW_HANDLER(SIOCSIWRETRY, cfg80211_wext_siwretry),
+ IW_HANDLER(SIOCGIWRETRY, cfg80211_wext_giwretry),
+ IW_HANDLER(SIOCSIWENCODE, cfg80211_wext_siwencode),
+ IW_HANDLER(SIOCGIWENCODE, cfg80211_wext_giwencode),
+ IW_HANDLER(SIOCSIWPOWER, cfg80211_wext_siwpower),
+ IW_HANDLER(SIOCGIWPOWER, cfg80211_wext_giwpower),
+ IW_HANDLER(SIOCSIWGENIE, cfg80211_wext_siwgenie),
+ IW_HANDLER(SIOCSIWAUTH, cfg80211_wext_siwauth),
+ IW_HANDLER(SIOCGIWAUTH, cfg80211_wext_giwauth),
+ IW_HANDLER(SIOCSIWENCODEEXT, cfg80211_wext_siwencodeext),
+ IW_HANDLER(SIOCSIWPMKSA, cfg80211_wext_siwpmksa),
};
const struct iw_handler_def cfg80211_wext_handler = {
diff --git a/net/wireless/wext-compat.h b/net/wireless/wext-compat.h
index 8d3cc1552e2f..c02eb789e676 100644
--- a/net/wireless/wext-compat.h
+++ b/net/wireless/wext-compat.h
@@ -13,7 +13,7 @@
int cfg80211_ibss_wext_siwfreq(struct net_device *dev,
struct iw_request_info *info,
- struct iw_freq *freq, char *extra);
+ struct iw_freq *wextfreq, char *extra);
int cfg80211_ibss_wext_giwfreq(struct net_device *dev,
struct iw_request_info *info,
struct iw_freq *freq, char *extra);
@@ -32,7 +32,7 @@ int cfg80211_ibss_wext_giwessid(struct net_device *dev,
int cfg80211_mgd_wext_siwfreq(struct net_device *dev,
struct iw_request_info *info,
- struct iw_freq *freq, char *extra);
+ struct iw_freq *wextfreq, char *extra);
int cfg80211_mgd_wext_giwfreq(struct net_device *dev,
struct iw_request_info *info,
struct iw_freq *freq, char *extra);
@@ -51,10 +51,10 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev,
int cfg80211_wext_siwmlme(struct net_device *dev,
struct iw_request_info *info,
- struct iw_point *data, char *extra);
+ union iwreq_data *wrqu, char *extra);
int cfg80211_wext_siwgenie(struct net_device *dev,
struct iw_request_info *info,
- struct iw_point *data, char *extra);
+ union iwreq_data *wrqu, char *extra);
int cfg80211_wext_freq(struct iw_freq *freq);
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index 68f45afc352d..191c6d98c700 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -324,8 +324,9 @@ int cfg80211_mgd_wext_giwap(struct net_device *dev,
int cfg80211_wext_siwgenie(struct net_device *dev,
struct iw_request_info *info,
- struct iw_point *data, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
+ struct iw_point *data = &wrqu->data;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
u8 *ie = extra;
@@ -374,7 +375,7 @@ int cfg80211_wext_siwgenie(struct net_device *dev,
int cfg80211_wext_siwmlme(struct net_device *dev,
struct iw_request_info *info,
- struct iw_point *data, char *extra)
+ union iwreq_data *wrqu, char *extra)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct iw_mlme *mlme = (struct iw_mlme *)extra;
diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c
index 5259ef8f5242..748d8630ab58 100644
--- a/net/x25/x25_dev.c
+++ b/net/x25/x25_dev.c
@@ -117,7 +117,7 @@ int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev,
if (!pskb_may_pull(skb, 1)) {
x25_neigh_put(nb);
- return 0;
+ goto drop;
}
switch (skb->data[0]) {
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index acc8e52a4f5f..771d0fa90ef5 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -231,9 +231,9 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
return 0;
}
-static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static int xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+ return __bpf_xdp_redirect_map(map, index, flags, 0,
__xsk_map_lookup_elem);
}
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 5f5aafd418af..21269e8f2db4 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -97,6 +97,18 @@ static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb)
}
}
+static inline bool xmit_xfrm_check_overflow(struct sk_buff *skb)
+{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ __u32 seq = xo->seq.low;
+
+ seq += skb_shinfo(skb)->gso_segs;
+ if (unlikely(seq < xo->seq.low))
+ return true;
+
+ return false;
+}
+
struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again)
{
int err;
@@ -134,7 +146,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
return skb;
}
- if (skb_is_gso(skb) && unlikely(x->xso.dev != dev)) {
+ if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) ||
+ unlikely(xmit_xfrm_check_overflow(skb)))) {
struct sk_buff *segs;
/* Packet got rerouted, fixup features and segment it. */
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index 9f4d42eb090f..ce56d659c55a 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -714,7 +714,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff
oseq += skb_shinfo(skb)->gso_segs;
}
- if (unlikely(oseq < replay_esn->oseq)) {
+ if (unlikely(xo->seq.low < replay_esn->oseq)) {
XFRM_SKB_CB(skb)->seq.output.hi = ++oseq_hi;
xo->seq.hi = oseq_hi;
replay_esn->oseq_hi = oseq_hi;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index d0ae17e3bb38..9ec481fbfb63 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2081,7 +2081,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high,
} else {
u32 spi = 0;
for (h = 0; h < high-low+1; h++) {
- spi = low + prandom_u32()%(high-low+1);
+ spi = low + prandom_u32_max(high - low + 1);
x0 = xfrm_state_lookup(net, mark, &x->id.daddr, htonl(spi), x->id.proto, x->props.family);
if (x0 == NULL) {
newspi = htonl(spi);